From d701956ac5d42433b72ddd7cf4dc05b0662f9028 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Thu, 27 Jun 2024 13:54:54 +0200 Subject: [PATCH 01/14] Add eval_during_augmentation attribute --- baybe/constraints/base.py | 4 ++++ baybe/constraints/discrete.py | 8 ++++++++ baybe/searchspace/core.py | 5 +++++ 3 files changed, 17 insertions(+) diff --git a/baybe/constraints/base.py b/baybe/constraints/base.py index 3509c27f0..00797d085 100644 --- a/baybe/constraints/base.py +++ b/baybe/constraints/base.py @@ -36,6 +36,10 @@ class Constraint(ABC, SerialMixin): eval_during_modeling: ClassVar[bool] """Class variable encoding whether the condition is evaluated during modeling.""" + eval_during_augmentation: ClassVar[bool] = False + """Class variable encoding whether the constraint could be considered during data + augmentation.""" + numerical_only: ClassVar[bool] = False """Class variable encoding whether the constraint is valid only for numerical parameters.""" diff --git a/baybe/constraints/discrete.py b/baybe/constraints/discrete.py index ee8cf9d0e..468df14ba 100644 --- a/baybe/constraints/discrete.py +++ b/baybe/constraints/discrete.py @@ -133,6 +133,10 @@ class DiscreteDependenciesConstraint(DiscreteConstraint): a single constraint. """ + # class variables + eval_during_augmentation: ClassVar[bool] = True + # See base class + # object variables conditions: list[Condition] = field() """The list of individual conditions.""" @@ -220,6 +224,10 @@ class DiscretePermutationInvarianceConstraint(DiscreteConstraint): evaluated during modeling to make use of the invariance. """ + # class variables + eval_during_augmentation: ClassVar[bool] = True + # See base class + # object variables dependencies: DiscreteDependenciesConstraint | None = field(default=None) """Dependencies connected with the invariant parameters.""" diff --git a/baybe/searchspace/core.py b/baybe/searchspace/core.py index 17f1f49f9..b724fd793 100644 --- a/baybe/searchspace/core.py +++ b/baybe/searchspace/core.py @@ -380,6 +380,11 @@ def transform( return comp_rep + @property + def constraints_augmentable(self) -> tuple[Constraint, ...]: + """The searchspace constraints that can be considered during augmentation.""" + return tuple(c for c in self.constraints if c.eval_during_augmentation) + def validate_searchspace_from_config(specs: dict, _) -> None: """Validate the search space specifications while skipping costly creation steps.""" From ac843f4f3dcd4471a3beadc0e288de33f2027baf Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Thu, 27 Jun 2024 15:21:16 +0200 Subject: [PATCH 02/14] Add permutation augmentation utility --- baybe/utils/augmentation.py | 33 ++++++++++++++ tests/test_utils.py | 89 +++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 baybe/utils/augmentation.py diff --git a/baybe/utils/augmentation.py b/baybe/utils/augmentation.py new file mode 100644 index 000000000..c3800bf80 --- /dev/null +++ b/baybe/utils/augmentation.py @@ -0,0 +1,33 @@ +"""Utilities related to data augmentation.""" + +from collections.abc import Sequence +from itertools import permutations + +import pandas as pd + + +def df_apply_permutation_augmentation( + df: pd.DataFrame, columns: Sequence[str] +) -> pd.DataFrame: + """Bla.""" + new_rows: list[pd.DataFrame] = [] + for index, row in df.iterrows(): + # Extract the values from the specified columns + original_values = row[columns].tolist() + + # Generate all permutations of these values + all_perms = list(permutations(original_values)) + + # For each permutation, create a new row if it's not already in the DataFrame + for perm in all_perms: + # Create a new row dictionary with the permuted values + new_row = row.copy().to_frame().T + new_row[columns] = perm + new_rows.append(new_row) + + augmented_df = pd.concat([df] + new_rows) + + # Drop duplicates if any created inadvertently + augmented_df.drop_duplicates(inplace=True) + + return augmented_df diff --git a/tests/test_utils.py b/tests/test_utils.py index 956a7b51e..09eb1bc4c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -6,6 +6,7 @@ import pytest from pytest import param +from baybe.utils.augmentation import df_apply_permutation_augmentation from baybe.utils.basic import register_hooks from baybe.utils.memory import bytes_to_human_readable from baybe.utils.numerical import closest_element @@ -120,3 +121,91 @@ def test_invalid_register_hooks(target, hook): """Passing inconsistent signatures to `register_hooks` raises an error.""" with pytest.raises(TypeError): register_hooks(target, [hook]) + + +@pytest.mark.parametrize( + ("data", "columns", "data_expected"), + [ + param( # 2 invariant cols and 1 unaffected col + { + "A": [1, 1], + "B": [2, 2], + "C": ["x", "y"], + }, + ["A", "B"], + { + "A": [1, 2, 1, 2], + "B": [2, 1, 2, 1], + "C": ["x", "x", "y", "y"], + }, + id="2inv+1add", + ), + param( # 2 invariant cols with identical values + {"A": [1, 1], "B": [2, 2]}, + ["A", "B"], + { + "A": [1, 2], + "B": [2, 1], + }, + id="2inv_degen", + ), + param( # 2 invariant cols with identical values but different targets + {"A": [1, 1], "B": [2, 2], "T": ["x", "y"]}, + ["A", "B"], + { + "A": [1, 1, 2, 2], + "B": [2, 2, 1, 1], + "T": ["x", "y", "x", "y"], + }, + id="2inv_degen+target_unique", + ), + param( # 2 invariant cols with identical values but different targets + {"A": [1, 1], "B": [2, 2], "T": ["x", "x"]}, + ["A", "B"], + { + "A": [1, 2], + "B": [2, 1], + "T": ["x", "x"], + }, + id="2inv_degen+target_degen", + ), + param( # 3 invariant cols + {"A": [1, 1], "B": [2, 4], "C": [3, 5]}, + ["A", "B", "C"], + { + "A": [1, 1, 2, 2, 3, 3, 1, 1, 4, 4, 5, 5], + "B": [2, 3, 1, 3, 2, 1, 4, 5, 1, 5, 1, 4], + "C": [3, 2, 3, 1, 1, 2, 5, 4, 5, 1, 4, 1], + }, + id="3inv", + ), + param( # 3 invariant cols + {"A": [1, 1], "B": [2, 4], "C": [3, 5], "D": ["x", "y"]}, + ["A", "B", "C"], + { + "A": [1, 1, 2, 2, 3, 3, 1, 1, 4, 4, 5, 5], + "B": [2, 3, 1, 3, 2, 1, 4, 5, 1, 5, 1, 4], + "C": [3, 2, 3, 1, 1, 2, 5, 4, 5, 1, 4, 1], + "D": ["x", "x", "x", "x", "x", "x", "y", "y", "y", "y", "y", "y"], + }, + id="3inv+1add", + ), + ], +) +def test_df_invariance_augmentation(data, columns, data_expected): + """Test invariance data augmentation is done correctly.""" + # Create all needed dataframes + df = pd.DataFrame(data) + df_augmented = df_apply_permutation_augmentation(df, columns) + df_expected = pd.DataFrame(data_expected) + + # Determine equality ignoring row order + are_equal = ( + pd.merge(left=df_augmented, right=df_expected, how="outer", indicator=True)[ + "_merge" + ] + .eq("both") + .all() + ) + + assert are_equal, (df, df_augmented, df_expected) From dd3b94c4cf4f8e102f181befe5049a768eab5e20 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Thu, 27 Jun 2024 19:35:38 +0200 Subject: [PATCH 03/14] Add dependency augmentation utility --- baybe/utils/augmentation.py | 60 +++++++++++++++++++++++-- tests/test_utils.py | 89 ++++++++++++++++++++++++++++++++++++- 2 files changed, 144 insertions(+), 5 deletions(-) diff --git a/baybe/utils/augmentation.py b/baybe/utils/augmentation.py index c3800bf80..fde4a022d 100644 --- a/baybe/utils/augmentation.py +++ b/baybe/utils/augmentation.py @@ -6,12 +6,25 @@ import pandas as pd +def _row_in_df(row: pd.Series | pd.DataFrame, df: pd.DataFrame) -> bool: + """Bla.""" + if isinstance(row, pd.DataFrame): + if len(row) != 1: + raise ValueError( + f"{_row_in_df.__name__} can only be called with pd.Series or " + f"pd.DataFrame's that have exactly one row." + ) + row = row.iloc[0] + + return (df == row).all(axis=1).any() + + def df_apply_permutation_augmentation( df: pd.DataFrame, columns: Sequence[str] ) -> pd.DataFrame: """Bla.""" new_rows: list[pd.DataFrame] = [] - for index, row in df.iterrows(): + for _, row in df.iterrows(): # Extract the values from the specified columns original_values = row[columns].tolist() @@ -23,11 +36,50 @@ def df_apply_permutation_augmentation( # Create a new row dictionary with the permuted values new_row = row.copy().to_frame().T new_row[columns] = perm - new_rows.append(new_row) + if not _row_in_df(new_row, df): + new_rows.append(new_row) augmented_df = pd.concat([df] + new_rows) - # Drop duplicates if any created inadvertently - augmented_df.drop_duplicates(inplace=True) + return augmented_df + + +def df_apply_dependency_augmentation( + df: pd.DataFrame, + causing: tuple[str, Sequence], + affected: Sequence[tuple[str, Sequence]], +) -> pd.DataFrame: + """Bla.""" + new_rows: list[pd.DataFrame] = [] + + # Iterate through all rows that have an invariance-causing value in the respective + # column + col_causing, vals_causing = causing + df_filtered = df.loc[df[col_causing].isin(vals_causing), :] + for _, row in df_filtered.iterrows(): + # Augment the specific row by growing a dataframe iteratively going through + # the affected columns. In each iteration augmented rows with that column + # changed to all possible values are added. + original_row = row.to_frame().T + + current_augmented = original_row.copy() + for col_affected, vals_affected in affected: + to_add = [] + for _, temp_row in current_augmented.iterrows(): + to_add += [ + new_row + for val in vals_affected + if not _row_in_df( + new_row := temp_row.to_frame().T.assign(**{col_affected: val}), + current_augmented, + ) + ] + current_augmented = pd.concat([current_augmented] + to_add) + + # Drop first entry because it's the original row + current_augmented = current_augmented.iloc[1:, :] + new_rows.append(current_augmented) + + augmented_df = pd.concat([df] + new_rows) return augmented_df diff --git a/tests/test_utils.py b/tests/test_utils.py index 09eb1bc4c..e6eb435de 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -6,7 +6,10 @@ import pytest from pytest import param -from baybe.utils.augmentation import df_apply_permutation_augmentation +from baybe.utils.augmentation import ( + df_apply_dependency_augmentation, + df_apply_permutation_augmentation, +) from baybe.utils.basic import register_hooks from baybe.utils.memory import bytes_to_human_readable from baybe.utils.numerical import closest_element @@ -209,3 +212,87 @@ def test_df_invariance_augmentation(data, columns, data_expected): ) assert are_equal, (df, df_augmented, df_expected) + + +@pytest.mark.parametrize( + ("data", "causing", "affected", "data_expected"), + [ + param( # 1 causing val, 1 col affected (with 3 values) + { + "A": [0, 1], + "B": [3, 4], + "C": ["x", "y"], + }, + ("A", [0]), + [("B", [3, 4, 5])], + { + "A": [0, 1, 0, 0], + "B": [3, 4, 4, 5], + "C": ["x", "y", "x", "x"], + }, + id="1causing_1affected", + ), + param( # 1 causing val, 2 cols affected (with 2 values each) + { + "A": [0, 1], + "B": [3, 4], + "C": ["x", "y"], + }, + ("A", [0]), + [("B", [3, 4]), ("C", ["x", "y"])], + { + "A": [0, 1, 0, 0, 0], + "B": [3, 4, 4, 3, 4], + "C": ["x", "y", "x", "y", "y"], + }, + id="1causing_2affected", + ), + param( # 2 causing vals, 1 col affected (with 3 values) + { + "A": [0, 1, 2], + "B": [3, 4, 3], + "C": ["x", "y", "z"], + }, + ("A", [0, 1]), + [("B", [3, 4, 5])], + { + "A": [0, 1, 2, 0, 0, 1, 1], + "B": [3, 4, 3, 4, 5, 3, 5], + "C": ["x", "y", "z", "x", "x", "y", "y"], + }, + id="2causing_1affected", + ), + param( # 2 causing vals, 2 cols affected (with 2 values each) + { + "A": [0, 1, 2], + "B": [3, 4, 3], + "C": ["x", "y", "x"], + }, + ("A", [0, 1]), + [("B", [3, 4]), ("C", ["x", "y"])], + { + "A": [0, 1, 2, 0, 0, 0, 1, 1, 1], + "B": [3, 4, 3, 4, 3, 4, 3, 3, 4], + "C": ["x", "y", "x", "x", "y", "y", "y", "x", "x"], + }, + id="2causing_2affected", + ), + ], +) +def test_df_dependency_augmentation(data, causing, affected, data_expected): + """Test dependency data augmentation is done correctly.""" + # Create all needed dataframes + df = pd.DataFrame(data) + df_augmented = df_apply_dependency_augmentation(df, causing, affected) + df_expected = pd.DataFrame(data_expected) + + # Determine equality ignoring row order + are_equal = ( + pd.merge(left=df_augmented, right=df_expected, how="outer", indicator=True)[ + "_merge" + ] + .eq("both") + .all() + ) + + assert are_equal, (df, df_augmented, df_expected) From e760c4c967ba51d1991cdefdd6bc5b0858cf0c4f Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Thu, 27 Jun 2024 20:00:15 +0200 Subject: [PATCH 04/14] Fill docstrings --- baybe/utils/augmentation.py | 78 ++++++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 18 deletions(-) diff --git a/baybe/utils/augmentation.py b/baybe/utils/augmentation.py index fde4a022d..02bd5726f 100644 --- a/baybe/utils/augmentation.py +++ b/baybe/utils/augmentation.py @@ -7,7 +7,18 @@ def _row_in_df(row: pd.Series | pd.DataFrame, df: pd.DataFrame) -> bool: - """Bla.""" + """Check whether a row is fully contained in a dataframe. + + Args: + row: The row to be checked. + df: The dataframe to be checked. + + Returns: + Boolean result. + + Raises: + ValueError: If `row` is a dataframe that contains more than one row. + """ if isinstance(row, pd.DataFrame): if len(row) != 1: raise ValueError( @@ -22,16 +33,27 @@ def _row_in_df(row: pd.Series | pd.DataFrame, df: pd.DataFrame) -> bool: def df_apply_permutation_augmentation( df: pd.DataFrame, columns: Sequence[str] ) -> pd.DataFrame: - """Bla.""" + """Augment a dataframe if permutation invariant columns are present. + + Indices are preserved so that each augmented row will have the same index as its + original. + + Args: + df: The dataframe that should be augmented. + columns: Sequence indicating the permutation invariant columns. + + Returns: + The augmented dataframe containing the original one. + """ new_rows: list[pd.DataFrame] = [] for _, row in df.iterrows(): # Extract the values from the specified columns - original_values = row[columns].tolist() + original_values = row[columns].tolist() # type: ignore[call-overload] # Generate all permutations of these values all_perms = list(permutations(original_values)) - # For each permutation, create a new row if it's not already in the DataFrame + # For each permutation, create a new row if it's not already in the dataframe for perm in all_perms: # Create a new row dictionary with the permuted values new_row = row.copy().to_frame().T @@ -49,36 +71,56 @@ def df_apply_dependency_augmentation( causing: tuple[str, Sequence], affected: Sequence[tuple[str, Sequence]], ) -> pd.DataFrame: - """Bla.""" + """Augment a dataframe if dependency invariant columns are present. + + This works with the concept of column-values pairs for causing and affected column. + Any row present where the specified causing column has one of the provided values + will trigger an augmentation on the affected columns. The latter are augmented by + going through all their invariant values and adding respective new rows. + + Args: + df: The dataframe that should be augmented. + causing: Causing column name and its causing values. + affected: List of affected columns and their invariant values. + + Returns: + The augmented dataframe containing the original one. + """ new_rows: list[pd.DataFrame] = [] - # Iterate through all rows that have an invariance-causing value in the respective - # column + # Iterate through all rows that have a causing value in the respective column. col_causing, vals_causing = causing df_filtered = df.loc[df[col_causing].isin(vals_causing), :] for _, row in df_filtered.iterrows(): # Augment the specific row by growing a dataframe iteratively going through # the affected columns. In each iteration augmented rows with that column - # changed to all possible values are added. + # changed to all possible values are added. If there is more than one affected + # column, it is important to include the augmented rows stemming from the + # preceding columns as well. original_row = row.to_frame().T - current_augmented = original_row.copy() - for col_affected, vals_affected in affected: + currently_added = original_row.copy() # Start with the original row + for col_affected, vals_invariant in affected: to_add = [] - for _, temp_row in current_augmented.iterrows(): + + # Go through all previously added rows + the original row + for _, temp_row in currently_added.iterrows(): to_add += [ new_row - for val in vals_affected + for val in vals_invariant if not _row_in_df( - new_row := temp_row.to_frame().T.assign(**{col_affected: val}), - current_augmented, + new_row := temp_row.to_frame().T.assign( + **{col_affected: val} + ), # this takes the current row and replaces the affected value + currently_added, ) ] - current_augmented = pd.concat([current_augmented] + to_add) + # Update the currently added rows + currently_added = pd.concat([currently_added] + to_add) - # Drop first entry because it's the original row - current_augmented = current_augmented.iloc[1:, :] - new_rows.append(current_augmented) + # Drop first entry because it's the original row and store added rows + currently_added = currently_added.iloc[1:, :] + new_rows.append(currently_added) augmented_df = pd.concat([df] + new_rows) From 535f277881d1a4f8cd05a631222a34b2d04b2759 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Thu, 27 Jun 2024 23:19:45 +0200 Subject: [PATCH 05/14] Add searchspace parameter name utility --- baybe/searchspace/continuous.py | 13 +++++++++++++ baybe/searchspace/core.py | 13 +++++++++++++ baybe/searchspace/discrete.py | 13 +++++++++++++ 3 files changed, 39 insertions(+) diff --git a/baybe/searchspace/continuous.py b/baybe/searchspace/continuous.py index ab9c2b715..aad40dfb8 100644 --- a/baybe/searchspace/continuous.py +++ b/baybe/searchspace/continuous.py @@ -458,6 +458,19 @@ def full_factorial(self) -> pd.DataFrame: return pd.DataFrame(index=index).reset_index() + def get_parameters_by_name( + self, names: Sequence[str] + ) -> tuple[NumericalContinuousParameter, ...]: + """Return parameters with the specified names. + + Args: + names: Sequence of names. + + Returns: + The named parameters. + """ + return tuple(p for p in self.parameters if p.name in names) + # Register deserialization hook converter.register_structure_hook(SubspaceContinuous, select_constructor_hook) diff --git a/baybe/searchspace/core.py b/baybe/searchspace/core.py index b724fd793..868339477 100644 --- a/baybe/searchspace/core.py +++ b/baybe/searchspace/core.py @@ -385,6 +385,19 @@ def constraints_augmentable(self) -> tuple[Constraint, ...]: """The searchspace constraints that can be considered during augmentation.""" return tuple(c for c in self.constraints if c.eval_during_augmentation) + def get_parameters_by_name(self, names: Sequence[str]) -> tuple[Parameter, ...]: + """Return parameters with the specified names. + + Args: + names: Sequence of names. + + Returns: + The named parameters. + """ + return self.discrete.get_parameters_by_name( + names + ) + self.continuous.get_parameters_by_name(names) + def validate_searchspace_from_config(specs: dict, _) -> None: """Validate the search space specifications while skipping costly creation steps.""" diff --git a/baybe/searchspace/discrete.py b/baybe/searchspace/discrete.py index 9d41f2f14..11e179e26 100644 --- a/baybe/searchspace/discrete.py +++ b/baybe/searchspace/discrete.py @@ -713,6 +713,19 @@ def transform( except AttributeError: return comp_rep + def get_parameters_by_name( + self, names: Sequence[str] + ) -> tuple[DiscreteParameter, ...]: + """Return parameters with the specified names. + + Args: + names: Sequence of names. + + Returns: + The named parameters. + """ + return tuple(p for p in self.parameters if p.name in names) + def _apply_constraint_filter( df: pd.DataFrame, constraints: Collection[DiscreteConstraint] From f00442ea26bd04b0b0327e4e72e05a8557ddb012 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Fri, 28 Jun 2024 12:47:10 +0200 Subject: [PATCH 06/14] Add dependents to invariance augmentation --- baybe/utils/augmentation.py | 126 ++++++++++++++++++++++++++++++++++-- tests/test_utils.py | 33 +++++++++- 2 files changed, 150 insertions(+), 9 deletions(-) diff --git a/baybe/utils/augmentation.py b/baybe/utils/augmentation.py index 02bd5726f..a043ef222 100644 --- a/baybe/utils/augmentation.py +++ b/baybe/utils/augmentation.py @@ -31,33 +31,93 @@ def _row_in_df(row: pd.Series | pd.DataFrame, df: pd.DataFrame) -> bool: def df_apply_permutation_augmentation( - df: pd.DataFrame, columns: Sequence[str] + df: pd.DataFrame, + columns: Sequence[str], + dependents: Sequence[str] | None = None, ) -> pd.DataFrame: """Augment a dataframe if permutation invariant columns are present. Indices are preserved so that each augmented row will have the same index as its - original. + original. `dependent` columns are augmented in the same order as the `columns`. + + * Original + + +---+---+---+---+ + | A | B | C | D | + +===+===+===+===+ + | a | b | x | y | + +---+---+---+---+ + | b | a | x | z | + +---+---+---+---+ + + * Result with ``columns = ["A", "B"]`` + + +---+---+---+---+ + | A | B | C | D | + +===+===+===+===+ + | a | b | x | y | + +---+---+---+---+ + | b | a | x | z | + +---+---+---+---+ + | b | a | x | y | + +---+---+---+---+ + | a | b | x | z | + +---+---+---+---+ + + * Result with ``columns = ["A", "B"]``, ``dependents = ["C", "D"]`` + + +---+---+---+---+ + | A | B | C | D | + +===+===+===+===+ + | a | b | x | y | + +---+---+---+---+ + | b | a | x | z | + +---+---+---+---+ + | b | a | y | x | + +---+---+---+---+ + | a | b | z | x | + +---+---+---+---+ Args: df: The dataframe that should be augmented. - columns: Sequence indicating the permutation invariant columns. + columns: The permutation invariant columns. + dependents: Columns that are connected to `columns` and should be permuted in + the same manner. Returns: The augmented dataframe containing the original one. + + Raises: + ValueError: If `dependents` has length incompatible with `columns`. """ + dependents = dependents or [] new_rows: list[pd.DataFrame] = [] + + if dependents and len(columns) != len(dependents): + raise ValueError( + "When augmenting permutation invariance with dependent columns, there must " + "be exactly the same amount of 'dependents' as there are 'columns'." + ) + for _, row in df.iterrows(): # Extract the values from the specified columns original_values = row[columns].tolist() # type: ignore[call-overload] + dependent_values = row[dependents].tolist() if dependents else None # type: ignore[call-overload] # Generate all permutations of these values - all_perms = list(permutations(original_values)) + column_perms = list(permutations(original_values)) + dependent_perms = ( + list(permutations(dependent_values)) if dependent_values else None + ) # For each permutation, create a new row if it's not already in the dataframe - for perm in all_perms: + for k, perm in enumerate(column_perms): # Create a new row dictionary with the permuted values new_row = row.copy().to_frame().T new_row[columns] = perm + if dependent_perms: + new_row[dependents] = dependent_perms[k] + if not _row_in_df(new_row, df): new_rows.append(new_row) @@ -78,10 +138,64 @@ def df_apply_dependency_augmentation( will trigger an augmentation on the affected columns. The latter are augmented by going through all their invariant values and adding respective new rows. + * Original + + +---+---+---+---+ + | A | B | C | D | + +===+===+===+===+ + | 0 | 2 | 5 | y | + +---+---+---+---+ + | 1 | 3 | 5 | z | + +---+---+---+---+ + + * Result with ``causing = ("A", [0])``, ``affected = [("B", [2,3,4])]`` + + +---+---+---+---+ + | A | B | C | D | + +===+===+===+===+ + | 0 | 2 | 5 | y | + +---+---+---+---+ + | 1 | 3 | 5 | z | + +---+---+---+---+ + | 0 | 3 | 5 | y | + +---+---+---+---+ + | 0 | 4 | 5 | y | + +---+---+---+---+ + + * Result with ``causing = ("A", [0, 1])`, `affected = [("B", [2,3])]`` + + +---+---+---+---+ + | A | B | C | D | + +===+===+===+===+ + | 0 | 2 | 5 | y | + +---+---+---+---+ + | 1 | 3 | 5 | z | + +---+---+---+---+ + | 0 | 3 | 5 | y | + +---+---+---+---+ + | 1 | 2 | 5 | z | + +---+---+---+---+ + + * Result with ``causing = ("A", [0])`, `affected = [("B", [2,3]), ("C", [5, 6])]`` + + +---+---+---+---+ + | A | B | C | D | + +===+===+===+===+ + | 0 | 2 | 5 | y | + +---+---+---+---+ + | 1 | 3 | 5 | z | + +---+---+---+---+ + | 0 | 3 | 5 | y | + +---+---+---+---+ + | 0 | 2 | 6 | y | + +---+---+---+---+ + | 0 | 3 | 6 | y | + +---+---+---+---+ + Args: df: The dataframe that should be augmented. causing: Causing column name and its causing values. - affected: List of affected columns and their invariant values. + affected: Affected columns and their invariant values. Returns: The augmented dataframe containing the original one. diff --git a/tests/test_utils.py b/tests/test_utils.py index e6eb435de..af97dc8aa 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -127,7 +127,7 @@ def test_invalid_register_hooks(target, hook): @pytest.mark.parametrize( - ("data", "columns", "data_expected"), + ("data", "columns", "dependents", "data_expected"), [ param( # 2 invariant cols and 1 unaffected col { @@ -136,6 +136,7 @@ def test_invalid_register_hooks(target, hook): "C": ["x", "y"], }, ["A", "B"], + None, { "A": [1, 2, 1, 2], "B": [2, 1, 2, 1], @@ -146,6 +147,7 @@ def test_invalid_register_hooks(target, hook): param( # 2 invariant cols with identical values {"A": [1, 1], "B": [2, 2]}, ["A", "B"], + None, { "A": [1, 2], "B": [2, 1], @@ -155,6 +157,7 @@ def test_invalid_register_hooks(target, hook): param( # 2 invariant cols with identical values but different targets {"A": [1, 1], "B": [2, 2], "T": ["x", "y"]}, ["A", "B"], + None, { "A": [1, 1, 2, 2], "B": [2, 2, 1, 1], @@ -165,6 +168,7 @@ def test_invalid_register_hooks(target, hook): param( # 2 invariant cols with identical values but different targets {"A": [1, 1], "B": [2, 2], "T": ["x", "x"]}, ["A", "B"], + None, { "A": [1, 2], "B": [2, 1], @@ -175,6 +179,7 @@ def test_invalid_register_hooks(target, hook): param( # 3 invariant cols {"A": [1, 1], "B": [2, 4], "C": [3, 5]}, ["A", "B", "C"], + None, { "A": [1, 1, 2, 2, 3, 3, 1, 1, 4, 4, 5, 5], "B": [2, 3, 1, 3, 2, 1, 4, 5, 1, 5, 1, 4], @@ -185,6 +190,7 @@ def test_invalid_register_hooks(target, hook): param( # 3 invariant cols {"A": [1, 1], "B": [2, 4], "C": [3, 5], "D": ["x", "y"]}, ["A", "B", "C"], + None, { "A": [1, 1, 2, 2, 3, 3, 1, 1, 4, 4, 5, 5], "B": [2, 3, 1, 3, 2, 1, 4, 5, 1, 5, 1, 4], @@ -193,13 +199,34 @@ def test_invalid_register_hooks(target, hook): }, id="3inv+1add", ), + param( # 2 invariant cols, 2 dependent ones, 2 additional ones + { + "Slot1": ["s1", "s2"], + "Slot2": ["s2", "s4"], + "Frac1": [0.1, 0.6], + "Frac2": [0.9, 0.4], + "Other1": ["A", "B"], + "Other2": ["C", "D"], + }, + ["Slot1", "Slot2"], + ["Frac1", "Frac2"], + { + "Slot1": ["s1", "s2", "s2", "s4"], + "Slot2": ["s2", "s4", "s1", "s2"], + "Frac1": [0.1, 0.6, 0.9, 0.4], + "Frac2": [0.9, 0.4, 0.1, 0.6], + "Other1": ["A", "B", "A", "B"], + "Other2": ["C", "D", "C", "D"], + }, + id="2inv_degen+2dependent+2add", + ), ], ) -def test_df_invariance_augmentation(data, columns, data_expected): +def test_df_invariance_augmentation(data, columns, dependents, data_expected): """Test invariance data augmentation is done correctly.""" # Create all needed dataframes df = pd.DataFrame(data) - df_augmented = df_apply_permutation_augmentation(df, columns) + df_augmented = df_apply_permutation_augmentation(df, columns, dependents) df_expected = pd.DataFrame(data_expected) # Determine equality ignoring row order From 3b752a5415f4565df3be563df29c38e5a7fb0025 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Fri, 28 Jun 2024 12:48:09 +0200 Subject: [PATCH 07/14] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b75c6722c..597318b25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ _ `_optional` subpackage for managing optional dependencies - `register_hooks` utility enabling user-defined augmentation of arbitrary callables - `transform` methods of `SearchSpace`, `SubspaceDiscrete` and `SubspaceContinuous` now take additional `allow_missing` and `allow_extra` keyword arguments +- Utilities for permutation and dependency data augmentation ### Changed - Passing an `Objective` to `Campaign` is now optional From f4c013af4553c284fe02ba22d835fe87f1541b5b Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Mon, 1 Jul 2024 16:36:08 +0200 Subject: [PATCH 08/14] Fix row conversion --- baybe/utils/augmentation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/baybe/utils/augmentation.py b/baybe/utils/augmentation.py index a043ef222..73c198148 100644 --- a/baybe/utils/augmentation.py +++ b/baybe/utils/augmentation.py @@ -113,7 +113,7 @@ def df_apply_permutation_augmentation( # For each permutation, create a new row if it's not already in the dataframe for k, perm in enumerate(column_perms): # Create a new row dictionary with the permuted values - new_row = row.copy().to_frame().T + new_row = pd.DataFrame([row]) new_row[columns] = perm if dependent_perms: new_row[dependents] = dependent_perms[k] @@ -211,7 +211,7 @@ def df_apply_dependency_augmentation( # changed to all possible values are added. If there is more than one affected # column, it is important to include the augmented rows stemming from the # preceding columns as well. - original_row = row.to_frame().T + original_row = pd.DataFrame([row]) currently_added = original_row.copy() # Start with the original row for col_affected, vals_invariant in affected: @@ -223,7 +223,7 @@ def df_apply_dependency_augmentation( new_row for val in vals_invariant if not _row_in_df( - new_row := temp_row.to_frame().T.assign( + new_row := temp_row.pipe(lambda x: pd.DataFrame([x])).assign( **{col_affected: val} ), # this takes the current row and replaces the affected value currently_added, From 82b9cfd6cdbcb716cc9dbe387efe4a6dcb99fc32 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Mon, 1 Jul 2024 16:52:38 +0200 Subject: [PATCH 09/14] Fix row comparison --- baybe/utils/augmentation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/baybe/utils/augmentation.py b/baybe/utils/augmentation.py index 73c198148..20decdd00 100644 --- a/baybe/utils/augmentation.py +++ b/baybe/utils/augmentation.py @@ -27,6 +27,7 @@ def _row_in_df(row: pd.Series | pd.DataFrame, df: pd.DataFrame) -> bool: ) row = row.iloc[0] + row = row.reindex(df.columns) return (df == row).all(axis=1).any() From 6b46d975bb7888ed36b0d9ebf1ec631e42901c85 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 2 Jul 2024 13:18:11 +0200 Subject: [PATCH 10/14] Improve strings --- baybe/utils/augmentation.py | 15 ++++++++------- tests/test_utils.py | 12 ++++++++---- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/baybe/utils/augmentation.py b/baybe/utils/augmentation.py index 20decdd00..865fb70be 100644 --- a/baybe/utils/augmentation.py +++ b/baybe/utils/augmentation.py @@ -17,13 +17,13 @@ def _row_in_df(row: pd.Series | pd.DataFrame, df: pd.DataFrame) -> bool: Boolean result. Raises: - ValueError: If `row` is a dataframe that contains more than one row. + ValueError: If ``row`` is a dataframe that contains more than one row. """ if isinstance(row, pd.DataFrame): if len(row) != 1: raise ValueError( f"{_row_in_df.__name__} can only be called with pd.Series or " - f"pd.DataFrame's that have exactly one row." + f"pd.DataFrames that have exactly one row." ) row = row.iloc[0] @@ -39,7 +39,7 @@ def df_apply_permutation_augmentation( """Augment a dataframe if permutation invariant columns are present. Indices are preserved so that each augmented row will have the same index as its - original. `dependent` columns are augmented in the same order as the `columns`. + original. ``dependent`` columns are augmented in the same order as the ``columns``. * Original @@ -82,14 +82,14 @@ def df_apply_permutation_augmentation( Args: df: The dataframe that should be augmented. columns: The permutation invariant columns. - dependents: Columns that are connected to `columns` and should be permuted in + dependents: Columns that are connected to ``columns`` and should be permuted in the same manner. Returns: The augmented dataframe containing the original one. Raises: - ValueError: If `dependents` has length incompatible with `columns`. + ValueError: If ``dependents`` has length incompatible with ``columns``. """ dependents = dependents or [] new_rows: list[pd.DataFrame] = [] @@ -163,7 +163,7 @@ def df_apply_dependency_augmentation( | 0 | 4 | 5 | y | +---+---+---+---+ - * Result with ``causing = ("A", [0, 1])`, `affected = [("B", [2,3])]`` + * Result with ``causing = ("A", [0, 1])``, ``affected = [("B", [2,3])]`` +---+---+---+---+ | A | B | C | D | @@ -177,7 +177,8 @@ def df_apply_dependency_augmentation( | 1 | 2 | 5 | z | +---+---+---+---+ - * Result with ``causing = ("A", [0])`, `affected = [("B", [2,3]), ("C", [5, 6])]`` + * Result with ``causing = ("A", [0])``, + ``affected = [("B", [2,3]), ("C", [5, 6])]`` +---+---+---+---+ | A | B | C | D | diff --git a/tests/test_utils.py b/tests/test_utils.py index af97dc8aa..ab10e5dae 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -222,8 +222,8 @@ def test_invalid_register_hooks(target, hook): ), ], ) -def test_df_invariance_augmentation(data, columns, dependents, data_expected): - """Test invariance data augmentation is done correctly.""" +def test_df_permutation_augmentation(data, columns, dependents, data_expected): + """Test permutation invariance data augmentation is done correctly.""" # Create all needed dataframes df = pd.DataFrame(data) df_augmented = df_apply_permutation_augmentation(df, columns, dependents) @@ -238,7 +238,9 @@ def test_df_invariance_augmentation(data, columns, dependents, data_expected): .all() ) - assert are_equal, (df, df_augmented, df_expected) + assert ( + are_equal + ), f"\norig:\n{df}\n\naugmented:\n{df_augmented}\n\nexpected:\n{df_expected}" @pytest.mark.parametrize( @@ -322,4 +324,6 @@ def test_df_dependency_augmentation(data, causing, affected, data_expected): .all() ) - assert are_equal, (df, df_augmented, df_expected) + assert ( + are_equal + ), f"\norig:\n{df}\n\naugmented:\n{df_augmented}\n\nexpected:\n{df_expected}" From ff5315bbee6e2e859da5e818dd1bc1f3dd2e4344 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 2 Jul 2024 13:39:47 +0200 Subject: [PATCH 11/14] Compress dependency augmentation --- baybe/utils/augmentation.py | 45 ++++++++++--------------------------- 1 file changed, 12 insertions(+), 33 deletions(-) diff --git a/baybe/utils/augmentation.py b/baybe/utils/augmentation.py index 865fb70be..d11a1700d 100644 --- a/baybe/utils/augmentation.py +++ b/baybe/utils/augmentation.py @@ -1,7 +1,7 @@ """Utilities related to data augmentation.""" from collections.abc import Sequence -from itertools import permutations +from itertools import permutations, product import pandas as pd @@ -203,40 +203,19 @@ def df_apply_dependency_augmentation( The augmented dataframe containing the original one. """ new_rows: list[pd.DataFrame] = [] - - # Iterate through all rows that have a causing value in the respective column. col_causing, vals_causing = causing df_filtered = df.loc[df[col_causing].isin(vals_causing), :] - for _, row in df_filtered.iterrows(): - # Augment the specific row by growing a dataframe iteratively going through - # the affected columns. In each iteration augmented rows with that column - # changed to all possible values are added. If there is more than one affected - # column, it is important to include the augmented rows stemming from the - # preceding columns as well. - original_row = pd.DataFrame([row]) - - currently_added = original_row.copy() # Start with the original row - for col_affected, vals_invariant in affected: - to_add = [] - - # Go through all previously added rows + the original row - for _, temp_row in currently_added.iterrows(): - to_add += [ - new_row - for val in vals_invariant - if not _row_in_df( - new_row := temp_row.pipe(lambda x: pd.DataFrame([x])).assign( - **{col_affected: val} - ), # this takes the current row and replaces the affected value - currently_added, - ) - ] - # Update the currently added rows - currently_added = pd.concat([currently_added] + to_add) - - # Drop first entry because it's the original row and store added rows - currently_added = currently_added.iloc[1:, :] - new_rows.append(currently_added) + affected_cols, affected_inv_vals = zip(*affected) + affected_inv_vals_combinations = list(product(*affected_inv_vals)) + + # Iterate through all rows that have a causing value in the respective column. + for _, r in df_filtered.iterrows(): + to_add = [ + pd.Series({**r.to_dict(), **dict(zip(affected_cols, values))}) + for values in affected_inv_vals_combinations + ] + to_add = [r2 for r2 in to_add if not _row_in_df(r2, df_filtered)] + new_rows.append(pd.DataFrame(to_add)) augmented_df = pd.concat([df] + new_rows) From 5d9d0d31b8c6e2caf6928c8bb173361e7ec75548 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 2 Jul 2024 19:05:02 +0200 Subject: [PATCH 12/14] Add permutation support for multiple dependents per columns --- baybe/utils/augmentation.py | 64 +++++++++++++++----------- tests/test_utils.py | 90 ++++++++++++++++++++++++++++++------- 2 files changed, 111 insertions(+), 43 deletions(-) diff --git a/baybe/utils/augmentation.py b/baybe/utils/augmentation.py index d11a1700d..8cd886c5d 100644 --- a/baybe/utils/augmentation.py +++ b/baybe/utils/augmentation.py @@ -34,7 +34,7 @@ def _row_in_df(row: pd.Series | pd.DataFrame, df: pd.DataFrame) -> bool: def df_apply_permutation_augmentation( df: pd.DataFrame, columns: Sequence[str], - dependents: Sequence[str] | None = None, + dependents: Sequence[Sequence[str]] | None = None, ) -> pd.DataFrame: """Augment a dataframe if permutation invariant columns are present. @@ -65,7 +65,7 @@ def df_apply_permutation_augmentation( | a | b | x | z | +---+---+---+---+ - * Result with ``columns = ["A", "B"]``, ``dependents = ["C", "D"]`` + * Result with ``columns = ["A", "B"]``, ``dependents = [["C"], ["D"]]`` +---+---+---+---+ | A | B | C | D | @@ -83,45 +83,52 @@ def df_apply_permutation_augmentation( df: The dataframe that should be augmented. columns: The permutation invariant columns. dependents: Columns that are connected to ``columns`` and should be permuted in - the same manner. + the same manner. Can be multiple per entry in ``affected`` but all must be + of same length. Returns: The augmented dataframe containing the original one. Raises: ValueError: If ``dependents`` has length incompatible with ``columns``. + ValueError: If entries in ``dependents`` are not of same length. """ + # Validation dependents = dependents or [] + if dependents: + if len(columns) != len(dependents): + raise ValueError( + "When augmenting permutation invariance with dependent columns, " + "'dependents' must have exactly as many entries as 'columns'." + ) + if len({len(d) for d in dependents}) != 1 or len(dependents[0]) < 1: + raise ValueError( + "Augmentation with dependents can only work if the amount of dependent " + "columns provided as entries of 'dependents' is the same for all " + "affected columns. If there are no dependents, set 'dependents' to " + "None." + ) + + # Augmentation Loop new_rows: list[pd.DataFrame] = [] + idx_permutation = list(permutations(range(len(columns)))) + for _, row in df.iterrows(): + to_add = [] + for _, perm in enumerate(idx_permutation): + new_row = row.copy() - if dependents and len(columns) != len(dependents): - raise ValueError( - "When augmenting permutation invariance with dependent columns, there must " - "be exactly the same amount of 'dependents' as there are 'columns'." - ) + # Permute columns + new_row[columns] = row[[columns[k] for k in perm]] - for _, row in df.iterrows(): - # Extract the values from the specified columns - original_values = row[columns].tolist() # type: ignore[call-overload] - dependent_values = row[dependents].tolist() if dependents else None # type: ignore[call-overload] - - # Generate all permutations of these values - column_perms = list(permutations(original_values)) - dependent_perms = ( - list(permutations(dependent_values)) if dependent_values else None - ) - - # For each permutation, create a new row if it's not already in the dataframe - for k, perm in enumerate(column_perms): - # Create a new row dictionary with the permuted values - new_row = pd.DataFrame([row]) - new_row[columns] = perm - if dependent_perms: - new_row[dependents] = dependent_perms[k] + # Permute dependent columns + for deps in map(list, zip(*dependents)): + new_row[deps] = row[[deps[k] for k in perm]] + # Check whether the new row is an existing permutation if not _row_in_df(new_row, df): - new_rows.append(new_row) + to_add.append(new_row) + new_rows.append(pd.DataFrame(to_add)) augmented_df = pd.concat([df] + new_rows) return augmented_df @@ -210,10 +217,13 @@ def df_apply_dependency_augmentation( # Iterate through all rows that have a causing value in the respective column. for _, r in df_filtered.iterrows(): + # Create augmented rows to_add = [ pd.Series({**r.to_dict(), **dict(zip(affected_cols, values))}) for values in affected_inv_vals_combinations ] + + # Do not include rows that were present in the original to_add = [r2 for r2 in to_add if not _row_in_df(r2, df_filtered)] new_rows.append(pd.DataFrame(to_add)) diff --git a/tests/test_utils.py b/tests/test_utils.py index ab10e5dae..f16b44ab6 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -142,20 +142,27 @@ def test_invalid_register_hooks(target, hook): "B": [2, 1, 2, 1], "C": ["x", "x", "y", "y"], }, - id="2inv+1add", + id="2inv_1add", ), param( # 2 invariant cols with identical values - {"A": [1, 1], "B": [2, 2]}, + { + "A": [1, 1], + "B": [2, 2], + }, ["A", "B"], None, { - "A": [1, 2], - "B": [2, 1], + "A": [1, 1, 2], + "B": [2, 2, 1], }, - id="2inv_degen", + id="2inv+degen", ), param( # 2 invariant cols with identical values but different targets - {"A": [1, 1], "B": [2, 2], "T": ["x", "y"]}, + { + "A": [1, 1], + "B": [2, 2], + "T": ["x", "y"], + }, ["A", "B"], None, { @@ -163,10 +170,14 @@ def test_invalid_register_hooks(target, hook): "B": [2, 2, 1, 1], "T": ["x", "y", "x", "y"], }, - id="2inv_degen+target_unique", + id="2inv+degen_target", ), param( # 2 invariant cols with identical values but different targets - {"A": [1, 1], "B": [2, 2], "T": ["x", "x"]}, + { + "A": [1, 1], + "B": [2, 2], + "T": ["x", "x"], + }, ["A", "B"], None, { @@ -174,10 +185,14 @@ def test_invalid_register_hooks(target, hook): "B": [2, 1], "T": ["x", "x"], }, - id="2inv_degen+target_degen", + id="2inv+degen_target+degen", ), param( # 3 invariant cols - {"A": [1, 1], "B": [2, 4], "C": [3, 5]}, + { + "A": [1, 1], + "B": [2, 4], + "C": [3, 5], + }, ["A", "B", "C"], None, { @@ -188,7 +203,12 @@ def test_invalid_register_hooks(target, hook): id="3inv", ), param( # 3 invariant cols - {"A": [1, 1], "B": [2, 4], "C": [3, 5], "D": ["x", "y"]}, + { + "A": [1, 1], + "B": [2, 4], + "C": [3, 5], + "D": ["x", "y"], + }, ["A", "B", "C"], None, { @@ -197,7 +217,7 @@ def test_invalid_register_hooks(target, hook): "C": [3, 2, 3, 1, 1, 2, 5, 4, 5, 1, 4, 1], "D": ["x", "x", "x", "x", "x", "x", "y", "y", "y", "y", "y", "y"], }, - id="3inv+1add", + id="3inv_1add", ), param( # 2 invariant cols, 2 dependent ones, 2 additional ones { @@ -209,7 +229,7 @@ def test_invalid_register_hooks(target, hook): "Other2": ["C", "D"], }, ["Slot1", "Slot2"], - ["Frac1", "Frac2"], + [["Frac1"], ["Frac2"]], { "Slot1": ["s1", "s2", "s2", "s4"], "Slot2": ["s2", "s4", "s1", "s2"], @@ -218,11 +238,34 @@ def test_invalid_register_hooks(target, hook): "Other1": ["A", "B", "A", "B"], "Other2": ["C", "D", "C", "D"], }, - id="2inv_degen+2dependent+2add", + id="2inv_2dependent_2add", + ), + param( # 2 invariant cols, 2 dependent ones, 2 additional ones + { + "Slot1": ["s1", "s2"], + "Slot2": ["s2", "s4"], + "Frac1": [0.1, 0.6], + "Frac2": [0.9, 0.4], + "Temp1": [10, 20], + "Temp2": [50, 60], + "Other": ["x", "y"], + }, + ["Slot1", "Slot2"], + [["Frac1", "Temp1"], ["Frac2", "Temp2"]], + { + "Slot1": ["s1", "s2", "s2", "s4"], + "Slot2": ["s2", "s4", "s1", "s2"], + "Frac1": [0.1, 0.6, 0.9, 0.4], + "Frac2": [0.9, 0.4, 0.1, 0.6], + "Temp1": [10, 20, 50, 60], + "Temp2": [50, 60, 10, 20], + "Other": ["x", "y", "x", "y"], + }, + id="2inv_4dependent2each_1add", ), ], ) -def test_df_permutation_augmentation(data, columns, dependents, data_expected): +def test_df_permutation_aug(data, columns, dependents, data_expected): """Test permutation invariance data augmentation is done correctly.""" # Create all needed dataframes df = pd.DataFrame(data) @@ -243,6 +286,21 @@ def test_df_permutation_augmentation(data, columns, dependents, data_expected): ), f"\norig:\n{df}\n\naugmented:\n{df_augmented}\n\nexpected:\n{df_expected}" +@pytest.mark.parametrize( + ("columns", "dependents", "msg"), + [ + param(["A"], [["B"], ["C"]], "exactly as many", id="too_manydependents"), + param(["A", "B"], [[], []], "same for all", id="dep_length_zero"), + param(["A", "B"], [["C"], []], "same for all", id="different_dep_lengths"), + ], +) +def test_df_permutation_aug_invalid(columns, dependents, msg): + """Test correct errors for invalid permutation attempts.""" + df = pd.DataFrame({"A": [1, 1], "B": [2, 2], "C": ["x", "y"]}) + with pytest.raises(ValueError, match=msg): + df_apply_permutation_augmentation(df, columns, dependents) + + @pytest.mark.parametrize( ("data", "causing", "affected", "data_expected"), [ @@ -308,7 +366,7 @@ def test_df_permutation_augmentation(data, columns, dependents, data_expected): ), ], ) -def test_df_dependency_augmentation(data, causing, affected, data_expected): +def test_df_dependency_aug(data, causing, affected, data_expected): """Test dependency data augmentation is done correctly.""" # Create all needed dataframes df = pd.DataFrame(data) From 62bdb468db3291cd733c8d7d7091d238c54a3a8d Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Tue, 2 Jul 2024 19:27:16 +0200 Subject: [PATCH 13/14] Simplify permutation --- baybe/utils/augmentation.py | 112 ++++++++++++++++-------------------- tests/test_utils.py | 55 ++++++------------ 2 files changed, 67 insertions(+), 100 deletions(-) diff --git a/baybe/utils/augmentation.py b/baybe/utils/augmentation.py index 8cd886c5d..b4b6c5e6e 100644 --- a/baybe/utils/augmentation.py +++ b/baybe/utils/augmentation.py @@ -33,58 +33,53 @@ def _row_in_df(row: pd.Series | pd.DataFrame, df: pd.DataFrame) -> bool: def df_apply_permutation_augmentation( df: pd.DataFrame, - columns: Sequence[str], - dependents: Sequence[Sequence[str]] | None = None, + columns: Sequence[Sequence[str]], ) -> pd.DataFrame: """Augment a dataframe if permutation invariant columns are present. - Indices are preserved so that each augmented row will have the same index as its - original. ``dependent`` columns are augmented in the same order as the ``columns``. - * Original - +---+---+---+---+ - | A | B | C | D | - +===+===+===+===+ - | a | b | x | y | - +---+---+---+---+ - | b | a | x | z | - +---+---+---+---+ - - * Result with ``columns = ["A", "B"]`` - - +---+---+---+---+ - | A | B | C | D | - +===+===+===+===+ - | a | b | x | y | - +---+---+---+---+ - | b | a | x | z | - +---+---+---+---+ - | b | a | x | y | - +---+---+---+---+ - | a | b | x | z | - +---+---+---+---+ - - * Result with ``columns = ["A", "B"]``, ``dependents = [["C"], ["D"]]`` - - +---+---+---+---+ - | A | B | C | D | - +===+===+===+===+ - | a | b | x | y | - +---+---+---+---+ - | b | a | x | z | - +---+---+---+---+ - | b | a | y | x | - +---+---+---+---+ - | a | b | z | x | - +---+---+---+---+ + +----+----+----+----+ + | A1 | A2 | B1 | B2 | + +====+====+====+====+ + | a | b | x | y | + +----+----+----+----+ + | b | a | x | z | + +----+----+----+----+ + + * Result with ``columns = [["A1"], ["A2"]]`` + + +----+----+----+----+ + | A1 | A2 | B1 | B2 | + +====+====+====+====+ + | a | b | x | y | + +----+----+----+----+ + | b | a | x | z | + +----+----+----+----+ + | b | a | x | y | + +----+----+----+----+ + | a | b | x | z | + +----+----+----+----+ + + * Result with ``columns = [["A1", "B1"], ["A2", "B2"]]`` + + +----+----+----+----+ + | A1 | A2 | B1 | B2 | + +====+====+====+====+ + | a | b | x | y | + +----+----+----+----+ + | b | a | x | z | + +----+----+----+----+ + | b | a | y | x | + +----+----+----+----+ + | a | b | z | x | + +----+----+----+----+ Args: df: The dataframe that should be augmented. - columns: The permutation invariant columns. - dependents: Columns that are connected to ``columns`` and should be permuted in - the same manner. Can be multiple per entry in ``affected`` but all must be - of same length. + columns: Sequences of permutation invariant columns. The n'th column in each + sequence will be permuted together with each n'th column in the other + sequences. Returns: The augmented dataframe containing the original one. @@ -94,20 +89,16 @@ def df_apply_permutation_augmentation( ValueError: If entries in ``dependents`` are not of same length. """ # Validation - dependents = dependents or [] - if dependents: - if len(columns) != len(dependents): - raise ValueError( - "When augmenting permutation invariance with dependent columns, " - "'dependents' must have exactly as many entries as 'columns'." - ) - if len({len(d) for d in dependents}) != 1 or len(dependents[0]) < 1: - raise ValueError( - "Augmentation with dependents can only work if the amount of dependent " - "columns provided as entries of 'dependents' is the same for all " - "affected columns. If there are no dependents, set 'dependents' to " - "None." - ) + if len(columns) < 2: + raise ValueError( + "When augmenting permutation invariance, at least two column sequences " + "must be given." + ) + if len({len(seq) for seq in columns}) != 1 or len(columns[0]) < 1: + raise ValueError( + "Permutation augmentation can only work if the amount of columns un each " + "sequence is the same and the sequences are not empty." + ) # Augmentation Loop new_rows: list[pd.DataFrame] = [] @@ -118,10 +109,7 @@ def df_apply_permutation_augmentation( new_row = row.copy() # Permute columns - new_row[columns] = row[[columns[k] for k in perm]] - - # Permute dependent columns - for deps in map(list, zip(*dependents)): + for deps in map(list, zip(*columns)): new_row[deps] = row[[deps[k] for k in perm]] # Check whether the new row is an existing permutation diff --git a/tests/test_utils.py b/tests/test_utils.py index f16b44ab6..a2bc6612e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -127,7 +127,7 @@ def test_invalid_register_hooks(target, hook): @pytest.mark.parametrize( - ("data", "columns", "dependents", "data_expected"), + ("data", "columns", "data_expected"), [ param( # 2 invariant cols and 1 unaffected col { @@ -135,8 +135,7 @@ def test_invalid_register_hooks(target, hook): "B": [2, 2], "C": ["x", "y"], }, - ["A", "B"], - None, + [["A"], ["B"]], { "A": [1, 2, 1, 2], "B": [2, 1, 2, 1], @@ -149,8 +148,7 @@ def test_invalid_register_hooks(target, hook): "A": [1, 1], "B": [2, 2], }, - ["A", "B"], - None, + [["A"], ["B"]], { "A": [1, 1, 2], "B": [2, 2, 1], @@ -163,8 +161,7 @@ def test_invalid_register_hooks(target, hook): "B": [2, 2], "T": ["x", "y"], }, - ["A", "B"], - None, + [["A"], ["B"]], { "A": [1, 1, 2, 2], "B": [2, 2, 1, 1], @@ -178,8 +175,7 @@ def test_invalid_register_hooks(target, hook): "B": [2, 2], "T": ["x", "x"], }, - ["A", "B"], - None, + [["A"], ["B"]], { "A": [1, 2], "B": [2, 1], @@ -187,21 +183,6 @@ def test_invalid_register_hooks(target, hook): }, id="2inv+degen_target+degen", ), - param( # 3 invariant cols - { - "A": [1, 1], - "B": [2, 4], - "C": [3, 5], - }, - ["A", "B", "C"], - None, - { - "A": [1, 1, 2, 2, 3, 3, 1, 1, 4, 4, 5, 5], - "B": [2, 3, 1, 3, 2, 1, 4, 5, 1, 5, 1, 4], - "C": [3, 2, 3, 1, 1, 2, 5, 4, 5, 1, 4, 1], - }, - id="3inv", - ), param( # 3 invariant cols { "A": [1, 1], @@ -209,8 +190,7 @@ def test_invalid_register_hooks(target, hook): "C": [3, 5], "D": ["x", "y"], }, - ["A", "B", "C"], - None, + [["A"], ["B"], ["C"]], { "A": [1, 1, 2, 2, 3, 3, 1, 1, 4, 4, 5, 5], "B": [2, 3, 1, 3, 2, 1, 4, 5, 1, 5, 1, 4], @@ -228,8 +208,7 @@ def test_invalid_register_hooks(target, hook): "Other1": ["A", "B"], "Other2": ["C", "D"], }, - ["Slot1", "Slot2"], - [["Frac1"], ["Frac2"]], + [["Slot1", "Frac1"], ["Slot2", "Frac2"]], { "Slot1": ["s1", "s2", "s2", "s4"], "Slot2": ["s2", "s4", "s1", "s2"], @@ -250,8 +229,7 @@ def test_invalid_register_hooks(target, hook): "Temp2": [50, 60], "Other": ["x", "y"], }, - ["Slot1", "Slot2"], - [["Frac1", "Temp1"], ["Frac2", "Temp2"]], + [["Slot1", "Frac1", "Temp1"], ["Slot2", "Frac2", "Temp2"]], { "Slot1": ["s1", "s2", "s2", "s4"], "Slot2": ["s2", "s4", "s1", "s2"], @@ -265,11 +243,11 @@ def test_invalid_register_hooks(target, hook): ), ], ) -def test_df_permutation_aug(data, columns, dependents, data_expected): +def test_df_permutation_aug(data, columns, data_expected): """Test permutation invariance data augmentation is done correctly.""" # Create all needed dataframes df = pd.DataFrame(data) - df_augmented = df_apply_permutation_augmentation(df, columns, dependents) + df_augmented = df_apply_permutation_augmentation(df, columns) df_expected = pd.DataFrame(data_expected) # Determine equality ignoring row order @@ -287,18 +265,19 @@ def test_df_permutation_aug(data, columns, dependents, data_expected): @pytest.mark.parametrize( - ("columns", "dependents", "msg"), + ("columns", "msg"), [ - param(["A"], [["B"], ["C"]], "exactly as many", id="too_manydependents"), - param(["A", "B"], [[], []], "same for all", id="dep_length_zero"), - param(["A", "B"], [["C"], []], "same for all", id="different_dep_lengths"), + param([], "at least two column sequences", id="no_seqs"), + param([["A"]], "at least two column sequences", id="just_one_seq"), + param([["A"], ["B", "C"]], "sequence is the same", id="different_lengths"), + param([[], []], "sequence is the same", id="empty_seqs"), ], ) -def test_df_permutation_aug_invalid(columns, dependents, msg): +def test_df_permutation_aug_invalid(columns, msg): """Test correct errors for invalid permutation attempts.""" df = pd.DataFrame({"A": [1, 1], "B": [2, 2], "C": ["x", "y"]}) with pytest.raises(ValueError, match=msg): - df_apply_permutation_augmentation(df, columns, dependents) + df_apply_permutation_augmentation(df, columns) @pytest.mark.parametrize( From d4025233f6b25fb9c972c5c71383dbe4edc16011 Mon Sep 17 00:00:00 2001 From: Martin Fitzner Date: Mon, 1 Jul 2024 18:50:34 +0200 Subject: [PATCH 14/14] Add tests --- baybe/exceptions.py | 8 ++ baybe/utils/dataframe.py | 14 ++-- tests/test_input_output.py | 152 ++++++++++++++++++++++++++++++++++--- 3 files changed, 156 insertions(+), 18 deletions(-) diff --git a/baybe/exceptions.py b/baybe/exceptions.py index d92e20569..94d0324c0 100644 --- a/baybe/exceptions.py +++ b/baybe/exceptions.py @@ -9,6 +9,14 @@ class UnusedObjectWarning(UserWarning): """ +class NoSearchspaceMatchWarning(UserWarning): + """The provided input has no match in the searchspace.""" + + +class TooManySearchspaceMatchesWarning(UserWarning): + """The provided input has multiple matches in the searchspace.""" + + ##### Exceptions ##### class NotEnoughPointsLeftError(Exception): """ diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py index 5bc09c270..b8d4ed02e 100644 --- a/baybe/utils/dataframe.py +++ b/baybe/utils/dataframe.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import warnings from collections.abc import Iterable, Iterator, Sequence from typing import ( TYPE_CHECKING, @@ -13,6 +14,7 @@ import numpy as np import pandas as pd +from baybe.exceptions import NoSearchspaceMatchWarning, TooManySearchspaceMatchesWarning from baybe.targets.enum import TargetMode from baybe.utils.numerical import DTypeFloatNumpy @@ -417,17 +419,17 @@ def fuzzy_row_match( # We expect exactly one match. If that's not the case, print a warning. inds_found = left_df.index[match].to_list() if len(inds_found) == 0 and len(num_cols) > 0: - _logger.warning( - "Input row with index %s could not be matched to the search space. " + warnings.warn( + f"Input row with index {ind} could not be matched to the search space. " "This could indicate that something went wrong.", - ind, + NoSearchspaceMatchWarning, ) elif len(inds_found) > 1: - _logger.warning( - "Input row with index %s has multiple matches with " + warnings.warn( + f"Input row with index {ind} has multiple matches with " "the search space. This could indicate that something went wrong. " "Matching only first occurrence.", - ind, + TooManySearchspaceMatchesWarning, ) inds_matched.append(inds_found[0]) else: diff --git a/tests/test_input_output.py b/tests/test_input_output.py index cc1060795..4ec96184d 100644 --- a/tests/test_input_output.py +++ b/tests/test_input_output.py @@ -1,13 +1,18 @@ """Tests for basic input-output and iterative loop.""" +import warnings + import numpy as np +import pandas as pd import pytest +from baybe.constraints import DiscreteNoLabelDuplicatesConstraint +from baybe.exceptions import NoSearchspaceMatchWarning +from baybe.utils.augmentation import ( + df_apply_dependency_augmentation, + df_apply_permutation_augmentation, +) from baybe.utils.dataframe import add_fake_results -# List of tests that are expected to fail (still missing implementation etc) -param_xfails = [] -target_xfails = [] - @pytest.mark.parametrize( "bad_val", @@ -16,9 +21,6 @@ ) def test_bad_parameter_input_value(campaign, good_reference_values, bad_val, request): """Test attempting to read in an invalid parameter value.""" - if request.node.callspec.id in param_xfails: - pytest.xfail() - rec = campaign.recommend(batch_size=3) add_fake_results( rec, @@ -27,7 +29,11 @@ def test_bad_parameter_input_value(campaign, good_reference_values, bad_val, req ) # Add an invalid value - rec.Num_disc_1.iloc[0] = bad_val + with warnings.catch_warnings(): + # Ignore warning about incompatible data type assignment + warnings.simplefilter("ignore", FutureWarning) + rec.iloc[0, rec.columns.get_loc("Num_disc_1")] = bad_val + with pytest.raises((ValueError, TypeError)): campaign.add_measurements(rec) @@ -39,9 +45,6 @@ def test_bad_parameter_input_value(campaign, good_reference_values, bad_val, req ) def test_bad_target_input_value(campaign, good_reference_values, bad_val, request): """Test attempting to read in an invalid target value.""" - if request.node.callspec.id in target_xfails: - pytest.xfail() - rec = campaign.recommend(batch_size=3) add_fake_results( rec, @@ -50,6 +53,131 @@ def test_bad_target_input_value(campaign, good_reference_values, bad_val, reques ) # Add an invalid value - rec.Target_max.iloc[0] = bad_val + with warnings.catch_warnings(): + # Ignore warning about incompatible data type assignment + warnings.simplefilter("ignore", FutureWarning) + rec.iloc[0, rec.columns.get_loc("Target_max")] = bad_val + with pytest.raises((ValueError, TypeError)): campaign.add_measurements(rec) + + +# Reused parameter names for the mixture mock example +_mixture_columns = [ + "Solvent_1", + "Solvent_2", + "Solvent_3", + "Fraction_1", + "Fraction_2", + "Fraction_3", +] + + +@pytest.mark.parametrize("n_grid_points", [5]) +@pytest.mark.parametrize( + "entry", + [ + pd.DataFrame.from_records( + [["THF", "Water", "DMF", 0.0, 25.0, 75.0]], columns=_mixture_columns + ), + ], +) +@pytest.mark.parametrize("parameter_names", [_mixture_columns]) +@pytest.mark.parametrize( + "constraint_names", [["Constraint_7", "Constraint_11", "Constraint_12"]] +) +def test_permutation_invariant_input(campaign, entry): + """Test whether permutation invariant measurements can be added.""" + add_fake_results(entry, campaign) + + # Create augmented combinations + entries = df_apply_permutation_augmentation( + entry, + columns=["Solvent_1", "Solvent_2", "Solvent_3"], + dependents=["Fraction_1", "Fraction_2", "Fraction_3"], + ) + + for _, row in entries.iterrows(): + # Reset searchspace metadata + campaign.searchspace.discrete.metadata["was_measured"] = False + + # Assert that not NoSearchspaceMatchWarning is thrown + with warnings.catch_warnings(): + print(row.to_frame().T) + warnings.simplefilter("error", category=NoSearchspaceMatchWarning) + campaign.add_measurements(pd.DataFrame([row])) + + # Assert exactly one searchspace entry has been marked + num_nonzero = campaign.searchspace.discrete.metadata["was_measured"].sum() + assert num_nonzero == 1, ( + "Measurement ingestion was successful, but did not correctly update the " + f"searchspace metadata. Number of non-zero entries: {num_nonzero} " + f"(expected 1)" + ) + + +@pytest.mark.parametrize("n_grid_points", [5], ids=["grid5"]) +@pytest.mark.parametrize( + "entry", + [ + pd.DataFrame.from_records( + [["THF", "Water", "DMF", 0.0, 25.0, 75.0]], + columns=_mixture_columns, + ), + pd.DataFrame.from_records( + [["THF", "Water", "DMF", 0.0, 0.0, 50.0]], + columns=_mixture_columns, + ), + ], + ids=["single_degen", "double_degen"], +) +@pytest.mark.parametrize("parameter_names", [_mixture_columns]) +@pytest.mark.parametrize( + "constraint_names", [["Constraint_7", "Constraint_11", "Constraint_12"]] +) +def test_dependency_invariant_input(campaign, entry): + """Test whether dependency invariant measurements can be added.""" + # Get an entry from the searchspace + add_fake_results(entry, campaign) + sol_vals = campaign.searchspace.get_parameters_by_name(["Solvent_1"])[0].values + + # Create augmented combinations + entries = df_apply_dependency_augmentation( + entry, causing=("Fraction_1", [0.0]), affected=[("Solvent_1", sol_vals)] + ) + entries = df_apply_dependency_augmentation( + entries, causing=("Fraction_2", [0.0]), affected=[("Solvent_2", sol_vals)] + ) + entries = df_apply_dependency_augmentation( + entries, causing=("Fraction_3", [0.0]), affected=[("Solvent_3", sol_vals)] + ) + + # Remove falsely created label duplicates + entries.reset_index(drop=True, inplace=True) + for c in campaign.searchspace.discrete.constraints: + if isinstance(c, DiscreteNoLabelDuplicatesConstraint): + entries.drop(index=c.get_invalid(entries), inplace=True) + + # Add nan entries for testing nan input in the invariant parameters + entry_nan = entry.copy() + entry_nan.loc[entry_nan["Fraction_1"] == 0.0, "Solvent_1"] = np.nan + entry_nan.loc[entry_nan["Fraction_2"] == 0.0, "Solvent_2"] = np.nan + entry_nan.loc[entry_nan["Fraction_3"] == 0.0, "Solvent_3"] = np.nan + + for _, row in pd.concat([entries, entry_nan]).iterrows(): + # Reset searchspace metadata + campaign.searchspace.discrete.metadata["was_measured"] = False + + # Assert that not NoSearchspaceMatchWarning is thrown + with warnings.catch_warnings(): + print(row.to_frame().T) + warnings.simplefilter("error", category=NoSearchspaceMatchWarning) + campaign.add_measurements(pd.DataFrame([row])) + + # Assert exactly one searchspace entry has been marked + num_nonzero = campaign.searchspace.discrete.metadata["was_measured"].sum() + assert num_nonzero == 1, ( + "Measurement ingestion was successful, but did not correctly update the " + f"searchspace metadata. Number of non-zero entries: {num_nonzero} " + f"(expected 1)" + )