From b18a06dce090a1bb9b6e3c858b83cd8b6277e280 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 28 Mar 2023 16:06:31 +0200 Subject: [PATCH] feat: improve transformers for tabular data (#108) Closes #61. Closes #90. ### Summary of Changes * Common superclasses `TableTransformer` and `InvertibleTableTransformer` * Common interface for `fit`, `transform`, `fit_transform`, `inverse_transform` * Return new transformer when calling `fit` * More thorough tests --------- Co-authored-by: lars-reimann --- .../data/tabular/transformation/__init__.py | 2 +- .../data/tabular/transformation/_imputer.py | 131 +++++++------ .../tabular/transformation/_label_encoder.py | 155 +++++++-------- .../transformation/_one_hot_encoder.py | 162 ++++++++-------- .../transformation/_table_transformer.py | 99 ++++++++++ .../transformation/_imputer/__init__.py | 0 .../transformation/_imputer/test_imputer.py | 69 ------- .../transformation/_label_encoder/__init__.py | 0 .../_label_encoder/test_fit_transform.py | 15 -- .../_label_encoder/test_inverse_transform.py | 28 --- .../_label_encoder/test_transform.py | 28 --- .../_one_hot_encoder/__init__.py | 0 .../_one_hot_encoder/test_fit_transform.py | 32 ---- .../test_inverse_transform.py | 36 ---- .../_one_hot_encoder/test_transform.py | 20 -- .../tabular/transformation/test_imputer.py | 175 +++++++++++++++++ .../transformation/test_label_encoder.py | 168 +++++++++++++++++ .../transformation/test_one_hot_encoder.py | 176 ++++++++++++++++++ 18 files changed, 843 insertions(+), 453 deletions(-) create mode 100644 src/safeds/data/tabular/transformation/_table_transformer.py delete mode 100644 tests/safeds/data/tabular/transformation/_imputer/__init__.py delete mode 100644 tests/safeds/data/tabular/transformation/_imputer/test_imputer.py delete mode 100644 tests/safeds/data/tabular/transformation/_label_encoder/__init__.py delete mode 100644 tests/safeds/data/tabular/transformation/_label_encoder/test_fit_transform.py delete mode 100644 tests/safeds/data/tabular/transformation/_label_encoder/test_inverse_transform.py delete mode 100644 tests/safeds/data/tabular/transformation/_label_encoder/test_transform.py delete mode 100644 tests/safeds/data/tabular/transformation/_one_hot_encoder/__init__.py delete mode 100644 tests/safeds/data/tabular/transformation/_one_hot_encoder/test_fit_transform.py delete mode 100644 tests/safeds/data/tabular/transformation/_one_hot_encoder/test_inverse_transform.py delete mode 100644 tests/safeds/data/tabular/transformation/_one_hot_encoder/test_transform.py create mode 100644 tests/safeds/data/tabular/transformation/test_imputer.py create mode 100644 tests/safeds/data/tabular/transformation/test_label_encoder.py create mode 100644 tests/safeds/data/tabular/transformation/test_one_hot_encoder.py diff --git a/src/safeds/data/tabular/transformation/__init__.py b/src/safeds/data/tabular/transformation/__init__.py index 17e046538..58931d053 100644 --- a/src/safeds/data/tabular/transformation/__init__.py +++ b/src/safeds/data/tabular/transformation/__init__.py @@ -1,3 +1,3 @@ -from ._imputer import Imputer +from ._imputer import Imputer, ImputerStrategy from ._label_encoder import LabelEncoder from ._one_hot_encoder import OneHotEncoder diff --git a/src/safeds/data/tabular/transformation/_imputer.py b/src/safeds/data/tabular/transformation/_imputer.py index 45862778f..4a23cca35 100644 --- a/src/safeds/data/tabular/transformation/_imputer.py +++ b/src/safeds/data/tabular/transformation/_imputer.py @@ -5,17 +5,18 @@ import pandas as pd from safeds.data.tabular.containers import Table -from sklearn.impute import SimpleImputer +from safeds.data.tabular.transformation._table_transformer import TableTransformer +from safeds.exceptions import NotFittedError, UnknownColumnNameError +from sklearn.impute import SimpleImputer as sk_SimpleImputer class ImputerStrategy(ABC): @abstractmethod - def _augment_imputer(self, imputer: SimpleImputer) -> None: + def _augment_imputer(self, imputer: sk_SimpleImputer) -> None: pass -# noinspection PyProtectedMember -class Imputer: +class Imputer(TableTransformer): """ Impute the data for a given Table. @@ -39,7 +40,10 @@ class Constant(ImputerStrategy): def __init__(self, value: Any): self._value = value - def _augment_imputer(self, imputer: SimpleImputer) -> None: + def __str__(self) -> str: + return f"Constant({self._value})" + + def _augment_imputer(self, imputer: sk_SimpleImputer) -> None: imputer.strategy = "constant" imputer.fill_value = self._value @@ -48,7 +52,10 @@ class Mean(ImputerStrategy): An imputation strategy for imputing missing data with mean values. """ - def _augment_imputer(self, imputer: SimpleImputer) -> None: + def __str__(self) -> str: + return "Mean" + + def _augment_imputer(self, imputer: sk_SimpleImputer) -> None: imputer.strategy = "mean" class Median(ImputerStrategy): @@ -56,7 +63,10 @@ class Median(ImputerStrategy): An imputation strategy for imputing missing data with median values. """ - def _augment_imputer(self, imputer: SimpleImputer) -> None: + def __str__(self) -> str: + return "Median" + + def _augment_imputer(self, imputer: sk_SimpleImputer) -> None: imputer.strategy = "median" class Mode(ImputerStrategy): @@ -64,81 +74,90 @@ class Mode(ImputerStrategy): An imputation strategy for imputing missing data with mode values. """ - def _augment_imputer(self, imputer: SimpleImputer) -> None: + def __str__(self) -> str: + return "Mode" + + def _augment_imputer(self, imputer: sk_SimpleImputer) -> None: imputer.strategy = "most_frequent" def __init__(self, strategy: ImputerStrategy): - self._imp = SimpleImputer() - strategy._augment_imputer(self._imp) - self._column_names: list[str] = [] + self._strategy = strategy - def fit(self, table: Table, column_names: Optional[list[str]] = None) -> None: + self._wrapped_transformer: Optional[sk_SimpleImputer] = None + self._column_names: Optional[list[str]] = None + + # noinspection PyProtectedMember + def fit(self, table: Table, column_names: Optional[list[str]] = None) -> Imputer: """ - Fit the imputer on the dataset. + Learn a transformation for a set of columns in a table. Parameters ---------- table : Table - The table used to learn the imputation values. + The table used to fit the transformer. column_names : Optional[list[str]] - An optional list of column names, if the imputer is only supposed to run on specific columns. + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer : TableTransformer + The fitted transformer. """ if column_names is None: - column_names = table.schema.get_column_names() + column_names = table.get_column_names() + else: + missing_columns = set(column_names) - set(table.get_column_names()) + if len(missing_columns) > 0: + raise UnknownColumnNameError(list(missing_columns)) - if self._imp.strategy == "most_frequent": + if isinstance(self._strategy, Imputer.Strategy.Mode): for name in column_names: - if 1 < len(table.get_column(name).mode()): - raise IndexError( - "There are multiple frequent values in a column given for the Imputer" - ) + if len(table.get_column(name).mode()) > 1: + raise IndexError("There are multiple most frequent values in a column given for the Imputer") + + indices = [table.schema._get_column_index_by_name(name) for name in column_names] + + wrapped_transformer = sk_SimpleImputer() + self._strategy._augment_imputer(wrapped_transformer) + wrapped_transformer.fit(table._data[indices]) + + result = Imputer(self._strategy) + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names - self._column_names = column_names - indices = [ - table.schema._get_column_index_by_name(name) for name in self._column_names - ] - self._imp.fit(table._data[indices]) + return result + # noinspection PyProtectedMember def transform(self, table: Table) -> Table: """ - Impute the missing values on the dataset. + Apply the learned transformation to a table. Parameters ---------- table : Table - The dataset to be imputed. + The table to which the learned transformation is applied. Returns ------- - table : Table - The dataset with missing values imputed by the given strategy. - """ - data = table._data.copy() - indices = [ - table.schema._get_column_index_by_name(name) for name in self._column_names - ] - data[indices] = pd.DataFrame( - self._imp.transform(data[indices]), columns=indices - ) - return Table(data, table.schema) + transformed_table : Table + The transformed table. - def fit_transform( - self, table: Table, column_names: Optional[list[str]] = None - ) -> Table: + Raises + ---------- + NotFittedError + If the transformer has not been fitted yet. """ - Fit the imputer on the dataset and impute the missing values. - Parameters - ---------- - table : Table - The table used to learn the imputation values. - column_names : Optional[list[str]] - An optional list of column names, if the imputer is only supposed to run on specific columns. + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise NotFittedError() - Returns - ------- - table : Table - The dataset with missing values imputed by the given strategy. - """ - self.fit(table, column_names) - return self.transform(table) + # Input table does not contain all columns used to fit the transformer + missing_columns = set(self._column_names) - set(table.get_column_names()) + if len(missing_columns) > 0: + raise UnknownColumnNameError(list(missing_columns)) + + data = table._data.copy() + indices = [table.schema._get_column_index_by_name(name) for name in self._column_names] + data[indices] = pd.DataFrame(self._wrapped_transformer.transform(data[indices]), columns=indices) + return Table(data, table.schema) diff --git a/src/safeds/data/tabular/transformation/_label_encoder.py b/src/safeds/data/tabular/transformation/_label_encoder.py index ad28adeed..561627f47 100644 --- a/src/safeds/data/tabular/transformation/_label_encoder.py +++ b/src/safeds/data/tabular/transformation/_label_encoder.py @@ -1,12 +1,14 @@ from __future__ import annotations import warnings -from typing import Any +from typing import Any, Optional -import pandas from safeds.data.tabular.containers import Table -from safeds.exceptions import LearningError, NotFittedError -from sklearn import exceptions, preprocessing +from safeds.data.tabular.transformation._table_transformer import ( + InvertibleTableTransformer, +) +from safeds.exceptions import NotFittedError, UnknownColumnNameError +from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder def warn(*_: Any, **__: Any) -> None: @@ -17,133 +19,108 @@ def warn(*_: Any, **__: Any) -> None: # noinspection PyProtectedMember - - -class LabelEncoder: +class LabelEncoder(InvertibleTableTransformer): """ The LabelEncoder encodes one or more given columns into labels. """ def __init__(self) -> None: - self._is_fitted = 0 - self._le = preprocessing.LabelEncoder() + self._wrapped_transformer: Optional[sk_OrdinalEncoder] = None + self._column_names: Optional[list[str]] = None - def fit(self, table: Table, column: str) -> None: + def fit(self, table: Table, column_names: Optional[list[str]] = None) -> LabelEncoder: """ - Fit the label encoder with the values in the table. + Learn a transformation for a set of columns in a table. Parameters ---------- table : Table - The table containing the data used to fit the label encoder. - column : str - The list of columns supposed to be label-encoded. + The table used to fit the transformer. + column_names : Optional[list[str]] + The list of columns from the table used to fit the transformer. If `None`, all columns are used. Returns ------- - None - This function does not return any value. It updates the internal state of the label encoder object. - - Raises - ------- - LearningError - If the model fitting was unsuccessful. + fitted_transformer : TableTransformer + The fitted transformer. """ - try: - self._le.fit(table.keep_only_columns([column])._data) - except exceptions.NotFittedError as exc: - raise LearningError("") from exc + if column_names is None: + column_names = table.get_column_names() + else: + missing_columns = set(column_names) - set(table.get_column_names()) + if len(missing_columns) > 0: + raise UnknownColumnNameError(list(missing_columns)) + + indices = [table.schema._get_column_index_by_name(name) for name in column_names] + + wrapped_transformer = sk_OrdinalEncoder() + wrapped_transformer.fit(table._data[indices]) - def transform(self, table: Table, column: str) -> Table: + result = LabelEncoder() + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names + + return result + + def transform(self, table: Table) -> Table: """ - Transform the given table to a normalized encoded table. + Apply the learned transformation to a table. Parameters ---------- table : Table - The table with target values. - column : str - The name of the column. + The table to which the learned transformation is applied. Returns ------- - result : Table - Table with normalized encodings. + transformed_table : Table + The transformed table. Raises - ------ + ---------- NotFittedError - If the Model wasn't fitted before transforming. + If the transformer has not been fitted yet. """ - p_df = table._data - p_df.columns = table.schema.get_column_names() - try: - p_df[column] = self._le.transform(p_df[column]) - return Table(p_df) - except Exception as exc: - raise NotFittedError from exc - - def fit_transform(self, table: Table, columns: list[str]) -> Table: - """ - Label-encode the table with the label encoder. - Parameters - ---------- - table : Table - The table to be transformed. - columns : list[str] - The list of column names to be encoded. + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise NotFittedError() - Returns - ------- - table : Table - The label-encoded table. + # Input table does not contain all columns used to fit the transformer + missing_columns = set(self._column_names) - set(table.get_column_names()) + if len(missing_columns) > 0: + raise UnknownColumnNameError(list(missing_columns)) - Raises - ------- - NotFittedError - If the encoder wasn't fitted before transforming. + data = table._data.copy() + data.columns = table.get_column_names() + data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) + return Table(data) + def inverse_transform(self, transformed_table: Table) -> Table: """ - p_df = table._data - p_df.columns = table.schema.get_column_names() - try: - for col in columns: - # Fit the LabelEncoder on the Column - self._le.fit(p_df[col]) - - # transform the column using the trained Label Encoder - p_df[col] = self._le.transform(p_df[col]) - return Table(pandas.DataFrame(p_df)) - except exceptions.NotFittedError as exc: - raise NotFittedError from exc - - def inverse_transform(self, table: Table, column: str) -> Table: - """ - Inverse-transform the table back to its original encodings. + Undo the learned transformation. Parameters ---------- - table : Table - The table to be inverse-transformed. - column : str - The column to be inverse-transformed. + transformed_table : Table + The table to be transformed back to the original version. Returns ------- table : Table - The inverse-transformed table. + The original table. Raises - ------- + ---------- NotFittedError - If the encoder wasn't fitted before transforming. + If the transformer has not been fitted yet. """ - try: - p_df = table._data - p_df.columns = table.schema.get_column_names() - p_df[column] = self._le.inverse_transform(p_df[column]) - return Table(p_df) - except exceptions.NotFittedError as exc: - raise NotFittedError from exc + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise NotFittedError() + + data = transformed_table._data.copy() + data.columns = transformed_table.get_column_names() + data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) + return Table(data) diff --git a/src/safeds/data/tabular/transformation/_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_one_hot_encoder.py index a3e58d8ad..033f81004 100644 --- a/src/safeds/data/tabular/transformation/_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_one_hot_encoder.py @@ -1,127 +1,131 @@ +from __future__ import annotations + +from typing import Optional + import pandas as pd from safeds.data.tabular.containers import Table -from safeds.exceptions import LearningError, NotFittedError -from sklearn import exceptions -from sklearn.preprocessing import OneHotEncoder as OHE_sklearn +from safeds.data.tabular.transformation._table_transformer import ( + InvertibleTableTransformer, +) +from safeds.exceptions import NotFittedError, UnknownColumnNameError +from sklearn.preprocessing import OneHotEncoder as sk_OneHotEncoder -class OneHotEncoder: +class OneHotEncoder(InvertibleTableTransformer): """ The OneHotEncoder encodes categorical columns to numerical features [0,1] that represent the existence for each value. """ def __init__(self) -> None: - self._encoder = OHE_sklearn() + self._wrapped_transformer: Optional[sk_OneHotEncoder] = None + self._column_names: Optional[list[str]] = None - def fit(self, table: Table, columns: list[str]) -> None: + # noinspection PyProtectedMember + def fit(self, table: Table, column_names: Optional[list[str]] = None) -> OneHotEncoder: """ - Fit the encoder to a table. + Learn a transformation for a set of columns in a table. Parameters ---------- table : Table - The table used to fit the encoder. - columns : list[str]: - The list of columns from the table used to fit the encoder. + The table used to fit the transformer. + column_names : Optional[list[str]] + The list of columns from the table used to fit the transformer. If `None`, all columns are used. - Raises - ---------- - LearningError - If there was an error during fitting. + Returns + ------- + fitted_transformer : TableTransformer + The fitted transformer. """ - try: - table_k_columns = table.keep_only_columns(column_names=columns) - df = table_k_columns._data - df.columns = table_k_columns.schema.get_column_names() - self._encoder.fit(df) - except exceptions.NotFittedError as exc: - raise LearningError("") from exc + if column_names is None: + column_names = table.get_column_names() + else: + missing_columns = set(column_names) - set(table.get_column_names()) + if len(missing_columns) > 0: + raise UnknownColumnNameError(list(missing_columns)) + + data = table._data.copy() + data.columns = table.get_column_names() + + wrapped_transformer = sk_OneHotEncoder() + wrapped_transformer.fit(data[column_names]) + + result = OneHotEncoder() + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names + return result + + # noinspection PyProtectedMember def transform(self, table: Table) -> Table: """ - Transform the data with the trained encoder. + Apply the learned transformation to a table. Parameters ---------- table : Table - The data to be transformed. + The table to which the learned transformation is applied. Returns - ---------- - table : Table + ------- + transformed_table : Table The transformed table. Raises ---------- NotFittedError - If the encoder wasn't fitted before transforming. - """ - try: - table_k_columns = table.keep_only_columns(self._encoder.feature_names_in_) - df_k_columns = table_k_columns._data - df_k_columns.columns = table_k_columns.schema.get_column_names() - df_new = pd.DataFrame(self._encoder.transform(df_k_columns).toarray()) - df_new.columns = self._encoder.get_feature_names_out() - df_concat = table._data.copy() - df_concat.columns = table.schema.get_column_names() - data_new = pd.concat([df_concat, df_new], axis=1).drop( - self._encoder.feature_names_in_, axis=1 - ) - return Table(data_new) - except Exception as exc: - raise NotFittedError from exc - - def fit_transform(self, table: Table, columns: list[str]) -> Table: + If the transformer has not been fitted yet. """ - Fit and transform data with a OneHotEncoder. - Parameters - ---------- - table : Table - The table used to fit the encoder and subsequently to be transformed - columns : list[str]: - The list of columns from the table used to fit the encoder and subsequently to be transformed. + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise NotFittedError() - Returns - ---------- - table : Table - The transformed table. + # Input table does not contain all columns used to fit the transformer + missing_columns = set(self._column_names) - set(table.get_column_names()) + if len(missing_columns) > 0: + raise UnknownColumnNameError(list(missing_columns)) - """ - self.fit(table, columns) - return self.transform(table) + original = table._data.copy() + original.columns = table.schema.get_column_names() + + one_hot_encoded = pd.DataFrame(self._wrapped_transformer.transform(original[self._column_names]).toarray()) + one_hot_encoded.columns = self._wrapped_transformer.get_feature_names_out() + + unchanged = original.drop(self._column_names, axis=1) - def inverse_transform(self, table: Table) -> Table: + return Table(pd.concat([unchanged, one_hot_encoded], axis=1)) + + # noinspection PyProtectedMember + def inverse_transform(self, transformed_table: Table) -> Table: """ - Reset a transformed table to its original state. + Undo the learned transformation. Parameters ---------- - table : Table - The table to be inverse-transformed. + transformed_table : Table + The table to be transformed back to the original version. Returns - ---------- + ------- table : Table - The inverse-transformed table. + The original table. Raises ---------- NotFittedError - If the encoder wasn't fitted before transforming. - + If the transformer has not been fitted yet. """ - try: - data = self._encoder.inverse_transform( - table.keep_only_columns(self._encoder.get_feature_names_out())._data - ) - df = pd.DataFrame(data) - df.columns = self._encoder.feature_names_in_ - new_table = Table(df) - for col in table.drop_columns( - self._encoder.get_feature_names_out() - ).to_columns(): - new_table = new_table.add_column(col) - return new_table - except exceptions.NotFittedError as exc: - raise NotFittedError from exc + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise NotFittedError() + + data = transformed_table._data.copy() + data.columns = transformed_table.get_column_names() + + decoded = pd.DataFrame( + self._wrapped_transformer.inverse_transform(transformed_table._data), columns=self._column_names + ) + unchanged = data.drop(self._wrapped_transformer.get_feature_names_out(), axis=1) + + return Table(pd.concat([unchanged, decoded], axis=1)) diff --git a/src/safeds/data/tabular/transformation/_table_transformer.py b/src/safeds/data/tabular/transformation/_table_transformer.py new file mode 100644 index 000000000..b2844de22 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_table_transformer.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Optional + +from safeds.data.tabular.containers import Table + + +class TableTransformer(ABC): + """ + A `TableTransformer` learns a transformation for a set of columns in a `Table` and can then apply the learned + transformation to another `Table` with the same columns. + """ + + @abstractmethod + def fit(self, table: Table, column_names: Optional[list[str]] = None) -> TableTransformer: + """ + Learn a transformation for a set of columns in a table. + + Parameters + ---------- + table : Table + The table used to fit the transformer. + column_names : Optional[list[str]] + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer : TableTransformer + The fitted transformer. + """ + + @abstractmethod + def transform(self, table: Table) -> Table: + """ + Apply the learned transformation to a table. + + Parameters + ---------- + table : Table + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table : Table + The transformed table. + + Raises + ---------- + NotFittedError + If the transformer has not been fitted yet. + """ + + def fit_transform(self, table: Table, column_names: Optional[list[str]] = None) -> Table: + """ + Learn a transformation for a set of columns in a table and apply the learned transformation to the same table. + If you also need the fitted transformer, use `fit` and `transform` separately. + + Parameters + ---------- + table : Table + The table used to fit the transformer. The transformer is then applied to this table. + column_names : Optional[list[str]] + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + transformed_table : Table + The transformed table. + """ + return self.fit(table, column_names).transform(table) + + +class InvertibleTableTransformer(TableTransformer): + """ + An `InvertibleTableTransformer` is a `TableTransformer` that can also undo the learned transformation after it has + been applied. + """ + + @abstractmethod + def inverse_transform(self, transformed_table: Table) -> Table: + """ + Undo the learned transformation. + + Parameters + ---------- + transformed_table : Table + The table to be transformed back to the original version. + + Returns + ------- + table : Table + The original table. + + Raises + ---------- + NotFittedError + If the transformer has not been fitted yet. + """ diff --git a/tests/safeds/data/tabular/transformation/_imputer/__init__.py b/tests/safeds/data/tabular/transformation/_imputer/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/safeds/data/tabular/transformation/_imputer/test_imputer.py b/tests/safeds/data/tabular/transformation/_imputer/test_imputer.py deleted file mode 100644 index 663823ef7..000000000 --- a/tests/safeds/data/tabular/transformation/_imputer/test_imputer.py +++ /dev/null @@ -1,69 +0,0 @@ -import numpy as np -import pandas as pd -import pytest -from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import Imputer - - -def test_imputer_mean() -> None: - table = Table(pd.DataFrame(data={"col1": [np.nan, 2, 3, 4, 5]})) - column = table.get_column("col1") - imp = Imputer(Imputer.Strategy.Mean()) - new_table = imp.fit_transform(table) - - assert new_table.get_column("col1")._data[0] == column.mean() - - -def test_imputer_median() -> None: - table = Table(pd.DataFrame(data={"col1": [np.nan, 2, 3, 4, 5]})) - column = table.get_column("col1") - imp = Imputer(Imputer.Strategy.Median()) - new_table = imp.fit_transform(table) - - assert new_table.get_column("col1")._data[0] == column.median() - - -def test_imputer_mode() -> None: - table = Table(pd.DataFrame(data={"col1": [np.nan, 2, 2, 4, 5]})) - column = table.get_column("col1") - imp = Imputer(Imputer.Strategy.Mode()) - new_table = imp.fit_transform(table) - - assert new_table.get_column("col1")._data[0] == column.mode()[0] - - -def test_imputer_mode_invalid() -> None: - table = Table(pd.DataFrame(data={"col1": [np.nan, 2, 3, 4, 5]})) - imp = Imputer(Imputer.Strategy.Mode()) - with pytest.raises(IndexError): - imp.fit_transform(table) - - -def test_imputer_constant() -> None: - table = Table(pd.DataFrame(data={"col1": [np.nan, 2, 3, 4, 5]})) - imp = Imputer(Imputer.Strategy.Constant(0)) - new_table = imp.fit_transform(table) - - assert new_table.get_column("col1")._data[0] == 0 - - -def test_imputer_specific_column() -> None: - table = Table( - pd.DataFrame(data={"col1": [np.nan, 2, 3, 4, 5], "col2": [np.nan, 2, 3, 4, 5]}) - ) - imp = Imputer(Imputer.Strategy.Constant(0)) - new_table = imp.fit_transform(table, ["col1"]) - - assert new_table.get_column("col1")._data[0] == 0 - assert np.isnan(new_table.get_column("col2")._data[0]) - - -def test_imputer_all_columns() -> None: - table = Table( - pd.DataFrame(data={"col1": [np.nan, 2, 3, 4, 5], "col2": [np.nan, 2, 3, 4, 5]}) - ) - imp = Imputer(Imputer.Strategy.Constant(0)) - new_table = imp.fit_transform(table) - - assert new_table.get_column("col1")._data[0] == 0 - assert new_table.get_column("col2")._data[0] == 0 diff --git a/tests/safeds/data/tabular/transformation/_label_encoder/__init__.py b/tests/safeds/data/tabular/transformation/_label_encoder/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/safeds/data/tabular/transformation/_label_encoder/test_fit_transform.py b/tests/safeds/data/tabular/transformation/_label_encoder/test_fit_transform.py deleted file mode 100644 index d83366407..000000000 --- a/tests/safeds/data/tabular/transformation/_label_encoder/test_fit_transform.py +++ /dev/null @@ -1,15 +0,0 @@ -import pandas as pd -from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import LabelEncoder - - -def test_fit_transform_valid() -> None: - test_table = Table( - pd.DataFrame({"cities": ["paris", "paris", "tokyo", "amsterdam"]}) - ) - le = LabelEncoder() - test_table = le.fit_transform(test_table, ["cities"]) - assert test_table.schema.has_column("cities") - assert test_table.to_columns()[0].get_value(0) == 1 - assert test_table.to_columns()[0].get_value(2) == 2 - assert test_table.to_columns()[0].get_value(3) == 0 diff --git a/tests/safeds/data/tabular/transformation/_label_encoder/test_inverse_transform.py b/tests/safeds/data/tabular/transformation/_label_encoder/test_inverse_transform.py deleted file mode 100644 index 63ab545a5..000000000 --- a/tests/safeds/data/tabular/transformation/_label_encoder/test_inverse_transform.py +++ /dev/null @@ -1,28 +0,0 @@ -import pandas as pd -import pytest -from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import LabelEncoder -from safeds.exceptions import NotFittedError - - -def test_inverse_transform_valid() -> None: - test_table = Table( - pd.DataFrame({"cities": ["paris", "paris", "tokyo", "amsterdam"]}) - ) - le = LabelEncoder() - test_table = le.fit_transform(test_table, ["cities"]) - test_table = le.inverse_transform(test_table, "cities") - assert test_table.schema.has_column("cities") - assert test_table.to_columns()[0].get_value(0) == "paris" - assert test_table.to_columns()[0].get_value(2) == "tokyo" - assert test_table.to_columns()[0].get_value(3) == "amsterdam" - - -def test_inverse_transform_invalid() -> None: - test_table = Table( - pd.DataFrame({"cities": ["paris", "paris", "tokyo", "amsterdam"]}) - ) - le = LabelEncoder() - # le.fit(test_table) removed to force NotFittedError - with pytest.raises(NotFittedError): - le.inverse_transform(test_table, "cities") diff --git a/tests/safeds/data/tabular/transformation/_label_encoder/test_transform.py b/tests/safeds/data/tabular/transformation/_label_encoder/test_transform.py deleted file mode 100644 index 1bed487ee..000000000 --- a/tests/safeds/data/tabular/transformation/_label_encoder/test_transform.py +++ /dev/null @@ -1,28 +0,0 @@ -import pandas as pd -import pytest -from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import LabelEncoder -from safeds.exceptions import NotFittedError - - -def test_transform_valid() -> None: - test_table = Table( - pd.DataFrame({"cities": ["paris", "paris", "tokyo", "amsterdam"]}) - ) - le = LabelEncoder() - le.fit(test_table, "cities") - test_table = le.transform(test_table, "cities") - assert test_table.schema.has_column("cities") - assert test_table.to_columns()[0].get_value(0) == 1 - assert test_table.to_columns()[0].get_value(2) == 2 - assert test_table.to_columns()[0].get_value(3) == 0 - - -def test_transform_invalid() -> None: - test_table = Table( - pd.DataFrame({"cities": ["paris", "paris", "tokyo", "amsterdam"]}) - ) - le = LabelEncoder() - # le.fit(test_table) removed to force NotFittedError - with pytest.raises(NotFittedError): - le.transform(test_table, "cities") diff --git a/tests/safeds/data/tabular/transformation/_one_hot_encoder/__init__.py b/tests/safeds/data/tabular/transformation/_one_hot_encoder/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/safeds/data/tabular/transformation/_one_hot_encoder/test_fit_transform.py b/tests/safeds/data/tabular/transformation/_one_hot_encoder/test_fit_transform.py deleted file mode 100644 index ef6239c8b..000000000 --- a/tests/safeds/data/tabular/transformation/_one_hot_encoder/test_fit_transform.py +++ /dev/null @@ -1,32 +0,0 @@ -import pandas as pd -from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import OneHotEncoder - - -def test_fit_transform() -> None: - table = Table( - pd.DataFrame( - data={ - "col1": ["A", "B", "C", "A"], - "col2": ["Test1", "Test1", "Test3", "Test1"], - "col3": [1, 2, 3, 4], - } - ) - ) - ohe = OneHotEncoder() - table_ohe = ohe.fit_transform(table, ["col1", "col2"]) - assert table_ohe.count_columns() == 6 - assert table_ohe.get_row(0).get_value("col1_A") == 1 - assert table_ohe.get_row(1).get_value("col1_B") == 1 - assert table_ohe.get_row(2).get_value("col1_C") == 1 - assert table_ohe.get_row(3).get_value("col1_A") == 1 - assert table_ohe.get_row(0).get_value("col2_Test1") == 1 - assert table_ohe.get_row(1).get_value("col2_Test1") == 1 - assert table_ohe.get_row(2).get_value("col2_Test3") == 1 - assert table_ohe.get_row(3).get_value("col2_Test1") == 1 - assert table_ohe.get_column("col1_A").sum() == 2 - assert table_ohe.get_column("col1_B").sum() == 1 - assert table_ohe.get_column("col1_C").sum() == 1 - assert table_ohe.get_column("col2_Test1").sum() == 3 - assert table_ohe.get_column("col2_Test3").sum() == 1 - assert table_ohe.get_column("col3").sum() == 10 diff --git a/tests/safeds/data/tabular/transformation/_one_hot_encoder/test_inverse_transform.py b/tests/safeds/data/tabular/transformation/_one_hot_encoder/test_inverse_transform.py deleted file mode 100644 index f7b4de2a0..000000000 --- a/tests/safeds/data/tabular/transformation/_one_hot_encoder/test_inverse_transform.py +++ /dev/null @@ -1,36 +0,0 @@ -import pandas as pd -import pytest -from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import OneHotEncoder -from safeds.exceptions import NotFittedError - - -def test_fit_transform() -> None: - table = Table( - pd.DataFrame( - data={ - "col1": ["A", "B", "C", "A"], - "col2": ["Test1", "Test1", "Test3", "Test1"], - "col3": [1, 2, 3, 4], - } - ) - ) - ohe = OneHotEncoder() - table_ohe = ohe.fit_transform(table, ["col1", "col2"]) - table_old = ohe.inverse_transform(table_ohe) - assert table_old == table - - -def test_fit_transform_invalid() -> None: - table = Table( - pd.DataFrame( - data={ - "col1": ["A", "B", "C", "A"], - "col2": ["Test1", "Test1", "Test3", "Test1"], - "col3": [1, 2, 3, 4], - } - ) - ) - ohe = OneHotEncoder() - with pytest.raises(NotFittedError): - ohe.inverse_transform(table) diff --git a/tests/safeds/data/tabular/transformation/_one_hot_encoder/test_transform.py b/tests/safeds/data/tabular/transformation/_one_hot_encoder/test_transform.py deleted file mode 100644 index 1a1405dce..000000000 --- a/tests/safeds/data/tabular/transformation/_one_hot_encoder/test_transform.py +++ /dev/null @@ -1,20 +0,0 @@ -import pandas as pd -import pytest -from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import OneHotEncoder -from safeds.exceptions import NotFittedError - - -def test_transform_invalid() -> None: - table = Table( - pd.DataFrame( - data={ - "col1": ["A", "B", "C", "A"], - "col2": ["Test1", "Test1", "Test3", "Test1"], - "col3": [1, 2, 3, 4], - } - ) - ) - ohe = OneHotEncoder() - with pytest.raises(NotFittedError): - ohe.transform(table) diff --git a/tests/safeds/data/tabular/transformation/test_imputer.py b/tests/safeds/data/tabular/transformation/test_imputer.py new file mode 100644 index 000000000..84d039571 --- /dev/null +++ b/tests/safeds/data/tabular/transformation/test_imputer.py @@ -0,0 +1,175 @@ +from typing import Optional + +import pytest +from safeds.data.tabular.containers import Column, Table +from safeds.data.tabular.transformation import Imputer, ImputerStrategy +from safeds.exceptions import NotFittedError, UnknownColumnNameError + + +class TestFit: + def test_should_raise_if_column_not_found(self) -> None: + table = Table.from_columns( + [ + Column("a", [1, 3, None]), + ] + ) + + with pytest.raises(UnknownColumnNameError): + Imputer(Imputer.Strategy.Constant(0)).fit(table, ["b"]) + + def test_should_not_change_original_transformer(self) -> None: + table = Table.from_columns( + [ + Column("a", [1, 3, None]), + ] + ) + + transformer = Imputer(Imputer.Strategy.Constant(0)) + transformer.fit(table) + + assert transformer._wrapped_transformer is None + assert transformer._column_names is None + + +class TestTransform: + def test_should_raise_if_column_not_found(self) -> None: + table_to_fit = Table.from_columns( + [ + Column("a", [1, 3, None]), + ] + ) + + transformer = Imputer(Imputer.Strategy.Constant(0)).fit(table_to_fit) + + table_to_transform = Table.from_columns( + [ + Column("b", [1, 3, None]), + ] + ) + + with pytest.raises(UnknownColumnNameError): + transformer.transform(table_to_transform) + + def test_should_raise_if_not_fitted(self) -> None: + table = Table.from_columns( + [ + Column("a", [1, 3, None]), + ] + ) + + transformer = Imputer(Imputer.Strategy.Constant(0)) + + with pytest.raises(NotFittedError): + transformer.transform(table) + + +class TestFitTransform: + @pytest.mark.parametrize( + ("table", "column_names", "strategy", "expected"), + [ + ( + Table.from_columns( + [ + Column("a", [1.0, 3.0, None]), + ] + ), + None, + Imputer.Strategy.Constant(0.0), + Table.from_columns( + [ + Column("a", [1.0, 3.0, 0.0]), + ] + ), + ), + ( + Table.from_columns( + [ + Column("a", [1.0, 3.0, None]), + ] + ), + None, + Imputer.Strategy.Mean(), + Table.from_columns( + [ + Column("a", [1.0, 3.0, 2.0]), + ] + ), + ), + ( + Table.from_columns( + [ + Column("a", [1.0, 3.0, 1.0, None]), + ] + ), + None, + Imputer.Strategy.Median(), + Table.from_columns( + [ + Column("a", [1.0, 3.0, 1.0, 1.0]), + Column("a", [1.0, 3.0, 1.0, 1.0]), + ] + ), + ), + ( + Table.from_columns( + [ + Column("a", [1.0, 3.0, 3.0, None]), + ] + ), + None, + Imputer.Strategy.Mode(), + Table.from_columns( + [ + Column("a", [1.0, 3.0, 3.0, 3.0]), + ] + ), + ), + ( + Table.from_columns( + [ + Column("a", [1.0, 3.0, None]), + Column("b", [1.0, 3.0, None]), + ] + ), + ["a"], + Imputer.Strategy.Constant(0.0), + Table.from_columns( + [ + Column("a", [1.0, 3.0, 0.0]), + Column("b", [1.0, 3.0, None]), + ] + ), + ), + ], + ) + def test_should_return_transformed_table( + self, table: Table, column_names: Optional[list[str]], strategy: ImputerStrategy, expected: Table + ) -> None: + assert Imputer(strategy).fit_transform(table, column_names) == expected + + def test_should_raise_if_strategy_is_mode_but_multiple_values_are_most_frequent(self) -> None: + table = Table.from_columns( + [ + Column("a", [1, 2, 3, None]), + ] + ) + + with pytest.raises(IndexError): + Imputer(Imputer.Strategy.Mode()).fit_transform(table) + + def test_should_not_change_original_table(self) -> None: + table = Table.from_columns( + [ + Column("a", [1, None, None]), + ] + ) + + Imputer(strategy=Imputer.Strategy.Constant(1)).fit_transform(table) + + expected = Table.from_columns( + [ + Column("a", [1, None, None]), + ] + ) + + assert table == expected diff --git a/tests/safeds/data/tabular/transformation/test_label_encoder.py b/tests/safeds/data/tabular/transformation/test_label_encoder.py new file mode 100644 index 000000000..e29e74742 --- /dev/null +++ b/tests/safeds/data/tabular/transformation/test_label_encoder.py @@ -0,0 +1,168 @@ +from typing import Optional + +import pytest +from safeds.data.tabular.containers import Column, Table +from safeds.data.tabular.transformation import LabelEncoder +from safeds.exceptions import NotFittedError, UnknownColumnNameError + + +class TestFit: + def test_should_raise_if_column_not_found(self) -> None: + table = Table.from_columns( + [ + Column("col1", ["a", "b", "c"]), + ] + ) + + with pytest.raises(UnknownColumnNameError): + LabelEncoder().fit(table, ["col2"]) + + def test_should_not_change_original_transformer(self) -> None: + table = Table.from_columns( + [ + Column("col1", ["a", "b", "c"]), + ] + ) + + transformer = LabelEncoder() + transformer.fit(table) + + assert transformer._wrapped_transformer is None + assert transformer._column_names is None + + +class TestTransform: + def test_should_raise_if_column_not_found(self) -> None: + table_to_fit = Table.from_columns( + [ + Column("col1", ["a", "b", "c"]), + ] + ) + + transformer = LabelEncoder().fit(table_to_fit) + + table_to_transform = Table.from_columns( + [ + Column("col2", ["a", "b", "c"]), + ] + ) + + with pytest.raises(UnknownColumnNameError): + transformer.transform(table_to_transform) + + def test_should_raise_if_not_fitted(self) -> None: + table = Table.from_columns( + [ + Column("col1", ["a", "b", "c"]), + ] + ) + + transformer = LabelEncoder() + + with pytest.raises(NotFittedError): + transformer.transform(table) + + +class TestFitTransform: + @pytest.mark.parametrize( + ("table", "column_names", "expected"), + [ + ( + Table.from_columns( + [ + Column("col1", ["a", "b", "b", "c"]), + ] + ), + None, + Table.from_columns( + [ + Column("col1", [0.0, 1.0, 1.0, 2.0]), + ] + ), + ), + ( + Table.from_columns( + [ + Column("col1", ["a", "b", "b", "c"]), + Column("col2", ["a", "b", "b", "c"]), + ] + ), + ["col1"], + Table.from_columns( + [ + Column("col1", [0.0, 1.0, 1.0, 2.0]), + Column("col2", ["a", "b", "b", "c"]), + ] + ), + ), + ], + ) + def test_should_return_transformed_table( + self, table: Table, column_names: Optional[list[str]], expected: Table + ) -> None: + assert LabelEncoder().fit_transform(table, column_names) == expected + + def test_should_not_change_original_table(self) -> None: + table = Table.from_columns( + [ + Column("col1", ["a", "b", "c"]), + ] + ) + + LabelEncoder().fit_transform(table) + + expected = Table.from_columns( + [ + Column("col1", ["a", "b", "c"]), + ] + ) + + assert table == expected + + +class TestInverseTransform: + @pytest.mark.parametrize( + "table", + [ + Table.from_columns( + [ + Column("col1", ["a", "b", "b", "c"]), + ] + ), + ], + ) + def test_should_return_original_table(self, table: Table) -> None: + transformer = LabelEncoder().fit(table) + + assert transformer.inverse_transform(transformer.transform(table)) == table + + def test_should_not_change_transformed_table(self) -> None: + table = Table.from_columns( + [ + Column("col1", ["a", "b", "c"]), + ] + ) + + transformer = LabelEncoder().fit(table) + transformed_table = transformer.transform(table) + transformer.inverse_transform(transformed_table) + + expected = Table.from_columns( + [ + Column("col1", [0.0, 1.0, 2.0]), + ] + ) + + assert transformed_table == expected + + def test_should_raise_if_not_fitted(self) -> None: + table = Table.from_columns( + [ + Column("col1", [0.0, 1.0, 1.0, 2.0]), + ] + ) + + transformer = LabelEncoder() + + with pytest.raises(NotFittedError): + transformer.inverse_transform(table) diff --git a/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py b/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py new file mode 100644 index 000000000..061d6d6b9 --- /dev/null +++ b/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py @@ -0,0 +1,176 @@ +from typing import Optional + +import pytest +from safeds.data.tabular.containers import Column, Table +from safeds.data.tabular.transformation import OneHotEncoder +from safeds.exceptions import NotFittedError, UnknownColumnNameError + + +class TestFit: + def test_should_raise_if_column_not_found(self) -> None: + table = Table.from_columns( + [ + Column("col1", ["a", "b", "c"]), + ] + ) + + with pytest.raises(UnknownColumnNameError): + OneHotEncoder().fit(table, ["col2"]) + + def test_should_not_change_original_transformer(self) -> None: + table = Table.from_columns( + [ + Column("col1", ["a", "b", "c"]), + ] + ) + + transformer = OneHotEncoder() + transformer.fit(table) + + assert transformer._wrapped_transformer is None + assert transformer._column_names is None + + +class TestTransform: + def test_should_raise_if_column_not_found(self) -> None: + table_to_fit = Table.from_columns( + [ + Column("col1", ["a", "b", "c"]), + ] + ) + + transformer = OneHotEncoder().fit(table_to_fit) + + table_to_transform = Table.from_columns( + [ + Column("col2", ["a", "b", "c"]), + ] + ) + + with pytest.raises(UnknownColumnNameError): + transformer.transform(table_to_transform) + + def test_should_raise_if_not_fitted(self) -> None: + table = Table.from_columns( + [ + Column("col1", ["a", "b", "c"]), + ] + ) + + transformer = OneHotEncoder() + + with pytest.raises(NotFittedError): + transformer.transform(table) + + +class TestFitTransform: + @pytest.mark.parametrize( + ("table", "column_names", "expected"), + [ + ( + Table.from_columns( + [ + Column("col1", ["a", "b", "b", "c"]), + ] + ), + None, + Table.from_columns( + [ + Column("col1_a", [1.0, 0.0, 0.0, 0.0]), + Column("col1_b", [0.0, 1.0, 1.0, 0.0]), + Column("col1_c", [0.0, 0.0, 0.0, 1.0]), + ] + ), + ), + ( + Table.from_columns( + [ + Column("col1", ["a", "b", "b", "c"]), + Column("col2", ["a", "b", "b", "c"]), + ] + ), + ["col1"], + Table.from_columns( + [ + Column("col2", ["a", "b", "b", "c"]), + Column("col1_a", [1.0, 0.0, 0.0, 0.0]), + Column("col1_b", [0.0, 1.0, 1.0, 0.0]), + Column("col1_c", [0.0, 0.0, 0.0, 1.0]), + ] + ), + ), + ], + ) + def test_should_return_transformed_table( + self, table: Table, column_names: Optional[list[str]], expected: Table + ) -> None: + assert OneHotEncoder().fit_transform(table, column_names) == expected + + def test_should_not_change_original_table(self) -> None: + table = Table.from_columns( + [ + Column("col1", ["a", "b", "c"]), + ] + ) + + OneHotEncoder().fit_transform(table) + + expected = Table.from_columns( + [ + Column("col1", ["a", "b", "c"]), + ] + ) + + assert table == expected + + +class TestInverseTransform: + @pytest.mark.parametrize( + "table", + [ + Table.from_columns( + [ + Column("col1", ["a", "b", "b", "c"]), + ] + ), + ], + ) + def test_should_return_original_table(self, table: Table) -> None: + transformer = OneHotEncoder().fit(table) + + assert transformer.inverse_transform(transformer.transform(table)) == table + + def test_should_not_change_transformed_table(self) -> None: + table = Table.from_columns( + [ + Column("col1", ["a", "b", "b", "c"]), + ] + ) + + transformer = OneHotEncoder().fit(table) + transformed_table = transformer.transform(table) + transformer.inverse_transform(transformed_table) + + expected = Table.from_columns( + [ + Column("col1_a", [1.0, 0.0, 0.0, 0.0]), + Column("col1_b", [0.0, 1.0, 1.0, 0.0]), + Column("col1_c", [0.0, 0.0, 0.0, 1.0]), + ] + ) + + assert transformed_table == expected + + def test_should_raise_if_not_fitted(self) -> None: + table = Table.from_columns( + [ + Column("a", [1.0, 0.0, 0.0, 0.0]), + Column("b", [0.0, 1.0, 1.0, 0.0]), + Column("c", [0.0, 0.0, 0.0, 1.0]), + ] + ) + + transformer = OneHotEncoder() + + with pytest.raises(NotFittedError): + transformer.inverse_transform(table)