From 62320a385e4383a492125c453a34aa98d26358a6 Mon Sep 17 00:00:00 2001 From: Simon <58087411+Tarmandan@users.noreply.github.com> Date: Mon, 1 Jul 2024 14:45:13 +0200 Subject: [PATCH] feat: add RobustScaler (#874) Closes #650 ### Summary of Changes Adds a RobustScaler class that works like the StandardScaler but uses median instead of mean and interquartile range instead of standard deviation. If the interquartile range is 0 it will only substract the median from all rows. For now cannot handle columns containing NaN-values. See Issue #873 --------- Co-authored-by: srose <118634249+wastedareas@users.noreply.github.com> Co-authored-by: Simon Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> --- .../data/tabular/transformation/__init__.py | 3 + .../tabular/transformation/_robust_scaler.py | 199 +++++++++++++++ .../transformation/test_robust_scaler.py | 236 ++++++++++++++++++ 3 files changed, 438 insertions(+) create mode 100644 src/safeds/data/tabular/transformation/_robust_scaler.py create mode 100644 tests/safeds/data/tabular/transformation/test_robust_scaler.py diff --git a/src/safeds/data/tabular/transformation/__init__.py b/src/safeds/data/tabular/transformation/__init__.py index b7f19d22e..920f0b7e5 100644 --- a/src/safeds/data/tabular/transformation/__init__.py +++ b/src/safeds/data/tabular/transformation/__init__.py @@ -10,6 +10,7 @@ from ._label_encoder import LabelEncoder from ._one_hot_encoder import OneHotEncoder from ._range_scaler import RangeScaler + from ._robust_scaler import RobustScaler from ._simple_imputer import SimpleImputer from ._standard_scaler import StandardScaler from ._table_transformer import TableTransformer @@ -22,6 +23,7 @@ "LabelEncoder": "._label_encoder:LabelEncoder", "OneHotEncoder": "._one_hot_encoder:OneHotEncoder", "RangeScaler": "._range_scaler:RangeScaler", + "RobustScaler": "._robust_scaler:RobustScaler", "SimpleImputer": "._simple_imputer:SimpleImputer", "StandardScaler": "._standard_scaler:StandardScaler", "TableTransformer": "._table_transformer:TableTransformer", @@ -34,6 +36,7 @@ "LabelEncoder", "OneHotEncoder", "RangeScaler", + "RobustScaler", "SimpleImputer", "StandardScaler", "TableTransformer", diff --git a/src/safeds/data/tabular/transformation/_robust_scaler.py b/src/safeds/data/tabular/transformation/_robust_scaler.py new file mode 100644 index 000000000..970b32d6c --- /dev/null +++ b/src/safeds/data/tabular/transformation/_robust_scaler.py @@ -0,0 +1,199 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._validation import _check_columns_exist +from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric +from safeds.data.tabular.containers import Table +from safeds.exceptions import TransformerNotFittedError + +from ._invertible_table_transformer import InvertibleTableTransformer + +if TYPE_CHECKING: + import polars as pl + + +class RobustScaler(InvertibleTableTransformer): + """ + The RobustScaler transforms column values to a range by removing the median and scaling to the interquartile range. + + Currently, for columns with high stability (IQR == 0), it will only substract the median and not scale to avoid dividing by zero. + + Parameters + ---------- + column_names: + The list of columns used to fit the transformer. If `None`, all numeric columns are used. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self, *, column_names: str | list[str] | None = None) -> None: + super().__init__(column_names) + + # Internal state + self._data_median: pl.DataFrame | None = None + self._data_scale: pl.DataFrame | None = None + + def __hash__(self) -> int: + # Leave out the internal state for faster hashing + return super().__hash__() + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._data_median is not None and self._data_scale is not None + + # ------------------------------------------------------------------------------------------------------------------ + # Learning and transformation + # ------------------------------------------------------------------------------------------------------------------ + + def fit(self, table: Table) -> RobustScaler: + """ + Learn a transformation for a set of columns in a table. + + **Note:** This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + ColumnNotFoundError + If column_names contain a column name that is missing in the table. + ColumnTypeError + If at least one of the specified columns in the table contains non-numerical data. + ValueError + If the table contains 0 rows. + """ + import polars as pl + + if self._column_names is None: + column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] + else: + column_names = self._column_names + _check_columns_exist(table, column_names) + _check_columns_are_numeric(table, column_names, operation="fit a RobustScaler") + + if table.row_count == 0: + raise ValueError("The RobustScaler cannot be fitted because the table contains 0 rows") + + _data_median = table._lazy_frame.select(column_names).median().collect() + q1 = table._lazy_frame.select(column_names).quantile(0.25).collect() + q3 = table._lazy_frame.select(column_names).quantile(0.75).collect() + _data_scale = q3 - q1 + + # To make sure there is no division by zero + for col_e in column_names: + _data_scale = _data_scale.with_columns( + pl.when(pl.col(col_e) == 0).then(1).otherwise(pl.col(col_e)).alias(col_e), + ) + + # Create a copy with the learned transformation + result = RobustScaler(column_names=column_names) + result._data_median = _data_median + result._data_scale = _data_scale + + return result + + def transform(self, table: Table) -> Table: + """ + Apply the learned transformation to a table. + + **Note:** The given table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + ColumnNotFoundError + If the input table does not contain all columns used to fit the transformer. + ColumnTypeError + If at least one of the columns in the input table that is used to fit contains non-numerical data. + """ + import polars as pl + + # Used in favor of is_fitted, so the type checker is happy + if self._column_names is None or self._data_median is None or self._data_scale is None: + raise TransformerNotFittedError + + _check_columns_exist(table, self._column_names) + _check_columns_are_numeric(table, self._column_names, operation="transform with a RobustScaler") + + columns = [ + (pl.col(name) - self._data_median.get_column(name)) / self._data_scale.get_column(name) + for name in self._column_names + ] + + return Table._from_polars_lazy_frame( + table._lazy_frame.with_columns(columns), + ) + + def inverse_transform(self, transformed_table: Table) -> Table: + """ + Undo the learned transformation. + + **Note:** The given table is not modified. + + Parameters + ---------- + transformed_table: + The table to be transformed back to the original version. + + Returns + ------- + original_table: + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + ColumnNotFoundError + If the input table does not contain all columns used to fit the transformer. + ColumnTypeError + If the transformed columns of the input table contain non-numerical data. + """ + import polars as pl + + # Used in favor of is_fitted, so the type checker is happy + if self._column_names is None or self._data_median is None or self._data_scale is None: + raise TransformerNotFittedError + + _check_columns_exist(transformed_table, self._column_names) + _check_columns_are_numeric( + transformed_table, + self._column_names, + operation="inverse-transform with a RobustScaler", + ) + + columns = [ + pl.col(name) * self._data_scale.get_column(name) + self._data_median.get_column(name) + for name in self._column_names + ] + + return Table._from_polars_lazy_frame( + transformed_table._lazy_frame.with_columns(columns), + ) diff --git a/tests/safeds/data/tabular/transformation/test_robust_scaler.py b/tests/safeds/data/tabular/transformation/test_robust_scaler.py new file mode 100644 index 000000000..6c18b4878 --- /dev/null +++ b/tests/safeds/data/tabular/transformation/test_robust_scaler.py @@ -0,0 +1,236 @@ +import pytest +from safeds.data.tabular.containers import Table +from safeds.data.tabular.transformation import RobustScaler +from safeds.exceptions import ColumnNotFoundError, ColumnTypeError, TransformerNotFittedError + +from tests.helpers import assert_tables_equal + + +class TestFit: + def test_should_raise_if_column_not_found(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + with pytest.raises(ColumnNotFoundError): + RobustScaler(column_names=["col2", "col3"]).fit(table) + + def test_should_raise_if_table_contains_non_numerical_data(self) -> None: + with pytest.raises(ColumnTypeError): + RobustScaler(column_names=["col1", "col2"]).fit( + Table({"col1": ["one", "two", "apple"], "col2": ["three", "four", "banana"]}), + ) + + def test_should_raise_if_table_contains_no_rows(self) -> None: + with pytest.raises(ValueError, match=r"The RobustScaler cannot be fitted because the table contains 0 rows"): + RobustScaler().fit(Table({"col1": []})) + + def test_should_not_change_original_transformer(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + transformer = RobustScaler() + transformer.fit(table) + + assert transformer._column_names is None + assert transformer._data_median is None + assert transformer._data_scale is None + + # TODO: Tests for None and NaN values should be moved to their own function + def test_should_not_divide_by_zero(self) -> None: + table = Table( + { + "col1": [1.0, 1.0, 2.0, 1.0], + "col2": [3.0, 3.0, 3.0, 3.0], + # "col3": [1.0, float("nan"), float("nan"), float("nan")], + "col4": [1.0, None, None, None], + }, + ) + target = Table( + { + "col1": [0.0, 0.0, 1.0, 0.0], + "col2": [0.0, 0.0, 0.0, 0.0], + # "col3": [0.0, float("nan"), float("nan"), float("nan")], + "col4": [0.0, None, None, None], + }, + ) + transformer = RobustScaler() + f_transformer = transformer.fit(table) + table = f_transformer.transform(table) + assert table == target + + +class TestTransform: + def test_should_raise_if_column_not_found(self) -> None: + table_to_fit = Table( + { + "col1": [0.0, 5.0, 10.0], + "col2": [5.0, 50.0, 100.0], + }, + ) + + transformer = RobustScaler().fit(table_to_fit) + + table_to_transform = Table( + { + "col3": ["a", "b", "c"], + }, + ) + + with pytest.raises(ColumnNotFoundError): + transformer.transform(table_to_transform) + + def test_should_raise_if_not_fitted(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + transformer = RobustScaler() + + with pytest.raises(TransformerNotFittedError, match=r"The transformer has not been fitted yet."): + transformer.transform(table) + + def test_should_raise_if_table_contains_non_numerical_data(self) -> None: + with pytest.raises(ColumnTypeError): + RobustScaler(column_names=["col1", "col2"]).fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]})).transform( + Table({"col1": ["a", "b", "c"], "col2": ["b", "c", "e"]}), + ) + + +class TestIsFitted: + def test_should_return_false_before_fitting(self) -> None: + transformer = RobustScaler() + assert not transformer.is_fitted + + def test_should_return_true_after_fitting(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + transformer = RobustScaler() + fitted_transformer = transformer.fit(table) + assert fitted_transformer.is_fitted + + +class TestFitAndTransform: + @pytest.mark.parametrize( + ("table", "column_names", "expected"), + [ + ( + Table( + { + "col1": [1.0, 2.0, 3.0, 4.0], + "col2": [1.0, 2.0, 3.0, 4.0], + }, + ), + None, + Table( + { + "col1": [-1.5, -0.5, 0.5, 1.5], + "col2": [-1.5, -0.5, 0.5, 1.5], + }, + ), + ), + ], + ids=["two_columns"], + ) + def test_should_return_fitted_transformer_and_transformed_table( + self, + table: Table, + column_names: list[str] | None, + expected: Table, + ) -> None: + fitted_transformer, transformed_table = RobustScaler(column_names=column_names).fit_and_transform(table) + assert fitted_transformer.is_fitted + assert_tables_equal(transformed_table, expected) + + def test_should_not_change_original_table(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + RobustScaler().fit_and_transform(table) + + expected = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + assert table == expected + + +class TestInverseTransform: + @pytest.mark.parametrize( + "table", + [ + Table( + { + "col1": [1.0, 2.0, 3.0, 4.0], + }, + ), + ], + ids=["one_column"], + ) + def test_should_return_original_table(self, table: Table) -> None: + transformer = RobustScaler().fit(table) + + assert transformer.inverse_transform(transformer.transform(table)) == table + + def test_should_not_change_transformed_table(self) -> None: + table = Table( + { + "col1": [0.0, 0.5, 1.0, 1.5, 2.0], + }, + ) + + transformer = RobustScaler().fit(table) + transformed_table = transformer.transform(table) + transformed_table = transformer.inverse_transform(transformed_table) + + expected = Table( + { + "col1": [0.0, 0.5, 1.0, 1.5, 2.0], + }, + ) + + assert_tables_equal(transformed_table, expected) + + def test_should_raise_if_not_fitted(self) -> None: + table = Table( + { + "col1": [1.0, 2.0, 3.0, 4.0], + }, + ) + + transformer = RobustScaler() + + with pytest.raises(TransformerNotFittedError, match=r"The transformer has not been fitted yet."): + transformer.inverse_transform(table) + + def test_should_raise_if_column_not_found(self) -> None: + with pytest.raises(ColumnNotFoundError): + RobustScaler(column_names=["col1", "col2"]).fit( + Table({"col1": [1, 2, 3, 4], "col2": [2, 3, 4, 5]}), + ).inverse_transform( + Table({"col3": [0, 1, 2, 3]}), + ) + + def test_should_raise_if_table_contains_non_numerical_data(self) -> None: + with pytest.raises(ColumnTypeError): + RobustScaler(column_names=["col1", "col2"]).fit( + Table({"col1": [1, 2, 3, 4], "col2": [2, 3, 4, 5]}), + ).inverse_transform( + Table({"col1": ["one", "two", "apple"], "col2": ["three", "four", "banana"]}), + )