diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index fd62f988f..fbfeeb962 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -596,7 +596,9 @@ def drop_columns_with_non_numerical_values(self) -> Table: A table without the columns that contain non-numerical values. """ - return Table.from_columns(self._list_columns_with_numerical_values()) + return Table.from_columns( + [column for column in self.to_columns() if column.type.is_numeric()] + ) def drop_duplicate_rows(self) -> Table: """ @@ -626,27 +628,26 @@ def drop_rows_with_missing_values(self) -> Table: def drop_rows_with_outliers(self) -> Table: """ - Remove all rows from the table that contain at least one outlier defined as having a value that has a distance - of more than 3 standard deviations from the column average. + Remove all rows from the table that contain at least one outlier. + + We define an outlier as a value that has a distance of more than 3 standard deviations from the column mean. + Missing values are not considered outliers. They are also ignored during the calculation of the standard + deviation. Returns ------- new_table : Table A new table without rows containing outliers. """ - result = self._data.copy(deep=True) + copy = self._data.copy(deep=True) - table_without_nonnumericals = Table.from_columns( - self._list_columns_with_numerical_values() + table_without_nonnumericals = self.drop_columns_with_non_numerical_values() + z_scores = np.absolute( + stats.zscore(table_without_nonnumericals._data, nan_policy="omit") ) + filter_ = ((z_scores < 3) | np.isnan(z_scores)).all(axis=1) - result = result[ - (np.absolute(stats.zscore(table_without_nonnumericals._data)) < 3).all( - axis=1 - ) - ] - - return Table(result, self._schema) + return Table(copy[filter_], self._schema) def filter_rows(self, query: Callable[[Row], bool]) -> Table: """ @@ -1098,18 +1099,3 @@ def _ipython_display_(self) -> DisplayHandle: "display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1] ): return display(tmp) - - def _list_columns_with_numerical_values(self) -> list[Column]: - """ - Return a list of columns only containing numerical values. - - Returns - ------- - cols : list[Column] - The list with only numerical columns. - """ - cols = [] - for column_name, data_type in self._schema._schema.items(): - if data_type.is_numeric(): - cols.append(self.get_column(column_name)) - return cols diff --git a/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_outliers.py b/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_outliers.py index 25b12a458..cfe37163c 100644 --- a/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_outliers.py +++ b/tests/safeds/data/tabular/containers/_table/test_drop_rows_with_outliers.py @@ -22,7 +22,7 @@ def test_drop_rows_with_outliers_no_outliers() -> None: def test_drop_rows_with_outliers_with_outliers() -> None: - table = Table( + input_ = Table( pd.DataFrame( data={ "col1": [ @@ -39,14 +39,24 @@ def test_drop_rows_with_outliers_with_outliers() -> None: "a", "a", ], - "col2": [1.0, 2.0, 3.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + "col2": [1.0, 2.0, 3.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, None], "col3": [2, 3, 1, 1_000_000_000, 1, 1, 1, 1, 1, 1, 1, 1], } ) ) - result = table.drop_rows_with_outliers() - assert result.count_rows() == 11 - assert result.count_columns() == 3 + result = input_.drop_rows_with_outliers() + + expected = Table( + pd.DataFrame( + data={ + "col1": ["A", "B", "C", "a", "a", "a", "a", "a", "a", "a", "a"], + "col2": [1.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, None], + "col3": [2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1], + } + ) + ) + + assert result == expected def test_drop_rows_with_outliers_no_rows() -> None: diff --git a/tests/safeds/data/tabular/containers/_table/test_list_columns_with_numerical_values.py b/tests/safeds/data/tabular/containers/_table/test_list_columns_with_numerical_values.py deleted file mode 100644 index bc5cba885..000000000 --- a/tests/safeds/data/tabular/containers/_table/test_list_columns_with_numerical_values.py +++ /dev/null @@ -1,30 +0,0 @@ -import numpy as np -import pandas as pd -from safeds.data.tabular.containers import Table -from safeds.data.tabular.typing import ColumnType, TableSchema - - -def test_list_columns_with_numerical_values_valid() -> None: - table = Table( - pd.DataFrame( - data={ - "col1": ["A", "B", "C", "A"], - "col2": ["Test1", "Test1", "Test3", "Test1"], - "col3": [1, 2, 3, 4], - "col4": [2, 3, 1, 4], - } - ) - ) - columns = table._list_columns_with_numerical_values() - assert columns[0] == table.get_column("col3") - assert columns[1] == table.get_column("col4") - assert len(columns) == 2 - - -def test_list_columns_with_numerical_values_invalid() -> None: - table = Table( - [], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))}) - ) - columns = table._list_columns_with_numerical_values() - assert columns[0] == table.get_column("col1") - assert len(columns) == 1