Skip to content

Commit

Permalink
feat: Added and improved errors and warnings in the table transformers (
Browse files Browse the repository at this point in the history
#372)

Closes #152.

### Summary of Changes

feat: Added and improved errors and warnings in the table transformers
feat: Added ability for the `Imputer` with strategy `Mode` to work if
multiple most frequent values (warns if this is the case, takes the
lowest most frequent value)
feat: Added ability for the `OneHotEncoder` to encode python NaNs
fix: fixed grammar in `NonNumericColumnError`
feat: Changed `ValueNotPresentWhenFittedError` to allow info about
multiple columns
test: Changed test for `Imputer` to test all strategies

---------

Co-authored-by: alex-senger <[email protected]>
Co-authored-by: megalinter-bot <[email protected]>
Co-authored-by: Simon Breuer <[email protected]>
  • Loading branch information
4 people authored Jun 23, 2023
1 parent c877530 commit 544e307
Show file tree
Hide file tree
Showing 14 changed files with 853 additions and 143 deletions.
2 changes: 0 additions & 2 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,10 +627,8 @@ def add_column(self, column: Column) -> Table:
------
DuplicateColumnNameError
If the new column already exists.
ColumnSizeError
If the size of the column does not match the amount of rows.
"""
if self.has_column(column.name):
raise DuplicateColumnNameError(column.name)
Expand Down
74 changes: 67 additions & 7 deletions src/safeds/data/tabular/transformation/_imputer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import warnings
from typing import Any

import pandas as pd
Expand All @@ -8,7 +9,7 @@
from safeds.data.tabular.containers import Table
from safeds.data.tabular.transformation._table_transformer import TableTransformer
from safeds.data.tabular.typing import ImputerStrategy
from safeds.exceptions import TransformerNotFittedError, UnknownColumnNameError
from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError


class Imputer(TableTransformer):
Expand Down Expand Up @@ -75,7 +76,7 @@ def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
imputer.strategy = "median"

class Mode(ImputerStrategy):
"""An imputation strategy for imputing missing data with mode values."""
"""An imputation strategy for imputing missing data with mode values. The lowest value will be used if there are multiple values with the same highest count."""

def __str__(self) -> str:
return "Mode"
Expand Down Expand Up @@ -107,18 +108,59 @@ def fit(self, table: Table, column_names: list[str] | None) -> Imputer:
-------
fitted_transformer : TableTransformer
The fitted transformer.
Raises
------
UnknownColumnNameError
If column_names contain a column name that is missing in the table
ValueError
If the table contains 0 rows
NonNumericColumnError
If the strategy is set to either Mean or Median and the specified columns of the table contain non-numerical data
"""
if column_names is None:
column_names = table.column_names
else:
missing_columns = set(column_names) - set(table.column_names)
missing_columns = sorted(set(column_names) - set(table.column_names))
if len(missing_columns) > 0:
raise UnknownColumnNameError(list(missing_columns))
raise UnknownColumnNameError(missing_columns)

if table.number_of_rows == 0:
raise ValueError("The Imputer cannot be fitted because the table contains 0 rows")

if (isinstance(self._strategy, Imputer.Strategy.Mean | Imputer.Strategy.Median)) and table.keep_only_columns(
column_names,
).remove_columns_with_non_numerical_values().number_of_columns < len(
column_names,
):
raise NonNumericColumnError(
str(
sorted(
set(table.keep_only_columns(column_names).column_names)
- set(
table.keep_only_columns(column_names)
.remove_columns_with_non_numerical_values()
.column_names,
),
),
),
)

if isinstance(self._strategy, Imputer.Strategy.Mode):
multiple_most_frequent = {}
for name in column_names:
if len(table.get_column(name).mode()) > 1:
raise IndexError("There are multiple most frequent values in a column given for the Imputer")
multiple_most_frequent[name] = table.get_column(name).mode()
if len(multiple_most_frequent) > 0:
warnings.warn(
(
"There are multiple most frequent values in a column given to the Imputer.\nThe lowest values"
" are being chosen in this cases. The following columns have multiple most frequent"
f" values:\n{multiple_most_frequent}"
),
UserWarning,
stacklevel=2,
)

wrapped_transformer = sk_SimpleImputer()
self._strategy._augment_imputer(wrapped_transformer)
Expand Down Expand Up @@ -151,15 +193,33 @@ def transform(self, table: Table) -> Table:
------
TransformerNotFittedError
If the transformer has not been fitted yet.
UnknownColumnNameError
If the input table does not contain all columns used to fit the transformer
ValueError
If the table contains 0 rows
"""
# Transformer has not been fitted yet
if self._wrapped_transformer is None or self._column_names is None:
raise TransformerNotFittedError

# Input table does not contain all columns used to fit the transformer
missing_columns = set(self._column_names) - set(table.column_names)
missing_columns = sorted(set(self._column_names) - set(table.column_names))
if len(missing_columns) > 0:
raise UnknownColumnNameError(list(missing_columns))
raise UnknownColumnNameError(missing_columns)

if table.number_of_rows == 0:
raise ValueError("The Imputer cannot transform the table because it contains 0 rows")

if table.keep_only_columns(self._column_names).remove_columns_with_missing_values().number_of_columns > 0:
warnings.warn(
(
"The columns"
f" {table.keep_only_columns(self._column_names).remove_columns_with_missing_values().column_names} have"
" no missing values, so the Imputer did not change these columns"
),
UserWarning,
stacklevel=2,
)

data = table._data.copy()
data[self._column_names] = pd.DataFrame(
Expand Down
70 changes: 65 additions & 5 deletions src/safeds/data/tabular/transformation/_label_encoder.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from __future__ import annotations

import warnings

from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder

from safeds.data.tabular.containers import Table
from safeds.data.tabular.transformation._table_transformer import (
InvertibleTableTransformer,
)
from safeds.exceptions import TransformerNotFittedError, UnknownColumnNameError
from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError


# noinspection PyProtectedMember
Expand Down Expand Up @@ -34,13 +36,35 @@ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder:
-------
fitted_transformer : TableTransformer
The fitted transformer.
Raises
------
UnknownColumnNameError
If column_names contain a column name that is missing in the table
ValueError
If the table contains 0 rows
"""
if column_names is None:
column_names = table.column_names
else:
missing_columns = set(column_names) - set(table.column_names)
missing_columns = sorted(set(column_names) - set(table.column_names))
if len(missing_columns) > 0:
raise UnknownColumnNameError(list(missing_columns))
raise UnknownColumnNameError(missing_columns)

if table.number_of_rows == 0:
raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows")

if table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns > 0:
warnings.warn(
(
"The columns"
f" {table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain"
" numerical data. The LabelEncoder is designed to encode non-numerical values into numerical"
" values"
),
UserWarning,
stacklevel=2,
)

wrapped_transformer = sk_OrdinalEncoder()
wrapped_transformer.fit(table._data[column_names])
Expand Down Expand Up @@ -71,15 +95,22 @@ def transform(self, table: Table) -> Table:
------
TransformerNotFittedError
If the transformer has not been fitted yet.
UnknownColumnNameError
If the input table does not contain all columns used to fit the transformer
ValueError
If the table contains 0 rows
"""
# Transformer has not been fitted yet
if self._wrapped_transformer is None or self._column_names is None:
raise TransformerNotFittedError

# Input table does not contain all columns used to fit the transformer
missing_columns = set(self._column_names) - set(table.column_names)
missing_columns = sorted(set(self._column_names) - set(table.column_names))
if len(missing_columns) > 0:
raise UnknownColumnNameError(list(missing_columns))
raise UnknownColumnNameError(missing_columns)

if table.number_of_rows == 0:
raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows")

data = table._data.copy()
data.columns = table.column_names
Expand All @@ -106,11 +137,40 @@ def inverse_transform(self, transformed_table: Table) -> Table:
------
TransformerNotFittedError
If the transformer has not been fitted yet.
UnknownColumnNameError
If the input table does not contain all columns used to fit the transformer
NonNumericColumnError
If the specified columns of the input table contain non-numerical data
ValueError
If the table contains 0 rows
"""
# Transformer has not been fitted yet
if self._wrapped_transformer is None or self._column_names is None:
raise TransformerNotFittedError

missing_columns = sorted(set(self._column_names) - set(transformed_table.column_names))
if len(missing_columns) > 0:
raise UnknownColumnNameError(missing_columns)

if transformed_table.keep_only_columns(
self._column_names,
).remove_columns_with_non_numerical_values().number_of_columns < len(self._column_names):
raise NonNumericColumnError(
str(
sorted(
set(self._column_names)
- set(
transformed_table.keep_only_columns(self._column_names)
.remove_columns_with_non_numerical_values()
.column_names,
),
),
),
)

if transformed_table.number_of_rows == 0:
raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows")

data = transformed_table._data.copy()
data.columns = transformed_table.column_names
data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names])
Expand Down
Loading

0 comments on commit 544e307

Please sign in to comment.