Skip to content

Commit

Permalink
feat: improve transformers for tabular data (#108)
Browse files Browse the repository at this point in the history
Closes #61.
Closes #90.

### Summary of Changes

* Common superclasses `TableTransformer` and
`InvertibleTableTransformer`
* Common interface for `fit`, `transform`, `fit_transform`,
`inverse_transform`
* Return new transformer when calling `fit`
* More thorough tests

---------

Co-authored-by: lars-reimann <[email protected]>
  • Loading branch information
lars-reimann and lars-reimann authored Mar 28, 2023
1 parent b92bba5 commit b18a06d
Show file tree
Hide file tree
Showing 18 changed files with 843 additions and 453 deletions.
2 changes: 1 addition & 1 deletion src/safeds/data/tabular/transformation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from ._imputer import Imputer
from ._imputer import Imputer, ImputerStrategy
from ._label_encoder import LabelEncoder
from ._one_hot_encoder import OneHotEncoder
131 changes: 75 additions & 56 deletions src/safeds/data/tabular/transformation/_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,18 @@

import pandas as pd
from safeds.data.tabular.containers import Table
from sklearn.impute import SimpleImputer
from safeds.data.tabular.transformation._table_transformer import TableTransformer
from safeds.exceptions import NotFittedError, UnknownColumnNameError
from sklearn.impute import SimpleImputer as sk_SimpleImputer


class ImputerStrategy(ABC):
@abstractmethod
def _augment_imputer(self, imputer: SimpleImputer) -> None:
def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
pass


# noinspection PyProtectedMember
class Imputer:
class Imputer(TableTransformer):
"""
Impute the data for a given Table.
Expand All @@ -39,7 +40,10 @@ class Constant(ImputerStrategy):
def __init__(self, value: Any):
self._value = value

def _augment_imputer(self, imputer: SimpleImputer) -> None:
def __str__(self) -> str:
return f"Constant({self._value})"

def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
imputer.strategy = "constant"
imputer.fill_value = self._value

Expand All @@ -48,97 +52,112 @@ class Mean(ImputerStrategy):
An imputation strategy for imputing missing data with mean values.
"""

def _augment_imputer(self, imputer: SimpleImputer) -> None:
def __str__(self) -> str:
return "Mean"

def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
imputer.strategy = "mean"

class Median(ImputerStrategy):
"""
An imputation strategy for imputing missing data with median values.
"""

def _augment_imputer(self, imputer: SimpleImputer) -> None:
def __str__(self) -> str:
return "Median"

def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
imputer.strategy = "median"

class Mode(ImputerStrategy):
"""
An imputation strategy for imputing missing data with mode values.
"""

def _augment_imputer(self, imputer: SimpleImputer) -> None:
def __str__(self) -> str:
return "Mode"

def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
imputer.strategy = "most_frequent"

def __init__(self, strategy: ImputerStrategy):
self._imp = SimpleImputer()
strategy._augment_imputer(self._imp)
self._column_names: list[str] = []
self._strategy = strategy

def fit(self, table: Table, column_names: Optional[list[str]] = None) -> None:
self._wrapped_transformer: Optional[sk_SimpleImputer] = None
self._column_names: Optional[list[str]] = None

# noinspection PyProtectedMember
def fit(self, table: Table, column_names: Optional[list[str]] = None) -> Imputer:
"""
Fit the imputer on the dataset.
Learn a transformation for a set of columns in a table.
Parameters
----------
table : Table
The table used to learn the imputation values.
The table used to fit the transformer.
column_names : Optional[list[str]]
An optional list of column names, if the imputer is only supposed to run on specific columns.
The list of columns from the table used to fit the transformer. If `None`, all columns are used.
Returns
-------
fitted_transformer : TableTransformer
The fitted transformer.
"""
if column_names is None:
column_names = table.schema.get_column_names()
column_names = table.get_column_names()
else:
missing_columns = set(column_names) - set(table.get_column_names())
if len(missing_columns) > 0:
raise UnknownColumnNameError(list(missing_columns))

if self._imp.strategy == "most_frequent":
if isinstance(self._strategy, Imputer.Strategy.Mode):
for name in column_names:
if 1 < len(table.get_column(name).mode()):
raise IndexError(
"There are multiple frequent values in a column given for the Imputer"
)
if len(table.get_column(name).mode()) > 1:
raise IndexError("There are multiple most frequent values in a column given for the Imputer")

indices = [table.schema._get_column_index_by_name(name) for name in column_names]

wrapped_transformer = sk_SimpleImputer()
self._strategy._augment_imputer(wrapped_transformer)
wrapped_transformer.fit(table._data[indices])

result = Imputer(self._strategy)
result._wrapped_transformer = wrapped_transformer
result._column_names = column_names

self._column_names = column_names
indices = [
table.schema._get_column_index_by_name(name) for name in self._column_names
]
self._imp.fit(table._data[indices])
return result

# noinspection PyProtectedMember
def transform(self, table: Table) -> Table:
"""
Impute the missing values on the dataset.
Apply the learned transformation to a table.
Parameters
----------
table : Table
The dataset to be imputed.
The table to which the learned transformation is applied.
Returns
-------
table : Table
The dataset with missing values imputed by the given strategy.
"""
data = table._data.copy()
indices = [
table.schema._get_column_index_by_name(name) for name in self._column_names
]
data[indices] = pd.DataFrame(
self._imp.transform(data[indices]), columns=indices
)
return Table(data, table.schema)
transformed_table : Table
The transformed table.
def fit_transform(
self, table: Table, column_names: Optional[list[str]] = None
) -> Table:
Raises
----------
NotFittedError
If the transformer has not been fitted yet.
"""
Fit the imputer on the dataset and impute the missing values.

Parameters
----------
table : Table
The table used to learn the imputation values.
column_names : Optional[list[str]]
An optional list of column names, if the imputer is only supposed to run on specific columns.
# Transformer has not been fitted yet
if self._wrapped_transformer is None or self._column_names is None:
raise NotFittedError()

Returns
-------
table : Table
The dataset with missing values imputed by the given strategy.
"""
self.fit(table, column_names)
return self.transform(table)
# Input table does not contain all columns used to fit the transformer
missing_columns = set(self._column_names) - set(table.get_column_names())
if len(missing_columns) > 0:
raise UnknownColumnNameError(list(missing_columns))

data = table._data.copy()
indices = [table.schema._get_column_index_by_name(name) for name in self._column_names]
data[indices] = pd.DataFrame(self._wrapped_transformer.transform(data[indices]), columns=indices)
return Table(data, table.schema)
Loading

0 comments on commit b18a06d

Please sign in to comment.