Skip to content

Categorical encoders #431

Merged
merged 12 commits into from
Jan 18, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions etna/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from etna.transforms.decomposition import STLTransform
from etna.transforms.decomposition import TheilSenTrendTransform
from etna.transforms.decomposition import TrendTransform
from etna.transforms.encoders import LabelEncoderTransform
from etna.transforms.encoders import MeanSegmentEncoderTransform
from etna.transforms.encoders import SegmentEncoderTransform
from etna.transforms.feature_selection import FilterFeaturesTransform
Expand Down
1 change: 1 addition & 0 deletions etna/transforms/encoders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from etna.transforms.encoders.categorical import LabelEncoderTransform
from etna.transforms.encoders.mean_segment_encoder import MeanSegmentEncoderTransform
from etna.transforms.encoders.segment_encoder import SegmentEncoderTransform
228 changes: 228 additions & 0 deletions etna/transforms/encoders/categorical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
import warnings
from enum import Enum
from typing import Optional

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.utils._encode import _check_unknown
from sklearn.utils._encode import _encode

from etna.transforms.base import PerSegmentWrapper
from etna.transforms.base import Transform


class ImputerMode(str, Enum):
"""Enum for different imputation strategy."""

new_value = "new_value"
mean = "mean"
none = "none"


class _LabelEncoder(preprocessing.LabelEncoder):
def transform(self, y: pd.Series, strategy: str):
diff = _check_unknown(y, known_values=self.classes_)

index = np.where(np.isin(y, diff))[0]

encoded = _encode(y, uniques=self.classes_, check_unknown=False).astype(float)

if strategy == "None":
Copy link
Contributor

@Mr-Geekman Mr-Geekman Jan 12, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Threre is no such value as "None", only "none".

filling_value = None
elif strategy == "new_value":
filling_value = -1
elif strategy == "mean":
filling_value = np.mean(encoded[~np.isin(y, diff)])
else:
raise ValueError(f"There are no '{strategy}' strategy exists")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This message doesn't look very good.
May be better: The strategy {strategy} doesn't exist.


encoded[index] = filling_value
return encoded


class _OneSegmentLabelEncoderTransform(Transform):
"""Replace the values in the column with the Label encoding"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no inverse transform method on purpose?


def __init__(self, in_column: str, out_column: str, strategy: str, inplace: bool):
"""
Create instance of _OneSegmentLabelEncoderTransform.

Parameters
----------
in_column:
name of column to apply transform to
out_column:
name of added column.
strategy:
filling encoding in not fitted values:
- If "new_value", then replace missing dates with '-1'
- If "mean", then replace missing dates using the mean in encoded column
- If "none", then replace missing dates with None
inplace:
if True, apply resampling inplace to in_column, if False, add transformed column to dataset
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resampling is irrelevant here

"""
self.in_column = in_column
self.out_column = out_column
self.strategy = strategy
self.le = _LabelEncoder()
self.inplace = inplace

def _get_column_name(self) -> str:
"""Get the `out_column` depending on the transform's parameters."""
if self.inplace and self.out_column:
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
if self.inplace:
return self.in_column
if self.out_column:
return self.out_column
if self.in_column.startswith("regressor"):
temp_transform = LabelEncoderTransform(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we use here just self.__repr__?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is about this?

in_column=self.in_column, inplace=self.inplace, out_column=self.out_column, strategy=self.strategy
)
return f"regressor_{temp_transform.__repr__()}"
temp_transform = LabelEncoderTransform(
in_column=self.in_column, inplace=self.inplace, out_column=self.out_column, strategy=self.strategy
)
return temp_transform.__repr__()

def fit(self, df: pd.DataFrame) -> "_OneSegmentLabelEncoderTransform":
"""
Fit Label encoder.

Parameters
----------
df:
dataframe with data to fit the transform.
Returns
-------
self
"""
self.le.fit(df[self.in_column])
return self

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Encode the `in_column` by fitted Label encoder.

Parameters
----------
df
dataframe with data to transform.
Returns
-------
result dataframe
"""
result_df = df.copy()
result_df[self._get_column_name()] = self.le.transform(df[self.in_column], self.strategy)
return result_df


class LabelEncoderTransform(PerSegmentWrapper):
Mr-Geekman marked this conversation as resolved.
Show resolved Hide resolved
def __init__(
self, in_column: str, inplace: bool = True, out_column: Optional[str] = None, strategy: str = ImputerMode.mean
):
"""
Init LabelEncoderTransform.

Parameters
----------
in_column:
name of column to be resampled
inplace:
if True, apply resampling inplace to in_column, if False, add transformed column to dataset
out_column:
name of added column. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor
strategy:
filling encoding in not fitted values:
- If "new_value", then replace missing values with '-1'
- If "mean", then replace missing values using the mean in encoded column
- If "none", then replace missing values with None
"""
self.in_column = in_column
self.inplace = inplace
self.strategy = strategy
self.out_column = out_column
super().__init__(
transform=_OneSegmentLabelEncoderTransform(self.in_column, self.out_column, self.strategy, self.inplace)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use keyword parameters here like in_column=self.in_column.
This can be a reason for warning during creation with out_column=None.

)


class _OneSegmentLabelBinarizerTransform(Transform):
"""Create one-hot encoding columns"""

def __init__(self, in_column: str, out_column: str):
"""
Create instance of _OneSegmentLabelBinarizerTransform.

Parameters
----------
in_column:
name of column to apply transform to
out_column:
name of added column
"""
self.in_column = in_column
self.out_column = out_column
self.ohe = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)

def _get_column_name(self) -> str:
"""Get the `out_column` depending on the transform's parameters."""

if self.out_column:
return self.out_column
if self.in_column.startswith("regressor"):
temp_transform = LabelBinarizerTransform(in_column=self.in_column, out_column=self.out_column)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we use here just self.__repr__?

return f"regressor_{temp_transform.__repr__()}"
temp_transform = LabelBinarizerTransform(in_column=self.in_column, out_column=self.out_column)
return temp_transform.__repr__()

def fit(self, df: pd.DataFrame) -> "_OneSegmentLabelBinarizerTransform":
"""
Fit Label Binarizer encoder.

Parameters
----------
df:
dataframe with data to fit the transform.
Returns
-------
self
"""
self.ohe.fit(np.array(df[self.in_column]).reshape(-1, 1))
return self

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Encode the `in_column` by fitted Label Binarize encoder.

Parameters
----------
df
dataframe with data to transform.
Returns
-------
result dataframe
"""
result_df = df.copy()
result_df[
[self._get_column_name() + "_" + str(i) for i in range(len(self.ohe.categories_[0]))]
] = self.ohe.transform(np.array(df[self.in_column]).reshape(-1, 1))
return result_df


class LabelBinarizerTransform(PerSegmentWrapper):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add description to the class. What is its purpose.

def __init__(self, in_column: str, out_column: Optional[str] = None):
"""
Init LabelBinarizerTransform.

Parameters
----------
in_column:
name of column to be encoded
out_column:
prefix of names of added columns. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor
"""
self.in_column = in_column
self.out_column = out_column
super().__init__(transform=_OneSegmentLabelBinarizerTransform(self.in_column, self.out_column))
81 changes: 81 additions & 0 deletions tests/test_transforms/test_encoders/test_categorical_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import numpy as np
import pytest

from etna.datasets import TSDataset
from etna.datasets import generate_periodic_df
from etna.transforms.encoders.categorical import LabelBinarizerTransform
from etna.transforms.encoders.categorical import LabelEncoderTransform


@pytest.fixture
def two_df_with_new_values():
df1 = TSDataset.to_dataset(generate_periodic_df(3, "2020-01-01", 10, 2, n_segments=2))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use named parameters here because it is not obvious what is the meaning of all these parameters.

df2 = TSDataset.to_dataset(generate_periodic_df(3, "2020-01-01", 10, 3, n_segments=2))
return df1, df2


@pytest.fixture
def df_for_categorical_encoding():
return TSDataset.to_dataset(generate_periodic_df(4, "2020-01-01", 10, 3, n_segments=2))


def test_label_encoder(df_for_categorical_encoding):
Mr-Geekman marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be better to name "test_label_encoder_simple"

le = LabelEncoderTransform(in_column="target")
le.fit(df_for_categorical_encoding)
expected_values = np.array([[0, 1], [1, 0], [2, 0], [0, 1]])
np.testing.assert_array_almost_equal(le.transform(df_for_categorical_encoding).values, expected_values)


@pytest.mark.parametrize(
"strategy, expected_values",
[
("new_value", np.array([[0, 0], [1, -1], [-1, -1]])),
("None", np.array([[0, 0], [1, np.nan], [np.nan, np.nan]])),
("mean", np.array([[0, 0], [1, 0], [0.5, 0]])),
],
)
def test_new_value_label(two_df_with_new_values, strategy, expected_values):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

label_encoder in the name of the test is omitted.

same issue with word order in description

df1, df2 = two_df_with_new_values
le = LabelEncoderTransform(in_column="target", strategy=strategy)
le.fit(df1)
np.testing.assert_array_almost_equal(le.transform(df2).values, expected_values)


def test_value_error_label(df_for_categorical_encoding):
with pytest.raises(ValueError, match="There are no"):
le = LabelEncoderTransform(in_column="target", strategy="new_vlue")
le.fit(df_for_categorical_encoding)
le.transform(df_for_categorical_encoding)


def test_label_binarizer(df_for_categorical_encoding):
lb = LabelBinarizerTransform(in_column="target")
lb.fit(df_for_categorical_encoding)
expected_values = np.array(
[
[1.0, 0.0, 0.0, 5.0, 0.0, 1.0, 5.0],
[0.0, 1.0, 0.0, 8.0, 1.0, 0.0, 0.0],
[0.0, 0.0, 1.0, 9.0, 1.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 5.0, 0.0, 1.0, 5.0],
]
)
np.testing.assert_array_almost_equal(lb.transform(df_for_categorical_encoding).values, expected_values)


def test_new_value_label_binarizer(two_df_with_new_values):
expected_values = np.array(
[[5.0, 1.0, 0.0, 5.0, 1.0, 0.0], [8.0, 0.0, 1.0, 0.0, 0.0, 0.0], [9.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
)
df1, df2 = two_df_with_new_values
lb = LabelBinarizerTransform(in_column="target", out_column="targets")
lb.fit(df1)
np.testing.assert_array_almost_equal(lb.transform(df2).values, expected_values)


def test_naming_label_binarizer(two_df_with_new_values):
df1, df2 = two_df_with_new_values
lb = LabelBinarizerTransform(in_column="target", out_column="targets")
lb.fit(df1)
segments = ["segment_0", "segment_1"]
target = ["target", "targets_0", "targets_1"]
assert set([(i, j) for i in segments for j in target]) == set(lb.transform(df2).columns.values)