-
Notifications
You must be signed in to change notification settings - Fork 80
Categorical encoders #431
Categorical encoders #431
Changes from 3 commits
a9382ea
298c45c
b4757aa
6827f2b
55f4a00
3c0d2ea
55d7a0e
dc884c3
dc43c79
e2ade6f
f0741fe
689a7b6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,4 @@ | ||
from etna.transforms.encoders.categorical import LabelEncoderTransform | ||
from etna.transforms.encoders.categorical import OneHotEncoderTransform | ||
from etna.transforms.encoders.mean_segment_encoder import MeanSegmentEncoderTransform | ||
from etna.transforms.encoders.segment_encoder import SegmentEncoderTransform |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,235 @@ | ||
import warnings | ||
from enum import Enum | ||
from typing import Optional | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn import preprocessing | ||
from sklearn.utils._encode import _check_unknown | ||
from sklearn.utils._encode import _encode | ||
|
||
from etna.transforms.base import PerSegmentWrapper | ||
from etna.transforms.base import Transform | ||
|
||
|
||
class ImputerMode(str, Enum): | ||
"""Enum for different imputation strategy.""" | ||
|
||
new_value = "new_value" | ||
mean = "mean" | ||
none = "none" | ||
|
||
|
||
class _LabelEncoder(preprocessing.LabelEncoder): | ||
def transform(self, y: pd.Series, strategy: str): | ||
diff = _check_unknown(y, known_values=self.classes_) | ||
|
||
index = np.where(np.isin(y, diff))[0] | ||
|
||
encoded = _encode(y, uniques=self.classes_, check_unknown=False).astype(float) | ||
|
||
if strategy == "None": | ||
filling_value = None | ||
elif strategy == "new_value": | ||
filling_value = -1 | ||
elif strategy == "mean": | ||
filling_value = np.mean(encoded[~np.isin(y, diff)]) | ||
else: | ||
raise ValueError(f"The strategy '{strategy}' doesn't exist") | ||
|
||
encoded[index] = filling_value | ||
return encoded | ||
|
||
|
||
class _OneSegmentLabelEncoderTransform(Transform): | ||
"""Replace the values in the column with the Label encoding.""" | ||
|
||
def __init__(self, in_column: str, out_column: str, strategy: str, inplace: bool): | ||
""" | ||
Create instance of _OneSegmentLabelEncoderTransform. | ||
|
||
Parameters | ||
---------- | ||
in_column: | ||
name of column to apply transform to | ||
out_column: | ||
name of added column. | ||
strategy: | ||
filling encoding in not fitted values: | ||
- If "new_value", then replace missing dates with '-1' | ||
- If "mean", then replace missing dates using the mean in encoded column | ||
- If "none", then replace missing dates with None | ||
inplace: | ||
if True, apply transform inplace to in_column, if False, add transformed column to dataset | ||
""" | ||
self.in_column = in_column | ||
self.out_column = out_column | ||
self.strategy = strategy | ||
self.le = _LabelEncoder() | ||
self.inplace = inplace | ||
|
||
def _get_column_name(self) -> str: | ||
"""Get the `out_column` depending on the transform's parameters.""" | ||
if self.inplace and self.out_column: | ||
warnings.warn("Transformation will be applied inplace, out_column param will be ignored") | ||
if self.inplace: | ||
return self.in_column | ||
if self.out_column: | ||
return self.out_column | ||
if self.in_column.startswith("regressor"): | ||
temp_transform = LabelEncoderTransform( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we use here just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is about this? |
||
in_column=self.in_column, inplace=self.inplace, out_column=self.out_column, strategy=self.strategy | ||
) | ||
return f"regressor_{temp_transform.__repr__()}" | ||
temp_transform = LabelEncoderTransform( | ||
in_column=self.in_column, inplace=self.inplace, out_column=self.out_column, strategy=self.strategy | ||
) | ||
return temp_transform.__repr__() | ||
|
||
def fit(self, df: pd.DataFrame) -> "_OneSegmentLabelEncoderTransform": | ||
""" | ||
Fit Label encoder. | ||
|
||
Parameters | ||
---------- | ||
df: | ||
dataframe with data to fit the transform. | ||
Returns | ||
------- | ||
self | ||
""" | ||
self.le.fit(df[self.in_column]) | ||
return self | ||
|
||
def transform(self, df: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Encode the `in_column` by fitted Label encoder. | ||
|
||
Parameters | ||
---------- | ||
df | ||
dataframe with data to transform. | ||
Returns | ||
------- | ||
result dataframe | ||
""" | ||
result_df = df.copy() | ||
result_df[self._get_column_name()] = self.le.transform(df[self.in_column], self.strategy) | ||
return result_df | ||
|
||
|
||
class LabelEncoderTransform(PerSegmentWrapper): | ||
Mr-Geekman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Encode categorical feature with value between 0 and n_classes-1.""" | ||
|
||
def __init__( | ||
self, in_column: str, inplace: bool = True, out_column: Optional[str] = None, strategy: str = ImputerMode.mean | ||
): | ||
""" | ||
Init LabelEncoderTransform. | ||
|
||
Parameters | ||
---------- | ||
in_column: | ||
name of column to be transformed | ||
inplace: | ||
if True, apply transform inplace to in_column, if False, add transformed column to dataset | ||
out_column: | ||
name of added column. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor | ||
strategy: | ||
filling encoding in not fitted values: | ||
- If "new_value", then replace missing values with '-1' | ||
- If "mean", then replace missing values using the mean in encoded column | ||
- If "none", then replace missing values with None | ||
""" | ||
self.in_column = in_column | ||
self.inplace = inplace | ||
self.strategy = strategy | ||
self.out_column = out_column | ||
super().__init__( | ||
transform=_OneSegmentLabelEncoderTransform( | ||
in_column=self.in_column, out_column=self.out_column, strategy=self.strategy, inplace=self.inplace | ||
) | ||
) | ||
|
||
|
||
class _OneSegmentOneHotEncoderTransform(Transform): | ||
"""Create one-hot encoding columns.""" | ||
|
||
def __init__(self, in_column: str, out_column: str): | ||
""" | ||
Create instance of _OneSegmentOneHotEncoderTransform. | ||
|
||
Parameters | ||
---------- | ||
in_column: | ||
name of column to apply transform to | ||
out_column: | ||
name of added column | ||
""" | ||
self.in_column = in_column | ||
self.out_column = out_column | ||
self.ohe = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False) | ||
|
||
def _get_column_name(self) -> str: | ||
"""Get the `out_column` depending on the transform's parameters.""" | ||
if self.out_column: | ||
return self.out_column | ||
if self.in_column.startswith("regressor"): | ||
temp_transform = OneHotEncoderTransform(in_column=self.in_column, out_column=self.out_column) | ||
return f"regressor_{temp_transform.__repr__()}" | ||
temp_transform = OneHotEncoderTransform(in_column=self.in_column, out_column=self.out_column) | ||
return temp_transform.__repr__() | ||
|
||
def fit(self, df: pd.DataFrame) -> "_OneSegmentOneHotEncoderTransform": | ||
""" | ||
Fit One Hot encoder. | ||
|
||
Parameters | ||
---------- | ||
df: | ||
dataframe with data to fit the transform. | ||
Returns | ||
------- | ||
self | ||
""" | ||
self.ohe.fit(np.array(df[self.in_column]).reshape(-1, 1)) | ||
return self | ||
|
||
def transform(self, df: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Encode the `in_column` by fitted Label Binarize encoder. | ||
|
||
Parameters | ||
---------- | ||
df | ||
dataframe with data to transform. | ||
Returns | ||
------- | ||
result dataframe | ||
""" | ||
result_df = df.copy() | ||
result_df[ | ||
[self._get_column_name() + "_" + str(i) for i in range(len(self.ohe.categories_[0]))] | ||
] = self.ohe.transform(np.array(df[self.in_column]).reshape(-1, 1)) | ||
return result_df | ||
|
||
|
||
class OneHotEncoderTransform(PerSegmentWrapper): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Describe how it works with new values. |
||
"""Encode categorical feature as a one-hot numeric features.""" | ||
|
||
def __init__(self, in_column: str, out_column: Optional[str] = None): | ||
""" | ||
Init OneHotEncoderTransform. | ||
|
||
Parameters | ||
---------- | ||
in_column: | ||
name of column to be encoded | ||
out_column: | ||
prefix of names of added columns. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor | ||
""" | ||
self.in_column = in_column | ||
self.out_column = out_column | ||
super().__init__( | ||
transform=_OneSegmentOneHotEncoderTransform(in_column=self.in_column, out_column=self.out_column) | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
from etna.datasets import TSDataset | ||
from etna.datasets import generate_periodic_df | ||
from etna.transforms.encoders.categorical import LabelEncoderTransform | ||
from etna.transforms.encoders.categorical import OneHotEncoderTransform | ||
|
||
|
||
@pytest.fixture | ||
def two_df_with_new_values(): | ||
df1 = TSDataset.to_dataset( | ||
generate_periodic_df(periods=3, start_time="2020-01-01", scale=10, period=2, n_segments=2) | ||
) | ||
df2 = TSDataset.to_dataset( | ||
generate_periodic_df(periods=3, start_time="2020-01-01", scale=10, period=3, n_segments=2) | ||
) | ||
return df1, df2 | ||
|
||
|
||
@pytest.fixture | ||
def df_for_categorical_encoding(): | ||
return TSDataset.to_dataset( | ||
generate_periodic_df(periods=4, start_time="2020-01-01", scale=10, period=3, n_segments=2) | ||
) | ||
|
||
|
||
def test_label_encoder(df_for_categorical_encoding): | ||
Mr-Geekman marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May be better to name "test_label_encoder_simple" |
||
"""Test LabelEncoderTransform correct works.""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It will be more correct to use |
||
le = LabelEncoderTransform(in_column="target") | ||
le.fit(df_for_categorical_encoding) | ||
expected_values = np.array([[0, 1], [1, 0], [2, 0], [0, 1]]) | ||
np.testing.assert_array_almost_equal(le.transform(df_for_categorical_encoding).values, expected_values) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"strategy, expected_values", | ||
[ | ||
("new_value", np.array([[0, 0], [1, -1], [-1, -1]])), | ||
("None", np.array([[0, 0], [1, np.nan], [np.nan, np.nan]])), | ||
("mean", np.array([[0, 0], [1, 0], [0.5, 0]])), | ||
], | ||
) | ||
def test_new_value_label(two_df_with_new_values, strategy, expected_values): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. label_encoder in the name of the test is omitted. same issue with word order in description |
||
"""Test LabelEncoderTransform correct works with unknown values.""" | ||
df1, df2 = two_df_with_new_values | ||
le = LabelEncoderTransform(in_column="target", strategy=strategy) | ||
le.fit(df1) | ||
np.testing.assert_array_almost_equal(le.transform(df2).values, expected_values) | ||
|
||
|
||
def test_value_error_label(df_for_categorical_encoding): | ||
"""Test LabelEncoderTransform with wrong strategy.""" | ||
with pytest.raises(ValueError, match="The strategy"): | ||
le = LabelEncoderTransform(in_column="target", strategy="new_vlue") | ||
le.fit(df_for_categorical_encoding) | ||
le.transform(df_for_categorical_encoding) | ||
|
||
|
||
def test_ohe(df_for_categorical_encoding): | ||
"""Test OneHotEncoderTransform correct works.""" | ||
ohe = OneHotEncoderTransform(in_column="target") | ||
ohe.fit(df_for_categorical_encoding) | ||
expected_values = np.array( | ||
[ | ||
[1.0, 0.0, 0.0, 5.0, 0.0, 1.0, 5.0], | ||
[0.0, 1.0, 0.0, 8.0, 1.0, 0.0, 0.0], | ||
[0.0, 0.0, 1.0, 9.0, 1.0, 0.0, 0.0], | ||
[1.0, 0.0, 0.0, 5.0, 0.0, 1.0, 5.0], | ||
] | ||
) | ||
np.testing.assert_array_almost_equal(ohe.transform(df_for_categorical_encoding).values, expected_values) | ||
|
||
|
||
def test_new_value_ohe(two_df_with_new_values): | ||
"""Test OneHotEncoderTransform correct works with unknown values.""" | ||
expected_values = np.array( | ||
[[5.0, 1.0, 0.0, 5.0, 1.0, 0.0], [8.0, 0.0, 1.0, 0.0, 0.0, 0.0], [9.0, 0.0, 0.0, 0.0, 0.0, 0.0]] | ||
) | ||
df1, df2 = two_df_with_new_values | ||
ohe = OneHotEncoderTransform(in_column="target", out_column="targets") | ||
ohe.fit(df1) | ||
np.testing.assert_array_almost_equal(ohe.transform(df2).values, expected_values) | ||
|
||
|
||
def test_naming_ohe(two_df_with_new_values): | ||
"""Test OneHotEncoderTransform gives the correct columns.""" | ||
df1, df2 = two_df_with_new_values | ||
ohe = OneHotEncoderTransform(in_column="target", out_column="targets") | ||
ohe.fit(df1) | ||
segments = ["segment_0", "segment_1"] | ||
target = ["target", "targets_0", "targets_1"] | ||
assert set([(i, j) for i in segments for j in target]) == set(ohe.transform(df2).columns.values) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Threre is no such value as "None", only "none".