-
Notifications
You must be signed in to change notification settings - Fork 80
Categorical encoders #431
Categorical encoders #431
Changes from 2 commits
a9382ea
298c45c
b4757aa
6827f2b
55f4a00
3c0d2ea
55d7a0e
dc884c3
dc43c79
e2ade6f
f0741fe
689a7b6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
from etna.transforms.encoders.categorical import LabelEncoderTransform | ||
from etna.transforms.encoders.mean_segment_encoder import MeanSegmentEncoderTransform | ||
from etna.transforms.encoders.segment_encoder import SegmentEncoderTransform |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
import warnings | ||
from enum import Enum | ||
from typing import Optional | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn import preprocessing | ||
from sklearn.utils._encode import _check_unknown | ||
from sklearn.utils._encode import _encode | ||
|
||
from etna.transforms.base import PerSegmentWrapper | ||
from etna.transforms.base import Transform | ||
|
||
|
||
class ImputerMode(str, Enum): | ||
"""Enum for different imputation strategy.""" | ||
|
||
new_value = "new_value" | ||
mean = "mean" | ||
none = "none" | ||
|
||
|
||
class _LabelEncoder(preprocessing.LabelEncoder): | ||
def transform(self, y: pd.Series, strategy: str): | ||
diff = _check_unknown(y, known_values=self.classes_) | ||
|
||
index = np.where(np.isin(y, diff))[0] | ||
|
||
encoded = _encode(y, uniques=self.classes_, check_unknown=False).astype(float) | ||
|
||
if strategy == "None": | ||
filling_value = None | ||
elif strategy == "new_value": | ||
filling_value = -1 | ||
elif strategy == "mean": | ||
filling_value = np.mean(encoded[~np.isin(y, diff)]) | ||
else: | ||
raise ValueError(f"There are no '{strategy}' strategy exists") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This message doesn't look very good. |
||
|
||
encoded[index] = filling_value | ||
return encoded | ||
|
||
|
||
class _OneSegmentLabelEncoderTransform(Transform): | ||
"""Replace the values in the column with the Label encoding""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no |
||
|
||
def __init__(self, in_column: str, out_column: str, strategy: str, inplace: bool): | ||
""" | ||
Create instance of _OneSegmentLabelEncoderTransform. | ||
|
||
Parameters | ||
---------- | ||
in_column: | ||
name of column to apply transform to | ||
out_column: | ||
name of added column. | ||
strategy: | ||
filling encoding in not fitted values: | ||
- If "new_value", then replace missing dates with '-1' | ||
- If "mean", then replace missing dates using the mean in encoded column | ||
- If "none", then replace missing dates with None | ||
inplace: | ||
if True, apply resampling inplace to in_column, if False, add transformed column to dataset | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Resampling is irrelevant here |
||
""" | ||
self.in_column = in_column | ||
self.out_column = out_column | ||
self.strategy = strategy | ||
self.le = _LabelEncoder() | ||
self.inplace = inplace | ||
|
||
def _get_column_name(self) -> str: | ||
"""Get the `out_column` depending on the transform's parameters.""" | ||
if self.inplace and self.out_column: | ||
warnings.warn("Transformation will be applied inplace, out_column param will be ignored") | ||
if self.inplace: | ||
return self.in_column | ||
if self.out_column: | ||
return self.out_column | ||
if self.in_column.startswith("regressor"): | ||
temp_transform = LabelEncoderTransform( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we use here just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is about this? |
||
in_column=self.in_column, inplace=self.inplace, out_column=self.out_column, strategy=self.strategy | ||
) | ||
return f"regressor_{temp_transform.__repr__()}" | ||
temp_transform = LabelEncoderTransform( | ||
in_column=self.in_column, inplace=self.inplace, out_column=self.out_column, strategy=self.strategy | ||
) | ||
return temp_transform.__repr__() | ||
|
||
def fit(self, df: pd.DataFrame) -> "_OneSegmentLabelEncoderTransform": | ||
""" | ||
Fit Label encoder. | ||
|
||
Parameters | ||
---------- | ||
df: | ||
dataframe with data to fit the transform. | ||
Returns | ||
------- | ||
self | ||
""" | ||
self.le.fit(df[self.in_column]) | ||
return self | ||
|
||
def transform(self, df: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Encode the `in_column` by fitted Label encoder. | ||
|
||
Parameters | ||
---------- | ||
df | ||
dataframe with data to transform. | ||
Returns | ||
------- | ||
result dataframe | ||
""" | ||
result_df = df.copy() | ||
result_df[self._get_column_name()] = self.le.transform(df[self.in_column], self.strategy) | ||
return result_df | ||
|
||
|
||
class LabelEncoderTransform(PerSegmentWrapper): | ||
Mr-Geekman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
def __init__( | ||
self, in_column: str, inplace: bool = True, out_column: Optional[str] = None, strategy: str = ImputerMode.mean | ||
): | ||
""" | ||
Init LabelEncoderTransform. | ||
|
||
Parameters | ||
---------- | ||
in_column: | ||
name of column to be resampled | ||
inplace: | ||
if True, apply resampling inplace to in_column, if False, add transformed column to dataset | ||
out_column: | ||
name of added column. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor | ||
strategy: | ||
filling encoding in not fitted values: | ||
- If "new_value", then replace missing values with '-1' | ||
- If "mean", then replace missing values using the mean in encoded column | ||
- If "none", then replace missing values with None | ||
""" | ||
self.in_column = in_column | ||
self.inplace = inplace | ||
self.strategy = strategy | ||
self.out_column = out_column | ||
super().__init__( | ||
transform=_OneSegmentLabelEncoderTransform(self.in_column, self.out_column, self.strategy, self.inplace) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use keyword parameters here like in_column=self.in_column. |
||
) | ||
|
||
|
||
class _OneSegmentLabelBinarizerTransform(Transform): | ||
"""Create one-hot encoding columns""" | ||
|
||
def __init__(self, in_column: str, out_column: str): | ||
""" | ||
Create instance of _OneSegmentLabelBinarizerTransform. | ||
|
||
Parameters | ||
---------- | ||
in_column: | ||
name of column to apply transform to | ||
out_column: | ||
name of added column | ||
""" | ||
self.in_column = in_column | ||
self.out_column = out_column | ||
self.ohe = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False) | ||
|
||
def _get_column_name(self) -> str: | ||
"""Get the `out_column` depending on the transform's parameters.""" | ||
|
||
if self.out_column: | ||
return self.out_column | ||
if self.in_column.startswith("regressor"): | ||
temp_transform = LabelBinarizerTransform(in_column=self.in_column, out_column=self.out_column) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we use here just |
||
return f"regressor_{temp_transform.__repr__()}" | ||
temp_transform = LabelBinarizerTransform(in_column=self.in_column, out_column=self.out_column) | ||
return temp_transform.__repr__() | ||
|
||
def fit(self, df: pd.DataFrame) -> "_OneSegmentLabelBinarizerTransform": | ||
""" | ||
Fit Label Binarizer encoder. | ||
|
||
Parameters | ||
---------- | ||
df: | ||
dataframe with data to fit the transform. | ||
Returns | ||
------- | ||
self | ||
""" | ||
self.ohe.fit(np.array(df[self.in_column]).reshape(-1, 1)) | ||
return self | ||
|
||
def transform(self, df: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Encode the `in_column` by fitted Label Binarize encoder. | ||
|
||
Parameters | ||
---------- | ||
df | ||
dataframe with data to transform. | ||
Returns | ||
------- | ||
result dataframe | ||
""" | ||
result_df = df.copy() | ||
result_df[ | ||
[self._get_column_name() + "_" + str(i) for i in range(len(self.ohe.categories_[0]))] | ||
] = self.ohe.transform(np.array(df[self.in_column]).reshape(-1, 1)) | ||
return result_df | ||
|
||
|
||
class LabelBinarizerTransform(PerSegmentWrapper): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add description to the class. What is its purpose. |
||
def __init__(self, in_column: str, out_column: Optional[str] = None): | ||
""" | ||
Init LabelBinarizerTransform. | ||
|
||
Parameters | ||
---------- | ||
in_column: | ||
name of column to be encoded | ||
out_column: | ||
prefix of names of added columns. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor | ||
""" | ||
self.in_column = in_column | ||
self.out_column = out_column | ||
super().__init__(transform=_OneSegmentLabelBinarizerTransform(self.in_column, self.out_column)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
from etna.datasets import TSDataset | ||
from etna.datasets import generate_periodic_df | ||
from etna.transforms.encoders.categorical import LabelBinarizerTransform | ||
from etna.transforms.encoders.categorical import LabelEncoderTransform | ||
|
||
|
||
@pytest.fixture | ||
def two_df_with_new_values(): | ||
df1 = TSDataset.to_dataset(generate_periodic_df(3, "2020-01-01", 10, 2, n_segments=2)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use named parameters here because it is not obvious what is the meaning of all these parameters. |
||
df2 = TSDataset.to_dataset(generate_periodic_df(3, "2020-01-01", 10, 3, n_segments=2)) | ||
return df1, df2 | ||
|
||
|
||
@pytest.fixture | ||
def df_for_categorical_encoding(): | ||
return TSDataset.to_dataset(generate_periodic_df(4, "2020-01-01", 10, 3, n_segments=2)) | ||
|
||
|
||
def test_label_encoder(df_for_categorical_encoding): | ||
Mr-Geekman marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May be better to name "test_label_encoder_simple" |
||
le = LabelEncoderTransform(in_column="target") | ||
le.fit(df_for_categorical_encoding) | ||
expected_values = np.array([[0, 1], [1, 0], [2, 0], [0, 1]]) | ||
np.testing.assert_array_almost_equal(le.transform(df_for_categorical_encoding).values, expected_values) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"strategy, expected_values", | ||
[ | ||
("new_value", np.array([[0, 0], [1, -1], [-1, -1]])), | ||
("None", np.array([[0, 0], [1, np.nan], [np.nan, np.nan]])), | ||
("mean", np.array([[0, 0], [1, 0], [0.5, 0]])), | ||
], | ||
) | ||
def test_new_value_label(two_df_with_new_values, strategy, expected_values): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. label_encoder in the name of the test is omitted. same issue with word order in description |
||
df1, df2 = two_df_with_new_values | ||
le = LabelEncoderTransform(in_column="target", strategy=strategy) | ||
le.fit(df1) | ||
np.testing.assert_array_almost_equal(le.transform(df2).values, expected_values) | ||
|
||
|
||
def test_value_error_label(df_for_categorical_encoding): | ||
with pytest.raises(ValueError, match="There are no"): | ||
le = LabelEncoderTransform(in_column="target", strategy="new_vlue") | ||
le.fit(df_for_categorical_encoding) | ||
le.transform(df_for_categorical_encoding) | ||
|
||
|
||
def test_label_binarizer(df_for_categorical_encoding): | ||
lb = LabelBinarizerTransform(in_column="target") | ||
lb.fit(df_for_categorical_encoding) | ||
expected_values = np.array( | ||
[ | ||
[1.0, 0.0, 0.0, 5.0, 0.0, 1.0, 5.0], | ||
[0.0, 1.0, 0.0, 8.0, 1.0, 0.0, 0.0], | ||
[0.0, 0.0, 1.0, 9.0, 1.0, 0.0, 0.0], | ||
[1.0, 0.0, 0.0, 5.0, 0.0, 1.0, 5.0], | ||
] | ||
) | ||
np.testing.assert_array_almost_equal(lb.transform(df_for_categorical_encoding).values, expected_values) | ||
|
||
|
||
def test_new_value_label_binarizer(two_df_with_new_values): | ||
expected_values = np.array( | ||
[[5.0, 1.0, 0.0, 5.0, 1.0, 0.0], [8.0, 0.0, 1.0, 0.0, 0.0, 0.0], [9.0, 0.0, 0.0, 0.0, 0.0, 0.0]] | ||
) | ||
df1, df2 = two_df_with_new_values | ||
lb = LabelBinarizerTransform(in_column="target", out_column="targets") | ||
lb.fit(df1) | ||
np.testing.assert_array_almost_equal(lb.transform(df2).values, expected_values) | ||
|
||
|
||
def test_naming_label_binarizer(two_df_with_new_values): | ||
df1, df2 = two_df_with_new_values | ||
lb = LabelBinarizerTransform(in_column="target", out_column="targets") | ||
lb.fit(df1) | ||
segments = ["segment_0", "segment_1"] | ||
target = ["target", "targets_0", "targets_1"] | ||
assert set([(i, j) for i in segments for j in target]) == set(lb.transform(df2).columns.values) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Threre is no such value as "None", only "none".