Skip to content

Categorical encoders #431

Merged
merged 12 commits into from
Jan 18, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Method TSDataset.info ([#409](https://github.com/tinkoff-ai/etna/pull/409))
- DifferencingTransform ([#414](https://github.com/tinkoff-ai/etna/pull/414))
- OneHotEncoderTransform and LabelEncoderTransform ([#431](https://github.com/tinkoff-ai/etna/pull/431))

## Changed
- Change method TSDataset.describe ([#409](https://github.com/tinkoff-ai/etna/pull/409))
Expand Down
2 changes: 2 additions & 0 deletions etna/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
from etna.transforms.decomposition import STLTransform
from etna.transforms.decomposition import TheilSenTrendTransform
from etna.transforms.decomposition import TrendTransform
from etna.transforms.encoders import LabelEncoderTransform
from etna.transforms.encoders import MeanSegmentEncoderTransform
from etna.transforms.encoders import OneHotEncoderTransform
from etna.transforms.encoders import SegmentEncoderTransform
from etna.transforms.feature_selection import FilterFeaturesTransform
from etna.transforms.feature_selection import GaleShapleyFeatureSelectionTransform
Expand Down
2 changes: 2 additions & 0 deletions etna/transforms/encoders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from etna.transforms.encoders.categorical import LabelEncoderTransform
from etna.transforms.encoders.categorical import OneHotEncoderTransform
from etna.transforms.encoders.mean_segment_encoder import MeanSegmentEncoderTransform
from etna.transforms.encoders.segment_encoder import SegmentEncoderTransform
235 changes: 235 additions & 0 deletions etna/transforms/encoders/categorical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
import warnings
from enum import Enum
from typing import Optional

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.utils._encode import _check_unknown
from sklearn.utils._encode import _encode

from etna.transforms.base import PerSegmentWrapper
from etna.transforms.base import Transform


class ImputerMode(str, Enum):
"""Enum for different imputation strategy."""

new_value = "new_value"
mean = "mean"
none = "none"


class _LabelEncoder(preprocessing.LabelEncoder):
def transform(self, y: pd.Series, strategy: str):
diff = _check_unknown(y, known_values=self.classes_)

index = np.where(np.isin(y, diff))[0]

encoded = _encode(y, uniques=self.classes_, check_unknown=False).astype(float)

if strategy == "None":
Copy link
Contributor

@Mr-Geekman Mr-Geekman Jan 12, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Threre is no such value as "None", only "none".

filling_value = None
elif strategy == "new_value":
filling_value = -1
elif strategy == "mean":
filling_value = np.mean(encoded[~np.isin(y, diff)])
else:
raise ValueError(f"The strategy '{strategy}' doesn't exist")

encoded[index] = filling_value
return encoded


class _OneSegmentLabelEncoderTransform(Transform):
"""Replace the values in the column with the Label encoding."""

def __init__(self, in_column: str, out_column: str, strategy: str, inplace: bool):
"""
Create instance of _OneSegmentLabelEncoderTransform.

Parameters
----------
in_column:
name of column to apply transform to
out_column:
name of added column.
strategy:
filling encoding in not fitted values:
- If "new_value", then replace missing dates with '-1'
- If "mean", then replace missing dates using the mean in encoded column
- If "none", then replace missing dates with None
inplace:
if True, apply transform inplace to in_column, if False, add transformed column to dataset
"""
self.in_column = in_column
self.out_column = out_column
self.strategy = strategy
self.le = _LabelEncoder()
self.inplace = inplace

def _get_column_name(self) -> str:
"""Get the `out_column` depending on the transform's parameters."""
if self.inplace and self.out_column:
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
if self.inplace:
return self.in_column
if self.out_column:
return self.out_column
if self.in_column.startswith("regressor"):
temp_transform = LabelEncoderTransform(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we use here just self.__repr__?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is about this?

in_column=self.in_column, inplace=self.inplace, out_column=self.out_column, strategy=self.strategy
)
return f"regressor_{temp_transform.__repr__()}"
temp_transform = LabelEncoderTransform(
in_column=self.in_column, inplace=self.inplace, out_column=self.out_column, strategy=self.strategy
)
return temp_transform.__repr__()

def fit(self, df: pd.DataFrame) -> "_OneSegmentLabelEncoderTransform":
"""
Fit Label encoder.

Parameters
----------
df:
dataframe with data to fit the transform.
Returns
-------
self
"""
self.le.fit(df[self.in_column])
return self

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Encode the `in_column` by fitted Label encoder.

Parameters
----------
df
dataframe with data to transform.
Returns
-------
result dataframe
"""
result_df = df.copy()
result_df[self._get_column_name()] = self.le.transform(df[self.in_column], self.strategy)
return result_df


class LabelEncoderTransform(PerSegmentWrapper):
Mr-Geekman marked this conversation as resolved.
Show resolved Hide resolved
"""Encode categorical feature with value between 0 and n_classes-1."""

def __init__(
self, in_column: str, inplace: bool = True, out_column: Optional[str] = None, strategy: str = ImputerMode.mean
):
"""
Init LabelEncoderTransform.

Parameters
----------
in_column:
name of column to be transformed
inplace:
if True, apply transform inplace to in_column, if False, add transformed column to dataset
out_column:
name of added column. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor
strategy:
filling encoding in not fitted values:
- If "new_value", then replace missing values with '-1'
- If "mean", then replace missing values using the mean in encoded column
- If "none", then replace missing values with None
"""
self.in_column = in_column
self.inplace = inplace
self.strategy = strategy
self.out_column = out_column
super().__init__(
transform=_OneSegmentLabelEncoderTransform(
in_column=self.in_column, out_column=self.out_column, strategy=self.strategy, inplace=self.inplace
)
)


class _OneSegmentOneHotEncoderTransform(Transform):
"""Create one-hot encoding columns."""

def __init__(self, in_column: str, out_column: str):
"""
Create instance of _OneSegmentOneHotEncoderTransform.

Parameters
----------
in_column:
name of column to apply transform to
out_column:
name of added column
"""
self.in_column = in_column
self.out_column = out_column
self.ohe = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)

def _get_column_name(self) -> str:
"""Get the `out_column` depending on the transform's parameters."""
if self.out_column:
return self.out_column
if self.in_column.startswith("regressor"):
temp_transform = OneHotEncoderTransform(in_column=self.in_column, out_column=self.out_column)
return f"regressor_{temp_transform.__repr__()}"
temp_transform = OneHotEncoderTransform(in_column=self.in_column, out_column=self.out_column)
return temp_transform.__repr__()

def fit(self, df: pd.DataFrame) -> "_OneSegmentOneHotEncoderTransform":
"""
Fit One Hot encoder.

Parameters
----------
df:
dataframe with data to fit the transform.
Returns
-------
self
"""
self.ohe.fit(np.array(df[self.in_column]).reshape(-1, 1))
return self

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Encode the `in_column` by fitted Label Binarize encoder.

Parameters
----------
df
dataframe with data to transform.
Returns
-------
result dataframe
"""
result_df = df.copy()
result_df[
[self._get_column_name() + "_" + str(i) for i in range(len(self.ohe.categories_[0]))]
] = self.ohe.transform(np.array(df[self.in_column]).reshape(-1, 1))
return result_df


class OneHotEncoderTransform(PerSegmentWrapper):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Describe how it works with new values.

"""Encode categorical feature as a one-hot numeric features."""

def __init__(self, in_column: str, out_column: Optional[str] = None):
"""
Init OneHotEncoderTransform.

Parameters
----------
in_column:
name of column to be encoded
out_column:
prefix of names of added columns. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor
"""
self.in_column = in_column
self.out_column = out_column
super().__init__(
transform=_OneSegmentOneHotEncoderTransform(in_column=self.in_column, out_column=self.out_column)
)
93 changes: 93 additions & 0 deletions tests/test_transforms/test_encoders/test_categorical_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import numpy as np
import pytest

from etna.datasets import TSDataset
from etna.datasets import generate_periodic_df
from etna.transforms.encoders.categorical import LabelEncoderTransform
from etna.transforms.encoders.categorical import OneHotEncoderTransform


@pytest.fixture
def two_df_with_new_values():
df1 = TSDataset.to_dataset(
generate_periodic_df(periods=3, start_time="2020-01-01", scale=10, period=2, n_segments=2)
)
df2 = TSDataset.to_dataset(
generate_periodic_df(periods=3, start_time="2020-01-01", scale=10, period=3, n_segments=2)
)
return df1, df2


@pytest.fixture
def df_for_categorical_encoding():
return TSDataset.to_dataset(
generate_periodic_df(periods=4, start_time="2020-01-01", scale=10, period=3, n_segments=2)
)


def test_label_encoder(df_for_categorical_encoding):
Mr-Geekman marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be better to name "test_label_encoder_simple"

"""Test LabelEncoderTransform correct works."""
Copy link
Contributor

@Mr-Geekman Mr-Geekman Jan 12, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will be more correct to use
"Test that LabelEncoderTransform works correct in a simple case."

le = LabelEncoderTransform(in_column="target")
le.fit(df_for_categorical_encoding)
expected_values = np.array([[0, 1], [1, 0], [2, 0], [0, 1]])
np.testing.assert_array_almost_equal(le.transform(df_for_categorical_encoding).values, expected_values)


@pytest.mark.parametrize(
"strategy, expected_values",
[
("new_value", np.array([[0, 0], [1, -1], [-1, -1]])),
("None", np.array([[0, 0], [1, np.nan], [np.nan, np.nan]])),
("mean", np.array([[0, 0], [1, 0], [0.5, 0]])),
],
)
def test_new_value_label(two_df_with_new_values, strategy, expected_values):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

label_encoder in the name of the test is omitted.

same issue with word order in description

"""Test LabelEncoderTransform correct works with unknown values."""
df1, df2 = two_df_with_new_values
le = LabelEncoderTransform(in_column="target", strategy=strategy)
le.fit(df1)
np.testing.assert_array_almost_equal(le.transform(df2).values, expected_values)


def test_value_error_label(df_for_categorical_encoding):
"""Test LabelEncoderTransform with wrong strategy."""
with pytest.raises(ValueError, match="The strategy"):
le = LabelEncoderTransform(in_column="target", strategy="new_vlue")
le.fit(df_for_categorical_encoding)
le.transform(df_for_categorical_encoding)


def test_ohe(df_for_categorical_encoding):
"""Test OneHotEncoderTransform correct works."""
ohe = OneHotEncoderTransform(in_column="target")
ohe.fit(df_for_categorical_encoding)
expected_values = np.array(
[
[1.0, 0.0, 0.0, 5.0, 0.0, 1.0, 5.0],
[0.0, 1.0, 0.0, 8.0, 1.0, 0.0, 0.0],
[0.0, 0.0, 1.0, 9.0, 1.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 5.0, 0.0, 1.0, 5.0],
]
)
np.testing.assert_array_almost_equal(ohe.transform(df_for_categorical_encoding).values, expected_values)


def test_new_value_ohe(two_df_with_new_values):
"""Test OneHotEncoderTransform correct works with unknown values."""
expected_values = np.array(
[[5.0, 1.0, 0.0, 5.0, 1.0, 0.0], [8.0, 0.0, 1.0, 0.0, 0.0, 0.0], [9.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
)
df1, df2 = two_df_with_new_values
ohe = OneHotEncoderTransform(in_column="target", out_column="targets")
ohe.fit(df1)
np.testing.assert_array_almost_equal(ohe.transform(df2).values, expected_values)


def test_naming_ohe(two_df_with_new_values):
"""Test OneHotEncoderTransform gives the correct columns."""
df1, df2 = two_df_with_new_values
ohe = OneHotEncoderTransform(in_column="target", out_column="targets")
ohe.fit(df1)
segments = ["segment_0", "segment_1"]
target = ["target", "targets_0", "targets_1"]
assert set([(i, j) for i in segments for j in target]) == set(ohe.transform(df2).columns.values)