Skip to content

Add TreeFeatureSelectionTransform #229

Merged
merged 8 commits into from
Oct 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Ensembles notebook ([#218](https://github.com/tinkoff-ai/etna-ts/pull/218))
- Function plot_backtest_interactive ([#225](https://github.com/tinkoff-ai/etna-ts/pull/225))
- Confidence intervals in Pipeline ([#221](https://github.com/tinkoff-ai/etna-ts/pull/221))
- TreeFeatureSelectionTransform ([229](https://github.com/tinkoff-ai/etna-ts/pull/229))

### Changed
- Delete offset from WindowStatisticsTransform ([#111](https://github.com/tinkoff-ai/etna-ts/pull/111))
Expand Down
1 change: 1 addition & 0 deletions etna/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from etna.transforms.datetime_flags import TimeFlagsTransform
from etna.transforms.detrend import LinearTrendTransform
from etna.transforms.detrend import TheilSenTrendTransform
from etna.transforms.feature_importance import TreeFeatureSelectionTransform
from etna.transforms.imputation import TimeSeriesImputerTransform
from etna.transforms.lags import LagTransform
from etna.transforms.log import LogTransform
Expand Down
134 changes: 134 additions & 0 deletions etna/transforms/feature_importance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import warnings
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor

from etna.datasets import TSDataset
from etna.transforms.base import Transform

TreeBasedRegressor = Union[
DecisionTreeRegressor,
ExtraTreeRegressor,
RandomForestRegressor,
ExtraTreesRegressor,
GradientBoostingRegressor,
CatBoostRegressor,
]


class TreeFeatureSelectionTransform(Transform):
"""Transform that selects regressors according to tree-based models feature importance."""

def __init__(self, model: TreeBasedRegressor, top_k: int):
"""
Init TreeFeatureSelectionTransform.

Parameters
----------
model:
model to make selection, it should have feature_importances_ property
(e.g. all tree-based regressors in sklearn)
top_k:
num of regressors to select; if there are not enough regressors, then all will be selected
"""
if not isinstance(top_k, int) or top_k < 0:
raise ValueError("Parameter top_k should be positive integer")

self.model = model
self.top_k = top_k
self.selected_regressors: Optional[List[str]] = None

@staticmethod
def _get_regressors(df: pd.DataFrame) -> List[str]:
"""Get list of regressors in the dataframe."""
result = set()
for column in df.columns.get_level_values("feature"):
if column.startswith("regressor_"):
result.add(column)
return sorted(list(result))

@staticmethod
def _get_train(df: pd.DataFrame) -> Tuple[np.array, np.array]:
"""Get train data for model."""
regressors = TreeFeatureSelectionTransform._get_regressors(df)
# TODO: fix when TSDataset.to_pandas became static
ts = TSDataset(df, freq=pd.infer_freq(df.index))
df = ts.to_pandas(flatten=True).dropna()
train_target = df["target"]
train_data = df[regressors]
return train_data, train_target

def _get_regressors_weights(self, df: pd.DataFrame) -> Dict[str, float]:
"""Get weights for regressors based on model feature importances."""
train_data, train_target = self._get_train(df)
self.model.fit(train_data, train_target)
weights_array = self.model.feature_importances_
weights_dict = {
column: weights_array[i] for i, column in enumerate(train_data.columns) if column.startswith("regressor_")
}
return weights_dict

@staticmethod
def _select_top_k_regressors(weights: Dict[str, float], top_k: int) -> List[str]:
keys = np.array(list(weights.keys()))
values = np.array(list(weights.values()))
idx_sort = np.argsort(values)[::-1]
idx_selected = idx_sort[:top_k]
return keys[idx_selected].tolist()

def fit(self, df: pd.DataFrame) -> "TreeFeatureSelectionTransform":
"""
Fit the model and remember features to select.

Parameters
----------
df:
dataframe with all segments data

Returns
-------
result: TreeFeatureSelectionTransform
instance after fitting
"""
if len(self._get_regressors(df)) == 0:
warnings.warn("It is not possible to select regressors if there aren't any")
return self
weights = self._get_regressors_weights(df)
self.selected_regressors = self._select_top_k_regressors(weights, self.top_k)
return self

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Select top_k regressors.

Parameters
----------
df:
dataframe with all segments data

Returns
-------
result: pd.DataFrame
Dataframe with with only selected regressors
"""
result = df.copy()
selected_columns = sorted(
[
column
for column in df.columns.get_level_values("feature").unique()
if not column.startswith("regressor_") or column in self.selected_regressors
]
)
result = result.loc[:, pd.IndexSlice[:, selected_columns]]
return result
205 changes: 205 additions & 0 deletions tests/test_transforms/test_feature_importance_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
import pandas as pd
import pytest
from catboost import CatBoostRegressor
from numpy.random import RandomState
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor

from etna.datasets import TSDataset
from etna.datasets import generate_ar_df
from etna.models import LinearPerSegmentModel
from etna.pipeline import Pipeline
from etna.transforms import SegmentEncoderTransform
from etna.transforms.feature_importance import TreeFeatureSelectionTransform


@pytest.fixture
def ts_with_regressors():
num_segments = 3
df = generate_ar_df(
start_time="2020-01-01", periods=300, ar_coef=[1], sigma=1, n_segments=num_segments, random_seed=0, freq="D"
)

example_segment = df["segment"].unique()[0]
timestamp = df[df["segment"] == example_segment]["timestamp"]
df_exog = pd.DataFrame({"timestamp": timestamp})

# useless regressors
num_useless = 12
df_regressors_useless = generate_ar_df(
start_time="2020-01-01", periods=300, ar_coef=[1], sigma=1, n_segments=num_useless, random_seed=1, freq="D"
)
for i, segment in enumerate(df_regressors_useless["segment"].unique()):
regressor = df_regressors_useless[df_regressors_useless["segment"] == segment]["target"].values
df_exog[f"regressor_useless_{i}"] = regressor

# useful regressors: the same as target but with little noise
df_regressors_useful = df.copy()
sampler = RandomState(seed=2).normal
for i, segment in enumerate(df_regressors_useful["segment"].unique()):
regressor = df_regressors_useful[df_regressors_useful["segment"] == segment]["target"].values
noise = sampler(scale=0.05, size=regressor.shape)
df_exog[f"regressor_useful_{i}"] = regressor + noise

# construct exog
classic_exog_list = []
for segment in df["segment"].unique():
tmp = df_exog.copy(deep=True)
tmp["segment"] = segment
classic_exog_list.append(tmp)
df_exog_all_segments = pd.concat(classic_exog_list)

# construct TSDataset
df = df[df["timestamp"] <= timestamp[200]]
return TSDataset(df=TSDataset.to_dataset(df), df_exog=TSDataset.to_dataset(df_exog_all_segments), freq="D")


@pytest.mark.parametrize(
"model",
[
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
ExtraTreesRegressor(n_estimators=10, random_state=42),
GradientBoostingRegressor(n_estimators=10, random_state=42),
CatBoostRegressor(iterations=10, random_state=42, silent=True, cat_features=["regressor_segment_code"]),
],
)
@pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50])
def test_selected_top_k_regressors(model, top_k, ts_with_regressors):
"""Check that transform selects exactly top_k regressors if where are this much."""
df = ts_with_regressors.to_pandas()
le_encoder = SegmentEncoderTransform()
df_encoded = le_encoder.fit_transform(df)
selector = TreeFeatureSelectionTransform(model=model, top_k=top_k)
df_selected = selector.fit_transform(df_encoded)

all_regressors = ts_with_regressors.regressors
all_regressors.append("regressor_segment_code")
selected_regressors = set()
for column in df_selected.columns.get_level_values("feature"):
if column.startswith("regressor"):
selected_regressors.add(column)

assert len(selected_regressors) == min(len(all_regressors), top_k)


@pytest.mark.parametrize(
"model",
[
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
ExtraTreesRegressor(n_estimators=10, random_state=42),
GradientBoostingRegressor(n_estimators=10, random_state=42),
CatBoostRegressor(iterations=10, random_state=42, silent=True, cat_features=["regressor_segment_code"]),
],
)
@pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50])
def test_retain_values(model, top_k, ts_with_regressors):
"""Check that transform doesn't change values of columns."""
df = ts_with_regressors.to_pandas()
le_encoder = SegmentEncoderTransform()
df_encoded = le_encoder.fit_transform(df)
selector = TreeFeatureSelectionTransform(model=model, top_k=top_k)
df_selected = selector.fit_transform(df_encoded)

for segment in ts_with_regressors.segments:
for column in df_selected.columns.get_level_values("feature").unique():
assert (
df_selected.loc[:, pd.IndexSlice[segment, column]] == df_encoded.loc[:, pd.IndexSlice[segment, column]]
).all()


@pytest.mark.parametrize(
"model",
[
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
ExtraTreesRegressor(n_estimators=10, random_state=42),
GradientBoostingRegressor(n_estimators=10, random_state=42),
CatBoostRegressor(iterations=10, random_state=42, silent=True, cat_features=["regressor_segment_code"]),
],
)
def test_fails_negative_top_k(model, ts_with_regressors):
"""Check that transform doesn't allow you to set top_k to negative values."""
with pytest.raises(ValueError, match="positive integer"):
TreeFeatureSelectionTransform(model=model, top_k=-1)


@pytest.mark.parametrize(
"model",
[
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
ExtraTreesRegressor(n_estimators=10, random_state=42),
GradientBoostingRegressor(n_estimators=10, random_state=42),
CatBoostRegressor(iterations=10, random_state=42, silent=True),
],
)
def test_warns_no_regressors(model, example_tsds):
"""Check that transform allows you to fit on dataset with no regressors but warns about it."""
df = example_tsds.to_pandas()
selector = TreeFeatureSelectionTransform(model=model, top_k=3)
with pytest.warns(UserWarning, match="not possible to select regressors"):
df_selected = selector.fit_transform(df)
assert (df == df_selected).all().all()


@pytest.mark.parametrize(
"model",
[
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
ExtraTreesRegressor(n_estimators=10, random_state=42),
GradientBoostingRegressor(n_estimators=10, random_state=42),
CatBoostRegressor(iterations=700, random_state=42, silent=True, cat_features=["regressor_segment_code"]),
],
)
def test_sanity_selected(model, ts_with_regressors):
"""Check that transform correctly finds meaningful regressors."""
df = ts_with_regressors.to_pandas()
le_encoder = SegmentEncoderTransform()
df_encoded = le_encoder.fit_transform(df)
selector = TreeFeatureSelectionTransform(model=model, top_k=8)
df_selected = selector.fit_transform(df_encoded)
features_columns = df_selected.columns.get_level_values("feature").unique()
selected_regressors = [column for column in features_columns if column.startswith("regressor_")]
useful_regressors = [column for column in selected_regressors if "useful" in column]
assert len(useful_regressors) == 3


@pytest.mark.parametrize(
"model",
[
DecisionTreeRegressor(random_state=42),
ExtraTreeRegressor(random_state=42),
RandomForestRegressor(n_estimators=10, random_state=42),
ExtraTreesRegressor(n_estimators=10, random_state=42),
GradientBoostingRegressor(n_estimators=10, random_state=42),
CatBoostRegressor(iterations=500, silent=True, random_state=42, cat_features=["regressor_segment_code"]),
],
)
def test_sanity_model(model, ts_with_regressors):
"""Check that training with this transform can utilize selected regressors."""
ts_train, ts_test = ts_with_regressors.train_test_split(test_size=30)
le_encoder = SegmentEncoderTransform()
selector = TreeFeatureSelectionTransform(model=model, top_k=8)

model = LinearPerSegmentModel()
pipeline = Pipeline(model=model, transforms=[le_encoder, selector], horizon=30)
pipeline.fit(ts=ts_train)
ts_forecast = pipeline.forecast()

for segment in ts_forecast.segments:
test_target = ts_test[:, segment, "target"]
forecasted_target = ts_forecast[:, segment, "target"]
r2 = r2_score(forecasted_target, test_target)
assert r2 > 0.99