Skip to content

Don't fill first timestamps in TimeSeriesImputerTransform #634

Merged
merged 3 commits into from
Apr 5, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed unsafe comparison in plots ([#611](https://github.com/tinkoff-ai/etna/pull/611))
- Fixed plot_trend does not work with Linear and TheilSen transforms ([#617](https://github.com/tinkoff-ai/etna/pull/617))
-
-
- Don't fill first timestamps in TimeSeriesImputerTransform ([#634](https://github.com/tinkoff-ai/etna/pull/634))
-

## [1.7.0] - 2022-03-16
Expand Down
28 changes: 14 additions & 14 deletions etna/transforms/missing_values/imputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ class ImputerMode(str, Enum):
class _OneSegmentTimeSeriesImputerTransform(Transform):
"""One segment version of transform to fill NaNs in series of a given dataframe.

- It is assumed that given series begins with first non NaN value.

- This transform can't fill NaNs in the future, only on train data.
- This transform can't fill NaNs in non-zero strategy if all values are Nans. In this case exception is raised.
- In 'forward_fill' strategy very first value and first NaNs are replaced with zero.

- This transform can't fill NaNs if all values are NaNs. In this case exception is raised.
"""

def __init__(self, in_column: str = "target", strategy: str = ImputerMode.zero, window: int = -1):
Expand Down Expand Up @@ -69,11 +71,15 @@ def fit(self, df: pd.DataFrame) -> "_OneSegmentTimeSeriesImputerTransform":
self: _OneSegmentTimeSeriesImputerTransform
fitted preprocess
"""
self.nan_timestamps = df[df[self.in_column].isna()].index
raw_series = df[self.in_column]
if np.all(raw_series.isna()):
raise ValueError("Series hasn't non NaN values which means it is empty and can't be filled.")
series = raw_series[raw_series.first_valid_index() :]
self.nan_timestamps = series[series.isna()].index
if self.strategy == ImputerMode.zero:
self.fill_value = 0
elif self.strategy == ImputerMode.mean:
self.fill_value = df[self.in_column].mean()
self.fill_value = series.mean()
return self

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -93,12 +99,6 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
result_df = df.copy()
cur_nans = result_df[result_df[self.in_column].isna()].index

# check if all values are nans
if cur_nans.shape[0] == result_df.shape[0] and self.strategy != ImputerMode.zero:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you sure we can drop this check?
how about transform call in TSDataset.make_future?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In make_future we have train_values + future_values. If all values are NaNs in make_future then on fit stage they also was NaN (because there was only train_values) and we will get an exception on fit stage.

However, if in the future we will make a transform in make_future on part of train data (for optimization) we can face the situation when we have non-nan values on fit, but all nans on transform. But this whole situation looks troublesome: we want to make imputation by train values and we have non-nans train values on fit, but we lost them on transform stage and can't make a transformation. That means that we've already made a mistake by this separation of data on fit and transform and this mistake isn't really a problem of our transform.

raise ValueError(
f"It isn't possible to make imputation in {self.strategy.value} mode if all values are NaNs"
)

result_df[self.in_column] = self._fill(result_df[self.in_column])

# restore nans not in self.nan_timestamps
Expand Down Expand Up @@ -145,8 +145,6 @@ def _fill(self, df: pd.Series) -> pd.Series:
df = df.fillna(value=self.fill_value)
elif self.strategy == ImputerMode.forward_fill:
df = df.fillna(method="ffill")
# very first value or first NaNs should be filled
df = df.fillna(value=0)
elif self.strategy == ImputerMode.running_mean:
for i, val in enumerate(df):
if pd.isnull(val):
Expand All @@ -158,9 +156,11 @@ def _fill(self, df: pd.Series) -> pd.Series:
class TimeSeriesImputerTransform(PerSegmentWrapper):
"""Transform to fill NaNs in series of a given dataframe.

- It is assumed that given series begins with first non NaN value.

- This transform can't fill NaNs in the future, only on train data.
- This transform can't fill NaNs in non-zero strategy if all values are Nans. In this case exception is raised.
- In 'forward_fill' strategy very first value and first NaNs are replaced with zero.

- This transform can't fill NaNs if all values are NaNs. In this case exception is raised.

Warning
-------
Expand Down
54 changes: 36 additions & 18 deletions tests/test_transforms/test_missing_values/test_impute_transform.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from copy import deepcopy

import numpy as np
import pandas as pd
import pytest
Expand All @@ -8,6 +10,21 @@
from etna.transforms.missing_values.imputation import _OneSegmentTimeSeriesImputerTransform


@pytest.fixture
def ts_nans_beginning(example_reg_tsds):
"""Example dataset with NaNs at the beginning."""
ts = deepcopy(example_reg_tsds)

# nans at the beginning (shouldn't be filled)
ts.loc[ts.index[:5], pd.IndexSlice["segment_1", "target"]] = np.NaN

# nans in the middle (should be filled)
ts.loc[ts.index[8], pd.IndexSlice["segment_1", "target"]] = np.NaN
ts.loc[ts.index[10], pd.IndexSlice["segment_2", "target"]] = np.NaN
ts.loc[ts.index[40], pd.IndexSlice["segment_2", "target"]] = np.NaN
return ts


def test_wrong_init_one_segment():
"""Check that imputer for one segment fails to init with wrong imputing strategy."""
with pytest.raises(ValueError):
Expand Down Expand Up @@ -39,33 +56,19 @@ def test_all_dates_present_impute_two_segments(all_date_present_df_two_segments:
np.testing.assert_array_equal(all_date_present_df_two_segments[segment]["target"], result[segment]["target"])


def test_all_missing_impute_zero(df_all_missing: pd.DataFrame):
"""Check that imputer fills zero value if all values are nans and strategy is zero."""
imputer = _OneSegmentTimeSeriesImputerTransform(strategy="zero")
result = imputer.fit_transform(df_all_missing)
assert np.all(result == 0)


def test_all_missing_impute_zero_two_segments(df_all_missing_two_segments: pd.DataFrame):
"""Check that imputer fills zero value if all values are nans and strategy is zero."""
imputer = TimeSeriesImputerTransform(strategy="zero")
result = imputer.fit_transform(df_all_missing_two_segments)
assert np.all(result == 0)


@pytest.mark.parametrize("fill_strategy", ["mean", "running_mean", "forward_fill"])
@pytest.mark.parametrize("fill_strategy", ["zero", "mean", "running_mean", "forward_fill"])
def test_all_missing_impute_fail(df_all_missing: pd.DataFrame, fill_strategy: str):
"""Check that imputer can't fill nans if all values are nans."""
imputer = _OneSegmentTimeSeriesImputerTransform(strategy=fill_strategy)
with pytest.raises(ValueError, match="It isn't possible to make imputation"):
with pytest.raises(ValueError, match="Series hasn't non NaN values which means it is empty and can't be filled"):
_ = imputer.fit_transform(df_all_missing)


@pytest.mark.parametrize("fill_strategy", ["mean", "running_mean", "forward_fill"])
def test_all_missing_impute_fail_two_segments(df_all_missing_two_segments: pd.DataFrame, fill_strategy: str):
"""Check that imputer can't fill nans if all values are nans."""
imputer = TimeSeriesImputerTransform(strategy=fill_strategy)
with pytest.raises(ValueError, match="It isn't possible to make imputation"):
with pytest.raises(ValueError, match="Series hasn't non NaN values which means it is empty and can't be filled"):
_ = imputer.fit_transform(df_all_missing_two_segments)


Expand Down Expand Up @@ -209,7 +212,22 @@ def test_inverse_transform_in_forecast(df_with_missing_range_x_index_two_segment


@pytest.mark.parametrize("fill_strategy", ["mean", "zero", "running_mean", "forward_fill"])
def test_fit_transform_with_nans(fill_strategy, ts_diff_endings):
def test_fit_transform_nans_at_the_beginning(fill_strategy, ts_nans_beginning):
"""Check that transform doesn't fill NaNs at the beginning."""
imputer = TimeSeriesImputerTransform(in_column="target", strategy=fill_strategy)
df_init = ts_nans_beginning.to_pandas()
ts_nans_beginning.fit_transform([imputer])
df_filled = ts_nans_beginning.to_pandas()
for segment in ts_nans_beginning.segments:
df_segment_init = df_init.loc[:, pd.IndexSlice[segment, "target"]]
df_segment_filled = df_filled.loc[:, pd.IndexSlice[segment, "target"]]
first_valid_index = df_segment_init.first_valid_index()
assert df_segment_init[:first_valid_index].equals(df_segment_filled[:first_valid_index])
assert not df_segment_filled[first_valid_index:].isna().any()


@pytest.mark.parametrize("fill_strategy", ["mean", "zero", "running_mean", "forward_fill"])
def test_fit_transform_nans_at_the_end(fill_strategy, ts_diff_endings):
"""Check that transform correctly works with NaNs at the end."""
imputer = TimeSeriesImputerTransform(in_column="target", strategy=fill_strategy)
ts_diff_endings.fit_transform([imputer])
Expand Down