diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a6818e88..7fea2a57e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,7 +35,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed unsafe comparison in plots ([#611](https://github.com/tinkoff-ai/etna/pull/611)) - Fixed plot_trend does not work with Linear and TheilSen transforms ([#617](https://github.com/tinkoff-ai/etna/pull/617)) - -- +- Don't fill first timestamps in TimeSeriesImputerTransform ([#634](https://github.com/tinkoff-ai/etna/pull/634)) - ## [1.7.0] - 2022-03-16 diff --git a/etna/transforms/missing_values/imputation.py b/etna/transforms/missing_values/imputation.py index 112e612ed..175e2e82d 100644 --- a/etna/transforms/missing_values/imputation.py +++ b/etna/transforms/missing_values/imputation.py @@ -20,9 +20,11 @@ class ImputerMode(str, Enum): class _OneSegmentTimeSeriesImputerTransform(Transform): """One segment version of transform to fill NaNs in series of a given dataframe. + - It is assumed that given series begins with first non NaN value. + - This transform can't fill NaNs in the future, only on train data. - - This transform can't fill NaNs in non-zero strategy if all values are Nans. In this case exception is raised. - - In 'forward_fill' strategy very first value and first NaNs are replaced with zero. + + - This transform can't fill NaNs if all values are NaNs. In this case exception is raised. """ def __init__(self, in_column: str = "target", strategy: str = ImputerMode.zero, window: int = -1): @@ -69,11 +71,15 @@ def fit(self, df: pd.DataFrame) -> "_OneSegmentTimeSeriesImputerTransform": self: _OneSegmentTimeSeriesImputerTransform fitted preprocess """ - self.nan_timestamps = df[df[self.in_column].isna()].index + raw_series = df[self.in_column] + if np.all(raw_series.isna()): + raise ValueError("Series hasn't non NaN values which means it is empty and can't be filled.") + series = raw_series[raw_series.first_valid_index() :] + self.nan_timestamps = series[series.isna()].index if self.strategy == ImputerMode.zero: self.fill_value = 0 elif self.strategy == ImputerMode.mean: - self.fill_value = df[self.in_column].mean() + self.fill_value = series.mean() return self def transform(self, df: pd.DataFrame) -> pd.DataFrame: @@ -93,12 +99,6 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: result_df = df.copy() cur_nans = result_df[result_df[self.in_column].isna()].index - # check if all values are nans - if cur_nans.shape[0] == result_df.shape[0] and self.strategy != ImputerMode.zero: - raise ValueError( - f"It isn't possible to make imputation in {self.strategy.value} mode if all values are NaNs" - ) - result_df[self.in_column] = self._fill(result_df[self.in_column]) # restore nans not in self.nan_timestamps @@ -145,8 +145,6 @@ def _fill(self, df: pd.Series) -> pd.Series: df = df.fillna(value=self.fill_value) elif self.strategy == ImputerMode.forward_fill: df = df.fillna(method="ffill") - # very first value or first NaNs should be filled - df = df.fillna(value=0) elif self.strategy == ImputerMode.running_mean: for i, val in enumerate(df): if pd.isnull(val): @@ -158,9 +156,11 @@ def _fill(self, df: pd.Series) -> pd.Series: class TimeSeriesImputerTransform(PerSegmentWrapper): """Transform to fill NaNs in series of a given dataframe. + - It is assumed that given series begins with first non NaN value. + - This transform can't fill NaNs in the future, only on train data. - - This transform can't fill NaNs in non-zero strategy if all values are Nans. In this case exception is raised. - - In 'forward_fill' strategy very first value and first NaNs are replaced with zero. + + - This transform can't fill NaNs if all values are NaNs. In this case exception is raised. Warning ------- diff --git a/tests/test_transforms/test_missing_values/test_impute_transform.py b/tests/test_transforms/test_missing_values/test_impute_transform.py index 420796cfb..7b2f2648a 100644 --- a/tests/test_transforms/test_missing_values/test_impute_transform.py +++ b/tests/test_transforms/test_missing_values/test_impute_transform.py @@ -1,3 +1,5 @@ +from copy import deepcopy + import numpy as np import pandas as pd import pytest @@ -8,6 +10,21 @@ from etna.transforms.missing_values.imputation import _OneSegmentTimeSeriesImputerTransform +@pytest.fixture +def ts_nans_beginning(example_reg_tsds): + """Example dataset with NaNs at the beginning.""" + ts = deepcopy(example_reg_tsds) + + # nans at the beginning (shouldn't be filled) + ts.loc[ts.index[:5], pd.IndexSlice["segment_1", "target"]] = np.NaN + + # nans in the middle (should be filled) + ts.loc[ts.index[8], pd.IndexSlice["segment_1", "target"]] = np.NaN + ts.loc[ts.index[10], pd.IndexSlice["segment_2", "target"]] = np.NaN + ts.loc[ts.index[40], pd.IndexSlice["segment_2", "target"]] = np.NaN + return ts + + def test_wrong_init_one_segment(): """Check that imputer for one segment fails to init with wrong imputing strategy.""" with pytest.raises(ValueError): @@ -39,25 +56,11 @@ def test_all_dates_present_impute_two_segments(all_date_present_df_two_segments: np.testing.assert_array_equal(all_date_present_df_two_segments[segment]["target"], result[segment]["target"]) -def test_all_missing_impute_zero(df_all_missing: pd.DataFrame): - """Check that imputer fills zero value if all values are nans and strategy is zero.""" - imputer = _OneSegmentTimeSeriesImputerTransform(strategy="zero") - result = imputer.fit_transform(df_all_missing) - assert np.all(result == 0) - - -def test_all_missing_impute_zero_two_segments(df_all_missing_two_segments: pd.DataFrame): - """Check that imputer fills zero value if all values are nans and strategy is zero.""" - imputer = TimeSeriesImputerTransform(strategy="zero") - result = imputer.fit_transform(df_all_missing_two_segments) - assert np.all(result == 0) - - -@pytest.mark.parametrize("fill_strategy", ["mean", "running_mean", "forward_fill"]) +@pytest.mark.parametrize("fill_strategy", ["zero", "mean", "running_mean", "forward_fill"]) def test_all_missing_impute_fail(df_all_missing: pd.DataFrame, fill_strategy: str): """Check that imputer can't fill nans if all values are nans.""" imputer = _OneSegmentTimeSeriesImputerTransform(strategy=fill_strategy) - with pytest.raises(ValueError, match="It isn't possible to make imputation"): + with pytest.raises(ValueError, match="Series hasn't non NaN values which means it is empty and can't be filled"): _ = imputer.fit_transform(df_all_missing) @@ -65,7 +68,7 @@ def test_all_missing_impute_fail(df_all_missing: pd.DataFrame, fill_strategy: st def test_all_missing_impute_fail_two_segments(df_all_missing_two_segments: pd.DataFrame, fill_strategy: str): """Check that imputer can't fill nans if all values are nans.""" imputer = TimeSeriesImputerTransform(strategy=fill_strategy) - with pytest.raises(ValueError, match="It isn't possible to make imputation"): + with pytest.raises(ValueError, match="Series hasn't non NaN values which means it is empty and can't be filled"): _ = imputer.fit_transform(df_all_missing_two_segments) @@ -209,7 +212,22 @@ def test_inverse_transform_in_forecast(df_with_missing_range_x_index_two_segment @pytest.mark.parametrize("fill_strategy", ["mean", "zero", "running_mean", "forward_fill"]) -def test_fit_transform_with_nans(fill_strategy, ts_diff_endings): +def test_fit_transform_nans_at_the_beginning(fill_strategy, ts_nans_beginning): + """Check that transform doesn't fill NaNs at the beginning.""" + imputer = TimeSeriesImputerTransform(in_column="target", strategy=fill_strategy) + df_init = ts_nans_beginning.to_pandas() + ts_nans_beginning.fit_transform([imputer]) + df_filled = ts_nans_beginning.to_pandas() + for segment in ts_nans_beginning.segments: + df_segment_init = df_init.loc[:, pd.IndexSlice[segment, "target"]] + df_segment_filled = df_filled.loc[:, pd.IndexSlice[segment, "target"]] + first_valid_index = df_segment_init.first_valid_index() + assert df_segment_init[:first_valid_index].equals(df_segment_filled[:first_valid_index]) + assert not df_segment_filled[first_valid_index:].isna().any() + + +@pytest.mark.parametrize("fill_strategy", ["mean", "zero", "running_mean", "forward_fill"]) +def test_fit_transform_nans_at_the_end(fill_strategy, ts_diff_endings): """Check that transform correctly works with NaNs at the end.""" imputer = TimeSeriesImputerTransform(in_column="target", strategy=fill_strategy) ts_diff_endings.fit_transform([imputer])