tinkoff-ai · Mr-Geekman · Apr 5, 2022 · Apr 1, 2022 · Apr 1, 2022 · Apr 5, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -35,7 +35,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed unsafe comparison in plots ([#611](https://github.com/tinkoff-ai/etna/pull/611))
 - Fixed plot_trend does not work with Linear and TheilSen transforms ([#617](https://github.com/tinkoff-ai/etna/pull/617))
 - 
-- 
+- Don't fill first timestamps in TimeSeriesImputerTransform ([#634](https://github.com/tinkoff-ai/etna/pull/634))
 - 
 
 ## [1.7.0] - 2022-03-16

diff --git a/etna/transforms/missing_values/imputation.py b/etna/transforms/missing_values/imputation.py
@@ -20,9 +20,11 @@ class ImputerMode(str, Enum):
 class _OneSegmentTimeSeriesImputerTransform(Transform):
     """One segment version of transform to fill NaNs in series of a given dataframe.
 
+    - It is assumed that given series begins with first non NaN value.
+
     - This transform can't fill NaNs in the future, only on train data.
-    - This transform can't fill NaNs in non-zero strategy if all values are Nans. In this case exception is raised.
-    - In 'forward_fill' strategy very first value and first NaNs are replaced with zero.
+
+    - This transform can't fill NaNs if all values are NaNs. In this case exception is raised.
     """
 
     def __init__(self, in_column: str = "target", strategy: str = ImputerMode.zero, window: int = -1):
@@ -69,11 +71,15 @@ def fit(self, df: pd.DataFrame) -> "_OneSegmentTimeSeriesImputerTransform":
         self: _OneSegmentTimeSeriesImputerTransform
             fitted preprocess
         """
-        self.nan_timestamps = df[df[self.in_column].isna()].index
+        raw_series = df[self.in_column]
+        if np.all(raw_series.isna()):
+            raise ValueError("Series hasn't non NaN values which means it is empty and can't be filled.")
+        series = raw_series[raw_series.first_valid_index() :]
+        self.nan_timestamps = series[series.isna()].index
         if self.strategy == ImputerMode.zero:
             self.fill_value = 0
         elif self.strategy == ImputerMode.mean:
-            self.fill_value = df[self.in_column].mean()
+            self.fill_value = series.mean()
         return self
 
     def transform(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -93,12 +99,6 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
         result_df = df.copy()
         cur_nans = result_df[result_df[self.in_column].isna()].index
 
-        # check if all values are nans
-        if cur_nans.shape[0] == result_df.shape[0] and self.strategy != ImputerMode.zero:
-            raise ValueError(
-                f"It isn't possible to make imputation in {self.strategy.value} mode if all values are NaNs"
-            )
-
         result_df[self.in_column] = self._fill(result_df[self.in_column])
 
         # restore nans not in self.nan_timestamps
@@ -145,8 +145,6 @@ def _fill(self, df: pd.Series) -> pd.Series:
             df = df.fillna(value=self.fill_value)
         elif self.strategy == ImputerMode.forward_fill:
             df = df.fillna(method="ffill")
-            # very first value or first NaNs should be filled
-            df = df.fillna(value=0)
         elif self.strategy == ImputerMode.running_mean:
             for i, val in enumerate(df):
                 if pd.isnull(val):
@@ -158,9 +156,11 @@ def _fill(self, df: pd.Series) -> pd.Series:
 class TimeSeriesImputerTransform(PerSegmentWrapper):
     """Transform to fill NaNs in series of a given dataframe.
 
+    - It is assumed that given series begins with first non NaN value.
+
     - This transform can't fill NaNs in the future, only on train data.
-    - This transform can't fill NaNs in non-zero strategy if all values are Nans. In this case exception is raised.
-    - In 'forward_fill' strategy very first value and first NaNs are replaced with zero.
+
+    - This transform can't fill NaNs if all values are NaNs. In this case exception is raised.
 
     Warning
     -------

diff --git a/tests/test_transforms/test_missing_values/test_impute_transform.py b/tests/test_transforms/test_missing_values/test_impute_transform.py
@@ -1,3 +1,5 @@
+from copy import deepcopy
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -8,6 +10,21 @@
 from etna.transforms.missing_values.imputation import _OneSegmentTimeSeriesImputerTransform
 
 
+@pytest.fixture
+def ts_nans_beginning(example_reg_tsds):
+    """Example dataset with NaNs at the beginning."""
+    ts = deepcopy(example_reg_tsds)
+
+    # nans at the beginning (shouldn't be filled)
+    ts.loc[ts.index[:5], pd.IndexSlice["segment_1", "target"]] = np.NaN
+
+    # nans in the middle (should be filled)
+    ts.loc[ts.index[8], pd.IndexSlice["segment_1", "target"]] = np.NaN
+    ts.loc[ts.index[10], pd.IndexSlice["segment_2", "target"]] = np.NaN
+    ts.loc[ts.index[40], pd.IndexSlice["segment_2", "target"]] = np.NaN
+    return ts
+
+
 def test_wrong_init_one_segment():
     """Check that imputer for one segment fails to init with wrong imputing strategy."""
     with pytest.raises(ValueError):
@@ -39,33 +56,19 @@ def test_all_dates_present_impute_two_segments(all_date_present_df_two_segments:
         np.testing.assert_array_equal(all_date_present_df_two_segments[segment]["target"], result[segment]["target"])
 
 
-def test_all_missing_impute_zero(df_all_missing: pd.DataFrame):
-    """Check that imputer fills zero value if all values are nans and strategy is zero."""
-    imputer = _OneSegmentTimeSeriesImputerTransform(strategy="zero")
-    result = imputer.fit_transform(df_all_missing)
-    assert np.all(result == 0)
-
-
-def test_all_missing_impute_zero_two_segments(df_all_missing_two_segments: pd.DataFrame):
-    """Check that imputer fills zero value if all values are nans and strategy is zero."""
-    imputer = TimeSeriesImputerTransform(strategy="zero")
-    result = imputer.fit_transform(df_all_missing_two_segments)
-    assert np.all(result == 0)
-
-
-@pytest.mark.parametrize("fill_strategy", ["mean", "running_mean", "forward_fill"])
+@pytest.mark.parametrize("fill_strategy", ["zero", "mean", "running_mean", "forward_fill"])
 def test_all_missing_impute_fail(df_all_missing: pd.DataFrame, fill_strategy: str):
     """Check that imputer can't fill nans if all values are nans."""
     imputer = _OneSegmentTimeSeriesImputerTransform(strategy=fill_strategy)
-    with pytest.raises(ValueError, match="It isn't possible to make imputation"):
+    with pytest.raises(ValueError, match="Series hasn't non NaN values which means it is empty and can't be filled"):
         _ = imputer.fit_transform(df_all_missing)
 
 
 @pytest.mark.parametrize("fill_strategy", ["mean", "running_mean", "forward_fill"])
 def test_all_missing_impute_fail_two_segments(df_all_missing_two_segments: pd.DataFrame, fill_strategy: str):
     """Check that imputer can't fill nans if all values are nans."""
     imputer = TimeSeriesImputerTransform(strategy=fill_strategy)
-    with pytest.raises(ValueError, match="It isn't possible to make imputation"):
+    with pytest.raises(ValueError, match="Series hasn't non NaN values which means it is empty and can't be filled"):
         _ = imputer.fit_transform(df_all_missing_two_segments)
 
 
@@ -209,7 +212,22 @@ def test_inverse_transform_in_forecast(df_with_missing_range_x_index_two_segment
 
 
 @pytest.mark.parametrize("fill_strategy", ["mean", "zero", "running_mean", "forward_fill"])
-def test_fit_transform_with_nans(fill_strategy, ts_diff_endings):
+def test_fit_transform_nans_at_the_beginning(fill_strategy, ts_nans_beginning):
+    """Check that transform doesn't fill NaNs at the beginning."""
+    imputer = TimeSeriesImputerTransform(in_column="target", strategy=fill_strategy)
+    df_init = ts_nans_beginning.to_pandas()
+    ts_nans_beginning.fit_transform([imputer])
+    df_filled = ts_nans_beginning.to_pandas()
+    for segment in ts_nans_beginning.segments:
+        df_segment_init = df_init.loc[:, pd.IndexSlice[segment, "target"]]
+        df_segment_filled = df_filled.loc[:, pd.IndexSlice[segment, "target"]]
+        first_valid_index = df_segment_init.first_valid_index()
+        assert df_segment_init[:first_valid_index].equals(df_segment_filled[:first_valid_index])
+        assert not df_segment_filled[first_valid_index:].isna().any()
+
+
+@pytest.mark.parametrize("fill_strategy", ["mean", "zero", "running_mean", "forward_fill"])
+def test_fit_transform_nans_at_the_end(fill_strategy, ts_diff_endings):
     """Check that transform correctly works with NaNs at the end."""
     imputer = TimeSeriesImputerTransform(in_column="target", strategy=fill_strategy)
     ts_diff_endings.fit_transform([imputer])