Merge branch 'master' into fix-pl-trainer-depreceated-flag

tinkoff-ai · Aug 22, 2022 · 7ea6029 · 7ea6029
2 parents 14c6e9b + 61d9a80
commit 7ea6029
Show file tree

Hide file tree

Showing 16 changed files with 494 additions and 69 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,7 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - 
 - 
 - 
-- 
+- Add `ChangePointSegmentationTransform`, `RupturesChangePointsModel` ([#821](https://github.com/tinkoff-ai/etna/issues/821))
 - 
 - 
 ### Changed
@@ -35,6 +35,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - 
 - 
 ### Fixed
+- Type hints for external users by [PEP 561](https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-library-stubs-or-py-typed-marker) ([#868](https://github.com/tinkoff-ai/etna/pull/868))
 - Type hints for `Pipeline.model` match `models.nn`([#768](https://github.com/tinkoff-ai/etna/pull/840))
 - 
 - 

diff --git a/etna/analysis/change_points_trend/search.py b/etna/analysis/change_points_trend/search.py
@@ -1,35 +1,12 @@
 from typing import Dict
 from typing import List
 
-import numpy as np
 import pandas as pd
 from ruptures.base import BaseEstimator
-from ruptures.costs import CostLinear
 
 from etna.datasets import TSDataset
 
 
-def _prepare_signal(series: pd.Series, model: BaseEstimator) -> np.ndarray:
-    """Prepare series for change point model."""
-    signal = series.to_numpy()
-    if isinstance(model.cost, CostLinear):
-        signal = signal.reshape((-1, 1))
-    return signal
-
-
-def _find_change_points_segment(
-    series: pd.Series, change_point_model: BaseEstimator, **model_predict_params
-) -> List[pd.Timestamp]:
-    """Find trend change points within one segment."""
-    signal = _prepare_signal(series=series, model=change_point_model)
-    timestamp = series.index
-    change_point_model.fit(signal=signal)
-    # last point in change points is the first index after the series
-    change_points_indices = change_point_model.predict(**model_predict_params)[:-1]
-    change_points = [timestamp[idx] for idx in change_points_indices]
-    return change_points
-
-
 def find_change_points(
     ts: TSDataset, in_column: str, change_point_model: BaseEstimator, **model_predict_params
 ) -> Dict[str, List[pd.Timestamp]]:
@@ -51,13 +28,12 @@ def find_change_points(
     Dict[str, List[pd.Timestamp]]
         dictionary with list of trend change points for each segment
     """
+    from etna.transforms.decomposition.base_change_points import RupturesChangePointsModel
+
     result: Dict[str, List[pd.Timestamp]] = {}
     df = ts.to_pandas()
+    ruptures = RupturesChangePointsModel(change_point_model, **model_predict_params)
     for segment in ts.segments:
         df_segment = df[segment]
-        raw_series = df_segment[in_column]
-        series = raw_series.loc[raw_series.first_valid_index() : raw_series.last_valid_index()]
-        result[segment] = _find_change_points_segment(
-            series=series, change_point_model=change_point_model, **model_predict_params
-        )
+        result[segment] = ruptures.get_change_points(df=df_segment, in_column=in_column)
     return result
diff --git a/etna/libs/__init__.py b/etna/libs/__init__.py
diff --git a/etna/libs/pmdarima_utils/arima.pyi b/etna/libs/pmdarima_utils/arima.pyi
@@ -0,0 +1,18 @@
+from typing import Any
+
+from numpy import ndarray
+
+def ARMAtoMA(
+    ar: ndarray, 
+    ma: ndarray, 
+    max_deg: int,
+    ) -> ndarray: ...
+
+def seasonal_prediction_with_confidence(
+    arima_res: Any, 
+    start: Any, 
+    end: Any, 
+    X: Any, 
+    alpha: Any, 
+    **kwargs: Any,
+    ) -> Any: ...
diff --git a/etna/libs/tsfresh/distribution.pyi b/etna/libs/tsfresh/distribution.pyi
@@ -0,0 +1 @@
+def initialize_warnings_in_workers(show_warnings: bool) -> None: ...
diff --git a/etna/libs/tsfresh/relevance.pyi b/etna/libs/tsfresh/relevance.pyi
@@ -0,0 +1,25 @@
+from typing import List
+from typing import Optional
+
+import pandas as pd
+
+def calculate_relevance_table(
+    X: pd.DataFrame, 
+    y: pd.Series, 
+    ml_task: str = ..., 
+    multiclass: bool = ..., 
+    n_significant: int = ..., 
+    n_jobs: int = ..., 
+    show_warnings: bool = ..., 
+    chunksize: Optional[int] = ..., 
+    test_for_binary_target_binary_feature: str = ...,
+    test_for_binary_target_real_feature: str = ...,
+    test_for_real_target_binary_feature: str = ..., 
+    test_for_real_target_real_feature: str = ..., 
+    fdr_level: float = ..., 
+    hypotheses_independent: bool = ...,
+    ) -> pd.DataFrame: ...
+
+def infer_ml_task(y: pd.Series) -> str: ...
+def combine_relevance_tables(relevance_tables: List[pd.DataFrame]) -> pd.DataFrame: ...
+def get_feature_type(feature_column: pd.Series) -> str: ...
diff --git a/etna/libs/tsfresh/significance_tests.pyi b/etna/libs/tsfresh/significance_tests.pyi
@@ -0,0 +1,6 @@
+import pandas as pd
+
+def target_binary_feature_binary_test(x: pd.Series, y: pd.Series) -> float: ...
+def target_binary_feature_real_test(x: pd.Series, y: pd.Series, test: str) -> float: ...
+def target_real_feature_binary_test(x: pd.Series, y: pd.Series) -> float: ...
+def target_real_feature_real_test(x: pd.Series, y: pd.Series) -> float: ...
diff --git a/etna/py.typed b/etna/py.typed
diff --git a/etna/transforms/__init__.py b/etna/transforms/__init__.py
@@ -1,6 +1,7 @@
 from etna.transforms.base import PerSegmentWrapper
 from etna.transforms.base import Transform
 from etna.transforms.decomposition import BinsegTrendTransform
+from etna.transforms.decomposition import ChangePointsSegmentationTransform
 from etna.transforms.decomposition import ChangePointsTrendTransform
 from etna.transforms.decomposition import LinearTrendTransform
 from etna.transforms.decomposition import STLTransform

diff --git a/etna/transforms/decomposition/__init__.py b/etna/transforms/decomposition/__init__.py
@@ -1,4 +1,6 @@
+from etna.transforms.decomposition.base_change_points import RupturesChangePointsModel
 from etna.transforms.decomposition.binseg import BinsegTrendTransform
+from etna.transforms.decomposition.change_points_segmentation import ChangePointsSegmentationTransform
 from etna.transforms.decomposition.change_points_trend import ChangePointsTrendTransform
 from etna.transforms.decomposition.detrend import LinearTrendTransform
 from etna.transforms.decomposition.detrend import TheilSenTrendTransform

diff --git a/etna/transforms/decomposition/base_change_points.py b/etna/transforms/decomposition/base_change_points.py
@@ -0,0 +1,108 @@
+from abc import ABC
+from abc import abstractmethod
+from typing import List
+from typing import Tuple
+from typing import Type
+
+import pandas as pd
+from ruptures.base import BaseEstimator
+from ruptures.costs import CostLinear
+from sklearn.base import RegressorMixin
+
+TTimestampInterval = Tuple[pd.Timestamp, pd.Timestamp]
+TDetrendModel = Type[RegressorMixin]
+
+
+class BaseChangePointsModelAdapter(ABC):
+    """BaseChangePointsModelAdapter is the base class for change point models adapters."""
+
+    @abstractmethod
+    def get_change_points(self, df: pd.DataFrame, in_column: str) -> List[pd.Timestamp]:
+        """Find change points within one segment.
+
+        Parameters
+        ----------
+        df:
+            dataframe indexed with timestamp
+        in_column:
+            name of column to get change points
+
+        Returns
+        -------
+        change points:
+            change point timestamps
+        """
+        pass
+
+    @staticmethod
+    def _build_intervals(change_points: List[pd.Timestamp]) -> List[TTimestampInterval]:
+        """Create list of stable intervals from list of change points."""
+        change_points.extend([pd.Timestamp.min, pd.Timestamp.max])
+        change_points = sorted(change_points)
+        intervals = list(zip(change_points[:-1], change_points[1:]))
+        return intervals
+
+    def get_change_points_intervals(self, df: pd.DataFrame, in_column: str) -> List[TTimestampInterval]:
+        """Find change point intervals in given dataframe and column.
+
+        Parameters
+        ----------
+        df:
+            dataframe indexed with timestamp
+        in_column:
+            name of column to get change points
+
+        Returns
+        -------
+        :
+            change points intervals
+        """
+        change_points = self.get_change_points(df=df, in_column=in_column)
+        intervals = self._build_intervals(change_points=change_points)
+        return intervals
+
+
+class RupturesChangePointsModel(BaseChangePointsModelAdapter):
+    """RupturesChangePointsModel is ruptures change point models adapter."""
+
+    def __init__(self, change_point_model: BaseEstimator, **change_point_model_predict_params):
+        """Init RupturesChangePointsModel.
+
+        Parameters
+        ----------
+        change_point_model:
+            model to get change points
+        change_point_model_predict_params:
+            params for ``change_point_model.predict`` method
+        """
+        self.change_point_model = change_point_model
+        self.model_predict_params = change_point_model_predict_params
+
+    def get_change_points(self, df: pd.DataFrame, in_column: str) -> List[pd.Timestamp]:
+        """Find change points within one segment.
+
+        Parameters
+        ----------
+        df:
+            dataframe indexed with timestamp
+        in_column:
+            name of column to get change points
+
+        Returns
+        -------
+        change points:
+            change point timestamps
+        """
+        series = df.loc[df[in_column].first_valid_index() : df[in_column].last_valid_index(), in_column]
+        if series.isnull().values.any():
+            raise ValueError("The input column contains NaNs in the middle of the series! Try to use the imputer.")
+
+        signal = series.to_numpy()
+        if isinstance(self.change_point_model.cost, CostLinear):
+            signal = signal.reshape((-1, 1))
+        timestamp = series.index
+        self.change_point_model.fit(signal=signal)
+        # last point in change points is the first index after the series
+        change_points_indices = self.change_point_model.predict(**self.model_predict_params)[:-1]
+        change_points = [timestamp[idx] for idx in change_points_indices]
+        return change_points
diff --git a/etna/transforms/decomposition/change_points_segmentation.py b/etna/transforms/decomposition/change_points_segmentation.py
@@ -0,0 +1,121 @@
+from typing import List
+from typing import Optional
+
+import pandas as pd
+
+from etna.transforms.base import FutureMixin
+from etna.transforms.base import PerSegmentWrapper
+from etna.transforms.base import Transform
+from etna.transforms.decomposition.base_change_points import BaseChangePointsModelAdapter
+from etna.transforms.decomposition.base_change_points import TTimestampInterval
+
+
+class _OneSegmentChangePointsSegmentationTransform(Transform):
+    """_OneSegmentChangePointsSegmentationTransform make label encoder to change points."""
+
+    def __init__(self, in_column: str, out_column: str, change_point_model: BaseChangePointsModelAdapter):
+        """Init _OneSegmentChangePointsSegmentationTransform.
+        Parameters
+        ----------
+        in_column:
+            name of column to apply transform to
+        out_column:
+            result column name. If not given use ``self.__repr__()``
+        change_point_model:
+            model to get change points
+        """
+        self.in_column = in_column
+        self.out_column = out_column
+        self.intervals: Optional[List[TTimestampInterval]] = None
+        self.change_point_model = change_point_model
+
+    def _fill_per_interval(self, series: pd.Series) -> pd.Series:
+        """Fill values in resulting series."""
+        if self.intervals is None:
+            raise ValueError("Transform is not fitted! Fit the Transform before calling transform method.")
+        result_series = pd.Series(index=series.index)
+        for k, interval in enumerate(self.intervals):
+            tmp_series = series[interval[0] : interval[1]]
+            if tmp_series.empty:
+                continue
+            result_series[tmp_series.index] = k
+        return result_series.astype(int).astype("category")
+
+    def fit(self, df: pd.DataFrame) -> "_OneSegmentChangePointsSegmentationTransform":
+        """Fit _OneSegmentChangePointsSegmentationTransform: find change points in ``df`` and build intervals.
+
+        Parameters
+        ----------
+        df:
+            one segment dataframe indexed with timestamp
+
+        Returns
+        -------
+        :
+            instance with trained change points
+
+        Raises
+        ------
+        ValueError
+            If series contains NaNs in the middle
+        """
+        self.intervals = self.change_point_model.get_change_points_intervals(df=df, in_column=self.in_column)
+        return self
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Split df to intervals.
+
+        Parameters
+        ----------
+        df:
+            one segment dataframe
+
+        Returns
+        -------
+        df:
+            df with new column
+        """
+        series = df[self.in_column]
+        result_series = self._fill_per_interval(series=series)
+        df.loc[:, self.out_column] = result_series
+        return df
+
+
+class ChangePointsSegmentationTransform(PerSegmentWrapper, FutureMixin):
+    """ChangePointsSegmentationTransform make label encoder to change points.
+
+    Warning
+    -------
+    This transform can suffer from look-ahead bias. For transforming data at some timestamp
+    it uses information from the whole train part.
+    """
+
+    def __init__(
+        self,
+        in_column: str,
+        change_point_model: BaseChangePointsModelAdapter,
+        out_column: Optional[str] = None,
+    ):
+        """Init ChangePointsSegmentationTransform.
+
+        Parameterss
+        ----------
+        in_column:
+            name of column to fit change point model
+        out_column:
+            result column name. If not given use ``self.__repr__()``
+        change_point_model:
+            model to get change points
+        """
+        self.in_column = in_column
+        self.out_column = out_column
+        self.change_point_model = change_point_model
+        if self.out_column is None:
+            self.out_column = repr(self)
+        super().__init__(
+            transform=_OneSegmentChangePointsSegmentationTransform(
+                in_column=self.in_column,
+                out_column=self.out_column,
+                change_point_model=self.change_point_model,
+            )
+        )