From 8f594cfdfc22033d2dd78241918e84282902c1bc Mon Sep 17 00:00:00 2001 From: alex-hse-repository <55380696+alex-hse-repository@users.noreply.github.com> Date: Fri, 18 Feb 2022 18:25:33 +0300 Subject: [PATCH] Change `ProphetModel` and `SARIMAXModel` according to latest architecture (#549) --- CHANGELOG.md | 2 + etna/models/base.py | 16 ++-- etna/models/catboost.py | 27 ++++++ etna/models/holt_winters.py | 18 ++-- etna/models/prophet.py | 103 +++------------------- etna/models/sarimax.py | 108 +++--------------------- etna/models/seasonal_ma.py | 18 ++-- etna/models/sklearn.py | 27 ++++++ etna/pipeline/pipeline.py | 4 +- tests/test_models/test_linear_model.py | 2 +- tests/test_models/test_sarimax_model.py | 2 +- 11 files changed, 112 insertions(+), 215 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e8469a584..53be6bb60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - - Rename `_CatBoostModel`, `_HoltWintersModel`, `_SklearnModel` ([#543](https://github.com/tinkoff-ai/etna/pull/543)) - +- Rename `_SARIMAXModel` and `_ProphetModel`, make `SARIMAXModel` and `ProphetModel` inherit from `PerSegmentPredictionIntervalModel` ([#549](https://github.com/tinkoff-ai/etna/pull/549)) +- ### Fixed - Fix `TSDataset._update_regressors` logic removing the regressors ([#489](https://github.com/tinkoff-ai/etna/pull/489)) - Fix `TSDataset.info`, `TSDataset.describe` methods ([#519](https://github.com/tinkoff-ai/etna/pull/519)) diff --git a/etna/models/base.py b/etna/models/base.py index d1296393e..92eabe87e 100644 --- a/etna/models/base.py +++ b/etna/models/base.py @@ -10,6 +10,7 @@ from typing import Sequence from typing import Union +import numpy as np import pandas as pd from etna.core.mixins import BaseMixin @@ -181,7 +182,6 @@ def __init__(self, base_model: Any): Internal model which will be used to forecast segments, expected to have fit/predict interface """ self._base_model = base_model - self._segments: Optional[List[str]] = None self._models: Optional[Dict[str, Any]] = None @log_decorator @@ -198,7 +198,6 @@ def fit(self, ts: TSDataset) -> "PerSegmentBaseModel": self: Model after fit """ - self._segments = ts.segments self._models = {} for segment in ts.segments: self._models[segment] = deepcopy(self._base_model) @@ -223,7 +222,7 @@ def get_model(self) -> Dict[str, Any]: dictionary where key is segment and value is internal model """ if self._models is None: - raise ValueError("Can not get the dict with base models from not fitted model!") + raise ValueError("Can not get the dict with base models, the model is not fitted!") return self._models @staticmethod @@ -235,17 +234,12 @@ def _forecast_segment(model: Any, segment: str, ts: TSDataset, *args, **kwargs) dates = segment_features["timestamp"] dates.reset_index(drop=True, inplace=True) segment_predict = model.predict(df=segment_features, *args, **kwargs) - segment_predict = pd.DataFrame({"target": segment_predict}) + if isinstance(segment_predict, np.ndarray): + segment_predict = pd.DataFrame({"target": segment_predict}) segment_predict["segment"] = segment segment_predict["timestamp"] = dates return segment_predict - def _build_models(self): - """Create a dict with models for each segment (if required).""" - self._models = {} - for segment in self._segments: # type: ignore - self._models[segment] = deepcopy(self._base_model) - class PerSegmentModel(PerSegmentBaseModel, ForecastAbstractModel): """Class for holding specific models for per-segment prediction.""" @@ -305,7 +299,7 @@ def __init__(self, base_model: Any): """ super().__init__(base_model=base_model) - @abstractmethod + @log_decorator def forecast( self, ts: TSDataset, prediction_interval: bool = False, quantiles: Sequence[float] = (0.025, 0.975) ) -> TSDataset: diff --git a/etna/models/catboost.py b/etna/models/catboost.py index 445f54b51..7711c3163 100644 --- a/etna/models/catboost.py +++ b/etna/models/catboost.py @@ -36,6 +36,20 @@ def __init__( self._categorical = None def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_CatBoostAdapter": + """ + Fit Catboost model. + + Parameters + ---------- + df: + Features dataframe + regressors: + List of the columns with regressors(ignored in this model) + Returns + ------- + self: + Fitted model + """ features = df.drop(columns=["timestamp", "target"]) target = df["target"] self._categorical = features.select_dtypes(include=["category"]).columns.to_list() @@ -44,6 +58,19 @@ def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_CatBoostAdapter": return self def predict(self, df: pd.DataFrame) -> np.ndarray: + """ + Compute predictions from a Catboost model. + + Parameters + ---------- + df: + Features dataframe + + Returns + ------- + y_pred: + Array with predictions + """ features = df.drop(columns=["timestamp", "target"]) predict_pool = Pool(features, cat_features=self._categorical) pred = self.model.predict(predict_pool) diff --git a/etna/models/holt_winters.py b/etna/models/holt_winters.py index 6182acbc5..2d8ebe465 100644 --- a/etna/models/holt_winters.py +++ b/etna/models/holt_winters.py @@ -7,6 +7,7 @@ from typing import Tuple from typing import Union +import numpy as np import pandas as pd from statsmodels.tsa.holtwinters import ExponentialSmoothing from statsmodels.tsa.holtwinters import HoltWintersResults @@ -171,17 +172,18 @@ def __init__( def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_HoltWintersAdapter": """ - Fits a Holt-Winters' model. + Fit Holt-Winters' model. Parameters ---------- df: Features dataframe - + regressors: + List of the columns with regressors(ignored in this model) Returns ------- - self: _HoltWintersAdapter - fitted model + self: + Fitted model """ self._check_df(df) @@ -213,7 +215,7 @@ def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_HoltWintersAdapter": ) return self - def predict(self, df: pd.DataFrame) -> pd.Series: + def predict(self, df: pd.DataFrame) -> np.ndarray: """ Compute predictions from a Holt-Winters' model. @@ -224,15 +226,15 @@ def predict(self, df: pd.DataFrame) -> pd.Series: Returns ------- - y_pred: pd.Series - Series with predictions + y_pred: + Array with predictions """ if self._result is None or self._model is None: raise ValueError("This model is not fitted! Fit the model before calling predict method!") self._check_df(df) forecast = self._result.predict(start=df["timestamp"].min(), end=df["timestamp"].max()) - y_pred = pd.Series(data=forecast.values, name="target") + y_pred = forecast.values return y_pred def _check_df(self, df: pd.DataFrame): diff --git a/etna/models/prophet.py b/etna/models/prophet.py index 8d0de8454..18b34c780 100644 --- a/etna/models/prophet.py +++ b/etna/models/prophet.py @@ -9,15 +9,13 @@ import pandas as pd from etna import SETTINGS -from etna.datasets import TSDataset -from etna.models.base import PerSegmentModel -from etna.models.base import log_decorator +from etna.models.base import PerSegmentPredictionIntervalModel if SETTINGS.prophet_required: from prophet import Prophet -class _ProphetModel: +class _ProphetAdapter: """Class for holding Prophet model.""" def __init__( @@ -83,7 +81,7 @@ def __init__( self.regressor_columns: Optional[List[str]] = None - def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_ProphetModel": + def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_ProphetAdapter": """ Fits a Prophet model. @@ -104,9 +102,9 @@ def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_ProphetModel": self.model.fit(prophet_df) return self - def predict(self, df: pd.DataFrame, prediction_interval: bool, quantiles: Sequence[float]): + def predict(self, df: pd.DataFrame, prediction_interval: bool, quantiles: Sequence[float]) -> pd.DataFrame: """ - Compute Prophet predictions. + Compute predictions from a Prophet model. Parameters ---------- @@ -119,7 +117,7 @@ def predict(self, df: pd.DataFrame, prediction_interval: bool, quantiles: Sequen Returns ------- - y_pred: pd.DataFrame + y_pred: DataFrame with predictions """ df = df.reset_index() @@ -134,10 +132,14 @@ def predict(self, df: pd.DataFrame, prediction_interval: bool, quantiles: Sequen for quantile in quantiles: percentile = quantile * 100 y_pred[f"yhat_{quantile:.4g}"] = self.model.percentile(sim_values["yhat"], percentile, axis=1) + rename_dict = { + column: column.replace("yhat", "target") for column in y_pred.columns if column.startswith("yhat") + } + y_pred = y_pred.rename(rename_dict, axis=1) return y_pred -class ProphetModel(PerSegmentModel): +class ProphetModel(PerSegmentPredictionIntervalModel): """Class for holding Prophet model. Examples @@ -296,7 +298,7 @@ def __init__( self.additional_seasonality_params = additional_seasonality_params super(ProphetModel, self).__init__( - base_model=_ProphetModel( + base_model=_ProphetAdapter( growth=self.growth, n_changepoints=self.n_changepoints, changepoints=self.changepoints, @@ -316,84 +318,3 @@ def __init__( additional_seasonality_params=self.additional_seasonality_params, ) ) - - @log_decorator - def fit(self, ts: TSDataset) -> "ProphetModel": - """Fit model.""" - self._segments = ts.segments - self._build_models() - - for segment in self._segments: - model = self._models[segment] # type: ignore - segment_features = ts[:, segment, :] - segment_features = segment_features.dropna() - segment_features = segment_features.droplevel("segment", axis=1) - segment_features = segment_features.reset_index() - model.fit(df=segment_features, regressors=ts.regressors) - return self - - @staticmethod - def _forecast_one_segment( - model, - segment: Union[str, List[str]], - ts: TSDataset, - prediction_interval: bool, - quantiles: Sequence[float], - ) -> pd.DataFrame: - segment_features = ts[:, segment, :] - segment_features = segment_features.droplevel("segment", axis=1) - segment_features = segment_features.reset_index() - dates = segment_features["timestamp"] - dates.reset_index(drop=True, inplace=True) - segment_predict = model.predict( - df=segment_features, prediction_interval=prediction_interval, quantiles=quantiles - ) - rename_dict = { - column: column.replace("yhat", "target") for column in segment_predict.columns if column.startswith("yhat") - } - segment_predict = segment_predict.rename(rename_dict, axis=1) - segment_predict["segment"] = segment - segment_predict["timestamp"] = dates - return segment_predict - - @log_decorator - def forecast( - self, ts: TSDataset, prediction_interval: bool = False, quantiles: Sequence[float] = (0.025, 0.975) - ) -> TSDataset: - """Make predictions. - - Parameters - ---------- - ts: - Dataframe with features - prediction_interval: - If True returns prediction interval for forecast - quantiles: - Levels of prediction distribution. By default 2.5% and 97.5% taken to form a 95% prediction interval - - Returns - ------- - TSDataset - Models result - """ - if self._segments is None: - raise ValueError("The model is not fitted yet, use fit() to train it") - - result_list = list() - for segment in self._segments: - model = self._models[segment] # type: ignore - - segment_predict = self._forecast_one_segment(model, segment, ts, prediction_interval, quantiles) - result_list.append(segment_predict) - - # need real case to test - result_df = pd.concat(result_list, ignore_index=True) - result_df = result_df.set_index(["timestamp", "segment"]) - df = ts.to_pandas(flatten=True) - df = df.set_index(["timestamp", "segment"]) - df = df.combine_first(result_df).reset_index() - - df = TSDataset.to_dataset(df) - ts.df = df - ts.inverse_transform() - return ts diff --git a/etna/models/sarimax.py b/etna/models/sarimax.py index 13482bce4..47cd59422 100644 --- a/etna/models/sarimax.py +++ b/etna/models/sarimax.py @@ -4,15 +4,12 @@ from typing import Optional from typing import Sequence from typing import Tuple -from typing import Union import pandas as pd from statsmodels.tools.sm_exceptions import ValueWarning from statsmodels.tsa.statespace.sarimax import SARIMAX -from etna.datasets import TSDataset -from etna.models.base import PerSegmentModel -from etna.models.base import log_decorator +from etna.models.base import PerSegmentPredictionIntervalModel warnings.filterwarnings( message="No frequency information was provided, so inferred frequency .* will be used", @@ -22,7 +19,7 @@ ) -class _SARIMAXModel: +class _SARIMAXAdapter: """ Class for holding Sarimax model. @@ -167,7 +164,7 @@ def __init__( self._result: Optional[SARIMAX] = None self.regressor_columns: Optional[List[str]] = None - def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_SARIMAXModel": + def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_SARIMAXAdapter": """ Fits a SARIMAX model. @@ -179,8 +176,8 @@ def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_SARIMAXModel": List of the columns with regressors Returns ------- - self: SARIMAX - fitted model + self: + Fitted model """ self.regressor_columns = regressors categorical_cols = df.select_dtypes(include=["category"]).columns.tolist() @@ -239,7 +236,7 @@ def predict(self, df: pd.DataFrame, prediction_interval: bool, quantiles: Sequen Returns ------- - y_pred: pd.DataFrame + y_pred: DataFrame with predictions """ if self._result is None or self._model is None: @@ -278,7 +275,12 @@ def predict(self, df: pd.DataFrame, prediction_interval: bool, quantiles: Sequen ) y_pred = pd.DataFrame(forecast.predicted_mean) y_pred.rename({"predicted_mean": "mean"}, axis=1, inplace=True) - return y_pred.reset_index(drop=True, inplace=False) + y_pred = y_pred.reset_index(drop=True, inplace=False) + rename_dict = { + column: column.replace("mean", "target") for column in y_pred.columns if column.startswith("mean") + } + y_pred = y_pred.rename(rename_dict, axis=1) + return y_pred def _check_df(self, df: pd.DataFrame, horizon: Optional[int] = None): if self.regressor_columns is None: @@ -306,7 +308,7 @@ def _select_regressors(self, df: pd.DataFrame) -> Optional[pd.DataFrame]: return exog_future -class SARIMAXModel(PerSegmentModel): +class SARIMAXModel(PerSegmentPredictionIntervalModel): """ Class for holding Sarimax model. @@ -448,7 +450,7 @@ def __init__( self.validate_specification = validate_specification self.kwargs = kwargs super(SARIMAXModel, self).__init__( - base_model=_SARIMAXModel( + base_model=_SARIMAXAdapter( order=self.order, seasonal_order=self.seasonal_order, trend=self.trend, @@ -469,85 +471,3 @@ def __init__( **self.kwargs, ) ) - - @log_decorator - def fit(self, ts: TSDataset) -> "SARIMAXModel": - """Fit model.""" - self._segments = ts.segments - self._build_models() - - for segment in self._segments: - model = self._models[segment] # type: ignore - segment_features = ts[:, segment, :] - segment_features = segment_features.dropna() - segment_features = segment_features.droplevel("segment", axis=1) - segment_features = segment_features.reset_index() - model.fit(df=segment_features, regressors=ts.regressors) - return self - - @staticmethod - def _forecast_one_segment( - model, - segment: Union[str, List[str]], - ts: TSDataset, - prediction_interval: bool, - quantiles: Sequence[float], - ) -> pd.DataFrame: - segment_features = ts[:, segment, :] - segment_features = segment_features.droplevel("segment", axis=1) - segment_features = segment_features.reset_index() - dates = segment_features["timestamp"] - dates.reset_index(drop=True, inplace=True) - segment_predict = model.predict( - df=segment_features, prediction_interval=prediction_interval, quantiles=quantiles - ) - rename_dict = { - column: column.replace("mean", "target") for column in segment_predict.columns if column.startswith("mean") - } - segment_predict = segment_predict.rename(rename_dict, axis=1) - segment_predict["segment"] = segment - segment_predict["timestamp"] = dates - return segment_predict - - @log_decorator - def forecast( - self, ts: TSDataset, prediction_interval: bool = False, quantiles: Sequence[float] = (0.025, 0.975) - ) -> TSDataset: - """Make predictions. - - Parameters - ---------- - ts: - Dataframe with features - prediction_interval: - If True returns prediction interval for forecast - quantiles: - Levels of prediction distribution. By default 2.5% and 97.5% taken to form a 95% prediction interval - - Returns - ------- - pd.DataFrame - Models result - """ - if self._segments is None: - raise ValueError("The model is not fitted yet, use fit() to train it") - - result_list = list() - for segment in self._segments: - model = self._models[segment] # type: ignore - - segment_predict = self._forecast_one_segment(model, segment, ts, prediction_interval, quantiles) - result_list.append(segment_predict) - - # need real case to test - result_df = pd.concat(result_list, ignore_index=True) - result_df = result_df.set_index(["timestamp", "segment"]) - df = ts.to_pandas(flatten=True) - df = df.set_index(["timestamp", "segment"]) - # N.B. inplace forecast will not change target values, because `combine_first` only fill nan values - df = df.combine_first(result_df).reset_index() - - df = TSDataset.to_dataset(df) - ts.df = df - ts.inverse_transform() - return ts diff --git a/etna/models/seasonal_ma.py b/etna/models/seasonal_ma.py index cda2800b7..ad5834cb7 100644 --- a/etna/models/seasonal_ma.py +++ b/etna/models/seasonal_ma.py @@ -36,17 +36,19 @@ def __init__(self, window: int = 5, seasonality: int = 7): def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_SeasonalMovingAverageModel": """ - Fitting simple model on given series. + Fit SeasonalMovingAverage model. Parameters ---------- df: pd.DataFrame Ignored. Needed for compatibility with AutoRegressorForecaster. + regressors: + List of the columns with regressors(ignored in this model) Returns ------- - self: SeasonalMovingAverageModel - fitted model + self: + Fitted model """ if set(df.columns) != {"timestamp", "target"}: warnings.warn( @@ -65,9 +67,9 @@ def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_SeasonalMovingAverag self.name = targets.name return self - def predict(self, df: pd.DataFrame) -> pd.Series: + def predict(self, df: pd.DataFrame) -> np.ndarray: """ - Calculate forecast. + Compute predictions from a SeasonalMovingAverage model. Parameters ---------- @@ -77,13 +79,15 @@ def predict(self, df: pd.DataFrame) -> pd.Series: Returns ------- - pd.Series with forecast. + y_pred: + Array with predictions. """ horizon = len(df) res = np.append(self.series, np.zeros(horizon)) for i in range(self.shift, len(res)): res[i] = res[i - self.shift : i : self.seasonality].mean() - return pd.Series(data=res[-horizon:], name=self.name) + y_pred = res[-horizon:] + return y_pred class SeasonalMovingAverageModel(PerSegmentModel): diff --git a/etna/models/sklearn.py b/etna/models/sklearn.py index 4d786090d..6ab65a298 100644 --- a/etna/models/sklearn.py +++ b/etna/models/sklearn.py @@ -17,6 +17,20 @@ def __init__(self, regressor: RegressorMixin): self.regressor_columns: Optional[List[str]] = None def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_SklearnAdapter": + """ + Fit Sklearn model. + + Parameters + ---------- + df: + Features dataframe + regressors: + List of the columns with regressors + Returns + ------- + self: + Fitted model + """ self.regressor_columns = regressors try: features = df[self.regressor_columns].apply(pd.to_numeric) @@ -27,6 +41,19 @@ def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_SklearnAdapter": return self def predict(self, df: pd.DataFrame) -> np.ndarray: + """ + Compute predictions from a Sklearn model. + + Parameters + ---------- + df: + Features dataframe + + Returns + ------- + y_pred: + Array with predictions + """ try: features = df[self.regressor_columns].apply(pd.to_numeric) except ValueError: diff --git a/etna/pipeline/pipeline.py b/etna/pipeline/pipeline.py index 01e0b94af..c74d1bc99 100644 --- a/etna/pipeline/pipeline.py +++ b/etna/pipeline/pipeline.py @@ -1,4 +1,3 @@ -import inspect from copy import deepcopy from enum import Enum from typing import Any @@ -22,6 +21,7 @@ from etna.metrics import Metric from etna.metrics import MetricAggregationMode from etna.models.base import Model +from etna.models.base import PerSegmentPredictionIntervalModel from etna.pipeline.base import BasePipeline from etna.transforms.base import Transform @@ -153,7 +153,7 @@ def forecast(self, prediction_interval: bool = False) -> TSDataset: future = self.ts.make_future(self.horizon) if prediction_interval: - if "prediction_interval" in inspect.signature(self.model.forecast).parameters: + if isinstance(self.model, PerSegmentPredictionIntervalModel): predictions = self.model.forecast( ts=future, prediction_interval=prediction_interval, quantiles=self.quantiles ) diff --git a/tests/test_models/test_linear_model.py b/tests/test_models/test_linear_model.py index 25ff7ed5b..2451d5079 100644 --- a/tests/test_models/test_linear_model.py +++ b/tests/test_models/test_linear_model.py @@ -86,7 +86,7 @@ def test_not_fitted(model, linear_segments_ts_unique): train.fit_transform([lags]) to_forecast = train.make_future(3) - with pytest.raises(ValueError, match="not fitted model!"): + with pytest.raises(ValueError, match="model is not fitted!"): model.forecast(to_forecast) diff --git a/tests/test_models/test_sarimax_model.py b/tests/test_models/test_sarimax_model.py index 1ec9e9f01..74600bda8 100644 --- a/tests/test_models/test_sarimax_model.py +++ b/tests/test_models/test_sarimax_model.py @@ -101,5 +101,5 @@ def test_prediction_interval_run_infuture(example_tsds): def test_forecast_raise_error_if_not_fitted(example_tsds): """Test that SARIMAX raise error when calling forecast without being fit.""" model = SARIMAXModel() - with pytest.raises(ValueError, match="The model is not fitted yet"): + with pytest.raises(ValueError, match="model is not fitted!"): _ = model.forecast(ts=example_tsds)