diff --git a/CHANGELOG.md b/CHANGELOG.md index 01e9f5f39..0ecf93714 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,7 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - - - -- +- Fix behavior of SARIMAXModel if simple_differencing=True is set ([#837](https://github.com/tinkoff-ai/etna/pull/837)) - - - diff --git a/etna/libs/pmdarima_utils/__init__.py b/etna/libs/pmdarima_utils/__init__.py new file mode 100644 index 000000000..bff45169f --- /dev/null +++ b/etna/libs/pmdarima_utils/__init__.py @@ -0,0 +1 @@ +from etna.libs.pmdarima_utils.arima import seasonal_prediction_with_confidence diff --git a/etna/libs/pmdarima_utils/arima.py b/etna/libs/pmdarima_utils/arima.py new file mode 100644 index 000000000..ae7bdb3be --- /dev/null +++ b/etna/libs/pmdarima_utils/arima.py @@ -0,0 +1,153 @@ +""" +MIT License + +Copyright (c) 2017 Taylor G Smith + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +# Note: Copied from pmdarima package (https://github.com/blue-yonder/tsfresh/blob/https://github.com/alkaline-ml/pmdarima/blob/v1.8.5/pmdarima/arima/arima.py) + +import numpy as np +import numpy.polynomial.polynomial as np_polynomial +from sklearn.utils.validation import check_array +from pmdarima.utils import diff +from pmdarima.utils import diff_inv +from pmdarima.utils import check_endog + + +def ARMAtoMA(ar, ma, max_deg): + r""" + Convert ARMA coefficients to infinite MA coefficients. + Compute coefficients of MA model equivalent to given ARMA model. + MA coefficients are cut off at max_deg. + The same function as ARMAtoMA() in stats library of R + Parameters + ---------- + ar : array-like, shape=(n_orders,) + The array of AR coefficients. + ma : array-like, shape=(n_orders,) + The array of MA coefficients. + max_deg : int + Coefficients are computed up to the order of max_deg. + Returns + ------- + np.ndarray, shape=(max_deg,) + Equivalent MA coefficients. + Notes + ----- + Here is the derivation. Suppose ARMA model is defined as + .. math:: + x_t - ar_1*x_{t-1} - ar_2*x_{t-2} - ... - ar_p*x_{t-p}\\ + = e_t + ma_1*e_{t-1} + ma_2*e_{t-2} + ... + ma_q*e_{t-q} + namely + .. math:: + (1 - \sum_{i=1}^p[ar_i*B^i]) x_t = (1 + \sum_{i=1}^q[ma_i*B^i]) e_t + where :math:`B` is a backward operator. + Equivalent MA model is + .. math:: + x_t = (1 - \sum_{i=1}^p[ar_i*B^i])^{-1}\\ + * (1 + \sum_{i=1}^q[ma_i*B^i]) e_t\\ + = (1 + \sum_{i=1}[ema_i*B^i]) e_t + where :math:``ema_i`` is a coefficient of equivalent MA model. + The :math:``ema_i`` satisfies + .. math:: + (1 - \sum_{i=1}^p[ar_i*B^i]) * (1 + \sum_{i=1}[ema_i*B^i]) \\ + = 1 + \sum_{i=1}^q[ma_i*B^i] + thus + .. math:: + \sum_{i=1}[ema_i*B^i] = \sum_{i=1}^p[ar_i*B^i] \\ + + \sum_{i=1}^p[ar_i*B^i] * \sum_{j=1}[ema_j*B^j] \\ + + \Sum_{i=1}^q[ma_i*B^i] + therefore + .. math:: + ema_i = ar_i (but 0 if i>p) \\ + + \Sum_{j=1}^{min(i-1,p)}[ar_j*ema_{i-j}] + ma_i(but 0 if i>q) \\ + = \sum_{j=1}{min(i,p)}[ar_j*ema_{i-j}(but 1 if j=i)] \\ + + ma_i(but 0 if i>q) + """ + p = len(ar) + q = len(ma) + ema = np.empty(max_deg) + for i in range(0, max_deg): + temp = ma[i] if i < q else 0.0 + for j in range(0, min(i + 1, p)): + temp += ar[j] * (ema[i - j - 1] if i - j - 1 >= 0 else 1.0) + ema[i] = temp + return ema + + +# Note: Originally copied from pmdarima package (https://github.com/blue-yonder/tsfresh/blob/https://github.com/alkaline-ml/pmdarima/blob/v1.8.5/pmdarima/arima/arima.py) +def seasonal_prediction_with_confidence(arima_res, + start, + end, + X, + alpha, + **kwargs): + """Compute the prediction for a SARIMAX and get a conf interval + + Unfortunately, SARIMAX does not really provide a nice way to get the + confidence intervals out of the box, so we have to perform the + ``get_prediction`` code here and unpack the confidence intervals manually. + """ + results = arima_res.get_prediction( + start=start, + end=end, + exog=X, + **kwargs) + + f = results.predicted_mean + conf_int = results.conf_int(alpha=alpha) + if arima_res.specification['simple_differencing']: + # If simple_differencing == True, statsmodels.get_prediction returns + # mid and confidence intervals on differenced time series. + # We have to invert differencing the mid and confidence intervals + y_org = arima_res.model.orig_endog + d = arima_res.model.orig_k_diff + D = arima_res.model.orig_k_seasonal_diff + period = arima_res.model.seasonal_periods + # Forecast mid: undifferencing non-seasonal part + if d > 0: + y_sdiff = y_org if D == 0 else diff(y_org, period, D) + f_temp = np.append(y_sdiff[-d:], f) + f_temp = diff_inv(f_temp, 1, d) + f = f_temp[(2 * d):] + # Forecast mid: undifferencing seasonal part + if D > 0 and period > 1: + f_temp = np.append(y_org[-(D * period):], f) + f_temp = diff_inv(f_temp, period, D) + f = f_temp[(2 * D * period):] + # confidence interval + ar_poly = arima_res.polynomial_reduced_ar + poly_diff = np_polynomial.polypow(np.array([1., -1.]), d) + sdiff = np.zeros(period + 1) + sdiff[0] = 1. + sdiff[-1] = 1. + poly_sdiff = np_polynomial.polypow(sdiff, D) + ar = -np.polymul(ar_poly, np.polymul(poly_diff, poly_sdiff))[1:] + ma = arima_res.polynomial_reduced_ma[1:] + n_predMinus1 = end - start + ema = ARMAtoMA(ar, ma, n_predMinus1) + sigma2 = arima_res._params_variance[0] + var = np.cumsum(np.append(1., ema * ema)) * sigma2 + q = results.dist.ppf(1. - alpha / 2, *results.dist_args) + conf_int[:, 0] = f - q * np.sqrt(var) + conf_int[:, 1] = f + q * np.sqrt(var) + + return check_endog(f, dtype=None, copy=False), \ + check_array(conf_int, copy=False, dtype=None) diff --git a/etna/models/sarimax.py b/etna/models/sarimax.py index a26263317..de1d19b8f 100644 --- a/etna/models/sarimax.py +++ b/etna/models/sarimax.py @@ -9,8 +9,10 @@ from statsmodels.tools.sm_exceptions import ValueWarning from statsmodels.tsa.statespace.sarimax import SARIMAX +from etna.libs.pmdarima_utils import seasonal_prediction_with_confidence from etna.models.base import BaseAdapter from etna.models.base import PerSegmentPredictionIntervalModel +from etna.models.utils import determine_num_steps warnings.filterwarnings( message="No frequency information was provided, so inferred frequency .* will be used", @@ -164,6 +166,8 @@ def __init__( self._model: Optional[SARIMAX] = None self._result: Optional[SARIMAX] = None self.regressor_columns: Optional[List[str]] = None + self._freq = None + self._first_train_timestamp = None def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_SARIMAXAdapter": """ @@ -193,8 +197,8 @@ def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_SARIMAXAdapter": self._check_df(df) - targets = df["target"] - targets.index = df["timestamp"] + # make it a numpy array for forgetting about indices, it is necessary for _seasonal_prediction_with_confidence + targets = df["target"].values exog_train = self._select_regressors(df) @@ -221,6 +225,14 @@ def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_SARIMAXAdapter": **self.kwargs, ) self._result = self._model.fit() + + freq = pd.infer_freq(df["timestamp"], warn=False) + if freq is None: + raise ValueError("Can't determine frequency of a given dataframe") + self._freq = freq + + self._first_train_timestamp = df["timestamp"].min() + return self def predict(self, df: pd.DataFrame, prediction_interval: bool, quantiles: Sequence[float]) -> pd.DataFrame: @@ -256,30 +268,39 @@ def predict(self, df: pd.DataFrame, prediction_interval: bool, quantiles: Sequen ) exog_future = self._select_regressors(df) + start_timestamp = df["timestamp"].min() + end_timestamp = df["timestamp"].max() + # determine index of start_timestamp if counting from first timestamp of train + start_idx = determine_num_steps( + start_timestamp=self._first_train_timestamp, end_timestamp=start_timestamp, freq=self._freq # type: ignore + ) + # determine index of end_timestamp if counting from first timestamp of train + end_idx = determine_num_steps( + start_timestamp=self._first_train_timestamp, end_timestamp=end_timestamp, freq=self._freq # type: ignore + ) + if prediction_interval: - forecast = self._result.get_prediction( - start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=False, exog=exog_future + forecast, _ = seasonal_prediction_with_confidence( + arima_res=self._result, start=start_idx, end=end_idx, X=exog_future, alpha=0.05 ) - y_pred = forecast.predicted_mean - y_pred.name = "mean" - y_pred = pd.DataFrame(y_pred) + y_pred = pd.DataFrame({"mean": forecast}) for quantile in quantiles: # set alpha in the way to get a desirable quantile alpha = min(quantile * 2, (1 - quantile) * 2) - borders = forecast.conf_int(alpha=alpha) + _, borders = seasonal_prediction_with_confidence( + arima_res=self._result, start=start_idx, end=end_idx, X=exog_future, alpha=alpha + ) if quantile < 1 / 2: - series = borders["lower target"] + series = borders[:, 0] else: - series = borders["upper target"] + series = borders[:, 1] y_pred[f"mean_{quantile:.4g}"] = series else: - forecast = self._result.get_prediction( - start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=False, exog=exog_future + forecast, _ = seasonal_prediction_with_confidence( + arima_res=self._result, start=start_idx, end=end_idx, X=exog_future, alpha=0.05 ) - y_pred = forecast.predicted_mean - y_pred.name = "mean" - y_pred = pd.DataFrame(y_pred) - y_pred = y_pred.reset_index(drop=True, inplace=False) + y_pred = pd.DataFrame({"mean": forecast}) + rename_dict = { column: column.replace("mean", "target") for column in y_pred.columns if column.startswith("mean") } diff --git a/etna/models/tbats.py b/etna/models/tbats.py index 42631245a..629e63728 100644 --- a/etna/models/tbats.py +++ b/etna/models/tbats.py @@ -11,7 +11,7 @@ from etna.models.base import BaseAdapter from etna.models.base import PerSegmentPredictionIntervalModel -from etna.models.utils import determine_num_steps_to_forecast +from etna.models.utils import determine_num_steps class _TBATSAdapter(BaseAdapter): @@ -43,8 +43,8 @@ def predict(self, df: pd.DataFrame, prediction_interval: bool, quantiles: Iterab "In-sample predictions aren't supported by current implementation." ) - steps_to_forecast = determine_num_steps_to_forecast( - last_train_timestamp=self._last_train_timestamp, last_test_timestamp=df["timestamp"].max(), freq=self._freq + steps_to_forecast = determine_num_steps( + start_timestamp=self._last_train_timestamp, end_timestamp=df["timestamp"].max(), freq=self._freq ) steps_to_skip = steps_to_forecast - df.shape[0] diff --git a/etna/models/utils.py b/etna/models/utils.py index 49cb5bb93..05dc0fde0 100644 --- a/etna/models/utils.py +++ b/etna/models/utils.py @@ -1,20 +1,15 @@ import pandas as pd -def determine_num_steps_to_forecast( - last_train_timestamp: pd.Timestamp, last_test_timestamp: pd.Timestamp, freq: str -) -> int: - """Determine number of steps to make a forecast in future. - - It is useful for out-sample forecast with gap if model predicts only on a certain number of steps - in autoregressive manner. +def determine_num_steps(start_timestamp: pd.Timestamp, end_timestamp: pd.Timestamp, freq: str) -> int: + """Determine how many steps of ``freq`` should we make from ``start_timestamp`` to reach ``end_timestamp``. Parameters ---------- - last_train_timestamp: - last timestamp in train data - last_test_timestamp: - last timestamp in test data, should be after ``last_train_timestamp`` + start_timestamp: + timestamp to start counting from + end_timestamp: + timestamp to end counting, should be not earlier than ``start_timestamp`` freq: pandas frequency string: `Offset aliases `_ @@ -26,26 +21,30 @@ def determine_num_steps_to_forecast( Raises ------ ValueError: - Value of last test timestamp is less or equal than last train timestamp + Value of end timestamp is less than start timestamp ValueError: - Last train timestamp isn't correct according to a given frequency + Start timestamp isn't correct according to a given frequency ValueError: - Last test timestamps isn't reachable with a given frequency + End timestamp isn't reachable with a given frequency """ - if last_test_timestamp <= last_train_timestamp: - raise ValueError("Last train timestamp should be less than last test timestamp!") + if start_timestamp > end_timestamp: + raise ValueError("Start train timestamp should be less or equal than end timestamp!") + + # check if start_timestamp is normalized + normalized_start_timestamp = pd.date_range(start=start_timestamp, periods=1, freq=freq) + if normalized_start_timestamp != start_timestamp: + raise ValueError(f"Start timestamp isn't correct according to given frequency: {freq}") - # check if last_train_timestamp is normalized - normalized_last_train_timestamp = pd.date_range(start=last_train_timestamp, periods=1, freq=freq) - if normalized_last_train_timestamp != last_train_timestamp: - raise ValueError(f"Last train timestamp isn't correct according to given frequency: {freq}") + # check a simple case + if start_timestamp == end_timestamp: + return 0 # make linear probing, because for complex offsets there is a cycle in `pd.date_range` cur_value = 1 while True: - timestamps = pd.date_range(start=last_train_timestamp, periods=cur_value + 1, freq=freq) - if timestamps[-1] == last_test_timestamp: + timestamps = pd.date_range(start=start_timestamp, periods=cur_value + 1, freq=freq) + if timestamps[-1] == end_timestamp: return cur_value - elif timestamps[-1] > last_test_timestamp: - raise ValueError(f"Last test timestamps isn't reachable with freq: {freq}") + elif timestamps[-1] > end_timestamp: + raise ValueError(f"End timestamp isn't reachable with freq: {freq}") cur_value += 1 diff --git a/tests/test_models/test_sarimax_model.py b/tests/test_models/test_sarimax_model.py index 3a7d6c299..4a6e581c0 100644 --- a/tests/test_models/test_sarimax_model.py +++ b/tests/test_models/test_sarimax_model.py @@ -5,22 +5,23 @@ from etna.pipeline import Pipeline +def _check_prediction(ts, model, horizon): + model.fit(ts) + future_ts = ts.make_future(future_steps=horizon) + res = model.forecast(future_ts) + res = res.to_pandas(flatten=True) + + assert not res.isnull().values.any() + assert len(res) == 14 + + def test_sarimax_forecaster_run(example_tsds): """ Given: I have dataframe with 2 segments When: Then: I get 7 periods per dataset as a forecast """ - - horizon = 7 - model = SARIMAXModel() - model.fit(example_tsds) - future_ts = example_tsds.make_future(future_steps=horizon) - res = model.forecast(future_ts) - res = res.to_pandas(flatten=True) - - assert not res.isnull().values.any() - assert len(res) == 14 + _check_prediction(ts=example_tsds, model=SARIMAXModel(), horizon=7) def test_sarimax_save_regressors_on_fit(example_reg_tsds): @@ -40,21 +41,22 @@ def test_sarimax_select_regressors_correctly(example_reg_tsds): assert (segment_regressors == segment_regressors_expected).all().all() -def test_sarimax_forecaster_run_with_reg(example_reg_tsds): +def test_sarimax_forecaster_run_with_simple_differencing(example_tsds): """ Given: I have dataframe with 2 segments When: Then: I get 7 periods per dataset as a forecast """ - horizon = 7 - model = SARIMAXModel() - model.fit(example_reg_tsds) - future_ts = example_reg_tsds.make_future(future_steps=horizon) - res = model.forecast(future_ts) - res = res.to_pandas(flatten=True) + _check_prediction(ts=example_tsds, model=SARIMAXModel(simple_differencing=True), horizon=7) - assert not res.isnull().values.any() - assert len(res) == 14 + +def test_sarimax_forecaster_run_with_reg(example_reg_tsds): + """ + Given: I have dataframe with 2 segments + When: + Then: I get 7 periods per dataset as a forecast + """ + _check_prediction(ts=example_reg_tsds, model=SARIMAXModel(), horizon=7) def test_sarimax_forececaster_run_with_reg_custom_order(example_reg_tsds): @@ -63,15 +65,7 @@ def test_sarimax_forececaster_run_with_reg_custom_order(example_reg_tsds): When: Sarimax have non standard `order` param Then: I get 7 periods per dataset as a forecast """ - horizon = 7 - model = SARIMAXModel(order=(3, 1, 0)) - model.fit(example_reg_tsds) - future_ts = example_reg_tsds.make_future(future_steps=horizon) - res = model.forecast(future_ts) - res = res.to_pandas(flatten=True) - - assert not res.isnull().values.any() - assert len(res) == 14 + _check_prediction(ts=example_reg_tsds, model=SARIMAXModel(order=(3, 1, 0)), horizon=7) def test_prediction_interval_run_insample(example_tsds): diff --git a/tests/test_models/test_utils.py b/tests/test_models/test_utils.py index d269a5b99..2a4bf7153 100644 --- a/tests/test_models/test_utils.py +++ b/tests/test_models/test_utils.py @@ -1,64 +1,56 @@ import pandas as pd import pytest -from etna.models.utils import determine_num_steps_to_forecast +from etna.models.utils import determine_num_steps @pytest.mark.parametrize( - "last_train_timestamp, last_test_timestamp, freq, answer", + "start_timestamp, end_timestamp, freq, answer", [ (pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02"), "D", 1), (pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-11"), "D", 10), + (pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-01"), "D", 0), (pd.Timestamp("2020-01-05"), pd.Timestamp("2020-01-19"), "W-SUN", 2), (pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-15"), pd.offsets.Week(), 2), (pd.Timestamp("2020-01-31"), pd.Timestamp("2021-02-28"), "M", 13), (pd.Timestamp("2020-01-01"), pd.Timestamp("2021-06-01"), "MS", 17), ], ) -def test_determine_num_steps_to_forecast_ok(last_train_timestamp, last_test_timestamp, freq, answer): - result = determine_num_steps_to_forecast( - last_train_timestamp=last_train_timestamp, last_test_timestamp=last_test_timestamp, freq=freq - ) +def test_determine_num_steps_ok(start_timestamp, end_timestamp, freq, answer): + result = determine_num_steps(start_timestamp=start_timestamp, end_timestamp=end_timestamp, freq=freq) assert result == answer @pytest.mark.parametrize( - "last_train_timestamp, last_test_timestamp, freq", + "start_timestamp, end_timestamp, freq", [ - (pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-01"), "D"), (pd.Timestamp("2020-01-02"), pd.Timestamp("2020-01-01"), "D"), ], ) -def test_determine_num_steps_to_forecast_fail_wrong_order(last_train_timestamp, last_test_timestamp, freq): - with pytest.raises(ValueError, match="Last train timestamp should be less than last test timestamp"): - _ = determine_num_steps_to_forecast( - last_train_timestamp=last_train_timestamp, last_test_timestamp=last_test_timestamp, freq=freq - ) +def test_determine_num_steps_fail_wrong_order(start_timestamp, end_timestamp, freq): + with pytest.raises(ValueError, match="Start train timestamp should be less or equal than end timestamp"): + _ = determine_num_steps(start_timestamp=start_timestamp, end_timestamp=end_timestamp, freq=freq) @pytest.mark.parametrize( - "last_train_timestamp, last_test_timestamp, freq", + "start_timestamp, end_timestamp, freq", [ (pd.Timestamp("2020-01-02"), pd.Timestamp("2020-06-01"), "M"), (pd.Timestamp("2020-01-02"), pd.Timestamp("2020-06-01"), "MS"), ], ) -def test_determine_num_steps_to_forecast_fail_wrong_start(last_train_timestamp, last_test_timestamp, freq): - with pytest.raises(ValueError, match="Last train timestamp isn't correct according to given frequency"): - _ = determine_num_steps_to_forecast( - last_train_timestamp=last_train_timestamp, last_test_timestamp=last_test_timestamp, freq=freq - ) +def test_determine_num_steps_fail_wrong_start(start_timestamp, end_timestamp, freq): + with pytest.raises(ValueError, match="Start timestamp isn't correct according to given frequency"): + _ = determine_num_steps(start_timestamp=start_timestamp, end_timestamp=end_timestamp, freq=freq) @pytest.mark.parametrize( - "last_train_timestamp, last_test_timestamp, freq", + "start_timestamp, end_timestamp, freq", [ (pd.Timestamp("2020-01-31"), pd.Timestamp("2020-06-05"), "M"), (pd.Timestamp("2020-01-01"), pd.Timestamp("2020-06-05"), "MS"), ], ) -def test_determine_num_steps_to_forecast_fail_wrong_end(last_train_timestamp, last_test_timestamp, freq): - with pytest.raises(ValueError, match="Last test timestamps isn't reachable with freq"): - _ = determine_num_steps_to_forecast( - last_train_timestamp=last_train_timestamp, last_test_timestamp=last_test_timestamp, freq=freq - ) +def test_determine_num_steps_fail_wrong_end(start_timestamp, end_timestamp, freq): + with pytest.raises(ValueError, match="End timestamp isn't reachable with freq"): + _ = determine_num_steps(start_timestamp=start_timestamp, end_timestamp=end_timestamp, freq=freq)