diff --git a/CHANGELOG.md b/CHANGELOG.md index 6897648ab..5897612ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add inverse_transform in *OutliersTransform ([#160](https://github.com/tinkoff-ai/etna-ts/pull/160)) - Examples for CatBoostModelMultiSegment and CatBoostModelPerSegment ([#181](https://github.com/tinkoff-ai/etna-ts/pull/181)) - Simplify TSDataset.train_test_split method by allowing to pass not all values ([#191](https://github.com/tinkoff-ai/etna-ts/pull/191)) +- Confidence interval anomalies detection to EDA ([#182](https://github.com/tinkoff-ai/etna-ts/pull/182)) ### Changed - Delete offset from WindowStatisticsTransform ([#111](https://github.com/tinkoff-ai/etna-ts/pull/111)) diff --git a/etna/analysis/__init__.py b/etna/analysis/__init__.py index 736f6d4ad..77eed80d6 100644 --- a/etna/analysis/__init__.py +++ b/etna/analysis/__init__.py @@ -1,6 +1,7 @@ from etna.analysis.eda_utils import cross_corr_plot from etna.analysis.eda_utils import distribution_plot from etna.analysis.eda_utils import sample_pacf_plot +from etna.analysis.outliers.confidence_interval_outliers import get_anomalies_confidence_interval from etna.analysis.outliers.density_outliers import get_anomalies_density from etna.analysis.outliers.hist_outliers import get_anomalies_hist from etna.analysis.outliers.median_outliers import get_anomalies_median diff --git a/etna/analysis/outliers/__init__.py b/etna/analysis/outliers/__init__.py index baf853487..c31f46a32 100644 --- a/etna/analysis/outliers/__init__.py +++ b/etna/analysis/outliers/__init__.py @@ -1,3 +1,4 @@ +from etna.analysis.outliers.confidence_interval_outliers import get_anomalies_confidence_interval from etna.analysis.outliers.density_outliers import get_anomalies_density from etna.analysis.outliers.hist_outliers import get_anomalies_hist from etna.analysis.outliers.median_outliers import get_anomalies_median diff --git a/etna/analysis/outliers/confidence_interval_outliers.py b/etna/analysis/outliers/confidence_interval_outliers.py new file mode 100644 index 000000000..620eeaa3f --- /dev/null +++ b/etna/analysis/outliers/confidence_interval_outliers.py @@ -0,0 +1,52 @@ +from copy import deepcopy +from typing import TYPE_CHECKING +from typing import Dict +from typing import List +from typing import Type +from typing import Union + +import numpy as np +import pandas as pd + +if TYPE_CHECKING: + from etna.datasets import TSDataset + from etna.models import ProphetModel + from etna.models import SARIMAXModel + + +def get_anomalies_confidence_interval( + ts: "TSDataset", + model: Union[Type["ProphetModel"], Type["SARIMAXModel"]], + interval_width: float = 0.95, + **model_params, +) -> Dict[str, List[pd.Timestamp]]: + """ + Get point outliers in time series using confidence intervals (estimation model-based method). + Outliers are all points out of the confidence interval predicted with the model. + + Parameters + ---------- + ts: + TSDataset with timeseries data(should contains all the necessary features). + model: + Model for confidence interval estimation. + interval_width: + The significance level for the confidence interval. By default a 95% confidence interval is taken. + + Returns + ------- + dict of outliers: Dict[str, List[pd.Timestamp]] + Dict of outliers in format {segment: [outliers_timestamps]}. + """ + outliers_per_segment = {} + time_points = np.array(ts.index.values) + model_instance = model(interval_width=interval_width, **model_params) + model_instance.fit(ts) + confidence_interval = model_instance.forecast(deepcopy(ts), confidence_interval=True) + for segment in ts.segments: + segment_slice = confidence_interval[:, segment, :][segment] + anomalies_mask = (segment_slice["target"] > segment_slice["target_upper"]) | ( + segment_slice["target"] < segment_slice["target_lower"] + ) + outliers_per_segment[segment] = list(time_points[anomalies_mask]) + return outliers_per_segment diff --git a/etna/models/sarimax.py b/etna/models/sarimax.py index 5dfbb250a..45c3c6ad8 100644 --- a/etna/models/sarimax.py +++ b/etna/models/sarimax.py @@ -6,12 +6,20 @@ from typing import Union import pandas as pd +from statsmodels.tools.sm_exceptions import ValueWarning from statsmodels.tsa.statespace.sarimax import SARIMAX from etna.datasets import TSDataset from etna.models.base import PerSegmentModel from etna.models.base import log_decorator +warnings.filterwarnings( + message="No frequency information was provided, so inferred frequency .* will be used", + action="ignore", + category=ValueWarning, + module="statsmodels.tsa.base.tsa_model", +) + class _SARIMAXModel: """ @@ -254,12 +262,15 @@ def predict(self, df: pd.DataFrame, confidence_interval: bool = False) -> pd.Dat ) exog_future = self._select_regressors(df) - forecast = self._result.get_prediction( - start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=True, exog=exog_future - ) if confidence_interval: + forecast = self._result.get_prediction( + start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=False, exog=exog_future + ) y_pred = forecast.summary_frame(alpha=1 - self.interval_width)[["mean_ci_lower", "mean", "mean_ci_upper"]] else: + forecast = self._result.get_prediction( + start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=True, exog=exog_future + ) y_pred = pd.DataFrame(forecast.predicted_mean) y_pred.rename({"predicted_mean": "mean"}, axis=1, inplace=True) return y_pred.reset_index(drop=True, inplace=False) @@ -456,6 +467,7 @@ def __init__( freq=self.freq, missing=self.missing, validate_specification=self.validate_specification, + interval_width=self.interval_width, **self.kwargs, ) ) diff --git a/tests/test_analysis/test_outliers/test_confidence_interval_outliers.py b/tests/test_analysis/test_outliers/test_confidence_interval_outliers.py new file mode 100644 index 000000000..494d5396d --- /dev/null +++ b/tests/test_analysis/test_outliers/test_confidence_interval_outliers.py @@ -0,0 +1,32 @@ +import numpy as np +import pytest + +from etna.analysis import get_anomalies_confidence_interval +from etna.models import ProphetModel +from etna.models import SARIMAXModel + + +@pytest.mark.parametrize("model", (ProphetModel, SARIMAXModel)) +def test_interface(outliers_tsds, model): + anomalies = get_anomalies_confidence_interval(outliers_tsds, model=model, interval_width=0.95) + assert isinstance(anomalies, dict) + assert sorted(list(anomalies.keys())) == sorted(outliers_tsds.segments) + for segment in anomalies.keys(): + assert isinstance(anomalies[segment], list) + for date in anomalies[segment]: + assert isinstance(date, np.datetime64) + + +@pytest.mark.parametrize( + "model,interval_width, true_anomalies", + ( + ( + ProphetModel, + 0.95, + {"1": [np.datetime64("2021-01-11")], "2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-27")]}, + ), + (SARIMAXModel, 0.999, {"1": [], "2": [np.datetime64("2021-01-27")]}), + ), +) +def test_confidence_interval_outliers(outliers_tsds, model, interval_width, true_anomalies): + assert get_anomalies_confidence_interval(outliers_tsds, model, interval_width) == true_anomalies