From da9f2b8ad9524120fb5063eb5e0c23ed3b8e5a73 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Tue, 12 Oct 2021 08:16:12 +0300 Subject: [PATCH 1/5] Add confidence intevals anomalies detection --- etna/analysis/__init__.py | 1 + etna/analysis/outliers/__init__.py | 1 + .../outliers/confidence_interval_outliers.py | 50 +++++++++++++++++++ .../test_confidence_interval_outliers.py | 26 ++++++++++ 4 files changed, 78 insertions(+) create mode 100644 etna/analysis/outliers/confidence_interval_outliers.py create mode 100644 tests/test_analysis/test_outliers/test_confidence_interval_outliers.py diff --git a/etna/analysis/__init__.py b/etna/analysis/__init__.py index 736f6d4ad..77eed80d6 100644 --- a/etna/analysis/__init__.py +++ b/etna/analysis/__init__.py @@ -1,6 +1,7 @@ from etna.analysis.eda_utils import cross_corr_plot from etna.analysis.eda_utils import distribution_plot from etna.analysis.eda_utils import sample_pacf_plot +from etna.analysis.outliers.confidence_interval_outliers import get_anomalies_confidence_interval from etna.analysis.outliers.density_outliers import get_anomalies_density from etna.analysis.outliers.hist_outliers import get_anomalies_hist from etna.analysis.outliers.median_outliers import get_anomalies_median diff --git a/etna/analysis/outliers/__init__.py b/etna/analysis/outliers/__init__.py index baf853487..c31f46a32 100644 --- a/etna/analysis/outliers/__init__.py +++ b/etna/analysis/outliers/__init__.py @@ -1,3 +1,4 @@ +from etna.analysis.outliers.confidence_interval_outliers import get_anomalies_confidence_interval from etna.analysis.outliers.density_outliers import get_anomalies_density from etna.analysis.outliers.hist_outliers import get_anomalies_hist from etna.analysis.outliers.median_outliers import get_anomalies_median diff --git a/etna/analysis/outliers/confidence_interval_outliers.py b/etna/analysis/outliers/confidence_interval_outliers.py new file mode 100644 index 000000000..77f664484 --- /dev/null +++ b/etna/analysis/outliers/confidence_interval_outliers.py @@ -0,0 +1,50 @@ +from typing import TYPE_CHECKING +from typing import Dict +from typing import List +from typing import Union + +import numpy as np +import pandas as pd + +if TYPE_CHECKING: + from etna.datasets import TSDataset + from etna.models import ProphetModel + from etna.models import SARIMAXModel + + +def get_anomalies_confidence_interval( + ts: "TSDataset", + model: Union["ProphetModel", "SARIMAXModel"], + interval_width: float = 0.95, + **model_params, +) -> Dict[str, List[pd.Timestamp]]: + """ + Get point outliers in time series using confidence intervals (estimation model-based method). + Outliers are all points out of the confidence interval predicted with the model. + + Parameters + ---------- + ts: + TSDataset with timeseries data(should contains all the necessary features) + model: + model for confidence interval estimation + interval_width: + width of the confidence interval + + Returns + ------- + dict of outliers: Dict[str, List[pd.Timestamp]] + dict of outliers in format {segment: [outliers_timestamps]} + """ + outliers_per_segment = {} + time_points = np.array(ts.index.values) + model = model(interval_width=interval_width, **model_params) + model.fit(ts) + confidence_interval = model.forecast(ts, confidence_interval=True) + for segment in ts.segments: + segment_slice = confidence_interval[:, segment, :][segment] + anomalies_mask = (segment_slice["target"] > segment_slice["target_upper"]) | ( + segment_slice["target"] < segment_slice["target_lower"] + ) + outliers_per_segment[segment] = list(time_points[anomalies_mask]) + return outliers_per_segment diff --git a/tests/test_analysis/test_outliers/test_confidence_interval_outliers.py b/tests/test_analysis/test_outliers/test_confidence_interval_outliers.py new file mode 100644 index 000000000..d6707c8d8 --- /dev/null +++ b/tests/test_analysis/test_outliers/test_confidence_interval_outliers.py @@ -0,0 +1,26 @@ +import numpy as np +import pytest + +from etna.analysis import get_anomalies_confidence_interval +from etna.models import ProphetModel +from etna.models import SARIMAXModel + + +@pytest.mark.parametrize("model", (ProphetModel, SARIMAXModel)) +def test_interface(outliers_tsds, model): + anomalies = get_anomalies_confidence_interval(outliers_tsds, model=model, interval_width=0.95) + assert isinstance(anomalies, dict) + assert sorted(list(anomalies.keys())) == sorted(outliers_tsds.segments) + for segment in anomalies.keys(): + assert isinstance(anomalies[segment], list) + for date in anomalies[segment]: + assert isinstance(date, np.datetime64) + + +@pytest.mark.parametrize("model", (ProphetModel, SARIMAXModel)) +@pytest.mark.parametrize( + "interval_width, true_anomalies", + ((0.95, {"1": [np.datetime64("2021-01-11")], "2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-27")]}),), +) +def test_confidence_interval_outliers(outliers_tsds, model, interval_width, true_anomalies): + assert get_anomalies_confidence_interval(outliers_tsds, model, interval_width) == true_anomalies From 147eae95dc371e3a4231b0e003b1ad04b4b8e911 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Tue, 12 Oct 2021 13:55:08 +0300 Subject: [PATCH 2/5] Fixes --- etna/analysis/outliers/confidence_interval_outliers.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/etna/analysis/outliers/confidence_interval_outliers.py b/etna/analysis/outliers/confidence_interval_outliers.py index 77f664484..c9fd8c6f9 100644 --- a/etna/analysis/outliers/confidence_interval_outliers.py +++ b/etna/analysis/outliers/confidence_interval_outliers.py @@ -1,6 +1,8 @@ +from copy import deepcopy from typing import TYPE_CHECKING from typing import Dict from typing import List +from typing import Type from typing import Union import numpy as np @@ -14,7 +16,7 @@ def get_anomalies_confidence_interval( ts: "TSDataset", - model: Union["ProphetModel", "SARIMAXModel"], + model: Union[Type["ProphetModel"], Type["SARIMAXModel"]], interval_width: float = 0.95, **model_params, ) -> Dict[str, List[pd.Timestamp]]: @@ -38,9 +40,9 @@ def get_anomalies_confidence_interval( """ outliers_per_segment = {} time_points = np.array(ts.index.values) - model = model(interval_width=interval_width, **model_params) - model.fit(ts) - confidence_interval = model.forecast(ts, confidence_interval=True) + model_instance = model(interval_width=interval_width, **model_params) + model_instance.fit(ts) + confidence_interval = model_instance.forecast(deepcopy(ts), confidence_interval=True) for segment in ts.segments: segment_slice = confidence_interval[:, segment, :][segment] anomalies_mask = (segment_slice["target"] > segment_slice["target_upper"]) | ( From 1ee5ff6f2cbebc1afd5d019c680cf5a1806e3a43 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Tue, 12 Oct 2021 15:32:12 +0300 Subject: [PATCH 3/5] Fixes --- etna/models/sarimax.py | 10 +++++++--- .../test_confidence_interval_outliers.py | 12 +++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/etna/models/sarimax.py b/etna/models/sarimax.py index 5dfbb250a..6aac4e228 100644 --- a/etna/models/sarimax.py +++ b/etna/models/sarimax.py @@ -254,12 +254,15 @@ def predict(self, df: pd.DataFrame, confidence_interval: bool = False) -> pd.Dat ) exog_future = self._select_regressors(df) - forecast = self._result.get_prediction( - start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=True, exog=exog_future - ) if confidence_interval: + forecast = self._result.get_prediction( + start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=False, exog=exog_future + ) y_pred = forecast.summary_frame(alpha=1 - self.interval_width)[["mean_ci_lower", "mean", "mean_ci_upper"]] else: + forecast = self._result.get_prediction( + start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=True, exog=exog_future + ) y_pred = pd.DataFrame(forecast.predicted_mean) y_pred.rename({"predicted_mean": "mean"}, axis=1, inplace=True) return y_pred.reset_index(drop=True, inplace=False) @@ -456,6 +459,7 @@ def __init__( freq=self.freq, missing=self.missing, validate_specification=self.validate_specification, + interval_width=self.interval_width, **self.kwargs, ) ) diff --git a/tests/test_analysis/test_outliers/test_confidence_interval_outliers.py b/tests/test_analysis/test_outliers/test_confidence_interval_outliers.py index d6707c8d8..494d5396d 100644 --- a/tests/test_analysis/test_outliers/test_confidence_interval_outliers.py +++ b/tests/test_analysis/test_outliers/test_confidence_interval_outliers.py @@ -17,10 +17,16 @@ def test_interface(outliers_tsds, model): assert isinstance(date, np.datetime64) -@pytest.mark.parametrize("model", (ProphetModel, SARIMAXModel)) @pytest.mark.parametrize( - "interval_width, true_anomalies", - ((0.95, {"1": [np.datetime64("2021-01-11")], "2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-27")]}),), + "model,interval_width, true_anomalies", + ( + ( + ProphetModel, + 0.95, + {"1": [np.datetime64("2021-01-11")], "2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-27")]}, + ), + (SARIMAXModel, 0.999, {"1": [], "2": [np.datetime64("2021-01-27")]}), + ), ) def test_confidence_interval_outliers(outliers_tsds, model, interval_width, true_anomalies): assert get_anomalies_confidence_interval(outliers_tsds, model, interval_width) == true_anomalies From b5449c005c5a3695f940c4c41f1205d6d194b3a9 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Tue, 12 Oct 2021 15:35:40 +0300 Subject: [PATCH 4/5] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 822d2c1fc..ba96d8f1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Pipeline.backtest method ([#161](https://github.com/tinkoff-ai/etna-ts/pull/161)) - STLTransform class ([#158](https://github.com/tinkoff-ai/etna-ts/pull/158)) - NN_examples notebook ([#159](https://github.com/tinkoff-ai/etna-ts/pull/159)) +- Confidence interval anomalies detection to EDA ([#182](https://github.com/tinkoff-ai/etna-ts/pull/182)) ### Changed - Delete offset from WindowStatisticsTransform ([#111](https://github.com/tinkoff-ai/etna-ts/pull/111)) From dab6c94cae18104579e97edc4399805014e663bb Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Fri, 15 Oct 2021 12:10:15 +0300 Subject: [PATCH 5/5] Fixes --- etna/analysis/outliers/confidence_interval_outliers.py | 8 ++++---- etna/models/sarimax.py | 8 ++++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/etna/analysis/outliers/confidence_interval_outliers.py b/etna/analysis/outliers/confidence_interval_outliers.py index c9fd8c6f9..620eeaa3f 100644 --- a/etna/analysis/outliers/confidence_interval_outliers.py +++ b/etna/analysis/outliers/confidence_interval_outliers.py @@ -27,16 +27,16 @@ def get_anomalies_confidence_interval( Parameters ---------- ts: - TSDataset with timeseries data(should contains all the necessary features) + TSDataset with timeseries data(should contains all the necessary features). model: - model for confidence interval estimation + Model for confidence interval estimation. interval_width: - width of the confidence interval + The significance level for the confidence interval. By default a 95% confidence interval is taken. Returns ------- dict of outliers: Dict[str, List[pd.Timestamp]] - dict of outliers in format {segment: [outliers_timestamps]} + Dict of outliers in format {segment: [outliers_timestamps]}. """ outliers_per_segment = {} time_points = np.array(ts.index.values) diff --git a/etna/models/sarimax.py b/etna/models/sarimax.py index 6aac4e228..45c3c6ad8 100644 --- a/etna/models/sarimax.py +++ b/etna/models/sarimax.py @@ -6,12 +6,20 @@ from typing import Union import pandas as pd +from statsmodels.tools.sm_exceptions import ValueWarning from statsmodels.tsa.statespace.sarimax import SARIMAX from etna.datasets import TSDataset from etna.models.base import PerSegmentModel from etna.models.base import log_decorator +warnings.filterwarnings( + message="No frequency information was provided, so inferred frequency .* will be used", + action="ignore", + category=ValueWarning, + module="statsmodels.tsa.base.tsa_model", +) + class _SARIMAXModel: """