tinkoff-ai · martins0n · Oct 15, 2021 · Oct 12, 2021 · Oct 12, 2021 · Oct 12, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add inverse_transform in *OutliersTransform ([#160](https://github.com/tinkoff-ai/etna-ts/pull/160))
 - Examples for CatBoostModelMultiSegment and CatBoostModelPerSegment ([#181](https://github.com/tinkoff-ai/etna-ts/pull/181))
 - Simplify TSDataset.train_test_split method by allowing to pass not all values ([#191](https://github.com/tinkoff-ai/etna-ts/pull/191))
+- Confidence interval anomalies detection to EDA ([#182](https://github.com/tinkoff-ai/etna-ts/pull/182))
 
 ### Changed
 - Delete offset from WindowStatisticsTransform ([#111](https://github.com/tinkoff-ai/etna-ts/pull/111))

diff --git a/etna/analysis/__init__.py b/etna/analysis/__init__.py
@@ -1,6 +1,7 @@
 from etna.analysis.eda_utils import cross_corr_plot
 from etna.analysis.eda_utils import distribution_plot
 from etna.analysis.eda_utils import sample_pacf_plot
+from etna.analysis.outliers.confidence_interval_outliers import get_anomalies_confidence_interval
 from etna.analysis.outliers.density_outliers import get_anomalies_density
 from etna.analysis.outliers.hist_outliers import get_anomalies_hist
 from etna.analysis.outliers.median_outliers import get_anomalies_median

diff --git a/etna/analysis/outliers/__init__.py b/etna/analysis/outliers/__init__.py
@@ -1,3 +1,4 @@
+from etna.analysis.outliers.confidence_interval_outliers import get_anomalies_confidence_interval
 from etna.analysis.outliers.density_outliers import get_anomalies_density
 from etna.analysis.outliers.hist_outliers import get_anomalies_hist
 from etna.analysis.outliers.median_outliers import get_anomalies_median

diff --git a/etna/analysis/outliers/confidence_interval_outliers.py b/etna/analysis/outliers/confidence_interval_outliers.py
@@ -0,0 +1,52 @@
+from copy import deepcopy
+from typing import TYPE_CHECKING
+from typing import Dict
+from typing import List
+from typing import Type
+from typing import Union
+
+import numpy as np
+import pandas as pd
+
+if TYPE_CHECKING:
+    from etna.datasets import TSDataset
+    from etna.models import ProphetModel
+    from etna.models import SARIMAXModel
+
+
+def get_anomalies_confidence_interval(
+    ts: "TSDataset",
+    model: Union[Type["ProphetModel"], Type["SARIMAXModel"]],
+    interval_width: float = 0.95,
+    **model_params,
+) -> Dict[str, List[pd.Timestamp]]:
+    """
+    Get point outliers in time series using confidence intervals (estimation model-based method).
+    Outliers are all points out of the confidence interval predicted with the model.
+
+    Parameters
+    ----------
+    ts:
+        TSDataset with timeseries data(should contains all the necessary features).
+    model:
+        Model for confidence interval estimation.
+    interval_width:
+       The significance level for the confidence interval. By default a 95% confidence interval is taken.
+
+    Returns
+    -------
+    dict of outliers: Dict[str, List[pd.Timestamp]]
+        Dict of outliers in format {segment: [outliers_timestamps]}.
+    """
+    outliers_per_segment = {}
+    time_points = np.array(ts.index.values)
+    model_instance = model(interval_width=interval_width, **model_params)
+    model_instance.fit(ts)
+    confidence_interval = model_instance.forecast(deepcopy(ts), confidence_interval=True)
+    for segment in ts.segments:
+        segment_slice = confidence_interval[:, segment, :][segment]
+        anomalies_mask = (segment_slice["target"] > segment_slice["target_upper"]) | (
+            segment_slice["target"] < segment_slice["target_lower"]
+        )
+        outliers_per_segment[segment] = list(time_points[anomalies_mask])
+    return outliers_per_segment
diff --git a/etna/models/sarimax.py b/etna/models/sarimax.py
@@ -6,12 +6,20 @@
 from typing import Union
 
 import pandas as pd
+from statsmodels.tools.sm_exceptions import ValueWarning
 from statsmodels.tsa.statespace.sarimax import SARIMAX
 
 from etna.datasets import TSDataset
 from etna.models.base import PerSegmentModel
 from etna.models.base import log_decorator
 
+warnings.filterwarnings(
+    message="No frequency information was provided, so inferred frequency .* will be used",
+    action="ignore",
+    category=ValueWarning,
+    module="statsmodels.tsa.base.tsa_model",
+)
+
 
 class _SARIMAXModel:
     """
@@ -254,12 +262,15 @@ def predict(self, df: pd.DataFrame, confidence_interval: bool = False) -> pd.Dat
             )
 
         exog_future = self._select_regressors(df)
-        forecast = self._result.get_prediction(
-            start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=True, exog=exog_future
-        )
         if confidence_interval:
+            forecast = self._result.get_prediction(
+                start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=False, exog=exog_future
+            )
             y_pred = forecast.summary_frame(alpha=1 - self.interval_width)[["mean_ci_lower", "mean", "mean_ci_upper"]]
         else:
+            forecast = self._result.get_prediction(
+                start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=True, exog=exog_future
+            )
             y_pred = pd.DataFrame(forecast.predicted_mean)
             y_pred.rename({"predicted_mean": "mean"}, axis=1, inplace=True)
         return y_pred.reset_index(drop=True, inplace=False)
@@ -456,6 +467,7 @@ def __init__(
                 freq=self.freq,
                 missing=self.missing,
                 validate_specification=self.validate_specification,
+                interval_width=self.interval_width,
                 **self.kwargs,
             )
         )

diff --git a/tests/test_analysis/test_outliers/test_confidence_interval_outliers.py b/tests/test_analysis/test_outliers/test_confidence_interval_outliers.py
@@ -0,0 +1,32 @@
+import numpy as np
+import pytest
+
+from etna.analysis import get_anomalies_confidence_interval
+from etna.models import ProphetModel
+from etna.models import SARIMAXModel
+
+
+@pytest.mark.parametrize("model", (ProphetModel, SARIMAXModel))
+def test_interface(outliers_tsds, model):
+    anomalies = get_anomalies_confidence_interval(outliers_tsds, model=model, interval_width=0.95)
+    assert isinstance(anomalies, dict)
+    assert sorted(list(anomalies.keys())) == sorted(outliers_tsds.segments)
+    for segment in anomalies.keys():
+        assert isinstance(anomalies[segment], list)
+        for date in anomalies[segment]:
+            assert isinstance(date, np.datetime64)
+
+
+@pytest.mark.parametrize(
+    "model,interval_width, true_anomalies",
+    (
+        (
+            ProphetModel,
+            0.95,
+            {"1": [np.datetime64("2021-01-11")], "2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-27")]},
+        ),
+        (SARIMAXModel, 0.999, {"1": [], "2": [np.datetime64("2021-01-27")]}),
+    ),
+)
+def test_confidence_interval_outliers(outliers_tsds, model, interval_width, true_anomalies):
+    assert get_anomalies_confidence_interval(outliers_tsds, model, interval_width) == true_anomalies