Skip to content

get_anomalies_confidence_interval -> EDA #182

Merged
merged 7 commits into from
Oct 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add inverse_transform in *OutliersTransform ([#160](https://github.com/tinkoff-ai/etna-ts/pull/160))
- Examples for CatBoostModelMultiSegment and CatBoostModelPerSegment ([#181](https://github.com/tinkoff-ai/etna-ts/pull/181))
- Simplify TSDataset.train_test_split method by allowing to pass not all values ([#191](https://github.com/tinkoff-ai/etna-ts/pull/191))
- Confidence interval anomalies detection to EDA ([#182](https://github.com/tinkoff-ai/etna-ts/pull/182))

### Changed
- Delete offset from WindowStatisticsTransform ([#111](https://github.com/tinkoff-ai/etna-ts/pull/111))
Expand Down
1 change: 1 addition & 0 deletions etna/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from etna.analysis.eda_utils import cross_corr_plot
from etna.analysis.eda_utils import distribution_plot
from etna.analysis.eda_utils import sample_pacf_plot
from etna.analysis.outliers.confidence_interval_outliers import get_anomalies_confidence_interval
from etna.analysis.outliers.density_outliers import get_anomalies_density
from etna.analysis.outliers.hist_outliers import get_anomalies_hist
from etna.analysis.outliers.median_outliers import get_anomalies_median
Expand Down
1 change: 1 addition & 0 deletions etna/analysis/outliers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from etna.analysis.outliers.confidence_interval_outliers import get_anomalies_confidence_interval
from etna.analysis.outliers.density_outliers import get_anomalies_density
from etna.analysis.outliers.hist_outliers import get_anomalies_hist
from etna.analysis.outliers.median_outliers import get_anomalies_median
Expand Down
52 changes: 52 additions & 0 deletions etna/analysis/outliers/confidence_interval_outliers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from copy import deepcopy
from typing import TYPE_CHECKING
from typing import Dict
from typing import List
from typing import Type
from typing import Union

import numpy as np
import pandas as pd

if TYPE_CHECKING:
from etna.datasets import TSDataset
from etna.models import ProphetModel
from etna.models import SARIMAXModel


def get_anomalies_confidence_interval(
ts: "TSDataset",
model: Union[Type["ProphetModel"], Type["SARIMAXModel"]],
interval_width: float = 0.95,
**model_params,
) -> Dict[str, List[pd.Timestamp]]:
"""
Get point outliers in time series using confidence intervals (estimation model-based method).
Outliers are all points out of the confidence interval predicted with the model.

Parameters
----------
ts:
TSDataset with timeseries data(should contains all the necessary features).
model:
Model for confidence interval estimation.
interval_width:
The significance level for the confidence interval. By default a 95% confidence interval is taken.

Returns
-------
dict of outliers: Dict[str, List[pd.Timestamp]]
Dict of outliers in format {segment: [outliers_timestamps]}.
"""
outliers_per_segment = {}
time_points = np.array(ts.index.values)
model_instance = model(interval_width=interval_width, **model_params)
model_instance.fit(ts)
confidence_interval = model_instance.forecast(deepcopy(ts), confidence_interval=True)
for segment in ts.segments:
segment_slice = confidence_interval[:, segment, :][segment]
anomalies_mask = (segment_slice["target"] > segment_slice["target_upper"]) | (
segment_slice["target"] < segment_slice["target_lower"]
)
outliers_per_segment[segment] = list(time_points[anomalies_mask])
return outliers_per_segment
18 changes: 15 additions & 3 deletions etna/models/sarimax.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,20 @@
from typing import Union

import pandas as pd
from statsmodels.tools.sm_exceptions import ValueWarning
from statsmodels.tsa.statespace.sarimax import SARIMAX

from etna.datasets import TSDataset
from etna.models.base import PerSegmentModel
from etna.models.base import log_decorator

warnings.filterwarnings(
message="No frequency information was provided, so inferred frequency .* will be used",
action="ignore",
category=ValueWarning,
module="statsmodels.tsa.base.tsa_model",
)


class _SARIMAXModel:
"""
Expand Down Expand Up @@ -254,12 +262,15 @@ def predict(self, df: pd.DataFrame, confidence_interval: bool = False) -> pd.Dat
)

exog_future = self._select_regressors(df)
forecast = self._result.get_prediction(
start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=True, exog=exog_future
)
if confidence_interval:
forecast = self._result.get_prediction(
start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=False, exog=exog_future
)
y_pred = forecast.summary_frame(alpha=1 - self.interval_width)[["mean_ci_lower", "mean", "mean_ci_upper"]]
else:
forecast = self._result.get_prediction(
start=df["timestamp"].min(), end=df["timestamp"].max(), dynamic=True, exog=exog_future
)
y_pred = pd.DataFrame(forecast.predicted_mean)
y_pred.rename({"predicted_mean": "mean"}, axis=1, inplace=True)
return y_pred.reset_index(drop=True, inplace=False)
Expand Down Expand Up @@ -456,6 +467,7 @@ def __init__(
freq=self.freq,
missing=self.missing,
validate_specification=self.validate_specification,
interval_width=self.interval_width,
**self.kwargs,
)
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import numpy as np
import pytest

from etna.analysis import get_anomalies_confidence_interval
from etna.models import ProphetModel
from etna.models import SARIMAXModel


@pytest.mark.parametrize("model", (ProphetModel, SARIMAXModel))
def test_interface(outliers_tsds, model):
anomalies = get_anomalies_confidence_interval(outliers_tsds, model=model, interval_width=0.95)
assert isinstance(anomalies, dict)
assert sorted(list(anomalies.keys())) == sorted(outliers_tsds.segments)
for segment in anomalies.keys():
assert isinstance(anomalies[segment], list)
for date in anomalies[segment]:
assert isinstance(date, np.datetime64)


@pytest.mark.parametrize(
"model,interval_width, true_anomalies",
(
(
ProphetModel,
0.95,
{"1": [np.datetime64("2021-01-11")], "2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-27")]},
),
(SARIMAXModel, 0.999, {"1": [], "2": [np.datetime64("2021-01-27")]}),
),
)
def test_confidence_interval_outliers(outliers_tsds, model, interval_width, true_anomalies):
assert get_anomalies_confidence_interval(outliers_tsds, model, interval_width) == true_anomalies