Width and Coverage metrics for prediction intervals (#638)

tinkoff-ai · Apr 13, 2022 · 8ade7d3 · 8ade7d3 · github-actions · Apr 13, 2022
1 parent e5ec89a
commit 8ade7d3
Show file tree

Hide file tree

Showing 5 changed files with 229 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 ### Added
--
+- `Width` and `Coverage` metrics for prediction intervals ([#638](https://github.com/tinkoff-ai/etna/pull/638))
 - 
 - Masked backtest ([#613](https://github.com/tinkoff-ai/etna/pull/613))
 - 

diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
@@ -17,4 +17,5 @@ See the API documentation for further details on available metrics:
    :template: custom-module-template.rst
    :recursive:
 
-   etna.metrics.metrics
+   etna.metrics.metrics
+   etna.metrics.intervals_metrics
diff --git a/etna/metrics/__init__.py b/etna/metrics/__init__.py
@@ -8,6 +8,8 @@
 from etna.metrics.base import MetricAggregationMode
 from etna.metrics.functional_metrics import mape
 from etna.metrics.functional_metrics import smape
+from etna.metrics.intervals_metrics import Coverage
+from etna.metrics.intervals_metrics import Width
 from etna.metrics.metrics import MAE
 from etna.metrics.metrics import MAPE
 from etna.metrics.metrics import MSE

diff --git a/etna/metrics/intervals_metrics.py b/etna/metrics/intervals_metrics.py
@@ -0,0 +1,152 @@
+from typing import Dict
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+
+from etna.datasets import TSDataset
+from etna.metrics.base import Metric
+from etna.metrics.base import MetricAggregationMode
+
+
+def dummy():
+    return np.nan
+
+
+class _QuantileMetricMixin:
+    def _validate_tsdataset_quantiles(self, ts: TSDataset, quantiles: Sequence[float]) -> None:
+        """Check if quantiles presented in y_pred."""
+        features = set(ts.df.columns.get_level_values("feature"))
+        for quantile in quantiles:
+            assert f"target_{quantile:.4g}" in features, f"Quantile {quantile} is not presented in tsdataset."
+
+
+class Coverage(Metric, _QuantileMetricMixin):
+    """Coverage metric for prediction intervals - precenteage of samples in the interval ``[lower quantile, upper quantile]``.
+
+    .. math::
+        Coverage(y\_true, y\_pred) = \\frac{\\sum_{i=0}^{n-1}{[ y\_true_i \\ge y\_pred_i^{lower\_quantile}] * [y\_true_i \\le y\_pred_i^{upper\_quantile}] }}{n}
+
+    Notes
+    -----
+    Works just if quantiles presented in y_pred
+    """
+
+    def __init__(
+        self, quantiles: Tuple[float, float] = (0.025, 0.975), mode: str = MetricAggregationMode.per_segment, **kwargs
+    ):
+        """Init metric.
+
+        Parameters
+        ----------
+        mode: 'macro' or 'per-segment'
+            metrics aggregation mode
+        kwargs:
+            metric's computation arguments
+        """
+        super().__init__(mode=mode, metric_fn=dummy, **kwargs)
+        self.quantiles = quantiles
+
+    def __call__(self, y_true: TSDataset, y_pred: TSDataset) -> Union[float, Dict[str, float]]:
+        """
+        Compute metric's value with y_true and y_pred.
+
+        Notes
+        -----
+        Note that if y_true and y_pred are not sorted Metric will sort it anyway
+
+        Parameters
+        ----------
+        y_true:
+            dataset with true time series values
+        y_pred:
+            dataset with predicted time series values
+
+        Returns
+        -------
+            metric's value aggregated over segments or not (depends on mode)
+        """
+        self._validate_segment_columns(y_true=y_true, y_pred=y_pred)
+        self._validate_tsdataset_quantiles(ts=y_pred, quantiles=self.quantiles)
+
+        segments = set(y_true.df.columns.get_level_values("segment"))
+        metrics_per_segment = {}
+        for segment in segments:
+            self._validate_timestamp_columns(
+                timestamp_true=y_true[:, segment, "target"].dropna().index,
+                timestamp_pred=y_pred[:, segment, "target"].dropna().index,
+            )
+            upper_quantile_flag = y_true[:, segment, "target"] <= y_pred[:, segment, f"target_{self.quantiles[1]:.4g}"]
+            lower_quantile_flag = y_true[:, segment, "target"] >= y_pred[:, segment, f"target_{self.quantiles[0]:.4g}"]
+
+            metrics_per_segment[segment] = np.mean(upper_quantile_flag * lower_quantile_flag)
+        metrics = self._aggregate_metrics(metrics_per_segment)
+        return metrics
+
+
+class Width(Metric, _QuantileMetricMixin):
+    """Mean width of prediction intervals.
+
+    .. math::
+        Width(y\_true, y\_pred) = \\frac{\\sum_{i=0}^{n-1}\\mid y\_pred_i^{upper\_quantile} - y\_pred_i^{lower\_quantile} \\mid}{n}
+
+    Notes
+    -----
+    Works just if quantiles presented in y_pred
+    """
+
+    def __init__(
+        self, quantiles: Tuple[float, float] = (0.025, 0.975), mode: str = MetricAggregationMode.per_segment, **kwargs
+    ):
+        """Init metric.
+
+        Parameters
+        ----------
+        mode: 'macro' or 'per-segment'
+            metrics aggregation mode
+        kwargs:
+            metric's computation arguments
+        """
+        super().__init__(mode=mode, metric_fn=dummy, **kwargs)
+        self.quantiles = quantiles
+
+    def __call__(self, y_true: TSDataset, y_pred: TSDataset) -> Union[float, Dict[str, float]]:
+        """
+        Compute metric's value with y_true and y_pred.
+
+        Notes
+        -----
+        Note that if y_true and y_pred are not sorted Metric will sort it anyway
+
+        Parameters
+        ----------
+        y_true:
+            dataset with true time series values
+        y_pred:
+            dataset with predicted time series values
+
+        Returns
+        -------
+            metric's value aggregated over segments or not (depends on mode)
+        """
+        self._validate_segment_columns(y_true=y_true, y_pred=y_pred)
+        self._validate_tsdataset_quantiles(ts=y_pred, quantiles=self.quantiles)
+
+        segments = set(y_true.df.columns.get_level_values("segment"))
+        metrics_per_segment = {}
+        for segment in segments:
+            self._validate_timestamp_columns(
+                timestamp_true=y_true[:, segment, "target"].dropna().index,
+                timestamp_pred=y_pred[:, segment, "target"].dropna().index,
+            )
+            upper_quantile = y_pred[:, segment, f"target_{self.quantiles[1]:.4g}"]
+            lower_quantile = y_pred[:, segment, f"target_{self.quantiles[0]:.4g}"]
+
+            metrics_per_segment[segment] = np.abs(lower_quantile - upper_quantile).mean()
+
+        metrics = self._aggregate_metrics(metrics_per_segment)
+        return metrics
+
+
+__all__ = ["Coverage", "Width"]
diff --git a/tests/test_metrics/test_intervals_metrics.py b/tests/test_metrics/test_intervals_metrics.py
@@ -0,0 +1,72 @@
+import pytest
+
+from etna.datasets import TSDataset
+from etna.metrics import Coverage
+from etna.metrics import Width
+
+
+@pytest.fixture
+def tsdataset_with_zero_width_quantiles(example_df):
+
+    ts_train = TSDataset.to_dataset(example_df)
+    ts_train = TSDataset(ts_train, freq="H")
+    example_df["target_0.025"] = example_df["target"]
+    example_df["target_0.975"] = example_df["target"]
+    ts_test = TSDataset.to_dataset(example_df)
+    ts_test = TSDataset(ts_test, freq="H")
+    return ts_train, ts_test
+
+
+@pytest.fixture
+def tsdataset_with_differnt_width_and_shifted_quantiles(example_df):
+
+    ts_train = TSDataset.to_dataset(example_df)
+    ts_train = TSDataset(ts_train, freq="H")
+    example_df["target_0.025"] = example_df["target"]
+    example_df["target_0.975"] = example_df["target"]
+
+    segment_one_index = example_df[lambda x: x.segment == "segment_1"].index
+
+    example_df.loc[segment_one_index, "target_0.025"] = example_df.loc[segment_one_index, "target_0.025"] + 1
+    example_df.loc[segment_one_index, "target_0.975"] = example_df.loc[segment_one_index, "target_0.975"] + 2
+
+    ts_test = TSDataset.to_dataset(example_df)
+    ts_test = TSDataset(ts_test, freq="H")
+    return ts_train, ts_test
+
+
+def test_width_metric_with_zero_width_quantiles(tsdataset_with_zero_width_quantiles):
+    ts_train, ts_test = tsdataset_with_zero_width_quantiles
+
+    expected_metric = 0.0
+    width_metric = Width(mode="per-segment")(ts_train, ts_test)
+
+    for segment in width_metric:
+        assert width_metric[segment] == expected_metric
+
+
+def test_width_metric_with_differnt_width_and_shifted_quantiles(tsdataset_with_differnt_width_and_shifted_quantiles):
+    ts_train, ts_test = tsdataset_with_differnt_width_and_shifted_quantiles
+
+    expected_metric = {"segment_1": 1.0, "segment_2": 0.0}
+    width_metric = Width(mode="per-segment")(ts_train, ts_test)
+
+    for segment in width_metric:
+        assert width_metric[segment] == expected_metric[segment]
+
+
+def test_coverage_metric_with_differnt_width_and_shifted_quantiles(tsdataset_with_differnt_width_and_shifted_quantiles):
+    ts_train, ts_test = tsdataset_with_differnt_width_and_shifted_quantiles
+
+    expected_metric = {"segment_1": 0.0, "segment_2": 1.0}
+    coverage_metric = Coverage(mode="per-segment")(ts_train, ts_test)
+
+    for segment in coverage_metric:
+        assert coverage_metric[segment] == expected_metric[segment]
+
+
+@pytest.mark.parametrize("metric", [Coverage(quantiles=(0.1, 0.3)), Width(quantiles=(0.1, 0.3))])
+def test_using_not_presented_quantiles(metric, tsdataset_with_zero_width_quantiles):
+    ts_train, ts_test = tsdataset_with_zero_width_quantiles
+    with pytest.raises(AssertionError, match="Quantile .* is not presented in tsdataset."):
+        _ = metric(ts_train, ts_test)