Merge branch 'master' into ETNA-813

tinkoff-ai · Oct 15, 2021 · 2f22bca · 2f22bca
2 parents dab6c94 + 3c19e2f
commit 2f22bca
Show file tree

Hide file tree

Showing 16 changed files with 1,740 additions and 47 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,9 +25,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - TrendTransform ([#139](https://github.com/tinkoff-ai/etna-ts/pull/139))
 - Running notebooks in ci ([#134](https://github.com/tinkoff-ai/etna-ts/issues/134))
 - Cluster plotter to EDA ([#169](https://github.com/tinkoff-ai/etna-ts/pull/169))
-- Pipeline.backtest method ([#161](https://github.com/tinkoff-ai/etna-ts/pull/161))
+- Pipeline.backtest method ([#161](https://github.com/tinkoff-ai/etna-ts/pull/161), [#192](https://github.com/tinkoff-ai/etna-ts/pull/192))
 - STLTransform class ([#158](https://github.com/tinkoff-ai/etna-ts/pull/158))
 - NN_examples notebook ([#159](https://github.com/tinkoff-ai/etna-ts/pull/159))
+- Example for ProphetModel ([#178](https://github.com/tinkoff-ai/etna-ts/pull/178))
+- Instruction notebook for custom model and transform creation ([#180](https://github.com/tinkoff-ai/etna-ts/pull/180))
+- Add inverse_transform in *OutliersTransform ([#160](https://github.com/tinkoff-ai/etna-ts/pull/160))
+- Examples for CatBoostModelMultiSegment and CatBoostModelPerSegment ([#181](https://github.com/tinkoff-ai/etna-ts/pull/181))
+- Simplify TSDataset.train_test_split method by allowing to pass not all values ([#191](https://github.com/tinkoff-ai/etna-ts/pull/191))
 - Confidence interval anomalies detection to EDA ([#182](https://github.com/tinkoff-ai/etna-ts/pull/182))
 
 ### Changed
@@ -38,12 +43,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add Correlation heatmap in EDA notebook ([#144](https://github.com/tinkoff-ai/etna-ts/pull/144))
 - Add `__repr__` for Pipeline ([#151](https://github.com/tinkoff-ai/etna-ts/pull/151))
 - Defined random state for every test cases ([#155](https://github.com/tinkoff-ai/etna-ts/pull/155))
-- TimeSeriesCrossValidation returns `Metric.__repr__` as a key in `backtest`'s return values ([#161](https://github.com/tinkoff-ai/etna-ts/pull/161))
 - Add confidence intervals to Prophet ([#153](https://github.com/tinkoff-ai/etna-ts/pull/153))
 - Add confidence intervals to SARIMA ([#172](https://github.com/tinkoff-ai/etna-ts/pull/172))
 
 ### Fixed
 - Set default value of `TSDataset.head` method ([#170](https://github.com/tinkoff-ai/etna-ts/pull/170))
+- Categorical and fillna issues with pandas >=1.2 ([#190](https://github.com/tinkoff-ai/etna-ts/pull/190))
 
 ## [1.1.3] - 2021-10-08
 ### Fixed

diff --git a/docs/source/tutorials.rst b/docs/source/tutorials.rst
@@ -10,4 +10,5 @@ Tutorials
    tutorials/get_started
    tutorials/EDA
    tutorials/backtest
-   tutorials/outliers
+   tutorials/outliers
+   tutorials/custom_transform_and_model
diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py
@@ -466,21 +466,60 @@ def to_dataset(df: pd.DataFrame) -> pd.DataFrame:
         df.columns.names = ["segment", "feature"]
         return df
 
+    def _find_all_borders(
+        self,
+        train_start: Optional[TTimestamp],
+        train_end: Optional[TTimestamp],
+        test_start: Optional[TTimestamp],
+        test_end: Optional[TTimestamp],
+    ) -> Tuple[TTimestamp, TTimestamp, TTimestamp, TTimestamp]:
+        """Find borders for train_test_split if some values wasn't specified."""
+        if train_start is None:
+            train_start_defined = self.df.index.min()
+        else:
+            train_start_defined = train_start
+
+        if test_end is None:
+            test_end_defined = self.df.index.max()
+        else:
+            test_end_defined = test_end
+
+        if train_end is None and test_start is None:
+            raise ValueError("One of train_end or test_start should be defined")
+
+        if train_end is None:
+            test_start_idx = self.df.index.get_loc(test_start)
+            train_end_defined = self.df.index[test_start_idx - 1]
+        else:
+            train_end_defined = train_end
+
+        if test_start is None:
+            train_end_idx = self.df.index.get_loc(train_end)
+            test_start_defined = self.df.index[train_end_idx + 1]
+        else:
+            test_start_defined = test_start
+
+        return train_start_defined, train_end_defined, test_start_defined, test_end_defined
+
     def train_test_split(
-        self, train_start: Optional[TTimestamp], train_end: TTimestamp, test_start: TTimestamp, test_end: TTimestamp
+        self,
+        train_start: Optional[TTimestamp],
+        train_end: Optional[TTimestamp],
+        test_start: Optional[TTimestamp],
+        test_end: Optional[TTimestamp],
     ) -> Tuple["TSDataset", "TSDataset"]:
         """Split given df with train-test timestamp indices.
 
         Parameters
         ----------
         train_start:
-            start timestamp of new train dataset
+            start timestamp of new train dataset, if None first timestamp is used
         train_end:
-            end timestamp of new train dataset
+            end timestamp of new train dataset, if None previous to test_start timestamp is used
         test_start:
-            start timestamp of new test dataset
+            start timestamp of new test dataset, if None next to train_end timestamp is used
         test_end:
-            end timestamp of new test dataset
+            end timestamp of new test dataset, if None last timestamp is used
 
         Returns
         -------
@@ -517,17 +556,22 @@ def train_test_split(
         2021-02-05     -5.10      0.40      2.15
         2021-02-06     -6.22      0.92      0.97
         """
-        if pd.Timestamp(test_end) > self.df.index.max():
-            raise UserWarning(f"Max timestamp in df is {self.df.index.max()}.")
-        if pd.Timestamp(train_start) < self.df.index.min():
-            raise UserWarning(f"Min timestamp in df is {self.df.index.min()}.")
-        train_df = self.df[train_start:train_end][self.raw_df.columns]  # type: ignore
-        train_raw_df = self.raw_df[train_start:train_end]  # type: ignore
+        train_start_defined, train_end_defined, test_start_defined, test_end_defined = self._find_all_borders(
+            train_start, train_end, test_start, test_end
+        )
+
+        if pd.Timestamp(test_end_defined) > self.df.index.max():
+            warnings.warn(f"Max timestamp in df is {self.df.index.max()}.")
+        if pd.Timestamp(train_start_defined) < self.df.index.min():
+            warnings.warn(f"Min timestamp in df is {self.df.index.min()}.")
+
+        train_df = self.df[train_start_defined:train_end_defined][self.raw_df.columns]  # type: ignore
+        train_raw_df = self.raw_df[train_start_defined:train_end_defined]  # type: ignore
         train = TSDataset(df=train_df, df_exog=self.df_exog, freq=self.freq)
         train.raw_df = train_raw_df
 
-        test_df = self.df[test_start:test_end][self.raw_df.columns]  # type: ignore
-        test_raw_df = self.raw_df[train_start:test_end]  # type: ignore
+        test_df = self.df[test_start_defined:test_end_defined][self.raw_df.columns]  # type: ignore
+        test_raw_df = self.raw_df[train_start_defined:test_end_defined]  # type: ignore
         test = TSDataset(df=test_df, df_exog=self.df_exog, freq=self.freq)
         test.raw_df = test_raw_df
 

diff --git a/etna/ensembles/voting_ensemble.py b/etna/ensembles/voting_ensemble.py
@@ -32,6 +32,15 @@ class VotingEnsemble(Pipeline):
     ...     weights=[0.7, 0.3]
     ... )
     >>> ensemble.fit(ts=ts)
+    VotingEnsemble(pipelines =
+    [Pipeline(model = ProphetModel(growth = 'linear', changepoints = None, n_changepoints = 25,
+    changepoint_range = 0.8, yearly_seasonality = 'auto', weekly_seasonality = 'auto',
+    daily_seasonality = 'auto', holidays = None, seasonality_mode = 'additive',
+    seasonality_prior_scale = 10.0, holidays_prior_scale = 10.0, mcmc_samples = 0,
+    interval_width = 0.8, uncertainty_samples = 1000, stan_backend = None,
+    additional_seasonality_params = (), ), transforms = [], horizon = 7, ),
+    Pipeline(model = NaiveModel(lag = 10, ), transforms = [], horizon = 7, )],
+    weights = [0.7, 0.3], n_jobs = 1, )
     >>> forecast = ensemble.forecast()
     >>> forecast
     segment         segment_0        segment_1       segment_2
@@ -113,6 +122,7 @@ def fit(self, ts: TSDataset) -> "VotingEnsemble":
         self.pipelines = Parallel(n_jobs=self.n_jobs, backend="multiprocessing", verbose=11)(
             delayed(self._fit_pipeline)(pipeline=pipeline, ts=deepcopy(ts)) for pipeline in self.pipelines
         )
+        return self
 
     @staticmethod
     def _forecast_pipeline(pipeline: Pipeline) -> TSDataset:

diff --git a/etna/model_selection/backtest.py b/etna/model_selection/backtest.py
@@ -183,7 +183,7 @@ def _compute_metrics(self, y_true: TSDataset, y_pred: TSDataset) -> Dict[str, fl
         """
         metrics = {}
         for metric in self.metrics:
-            metrics[metric.__repr__()] = metric(y_true=y_true, y_pred=y_pred)
+            metrics[metric.__class__.__name__] = metric(y_true=y_true, y_pred=y_pred)
         return metrics
 
     def get_forecasts(self) -> pd.DataFrame:

diff --git a/etna/models/catboost.py b/etna/models/catboost.py
@@ -49,7 +49,47 @@ def predict(self, df: pd.DataFrame) -> list:
 
 
 class CatBoostModelPerSegment(PerSegmentModel):
-    """Class for holding per segment Catboost model."""
+    """Class for holding per segment Catboost model.
+
+    Examples
+    --------
+    >>> from etna.datasets import generate_periodic_df
+    >>> from etna.datasets import TSDataset
+    >>> from etna.models import CatBoostModelPerSegment
+    >>> from etna.transforms import LagTransform
+    >>> classic_df = generate_periodic_df(
+    ...     periods=100,
+    ...     start_time="2020-01-01",
+    ...     n_segments=4,
+    ...     period=7,
+    ...     sigma=3
+    ... )
+    >>> df = TSDataset.to_dataset(df=classic_df)
+    >>> ts = TSDataset(df, freq="D")
+    >>> horizon = 7
+    >>> transforms = [
+    ...     LagTransform(in_column="target", lags=[horizon, horizon+1, horizon+2])
+    ... ]
+    >>> ts.fit_transform(transforms=transforms)
+    >>> future = ts.make_future(horizon)
+    >>> model = CatBoostModelPerSegment()
+    >>> model.fit(ts=ts)
+    CatBoostModelPerSegment(iterations = None, depth = None, learning_rate = None,
+    logging_level = 'Silent', l2_leaf_reg = None, thread_count = None, )
+    >>> forecast = model.forecast(future)
+    >>> pd.options.display.float_format = '{:,.2f}'.format
+    >>> forecast[:, :, "target"]
+    segment    segment_0 segment_1 segment_2 segment_3
+    feature       target    target    target    target
+    timestamp
+    2020-04-10      9.00      9.00      4.00      6.00
+    2020-04-11      5.00      2.00      7.00      9.00
+    2020-04-12      0.00      4.00      7.00      9.00
+    2020-04-13      0.00      5.00      9.00      7.00
+    2020-04-14      1.00      2.00      1.00      6.00
+    2020-04-15      5.00      7.00      4.00      7.00
+    2020-04-16      8.00      6.00      2.00      0.00
+    """
 
     def __init__(
         self,
@@ -122,7 +162,47 @@ def __init__(
 
 
 class CatBoostModelMultiSegment(Model):
-    """Class for holding Catboost model for all segments."""
+    """Class for holding Catboost model for all segments.
+
+    Examples
+    --------
+    >>> from etna.datasets import generate_periodic_df
+    >>> from etna.datasets import TSDataset
+    >>> from etna.models import CatBoostModelMultiSegment
+    >>> from etna.transforms import LagTransform
+    >>> classic_df = generate_periodic_df(
+    ...     periods=100,
+    ...     start_time="2020-01-01",
+    ...     n_segments=4,
+    ...     period=7,
+    ...     sigma=3
+    ... )
+    >>> df = TSDataset.to_dataset(df=classic_df)
+    >>> ts = TSDataset(df, freq="D")
+    >>> horizon = 7
+    >>> transforms = [
+    ...     LagTransform(in_column="target", lags=[horizon, horizon+1, horizon+2])
+    ... ]
+    >>> ts.fit_transform(transforms=transforms)
+    >>> future = ts.make_future(horizon)
+    >>> model = CatBoostModelMultiSegment()
+    >>> model.fit(ts=ts)
+    CatBoostModelMultiSegment(iterations = None, depth = None, learning_rate = None,
+    logging_level = 'Silent', l2_leaf_reg = None, thread_count = None, )
+    >>> forecast = model.forecast(future)
+    >>> pd.options.display.float_format = '{:,.2f}'.format
+    >>> forecast[:, :, "target"].round()
+    segment    segment_0 segment_1 segment_2 segment_3
+    feature       target    target    target    target
+    timestamp
+    2020-04-10      9.00      9.00      4.00      6.00
+    2020-04-11      5.00      2.00      7.00      9.00
+    2020-04-12     -0.00      4.00      7.00      9.00
+    2020-04-13      0.00      5.00      9.00      7.00
+    2020-04-14      1.00      2.00      1.00      6.00
+    2020-04-15      5.00      7.00      4.00      7.00
+    2020-04-16      8.00      6.00      2.00      0.00
+    """
 
     def __init__(
         self,

diff --git a/etna/models/prophet.py b/etna/models/prophet.py
@@ -129,7 +129,44 @@ def predict(self, df: pd.DataFrame, confidence_interval: bool = False):
 
 
 class ProphetModel(PerSegmentModel):
-    """Class for holding Prophet model."""
+    """Class for holding Prophet model.
+
+    Examples
+    --------
+    >>> from etna.datasets import generate_periodic_df
+    >>> from etna.datasets import TSDataset
+    >>> from etna.models import ProphetModel
+    >>> classic_df = generate_periodic_df(
+    ...     periods=100,
+    ...     start_time="2020-01-01",
+    ...     n_segments=4,
+    ...     period=7,
+    ...     sigma=3
+    ... )
+    >>> df = TSDataset.to_dataset(df=classic_df)
+    >>> ts = TSDataset(df, freq="D")
+    >>> future = ts.make_future(7)
+    >>> model = ProphetModel(growth="flat")
+    >>> model.fit(ts=ts)
+    ProphetModel(growth = 'flat', changepoints = None, n_changepoints = 25,
+    changepoint_range = 0.8, yearly_seasonality = 'auto', weekly_seasonality = 'auto',
+    daily_seasonality = 'auto', holidays = None, seasonality_mode = 'additive',
+    seasonality_prior_scale = 10.0, holidays_prior_scale = 10.0, mcmc_samples = 0,
+    interval_width = 0.8, uncertainty_samples = 1000, stan_backend = None,
+    additional_seasonality_params = (), )
+    >>> forecast = model.forecast(future)
+    >>> forecast
+    segment    segment_0 segment_1 segment_2 segment_3
+    feature       target    target    target    target
+    timestamp
+    2020-04-10      9.00      9.00      4.00      6.00
+    2020-04-11      5.00      2.00      7.00      9.00
+    2020-04-12      0.00      4.00      7.00      9.00
+    2020-04-13      0.00      5.00      9.00      7.00
+    2020-04-14      1.00      2.00      1.00      6.00
+    2020-04-15      5.00      7.00      4.00      7.00
+    2020-04-16      8.00      6.00      2.00      0.00
+    """
 
     def __init__(
         self,
@@ -152,6 +189,7 @@ def __init__(
     ):
         """
         Create instance of Prophet model.
+
         Parameters
         ----------
         growth:
@@ -282,6 +320,7 @@ def _forecast_segment(
     @log_decorator
     def forecast(self, ts: TSDataset, confidence_interval: bool = False) -> TSDataset:
         """Make predictions.
+
         Parameters
         ----------
         ts:
@@ -294,7 +333,7 @@ def forecast(self, ts: TSDataset, confidence_interval: bool = False) -> TSDatase
             Models result
         Notes
         -----
-        The width of the confidence interval is specified in the constructor of ProphetModel setting the interval_width
+        The width of the confidence interval is specified in the constructor of ProphetModel setting the interval_width.
         """
         if self._segments is None:
             raise ValueError("The model is not fitted yet, use fit() to train it")

diff --git a/etna/pipeline/pipeline.py b/etna/pipeline/pipeline.py
@@ -16,7 +16,6 @@
 from etna.loggers import tslogger
 from etna.metrics import Metric
 from etna.metrics import MetricAggregationMode
-from etna.metrics.utils import compute_metrics
 from etna.models.base import Model
 from etna.transforms.base import Transform
 
@@ -150,6 +149,14 @@ def _generate_folds_datasets(
 
             yield train, test
 
+    @staticmethod
+    def _compute_metrics(metrics: List[Metric], y_true: TSDataset, y_pred: TSDataset) -> Dict[str, float]:
+        """Compute metrics for given y_true, y_pred."""
+        metrics_values = {}
+        for metric in metrics:
+            metrics_values[metric.__class__.__name__] = metric(y_true=y_true, y_pred=y_pred)
+        return metrics_values
+
     def _run_fold(
         self,
         train: TSDataset,
@@ -170,7 +177,7 @@ def _run_fold(
             fold[f"{stage_name}_timerange"]["start"] = stage_df.index.min()
             fold[f"{stage_name}_timerange"]["end"] = stage_df.index.max()
         fold["forecast"] = forecast
-        fold["metrics"] = deepcopy(compute_metrics(metrics=metrics, y_true=test, y_pred=forecast))
+        fold["metrics"] = deepcopy(self._compute_metrics(metrics=metrics, y_true=test, y_pred=forecast))
 
         tslogger.log_backtest_run(pd.DataFrame(fold["metrics"]), forecast.to_pandas(), test.to_pandas())
         tslogger.finish_experiment()