Skip to content

Commit

Permalink
Merge branch 'master' into ETNA-813
Browse files Browse the repository at this point in the history
  • Loading branch information
alex-hse-repository authored Oct 15, 2021
2 parents dab6c94 + 3c19e2f commit 2f22bca
Show file tree
Hide file tree
Showing 16 changed files with 1,740 additions and 47 deletions.
9 changes: 7 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- TrendTransform ([#139](https://github.com/tinkoff-ai/etna-ts/pull/139))
- Running notebooks in ci ([#134](https://github.com/tinkoff-ai/etna-ts/issues/134))
- Cluster plotter to EDA ([#169](https://github.com/tinkoff-ai/etna-ts/pull/169))
- Pipeline.backtest method ([#161](https://github.com/tinkoff-ai/etna-ts/pull/161))
- Pipeline.backtest method ([#161](https://github.com/tinkoff-ai/etna-ts/pull/161), [#192](https://github.com/tinkoff-ai/etna-ts/pull/192))
- STLTransform class ([#158](https://github.com/tinkoff-ai/etna-ts/pull/158))
- NN_examples notebook ([#159](https://github.com/tinkoff-ai/etna-ts/pull/159))
- Example for ProphetModel ([#178](https://github.com/tinkoff-ai/etna-ts/pull/178))
- Instruction notebook for custom model and transform creation ([#180](https://github.com/tinkoff-ai/etna-ts/pull/180))
- Add inverse_transform in *OutliersTransform ([#160](https://github.com/tinkoff-ai/etna-ts/pull/160))
- Examples for CatBoostModelMultiSegment and CatBoostModelPerSegment ([#181](https://github.com/tinkoff-ai/etna-ts/pull/181))
- Simplify TSDataset.train_test_split method by allowing to pass not all values ([#191](https://github.com/tinkoff-ai/etna-ts/pull/191))
- Confidence interval anomalies detection to EDA ([#182](https://github.com/tinkoff-ai/etna-ts/pull/182))

### Changed
Expand All @@ -38,12 +43,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add Correlation heatmap in EDA notebook ([#144](https://github.com/tinkoff-ai/etna-ts/pull/144))
- Add `__repr__` for Pipeline ([#151](https://github.com/tinkoff-ai/etna-ts/pull/151))
- Defined random state for every test cases ([#155](https://github.com/tinkoff-ai/etna-ts/pull/155))
- TimeSeriesCrossValidation returns `Metric.__repr__` as a key in `backtest`'s return values ([#161](https://github.com/tinkoff-ai/etna-ts/pull/161))
- Add confidence intervals to Prophet ([#153](https://github.com/tinkoff-ai/etna-ts/pull/153))
- Add confidence intervals to SARIMA ([#172](https://github.com/tinkoff-ai/etna-ts/pull/172))

### Fixed
- Set default value of `TSDataset.head` method ([#170](https://github.com/tinkoff-ai/etna-ts/pull/170))
- Categorical and fillna issues with pandas >=1.2 ([#190](https://github.com/tinkoff-ai/etna-ts/pull/190))

## [1.1.3] - 2021-10-08
### Fixed
Expand Down
3 changes: 2 additions & 1 deletion docs/source/tutorials.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ Tutorials
tutorials/get_started
tutorials/EDA
tutorials/backtest
tutorials/outliers
tutorials/outliers
tutorials/custom_transform_and_model
70 changes: 57 additions & 13 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,21 +466,60 @@ def to_dataset(df: pd.DataFrame) -> pd.DataFrame:
df.columns.names = ["segment", "feature"]
return df

def _find_all_borders(
self,
train_start: Optional[TTimestamp],
train_end: Optional[TTimestamp],
test_start: Optional[TTimestamp],
test_end: Optional[TTimestamp],
) -> Tuple[TTimestamp, TTimestamp, TTimestamp, TTimestamp]:
"""Find borders for train_test_split if some values wasn't specified."""
if train_start is None:
train_start_defined = self.df.index.min()
else:
train_start_defined = train_start

if test_end is None:
test_end_defined = self.df.index.max()
else:
test_end_defined = test_end

if train_end is None and test_start is None:
raise ValueError("One of train_end or test_start should be defined")

if train_end is None:
test_start_idx = self.df.index.get_loc(test_start)
train_end_defined = self.df.index[test_start_idx - 1]
else:
train_end_defined = train_end

if test_start is None:
train_end_idx = self.df.index.get_loc(train_end)
test_start_defined = self.df.index[train_end_idx + 1]
else:
test_start_defined = test_start

return train_start_defined, train_end_defined, test_start_defined, test_end_defined

def train_test_split(
self, train_start: Optional[TTimestamp], train_end: TTimestamp, test_start: TTimestamp, test_end: TTimestamp
self,
train_start: Optional[TTimestamp],
train_end: Optional[TTimestamp],
test_start: Optional[TTimestamp],
test_end: Optional[TTimestamp],
) -> Tuple["TSDataset", "TSDataset"]:
"""Split given df with train-test timestamp indices.
Parameters
----------
train_start:
start timestamp of new train dataset
start timestamp of new train dataset, if None first timestamp is used
train_end:
end timestamp of new train dataset
end timestamp of new train dataset, if None previous to test_start timestamp is used
test_start:
start timestamp of new test dataset
start timestamp of new test dataset, if None next to train_end timestamp is used
test_end:
end timestamp of new test dataset
end timestamp of new test dataset, if None last timestamp is used
Returns
-------
Expand Down Expand Up @@ -517,17 +556,22 @@ def train_test_split(
2021-02-05 -5.10 0.40 2.15
2021-02-06 -6.22 0.92 0.97
"""
if pd.Timestamp(test_end) > self.df.index.max():
raise UserWarning(f"Max timestamp in df is {self.df.index.max()}.")
if pd.Timestamp(train_start) < self.df.index.min():
raise UserWarning(f"Min timestamp in df is {self.df.index.min()}.")
train_df = self.df[train_start:train_end][self.raw_df.columns] # type: ignore
train_raw_df = self.raw_df[train_start:train_end] # type: ignore
train_start_defined, train_end_defined, test_start_defined, test_end_defined = self._find_all_borders(
train_start, train_end, test_start, test_end
)

if pd.Timestamp(test_end_defined) > self.df.index.max():
warnings.warn(f"Max timestamp in df is {self.df.index.max()}.")
if pd.Timestamp(train_start_defined) < self.df.index.min():
warnings.warn(f"Min timestamp in df is {self.df.index.min()}.")

train_df = self.df[train_start_defined:train_end_defined][self.raw_df.columns] # type: ignore
train_raw_df = self.raw_df[train_start_defined:train_end_defined] # type: ignore
train = TSDataset(df=train_df, df_exog=self.df_exog, freq=self.freq)
train.raw_df = train_raw_df

test_df = self.df[test_start:test_end][self.raw_df.columns] # type: ignore
test_raw_df = self.raw_df[train_start:test_end] # type: ignore
test_df = self.df[test_start_defined:test_end_defined][self.raw_df.columns] # type: ignore
test_raw_df = self.raw_df[train_start_defined:test_end_defined] # type: ignore
test = TSDataset(df=test_df, df_exog=self.df_exog, freq=self.freq)
test.raw_df = test_raw_df

Expand Down
10 changes: 10 additions & 0 deletions etna/ensembles/voting_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@ class VotingEnsemble(Pipeline):
... weights=[0.7, 0.3]
... )
>>> ensemble.fit(ts=ts)
VotingEnsemble(pipelines =
[Pipeline(model = ProphetModel(growth = 'linear', changepoints = None, n_changepoints = 25,
changepoint_range = 0.8, yearly_seasonality = 'auto', weekly_seasonality = 'auto',
daily_seasonality = 'auto', holidays = None, seasonality_mode = 'additive',
seasonality_prior_scale = 10.0, holidays_prior_scale = 10.0, mcmc_samples = 0,
interval_width = 0.8, uncertainty_samples = 1000, stan_backend = None,
additional_seasonality_params = (), ), transforms = [], horizon = 7, ),
Pipeline(model = NaiveModel(lag = 10, ), transforms = [], horizon = 7, )],
weights = [0.7, 0.3], n_jobs = 1, )
>>> forecast = ensemble.forecast()
>>> forecast
segment segment_0 segment_1 segment_2
Expand Down Expand Up @@ -113,6 +122,7 @@ def fit(self, ts: TSDataset) -> "VotingEnsemble":
self.pipelines = Parallel(n_jobs=self.n_jobs, backend="multiprocessing", verbose=11)(
delayed(self._fit_pipeline)(pipeline=pipeline, ts=deepcopy(ts)) for pipeline in self.pipelines
)
return self

@staticmethod
def _forecast_pipeline(pipeline: Pipeline) -> TSDataset:
Expand Down
2 changes: 1 addition & 1 deletion etna/model_selection/backtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def _compute_metrics(self, y_true: TSDataset, y_pred: TSDataset) -> Dict[str, fl
"""
metrics = {}
for metric in self.metrics:
metrics[metric.__repr__()] = metric(y_true=y_true, y_pred=y_pred)
metrics[metric.__class__.__name__] = metric(y_true=y_true, y_pred=y_pred)
return metrics

def get_forecasts(self) -> pd.DataFrame:
Expand Down
84 changes: 82 additions & 2 deletions etna/models/catboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,47 @@ def predict(self, df: pd.DataFrame) -> list:


class CatBoostModelPerSegment(PerSegmentModel):
"""Class for holding per segment Catboost model."""
"""Class for holding per segment Catboost model.
Examples
--------
>>> from etna.datasets import generate_periodic_df
>>> from etna.datasets import TSDataset
>>> from etna.models import CatBoostModelPerSegment
>>> from etna.transforms import LagTransform
>>> classic_df = generate_periodic_df(
... periods=100,
... start_time="2020-01-01",
... n_segments=4,
... period=7,
... sigma=3
... )
>>> df = TSDataset.to_dataset(df=classic_df)
>>> ts = TSDataset(df, freq="D")
>>> horizon = 7
>>> transforms = [
... LagTransform(in_column="target", lags=[horizon, horizon+1, horizon+2])
... ]
>>> ts.fit_transform(transforms=transforms)
>>> future = ts.make_future(horizon)
>>> model = CatBoostModelPerSegment()
>>> model.fit(ts=ts)
CatBoostModelPerSegment(iterations = None, depth = None, learning_rate = None,
logging_level = 'Silent', l2_leaf_reg = None, thread_count = None, )
>>> forecast = model.forecast(future)
>>> pd.options.display.float_format = '{:,.2f}'.format
>>> forecast[:, :, "target"]
segment segment_0 segment_1 segment_2 segment_3
feature target target target target
timestamp
2020-04-10 9.00 9.00 4.00 6.00
2020-04-11 5.00 2.00 7.00 9.00
2020-04-12 0.00 4.00 7.00 9.00
2020-04-13 0.00 5.00 9.00 7.00
2020-04-14 1.00 2.00 1.00 6.00
2020-04-15 5.00 7.00 4.00 7.00
2020-04-16 8.00 6.00 2.00 0.00
"""

def __init__(
self,
Expand Down Expand Up @@ -122,7 +162,47 @@ def __init__(


class CatBoostModelMultiSegment(Model):
"""Class for holding Catboost model for all segments."""
"""Class for holding Catboost model for all segments.
Examples
--------
>>> from etna.datasets import generate_periodic_df
>>> from etna.datasets import TSDataset
>>> from etna.models import CatBoostModelMultiSegment
>>> from etna.transforms import LagTransform
>>> classic_df = generate_periodic_df(
... periods=100,
... start_time="2020-01-01",
... n_segments=4,
... period=7,
... sigma=3
... )
>>> df = TSDataset.to_dataset(df=classic_df)
>>> ts = TSDataset(df, freq="D")
>>> horizon = 7
>>> transforms = [
... LagTransform(in_column="target", lags=[horizon, horizon+1, horizon+2])
... ]
>>> ts.fit_transform(transforms=transforms)
>>> future = ts.make_future(horizon)
>>> model = CatBoostModelMultiSegment()
>>> model.fit(ts=ts)
CatBoostModelMultiSegment(iterations = None, depth = None, learning_rate = None,
logging_level = 'Silent', l2_leaf_reg = None, thread_count = None, )
>>> forecast = model.forecast(future)
>>> pd.options.display.float_format = '{:,.2f}'.format
>>> forecast[:, :, "target"].round()
segment segment_0 segment_1 segment_2 segment_3
feature target target target target
timestamp
2020-04-10 9.00 9.00 4.00 6.00
2020-04-11 5.00 2.00 7.00 9.00
2020-04-12 -0.00 4.00 7.00 9.00
2020-04-13 0.00 5.00 9.00 7.00
2020-04-14 1.00 2.00 1.00 6.00
2020-04-15 5.00 7.00 4.00 7.00
2020-04-16 8.00 6.00 2.00 0.00
"""

def __init__(
self,
Expand Down
43 changes: 41 additions & 2 deletions etna/models/prophet.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,44 @@ def predict(self, df: pd.DataFrame, confidence_interval: bool = False):


class ProphetModel(PerSegmentModel):
"""Class for holding Prophet model."""
"""Class for holding Prophet model.
Examples
--------
>>> from etna.datasets import generate_periodic_df
>>> from etna.datasets import TSDataset
>>> from etna.models import ProphetModel
>>> classic_df = generate_periodic_df(
... periods=100,
... start_time="2020-01-01",
... n_segments=4,
... period=7,
... sigma=3
... )
>>> df = TSDataset.to_dataset(df=classic_df)
>>> ts = TSDataset(df, freq="D")
>>> future = ts.make_future(7)
>>> model = ProphetModel(growth="flat")
>>> model.fit(ts=ts)
ProphetModel(growth = 'flat', changepoints = None, n_changepoints = 25,
changepoint_range = 0.8, yearly_seasonality = 'auto', weekly_seasonality = 'auto',
daily_seasonality = 'auto', holidays = None, seasonality_mode = 'additive',
seasonality_prior_scale = 10.0, holidays_prior_scale = 10.0, mcmc_samples = 0,
interval_width = 0.8, uncertainty_samples = 1000, stan_backend = None,
additional_seasonality_params = (), )
>>> forecast = model.forecast(future)
>>> forecast
segment segment_0 segment_1 segment_2 segment_3
feature target target target target
timestamp
2020-04-10 9.00 9.00 4.00 6.00
2020-04-11 5.00 2.00 7.00 9.00
2020-04-12 0.00 4.00 7.00 9.00
2020-04-13 0.00 5.00 9.00 7.00
2020-04-14 1.00 2.00 1.00 6.00
2020-04-15 5.00 7.00 4.00 7.00
2020-04-16 8.00 6.00 2.00 0.00
"""

def __init__(
self,
Expand All @@ -152,6 +189,7 @@ def __init__(
):
"""
Create instance of Prophet model.
Parameters
----------
growth:
Expand Down Expand Up @@ -282,6 +320,7 @@ def _forecast_segment(
@log_decorator
def forecast(self, ts: TSDataset, confidence_interval: bool = False) -> TSDataset:
"""Make predictions.
Parameters
----------
ts:
Expand All @@ -294,7 +333,7 @@ def forecast(self, ts: TSDataset, confidence_interval: bool = False) -> TSDatase
Models result
Notes
-----
The width of the confidence interval is specified in the constructor of ProphetModel setting the interval_width
The width of the confidence interval is specified in the constructor of ProphetModel setting the interval_width.
"""
if self._segments is None:
raise ValueError("The model is not fitted yet, use fit() to train it")
Expand Down
11 changes: 9 additions & 2 deletions etna/pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from etna.loggers import tslogger
from etna.metrics import Metric
from etna.metrics import MetricAggregationMode
from etna.metrics.utils import compute_metrics
from etna.models.base import Model
from etna.transforms.base import Transform

Expand Down Expand Up @@ -150,6 +149,14 @@ def _generate_folds_datasets(

yield train, test

@staticmethod
def _compute_metrics(metrics: List[Metric], y_true: TSDataset, y_pred: TSDataset) -> Dict[str, float]:
"""Compute metrics for given y_true, y_pred."""
metrics_values = {}
for metric in metrics:
metrics_values[metric.__class__.__name__] = metric(y_true=y_true, y_pred=y_pred)
return metrics_values

def _run_fold(
self,
train: TSDataset,
Expand All @@ -170,7 +177,7 @@ def _run_fold(
fold[f"{stage_name}_timerange"]["start"] = stage_df.index.min()
fold[f"{stage_name}_timerange"]["end"] = stage_df.index.max()
fold["forecast"] = forecast
fold["metrics"] = deepcopy(compute_metrics(metrics=metrics, y_true=test, y_pred=forecast))
fold["metrics"] = deepcopy(self._compute_metrics(metrics=metrics, y_true=test, y_pred=forecast))

tslogger.log_backtest_run(pd.DataFrame(fold["metrics"]), forecast.to_pandas(), test.to_pandas())
tslogger.finish_experiment()
Expand Down
Loading

0 comments on commit 2f22bca

Please sign in to comment.