Skip to content

Add target components handling in get_level_dataframe #1179

Merged
merged 25 commits into from
Mar 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1e75a7a
Rework get_level_dataframe
alex-hse-repository Mar 20, 2023
90fa18e
Rework tests for get_level_dataframe
alex-hse-repository Mar 20, 2023
c205a58
Rework get_level_dataset
alex-hse-repository Mar 21, 2023
e5e480c
Rework tests
alex-hse-repository Mar 21, 2023
f31ef01
Add get_level_dataset test with components
alex-hse-repository Mar 21, 2023
00ba286
Improve reconcile method
alex-hse-repository Mar 21, 2023
e34706a
Add test for reconcilation
alex-hse-repository Mar 21, 2023
e7f5053
fixx bug in slicing
alex-hse-repository Mar 23, 2023
b2f3c86
Merge branch 'master' of https://github.com/tinkoff-ai/etna into issu…
alex-hse-repository Mar 23, 2023
b1d1be0
Fixes after merge
alex-hse-repository Mar 23, 2023
668bb1f
Update changelog
alex-hse-repository Mar 23, 2023
93d1c2f
fix tests
alex-hse-repository Mar 23, 2023
59c68bd
Merge branch 'master' of https://github.com/tinkoff-ai/etna into issu…
alex-hse-repository Mar 23, 2023
6deb055
Rename fixtures in tests
alex-hse-repository Mar 24, 2023
fac6459
Add target_quantiles_names property to dataset
alex-hse-repository Mar 24, 2023
e703e2f
Update changelog
alex-hse-repository Mar 27, 2023
4f0a861
Change default value of target_components names
alex-hse-repository Mar 27, 2023
1104cce
Add comments about the slicing
alex-hse-repository Mar 27, 2023
9840e80
Fix reconcilers
alex-hse-repository Mar 27, 2023
f93e6c8
Small fixes
alex-hse-repository Mar 27, 2023
de06345
Merge branch 'master' of https://github.com/tinkoff-ai/etna into issu…
alex-hse-repository Mar 27, 2023
83880b2
Review fixes
alex-hse-repository Mar 29, 2023
dab1247
Merge branch 'master' of https://github.com/tinkoff-ai/etna into issu…
alex-hse-repository Mar 29, 2023
5415a78
Merge branch 'master' of https://github.com/tinkoff-ai/etna into issu…
alex-hse-repository Mar 29, 2023
8e169ef
Fixes for low versions of pandas
alex-hse-repository Mar 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased
### Added
- Add target components handling in `get_level_dataframe` ([#1179](https://github.com/tinkoff-ai/etna/pull/1179))
- Forecast decomposition for `SeasonalMovingAverageModel`([#1180](https://github.com/tinkoff-ai/etna/pull/1180))
- Target components logic into base classes of pipelines ([#1173](https://github.com/tinkoff-ai/etna/pull/1173))
- Method `predict_components` for forecast decomposition in `_SklearnAdapter` and `_LinearAdapter` for linear models ([#1164](https://github.com/tinkoff-ai/etna/pull/1164))
Expand Down
50 changes: 31 additions & 19 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
from etna.datasets.hierarchical_structure import HierarchicalStructure
from etna.datasets.utils import _TorchDataset
from etna.datasets.utils import get_level_dataframe
from etna.datasets.utils import get_target_with_quantiles
from etna.datasets.utils import inverse_transform_target_components
from etna.datasets.utils import match_target_quantiles
from etna.loggers import tslogger

if TYPE_CHECKING:
Expand Down Expand Up @@ -164,7 +164,9 @@ def __init__(
if self.current_df_level == self.current_df_exog_level:
self.df = self._merge_exog(self.df)

self._target_components_names: Optional[List[str]] = None
self._target_components_names: Tuple[str, ...] = tuple()

self.df = self.df.sort_index(axis=1, level=("segment", "feature"))

def _get_dataframe_level(self, df: pd.DataFrame) -> Optional[str]:
"""Return the level of the passed dataframe in hierarchical structure."""
Expand Down Expand Up @@ -429,7 +431,7 @@ def inverse_transform(self, transforms: Sequence["Transform"]):
# TODO: return regressors after inverse_transform
# Logic with target components is here for performance reasons.
# This way we avoid doing the inverse transformation for components several times.
target_components_present = self.target_components_names is not None
target_components_present = len(self.target_components_names) > 0
target_df, target_components_df = None, None
if target_components_present:
target_df = self.to_pandas(features=["target"])
Expand Down Expand Up @@ -494,10 +496,15 @@ def regressors(self) -> List[str]:
return self._regressors

@property
def target_components_names(self) -> Optional[List[str]]:
"""Get list of target components names. Components sum up to target. If there are no components, None is returned."""
def target_components_names(self) -> Tuple[str, ...]:
"""Get tuple with target components names. Components sum up to target. Return the empty tuple in case of components absence."""
return self._target_components_names

@property
def target_quantiles_names(self) -> Tuple[str, ...]:
"""Get tuple with target quantiles names. Return the empty tuple in case of quantile absence."""
return tuple(match_target_quantiles(features=set(self.columns.get_level_values("feature"))))

def plot(
self,
n_segments: int = 10,
Expand Down Expand Up @@ -1044,9 +1051,7 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False):
ValueError:
If ``features`` list contains target components
"""
features_contain_target_components = (self.target_components_names is not None) and (
len(set(features).intersection(self.target_components_names)) != 0
)
features_contain_target_components = len(set(features).intersection(self.target_components_names)) > 0
if features_contain_target_components:
raise ValueError(
"Target components can't be dropped from the dataset using this method! Use `drop_target_components` method!"
Expand Down Expand Up @@ -1112,21 +1117,26 @@ def get_level_dataset(self, target_level: str) -> "TSDataset":
if target_level_index > current_level_index:
raise ValueError("Target level should be higher in the hierarchy than the current level of dataframe!")

target_names = self.target_quantiles_names + self.target_components_names + ("target",)

if target_level_index < current_level_index:
summing_matrix = self.hierarchical_structure.get_summing_matrix(
target_level=target_level, source_level=self.current_df_level
)

target_level_df = get_level_dataframe(
df=self.df,
df=self.to_pandas(features=target_names),
mapping_matrix=summing_matrix,
source_level_segments=current_level_segments,
target_level_segments=target_level_segments,
)

else:
target_names = tuple(get_target_with_quantiles(columns=self.columns))
target_level_df = self[:, current_level_segments, target_names]
target_level_df = self.to_pandas(features=target_names)

target_components_df = target_level_df.loc[:, pd.IndexSlice[:, self.target_components_names]]
if len(self.target_components_names) > 0: # for pandas >=1.1, <1.2
target_level_df = target_level_df.drop(columns=list(self.target_components_names), level="feature")

ts = TSDataset(
df=target_level_df,
Expand All @@ -1135,7 +1145,9 @@ def get_level_dataset(self, target_level: str) -> "TSDataset":
known_future=self.known_future,
hierarchical_structure=self.hierarchical_structure,
)
ts._target_components_names = self._target_components_names

if len(self.target_components_names) > 0:
ts.add_target_components(target_components_df=target_components_df)
Mr-Geekman marked this conversation as resolved.
Show resolved Hide resolved
return ts

def add_target_components(self, target_components_df: pd.DataFrame):
Expand All @@ -1155,7 +1167,7 @@ def add_target_components(self, target_components_df: pd.DataFrame):
ValueError:
If components don't sum up to target
"""
if self._target_components_names is not None:
if len(self.target_components_names) > 0:
Mr-Geekman marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("Dataset already contains target components!")

components_names = sorted(target_components_df[self.segments[0]].columns.get_level_values("feature"))
Expand All @@ -1170,7 +1182,7 @@ def add_target_components(self, target_components_df: pd.DataFrame):
if not np.allclose(components_sum.values, self[..., "target"].values):
raise ValueError("Components don't sum up to target!")

self._target_components_names = components_names
self._target_components_names = tuple(components_names)
self.df = (
pd.concat((self.df, target_components_df), axis=1)
.loc[self.df.index]
Expand All @@ -1185,15 +1197,15 @@ def get_target_components(self) -> Optional[pd.DataFrame]:
:
Dataframe with target components
"""
if self._target_components_names is None:
if len(self.target_components_names) == 0:
return None
return self.to_pandas(features=self._target_components_names)
return self.to_pandas(features=self.target_components_names)

def drop_target_components(self):
"""Drop target components from dataset."""
if self._target_components_names is not None:
self.df.drop(columns=self.target_components_names, level="feature", inplace=True)
self._target_components_names = None
if len(self.target_components_names) > 0: # for pandas >=1.1, <1.2
self.df.drop(columns=list(self.target_components_names), level="feature", inplace=True)
self._target_components_names = ()

@property
def columns(self) -> pd.core.indexes.multi.MultiIndex:
Expand Down
40 changes: 19 additions & 21 deletions etna/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,24 +216,20 @@ def get_level_dataframe(
mapping_matrix:
mapping matrix between levels
source_level_segments:
tuple of segments at the source level
list of segments at the source level, set the order of segments matching the mapping matrix
target_level_segments:
tuple of segments at the target level
list of segments at the target level

Returns
-------
:
dataframe at the target level
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
"""
target_names = tuple(get_target_with_quantiles(columns=df.columns))

num_target_names = len(target_names)
column_names = sorted(set(df.columns.get_level_values("feature")))
num_columns = len(column_names)
num_source_level_segments = len(source_level_segments)
num_target_level_segments = len(target_level_segments)

if len(target_names) == 0:
raise ValueError("Provided dataframe has no columns with the target variable!")

if set(df.columns.get_level_values(level="segment")) != set(source_level_segments):
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("Segments mismatch for provided dataframe and `source_level_segments`!")

Expand All @@ -243,30 +239,32 @@ def get_level_dataframe(
if num_target_level_segments != mapping_matrix.shape[0]:
raise ValueError("Number of target level segments do not match mapping matrix number of columns!")

df = df.loc[:, pd.IndexSlice[source_level_segments, target_names]]

source_level_data = df.values # shape: (t, num_source_level_segments * num_target_names)
# Slice should be done by source_level_segments -- to fix the order of segments for mapping matrix,
# by num_columns -- to fix the order of columns to create correct index in the end
source_level_data = df.loc[
pd.IndexSlice[:], pd.IndexSlice[source_level_segments, column_names]
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
].values # shape: (t, num_source_level_segments * num_columns)

source_level_data = source_level_data.reshape(
(-1, num_source_level_segments, num_target_names)
) # shape: (t, num_source_level_segments, num_target_names)
source_level_data = np.swapaxes(source_level_data, 1, 2) # shape: (t, num_target_names, num_source_level_segments)
(-1, num_source_level_segments, num_columns)
) # shape: (t, num_source_level_segments, num_columns)
source_level_data = np.swapaxes(source_level_data, 1, 2) # shape: (t, num_columns, num_source_level_segments)
source_level_data = source_level_data.reshape(
(-1, num_source_level_segments)
) # shape: (t * num_target_names, num_source_level_segments)
) # shape: (t * num_columns, num_source_level_segments)

target_level_data = source_level_data @ mapping_matrix.T

target_level_data = target_level_data.reshape(
(-1, num_target_names, num_target_level_segments)
) # shape: (t, num_target_names, num_target_level_segments)
target_level_data = np.swapaxes(target_level_data, 1, 2) # shape: (t, num_target_level_segments, num_target_names)
(-1, num_columns, num_target_level_segments)
) # shape: (t, num_columns, num_target_level_segments)
target_level_data = np.swapaxes(target_level_data, 1, 2) # shape: (t, num_target_level_segments, num_columns)
target_level_data = target_level_data.reshape(
(-1, num_target_names * num_target_level_segments)
) # shape: (t, num_target_level_segments * num_target_names)
(-1, num_columns * num_target_level_segments)
) # shape: (t, num_target_level_segments * num_columns)

target_level_segments = pd.MultiIndex.from_product(
[target_level_segments, target_names], names=["segment", "feature"]
[target_level_segments, column_names], names=["segment", "feature"]
)
target_level_df = pd.DataFrame(data=target_level_data, index=df.index, columns=target_level_segments)

Expand Down
11 changes: 10 additions & 1 deletion etna/reconciliation/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from abc import abstractmethod
from typing import Optional

import pandas as pd
from scipy.sparse import csr_matrix

from etna.core import BaseMixin
Expand Down Expand Up @@ -83,18 +84,26 @@ def reconcile(self, ts: TSDataset) -> TSDataset:
current_level_segments = ts.hierarchical_structure.get_level_segments(level_name=self.source_level)
target_level_segments = ts.hierarchical_structure.get_level_segments(level_name=self.target_level)

target_names = ts.target_quantiles_names + ts.target_components_names + ("target",)

df_reconciled = get_level_dataframe(
df=ts.to_pandas(),
df=ts.to_pandas(features=target_names),
mapping_matrix=self.mapping_matrix,
source_level_segments=current_level_segments,
target_level_segments=target_level_segments,
)

target_components_df = df_reconciled.loc[:, pd.IndexSlice[:, ts.target_components_names]]
df_reconciled = df_reconciled.drop(columns=list(ts.target_components_names), level="feature")

ts_reconciled = TSDataset(
df=df_reconciled,
freq=ts.freq,
df_exog=ts.df_exog,
known_future=ts.known_future,
hierarchical_structure=ts.hierarchical_structure,
)

if len(ts.target_components_names) > 0:
ts_reconciled.add_target_components(target_components_df=target_components_df)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be we should make add_target_components immute to None value.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can create private method which can handle None, but not public

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure it is useful

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here about target_components_df.

return ts_reconciled
2 changes: 1 addition & 1 deletion etna/transforms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def inverse_transform(self, ts: TSDataset) -> TSDataset:
TSDataset after applying inverse transformation.
"""
required_features = self._get_inverse_transform_required_features(ts)
target_components_present = "target" in required_features and ts.target_components_names is not None
target_components_present = "target" in required_features and len(ts.target_components_names) > 0
target_df = None
if target_components_present:
target_df = ts.to_pandas(flatten=False, features=["target"])
Expand Down
Loading