Skip to content

Add FourierTransform #306

Merged
merged 7 commits into from
Nov 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Spell checking for source code and md files ([#303](https://github.com/tinkoff-ai/etna/pull/303))
- ResampleWithDistributionTransform ([#296](https://github.com/tinkoff-ai/etna/pull/296))
- Add function to duplicate exogenous data ([#305](https://github.com/tinkoff-ai/etna/pull/305))
- FourierTransform ([#306](https://github.com/tinkoff-ai/etna/pull/306))

### Changed
- Rename confidence interval to prediction interval, start working with quantiles instead of interval_width ([#285](https://github.com/tinkoff-ai/etna/pull/285))
Expand Down
1 change: 1 addition & 0 deletions etna/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from etna.transforms.detrend import TheilSenTrendTransform
from etna.transforms.feature_importance import TreeFeatureSelectionTransform
from etna.transforms.filter import FilterFeaturesTransform
from etna.transforms.fourier import FourierTransform
from etna.transforms.gale_shapley import GaleShapleyFeatureSelectionTransform
from etna.transforms.imputation import TimeSeriesImputerTransform
from etna.transforms.lags import LagTransform
Expand Down
135 changes: 135 additions & 0 deletions etna/transforms/fourier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import math
from typing import Optional
from typing import Sequence

import numpy as np
import pandas as pd

from etna.transforms.base import Transform


class FourierTransform(Transform):
"""Adds fourier features to the dataset."""

def __init__(
self,
period: float,
order: Optional[int] = None,
mods: Optional[Sequence[int]] = None,
out_column: Optional[str] = None,
):
"""Create instance of FourierTransform.
Parameters
----------
period:
the period of the seasonality to capture in frequency units of time series, it should be >= 2
order:
upper order of Fourier components to include, it should be >= 1 and <= ceil(period/2))
mods:
alternative and precise way of defining which harmonics will be used,
for example `mods=[1, 3, 4]` means that sin of the first order
and sin and cos of the second order will be used,
mods should be >= 1 and < period
iKintosh marked this conversation as resolved.
Show resolved Hide resolved
out_column:
if set, name of added column, the final name will be '{out_columnt}_{mod}',
don't forget to add 'regressor_' prefix
if don't set, name will be 'regressor_{repr}', repr will represent class that creates exactly this column
Raises
------
ValueError:
if period < 2
ValueError:
if both or none of order, mods is set
ValueError:
if order is < 1 or > ceil(period/2)
ValueError:
if at least one mod is < 1 or >= period
Notes
-----
To understand how transform works we recommend: https://otexts.com/fpp2/useful-predictors.html#fourier-series
* Parameter `period` is responsible for the seasonality we want to capture.
* Parameters `order` and `mods` define which harmonics will be used.
Parameter `order` is a more user-friendly version of `mods`.
For example, `order=2` can be represented as `mods=[1, 2, 3, 4]` if `period` > 4 and
as `mods=[1, 2, 3]` if 3 <= `period` <= 4.
"""
if period < 2:
raise ValueError("Period should be at least 2")
self.period = period
self.mods: Sequence[int]

if order is not None and mods is None:
if order < 1 or order > math.ceil(period / 2):
raise ValueError("Order should be within [1, ceil(period/2)] range")
self.mods = [mod for mod in range(1, 2 * order + 1) if mod < period]
elif mods is not None and order is None:
if min(mods) < 1 or max(mods) >= period:
raise ValueError("Every mod should be within [1, int(period)) range")
self.mods = mods
else:
raise ValueError("There should be exactly one option set: order or mods")

self.out_column = out_column

def fit(self, df: pd.DataFrame) -> "FourierTransform":
"""Fit method does nothing and is kept for compatibility.
Parameters
----------
df:
dataframe with data.
Returns
-------
result: FourierTransform
"""
return self

def _get_column_name(self, mod: int) -> str:
if self.out_column is None:
return f"regressor_{FourierTransform(period=self.period, mods=[mod]).__repr__()}"
else:
return f"{self.out_column}_{mod}"

@staticmethod
def _construct_answer(df: pd.DataFrame, features: pd.DataFrame) -> pd.DataFrame:
dataframes = []
for seg in df.columns.get_level_values("segment").unique():
tmp = df[seg].join(features)
_idx = tmp.columns.to_frame()
_idx.insert(0, "segment", seg)
tmp.columns = pd.MultiIndex.from_frame(_idx)
dataframes.append(tmp)

result = pd.concat(dataframes, axis=1).sort_index(axis=1)
result.columns.names = ["segment", "feature"]
return result

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""Add harmonics to the dataset.
Parameters
----------
df:
dataframe with data to transform.
Returns
-------
result: pd.Dataframe
transformed dataframe
"""
features = pd.DataFrame(index=df.index)
elapsed = np.arange(features.shape[0]) / self.period

for mod in self.mods:
order = (mod + 1) // 2
is_cos = mod % 2 == 0

features[self._get_column_name(mod)] = np.sin(2 * np.pi * order * elapsed + np.pi / 2 * is_cos)

return self._construct_answer(df, features)
136 changes: 136 additions & 0 deletions tests/test_transforms/test_fourier_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import numpy as np
import pandas as pd
import pytest

from etna.datasets import TSDataset
from etna.metrics import R2
from etna.models import LinearPerSegmentModel
from etna.transforms import FourierTransform


def add_seasonality(series: pd.Series, period: int, magnitude: float) -> pd.Series:
"""Add seasonality to given series."""
new_series = series.copy()
size = series.shape[0]
indices = np.arange(size)
new_series += np.sin(2 * np.pi * indices / period) * magnitude
return new_series


def get_one_df(period_1, period_2, magnitude_1, magnitude_2):
timestamp = pd.date_range(start="2020-01-01", end="2021-01-01", freq="D")
df = pd.DataFrame({"timestamp": timestamp})
target = 0
indices = np.arange(timestamp.shape[0])
target += np.sin(2 * np.pi * indices * 2 / period_1) * magnitude_1
target += np.cos(2 * np.pi * indices * 3 / period_2) * magnitude_2
target += np.random.normal(scale=0.05, size=timestamp.shape[0])
df["target"] = target
return df


@pytest.fixture
def ts_trend_seasonal(random_seed) -> TSDataset:
df_1 = get_one_df(period_1=7, period_2=30.4, magnitude_1=1, magnitude_2=1 / 2)
df_1["segment"] = "segment_1"
df_2 = get_one_df(period_1=7, period_2=30.4, magnitude_1=1 / 2, magnitude_2=1 / 5)
df_2["segment"] = "segment_2"
classic_df = pd.concat([df_1, df_2], ignore_index=True)
return TSDataset(TSDataset.to_dataset(classic_df), freq="D")


@pytest.mark.parametrize("period", [-1, 0, 1, 1.5])
def test_fail_period(period):
"""Test that transform is not created with wrong period."""
with pytest.raises(ValueError, match="Period should be at least 2"):
_ = FourierTransform(period=period, order=1)


@pytest.mark.parametrize("order", [0, 5])
def test_fail_order(order):
"""Test that transform is not created with wrong order."""
with pytest.raises(ValueError, match="Order should be within"):
_ = FourierTransform(period=7, order=order)


@pytest.mark.parametrize("mods", [[0], [0, 1, 2, 3], [1, 2, 3, 7], [7]])
def test_fail_mods(mods):
"""Test that transform is not created with wrong mods."""
with pytest.raises(ValueError, match="Every mod should be within"):
_ = FourierTransform(period=7, mods=mods)


def test_fail_set_none():
"""Test that transform is not created without order and mods."""
with pytest.raises(ValueError, match="There should be exactly one option set"):
_ = FourierTransform(period=7)


def test_fail_set_both():
"""Test that transform is not created with both order and mods set."""
with pytest.raises(ValueError, match="There should be exactly one option set"):
_ = FourierTransform(period=7, order=1, mods=[1, 2, 3])


@pytest.mark.parametrize(
"period, order, num_columns", [(6, 2, 4), (7, 2, 4), (6, 3, 5), (7, 3, 6), (5.5, 2, 4), (5.5, 3, 5)]
)
def test_column_names(example_df, period, order, num_columns):
"""Test that transform creates expected number of columns and they can be recreated by its name."""
df = TSDataset.to_dataset(example_df)
transform = FourierTransform(period=period, order=order)
transformed_df = transform.fit_transform(df)
columns = transformed_df.columns.get_level_values("feature").unique().drop("target")
assert len(columns) == num_columns
for column in columns:
transform_temp = eval(column[len("regressor_") :])
df_temp = transform_temp.fit_transform(df)
columns_temp = df_temp.columns.get_level_values("feature").unique().drop("target")
assert len(columns_temp) == 1
assert columns_temp[0] == column


def test_column_names_out_column(example_df):
"""Test that transform creates expected columns if `out_column` is set"""
df = TSDataset.to_dataset(example_df)
transform = FourierTransform(period=10, order=3, out_column="regressor_fourier")
transformed_df = transform.fit_transform(df)
columns = transformed_df.columns.get_level_values("feature").unique().drop("target")
expected_columns = {f"regressor_fourier_{i}" for i in range(1, 7)}
assert set(columns) == expected_columns


@pytest.mark.parametrize("period, mod", [(24, 1), (24, 2), (24, 9), (24, 20), (24, 23), (7.5, 3), (7.5, 4)])
def test_column_values(example_df, period, mod):
"""Test that transform generates correct values."""
df = TSDataset.to_dataset(example_df)
transform = FourierTransform(period=period, mods=[mod], out_column="regressor_fourier")
transformed_df = transform.fit_transform(df)
for segment in example_df["segment"].unique():
transform_values = transformed_df.loc[:, pd.IndexSlice[segment, f"regressor_fourier_{mod}"]]

timestamp = df.index
freq = pd.Timedelta("1H")
elapsed = (timestamp - timestamp[0]) / (period * freq)
order = (mod + 1) // 2
if mod % 2 == 0:
expected_values = np.cos(2 * np.pi * order * elapsed).values
else:
expected_values = np.sin(2 * np.pi * order * elapsed).values

assert np.allclose(transform_values, expected_values, atol=1e-12)


def test_forecast(ts_trend_seasonal):
"""Test that transform works correctly in forecast."""
transform_1 = FourierTransform(period=7, order=3)
transform_2 = FourierTransform(period=30.4, order=5)
ts_train, ts_test = ts_trend_seasonal.train_test_split(test_size=10)
ts_train.fit_transform(transforms=[transform_1, transform_2])
model = LinearPerSegmentModel()
model.fit(ts_train)
ts_future = ts_train.make_future(10)
ts_forecast = model.forecast(ts_future)
metric = R2("macro")
r2 = metric(ts_test, ts_forecast)
assert r2 > 0.95