diff --git a/pyforecaster/formatter.py b/pyforecaster/formatter.py index f405d4a..9f3f87c 100644 --- a/pyforecaster/formatter.py +++ b/pyforecaster/formatter.py @@ -216,12 +216,19 @@ def _transform(self, x, time_features=True, holidays=False, return_target=True, for tr in self.transformers: x = tr.transform(x) transformed_columns = [c for c in x.columns if c not in original_columns] + if return_target: for tr in self.target_transformers: target = pd.concat([target, tr.transform(x, augment=False)], axis=1) - # apply normalization if any - if len(self.target_normalizers)>0: - target, x = self.normalize(x, target) + + # apply normalization to target if any and if return_target is True + if len(self.target_normalizers)>0: + normalizing_columns = [nr.name for nr in self.target_normalizers] + x = self.add_normalizing_columns(x) + + # this is needed even if target is not returned, to normalize features correlated to the target + target, x = self.normalize(x, target, return_target=return_target) + transformed_columns = transformed_columns + normalizing_columns if return_target: # remove raws with nans to reconcile impossible dataset entries introduced by shiftin' around @@ -239,7 +246,19 @@ def _transform(self, x, time_features=True, holidays=False, return_target=True, x = self.add_holidays(x, **holidays_kwargs) return x, target - def normalize(self, x, y, normalizing_fun=None, antitransform=False): + def add_normalizing_columns(self, x): + + # if we're doing the direct transform (normalization) we compute the normalizers and add them to the x df + # compute normalizers if any + normalizers = pd.concat([nr.transform(x, augment=False) for nr in self.target_normalizers], axis=1) + + # rename normalizers with tag names + normalizers.columns = [nr.name for nr in self.target_normalizers] + x = pd.concat([x, normalizers], axis=1) + + return x + + def normalize(self, x, y=None, normalizing_fun=None, antitransform=False, return_target=True): """ Columns needed to compute the normaliztion factors are computed by the target transformers and returned in the original x dataframe. The normalizing_fun is a string expression that must be evaluated to normalize the @@ -261,47 +280,42 @@ def normalize(self, x, y, normalizing_fun=None, antitransform=False): '\bor by passing the noralizing_expr argument to this function') return y, x - if not antitransform: - # if we're doing the direct transform (normalization) we compute the normalizers and add them to the x df - # compute normalizers if any - normalizers = pd.concat([nr.transform(x, augment=False) for nr in self.target_normalizers], axis = 1) - # rename normalizers with tag names - normalizers.columns = [nr.name for nr in self.target_normalizers] - x = pd.concat([x, normalizers], axis=1) - else: - # if we're antitransform, we retrieve the normalizers from the x df - normalizers = x[[nr.name for nr in self.target_normalizers]] + normalizers = x[[nr.name for nr in self.target_normalizers]] + # get normalizers names target_to_norm_names = [nr.names for nr in self.target_normalizers] target_to_norm_names = [item for sublist in target_to_norm_names for item in sublist] - # join target and normalizers in a single df - df_n = pd.concat([y, normalizers], axis=1) + # normalize the target if any + if return_target: + # join target and normalizers in a single df + df_n = pd.concat([y, normalizers], axis=1) + for target_to_norm in np.unique(target_to_norm_names): + for tr in self.target_transformers: + nr_columns = (tr.metadata['name'].isin([target_to_norm])).index + for c in nr_columns: + df_n.loc[:, c] = self.normalizing_wrapper(normalizing_fun, df_n, c) + y = df_n[[c for c in y.columns]] + # normalize the features related to the target for target_to_norm in np.unique(target_to_norm_names): - for tr in self.target_transformers: + for tr in self.transformers: # find df_n columns to normalize nr_columns = (tr.metadata['name'].isin([target_to_norm])).index for c in nr_columns: - df_n.loc[:, c] = self.normalizing_wrapper(normalizing_fun, df_n, c) - if not antitransform: - for tr in self.transformers: - # find df_n columns to normalize - nr_columns = (tr.metadata['name'].isin([target_to_norm])).index - for c in nr_columns: - x.loc[:, c] = self.normalizing_wrapper(normalizing_fun, x, c) + x.loc[:, c] = self.normalizing_wrapper(normalizing_fun, x, c) + - df_n = df_n[[c for c in y.columns]] - return df_n, x + return y, x def denormalize(self, x, y): if self.denormalizing_fun is None: self.logger.warning('You did not pass any denormalization expression, ** no denormalization will be applied **. ' '\bYou can set a denormalization expression by calling Formatter.add_normalizing_fun ') return y - y, _ = self.normalize(x, y, normalizing_fun=self.denormalizing_fun, antitransform=True) + y, _ = self.normalize(x.copy(), y, normalizing_fun=self.denormalizing_fun) return y def normalizing_wrapper(self, normalizing_fun, df, t): diff --git a/tests/test_formatter.py b/tests/test_formatter.py index 7c792d3..2afc2d9 100644 --- a/tests/test_formatter.py +++ b/tests/test_formatter.py @@ -273,7 +273,7 @@ def test_normalizers(self): def test_normalizers_complex(self): df = pd.DataFrame(np.random.randn(100, 5), index=pd.date_range('01-01-2020', freq='20min', periods=100, tz='Europe/Zurich'), columns=['a', 'b', 'c', 'd', 'e']) - formatter = pyf.Formatter().add_transform(['a', 'b'], lags=np.arange(1, 5), agg_freq='20min') + formatter = pyf.Formatter(augment=False).add_transform(['a', 'b'], lags=np.arange(1, 5), agg_freq='20min') formatter.add_target_transform(['a'], lags=-np.arange(1, 5), agg_freq='20min') formatter.add_target_normalizer(['a'], 'mean', agg_freq='10H', name='a_n') formatter.add_target_normalizer(['a'], 'std', agg_freq='5H', name='b_n') @@ -285,7 +285,7 @@ def test_normalizers_complex(self): y_unnorm = formatter.denormalize(x, y_norm) # check if back-transform works - assert (y_unnorm-y).sum().sum() < 1e-6 + assert (y_unnorm-y).abs().sum().sum() < 1e-6 def test_normalizers_impossible(self): diff --git a/tests/test_models.py b/tests/test_models.py index 67a56fc..9ebdc77 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -8,6 +8,7 @@ from pyforecaster.forecasting_models.fast_adaptive_models import Fourier_es, FK, FK_multi from pyforecaster.forecasting_models.random_fourier_features import RFFRegression, AdditiveRFFRegression from pyforecaster.forecasting_models.randomforests import QRF +from pyforecaster.forecasting_models.gradientboosters import LGBMHybrid from pyforecaster.forecaster import LinearForecaster, LGBForecaster from pyforecaster.plot_utils import plot_quantiles from pyforecaster.formatter import Formatter @@ -204,13 +205,13 @@ def test_rffr(self): def test_antinormalize(self): - formatter = Formatter(logger=self.logger).add_transform(['all'], lags=np.arange(144), + formatter = Formatter(logger=self.logger, augment=False).add_transform(['all'], lags=np.arange(144), relative_lags=True) formatter.add_transform(['all'], ['min', 'max'], agg_bins=[1, 2, 15, 20]) - formatter.add_target_transform(['all'], lags=-np.arange(144)) + formatter.add_target_transform(['all'], lags=-np.arange(1, 145)) - formatter.add_target_normalizer(['all'], 'mean', agg_freq='7d', name='a_movingavg') - formatter.add_target_normalizer(['all'], 'std', agg_freq='7d', name='a_movingstd') + formatter.add_target_normalizer(['all'], 'mean', agg_freq='3d', name='a_movingavg') + formatter.add_target_normalizer(['all'], 'std', agg_freq='3d', name='a_movingstd') x, y = formatter.transform(self.data.iloc[:10000]) @@ -218,27 +219,22 @@ def test_antinormalize(self): x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(), y.iloc[n_tr:].copy()] - m_lin = LinearForecaster(val_ratio=0.2, formatter=formatter).fit(x_tr, y_tr) - y_hat = m_lin.predict(x_te) - q = m_lin.predict_quantiles(x_te) + #m_lin = LinearForecaster(val_ratio=0.2, formatter=formatter).fit(x_tr, y_tr) + #y_hat_nonorm = m_lin.predict(x_te) + #q_nonorm = m_lin.predict_quantiles(x_te) #m_lgb = LGBForecaster(val_ratio=0.5, lgb_pars={'num_leaves':20}, formatter=formatter).fit(x_tr, y_tr) #y_hat_lgb = m_lgb.predict(x_te) - mae = lambda x, y: np.abs(x-y).mean().mean() - print('MAE lin:', mae(y_te, y_hat)) + #mae = lambda x, y: np.abs(x-y).mean().mean() + #print('MAE lin:', mae(y_te, y_hat_nonorm)) - plt.close('all') - plot_quantiles([y_te, y_hat], q, ['y_te', 'y_hat_lin'], n_rows=600) - plt.close('all') - formatter.add_normalizing_fun(expr="(df[t] - df['a_movingavg']) / (df['a_movingstd'] + 1)", inv_expr="df[t]*(df['a_movingstd']+1) + df['a_movingavg']") x, y_norm = formatter.transform(self.data.iloc[:10000]) y = formatter.denormalize(x, y_norm) - x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y_norm.iloc[:n_tr].copy(), - y.iloc[n_tr:].copy()] + x_tr, x_te, y_tr = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y_norm.iloc[:n_tr].copy()] m_lin = LinearForecaster(val_ratio=0.2, formatter=formatter).fit(x_tr, y_tr) y_hat = m_lin.predict(x_te) q = m_lin.predict_quantiles(x_te)