From 0a7ba4ae6897a499cffad9f095f40d12d471602c Mon Sep 17 00:00:00 2001 From: nepslor Date: Mon, 9 Sep 2024 15:42:40 +0200 Subject: [PATCH] refactored to return quantiles as multiindex dataframe --- pyforecaster/forecaster.py | 40 ++++++- pyforecaster/forecasting_models/benchmarks.py | 4 +- .../fast_adaptive_models.py | 8 +- .../forecasting_models/gradientboosters.py | 2 +- .../forecasting_models/holtwinters.py | 5 +- .../random_fourier_features.py | 113 ++++++++++++++---- .../forecasting_models/randomforests.py | 1 + .../forecasting_models/statsmodels_wrapper.py | 4 +- pyforecaster/formatter.py | 30 +++-- pyforecaster/metrics.py | 4 +- pyforecaster/plot_utils.py | 3 +- tests/test_boosters.py | 2 +- tests/test_models.py | 27 +++-- tests/test_scenario.py | 6 +- 14 files changed, 186 insertions(+), 63 deletions(-) diff --git a/pyforecaster/forecaster.py b/pyforecaster/forecaster.py index 688d5bc..0d8361d 100644 --- a/pyforecaster/forecaster.py +++ b/pyforecaster/forecaster.py @@ -54,6 +54,7 @@ def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None, self.additional_node = additional_node self.formatter = formatter self.conditional_to_hour = conditional_to_hour + self.target_cols = None @property def online_tree_reduction(self): @@ -100,9 +101,36 @@ def anti_transform(self, x, y_hat): return y_hat @abstractmethod - def predict_quantiles(self, x, **kwargs): + def _predict_quantiles(self, x, **kwargs): pass + def quantiles_to_df(self, q_hat:np.ndarray, index): + level_0_labels = self.target_cols + level_1_labels = self.q_vect + q_hat = np.swapaxes(q_hat, 1, 2) + q_hat = np.reshape(q_hat, (q_hat.shape[0], q_hat.shape[1] * q_hat.shape[2])) + q_hat = pd.DataFrame(q_hat, index=index, columns=pd.MultiIndex.from_product([level_0_labels, level_1_labels])) + q_hat.columns.names = ['target', 'quantile'] + return q_hat + + @staticmethod + def quantiles_to_numpy(q_hat:pd.DataFrame): + n_taus = len(q_hat.columns.get_level_values(1).unique()) + q_hat = q_hat.values + q_hat = np.reshape(q_hat, (q_hat.shape[0], n_taus, -1)) + q_hat = np.swapaxes(q_hat, 1, 2) + return q_hat + + def predict_quantiles(self, x, dataframe=True, **kwargs): + q_hat = self._predict_quantiles(x, **kwargs) + if isinstance(q_hat, np.ndarray) and dataframe: + # create multiindex dataframe + q_hat = self.quantiles_to_df(q_hat, x.index) + if isinstance(q_hat, pd.DataFrame) and not dataframe: + q_hat = self.quantiles_to_numpy(q_hat) + + return q_hat + def predict_pmf(self, x, discrete_prob_space, **predict_q_kwargs): """ Return probability mass function of the target variable, obtained from quantile predictions @@ -110,7 +138,7 @@ def predict_pmf(self, x, discrete_prob_space, **predict_q_kwargs): :param predict_q_kwargs: :return: """ - quantiles = self.predict_quantiles(x, **predict_q_kwargs) + quantiles = self.predict_quantiles(x, dataframe=False, **predict_q_kwargs) pmf = np.zeros((quantiles.shape[0], quantiles.shape[1], len(discrete_prob_space)-1)) for i in range(quantiles.shape[0]): for j in range(quantiles.shape[1]): @@ -120,7 +148,7 @@ def predict_pmf(self, x, discrete_prob_space, **predict_q_kwargs): def predict_scenarios(self, x, n_scen=None, random_state=None, **predict_q_kwargs): n_scen = self.n_scen_fit if n_scen is None else n_scen # retrieve quantiles from child class - quantiles = self.predict_quantiles(x, **predict_q_kwargs) + quantiles = self.predict_quantiles(x, dataframe=False, **predict_q_kwargs) scenarios = self.scengen.predict_scenarios(quantiles, n_scen=n_scen, x=x, random_state=random_state) q_from_scens = np.rollaxis(np.quantile(scenarios, self.q_vect, axis=-1), 0, 3) mean_abs_dev = np.abs(q_from_scens - quantiles).mean(axis=0).mean(axis=0) @@ -142,7 +170,7 @@ def predict_trees(self, x, n_scen=100, nodes_at_step=None, init_obs=None, random if self.online_tree_reduction: # retrieve quantiles from child class - quantiles = self.predict_quantiles(x, **predict_q_kwargs) + quantiles = self.predict_quantiles(x, dataframe=False, **predict_q_kwargs) trees = self.scengen.predict_trees(quantiles=quantiles, n_scen=n_scen, x=x, nodes_at_step=nodes_at_step, init_obs=init_obs, @@ -194,7 +222,7 @@ def predict(self, x:pd.DataFrame, **kwargs): y_hat = self.anti_transform(x, y_hat) return y_hat - def predict_quantiles(self, x:pd.DataFrame, **kwargs): + def _predict_quantiles(self, x:pd.DataFrame, **kwargs): preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect))) for h in np.unique(x.index.hour): preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0) @@ -250,7 +278,7 @@ def predict(self, x, **kwargs): y_hat = self.anti_transform(x, y_hat) return y_hat - def predict_quantiles(self, x, **kwargs): + def _predict_quantiles(self, x, **kwargs): preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect))) for h in np.unique(x.index.hour): preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0) diff --git a/pyforecaster/forecasting_models/benchmarks.py b/pyforecaster/forecasting_models/benchmarks.py index 138cbc3..01b893c 100644 --- a/pyforecaster/forecasting_models/benchmarks.py +++ b/pyforecaster/forecasting_models/benchmarks.py @@ -21,7 +21,7 @@ def predict(self, x:pd.DataFrame, **kwargs): y_hat = x[self.target_col].values.reshape(-1, 1) * np.ones((1, self.n_sa)) return pd.DataFrame(y_hat, index=x.index) - def predict_quantiles(self, x:pd.DataFrame, **kwargs): + def _predict_quantiles(self, x:pd.DataFrame, **kwargs): preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect))) if self.conditional_to_hour: for h in np.unique(x.index.hour): @@ -54,7 +54,7 @@ def predict(self, x: pd.DataFrame, **kwargs): y_hat = hankel(values, self.n_sa) return pd.DataFrame(y_hat[:len(x), :], index=x.index) - def predict_quantiles(self, x: pd.DataFrame, **kwargs): + def _predict_quantiles(self, x: pd.DataFrame, **kwargs): preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect))) if self.conditional_to_hour: for h in np.unique(x.index.hour): diff --git a/pyforecaster/forecasting_models/fast_adaptive_models.py b/pyforecaster/forecasting_models/fast_adaptive_models.py index ab14feb..e156823 100644 --- a/pyforecaster/forecasting_models/fast_adaptive_models.py +++ b/pyforecaster/forecasting_models/fast_adaptive_models.py @@ -135,6 +135,7 @@ def fit(self, x_pd, y_pd=None, **kwargs): hw_target = hankel(y_present[1:], self.n_sa) resid = hw_target - preds self.err_distr = np.quantile(resid, self.q_vect, axis=0).T + self.target_cols = [f'{self.target_name}_t+{i}' for i in range(1, self.n_sa + 1)] return self @abstractmethod @@ -238,7 +239,7 @@ def run(self, x, y, return_coeffs=False, start_from=0, fit=True): return np.vstack(preds[1:]), np.vstack(coeffs_t_history) return np.vstack(preds[1:]) - def predict_quantiles(self, x, **kwargs): + def _predict_quantiles(self, x, **kwargs): preds = self.predict(x) return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0) @@ -397,7 +398,7 @@ def run(self, x, y, return_coeffs=False, start_from=0, fit=True): return np.vstack(preds[1:]), coeffs_t_history.T return np.vstack(preds[1:]) - def predict_quantiles(self, x, **kwargs): + def _predict_quantiles(self, x, **kwargs): preds = self.predict(x) return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0) @@ -503,6 +504,7 @@ def fit(self, x_pd, y_pd=None, **kwargs): hw_target = hankel(x_pd[self.target_name].values[1:], self.n_sa) resid = hw_target - preds self.err_distr = np.quantile(resid, self.q_vect, axis=0).T + self.target_cols = ['{}_{}'.format(self.target_name, t) for t in np.arange(self.n_sa)] return self def predict(self, x_pd, **kwargs): @@ -563,7 +565,7 @@ def run(self, x_pd, return_coeffs=True, fit=True): return np.vstack(preds[1:]), np.vstack(coeffs_t_history) return np.vstack(preds[1:]) - def predict_quantiles(self, x, **kwargs): + def _predict_quantiles(self, x, **kwargs): preds = self.predict(x) return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0) diff --git a/pyforecaster/forecasting_models/gradientboosters.py b/pyforecaster/forecasting_models/gradientboosters.py index 522623e..434bedb 100644 --- a/pyforecaster/forecasting_models/gradientboosters.py +++ b/pyforecaster/forecasting_models/gradientboosters.py @@ -165,7 +165,7 @@ def dataset_at_stepahead(df, target_col_num, metadata_features, formatter, logge period=period, keep_last_n_lags=keep_last_n_lags, keep_last_seconds=keep_last_seconds, tol_period=tol_period) - def predict_quantiles(self, x, **kwargs): + def _predict_quantiles(self, x, **kwargs): preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect))) for h in np.unique(x.index.hour): preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0) diff --git a/pyforecaster/forecasting_models/holtwinters.py b/pyforecaster/forecasting_models/holtwinters.py index 90ff975..66701a0 100644 --- a/pyforecaster/forecasting_models/holtwinters.py +++ b/pyforecaster/forecasting_models/holtwinters.py @@ -198,6 +198,7 @@ def fit(self, x_pd, y_pd=None, **kwargs): # concat past inputs and last row of target self.reinit(y) self.err_distr = np.quantile(resid, self.q_vect, axis=0).T + self.target_cols = ['{}_{}'.format(self.target_name, t) for t in np.arange(self.n_sa)] return self def predict(self, x_pd, **kwargs): @@ -231,7 +232,7 @@ def predict(self, x_pd, **kwargs): return y_hat - def predict_quantiles(self, x, **kwargs): + def _predict_quantiles(self, x, **kwargs): preds = self.predict(x) return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0) @@ -448,7 +449,7 @@ def reinit(self, x): for i,m in enumerate(self.models): m.reinit(x) - def predict_quantiles(self, x, **kwargs): + def _predict_quantiles(self, x, **kwargs): preds = self.predict(x) return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0) diff --git a/pyforecaster/forecasting_models/random_fourier_features.py b/pyforecaster/forecasting_models/random_fourier_features.py index d916a9a..487da6d 100644 --- a/pyforecaster/forecasting_models/random_fourier_features.py +++ b/pyforecaster/forecasting_models/random_fourier_features.py @@ -2,6 +2,8 @@ # Author: Tetsuya Ishikawa import functools + +import matplotlib.pyplot as plt import numpy as np import scipy.stats from sklearn.linear_model import LinearRegression @@ -10,47 +12,112 @@ from concurrent.futures import ProcessPoolExecutor from functools import partial from tqdm import tqdm +from numba import njit, prange, typed, types + + +@njit(fastmath=True) +def compute_coeffs(y, x, x_binned, n_bins): + n_features = y.shape[1] + y_means = np.empty((n_bins, n_features)) + x_means = np.zeros(n_bins) + coeffs = np.zeros((n_bins, n_features)) + + for j in range(n_bins): + mask = x_binned == j + selected_y = y[mask] + selected_x = x[mask] + + if selected_y.shape[0] > 0: + # Use vectorized operation to calculate the mean + y_means[j, :] = np.sum(selected_y, axis=0) / selected_y.shape[0] + + # Vectorized computation for x_means[j] + #x_means[j] = np.sum(selected_x) / selected_x.shape[0] + + # Vectorized computation for coeffs[j, :] + #x_diff = selected_x - x_means[j] + #y_diff = selected_y - y_means[j, :] + #coeffs[j, :] = y_diff.T @ x_diff / (np.sum(x_diff ** 2) + 1e-12) + else: + y_means[j, :] = np.nan # or 0 if you prefer + + return y_means, x_means, coeffs + +@njit(fastmath=True) +def linear_response(x, x_binned, x_means, y_means, slopes): + return y_means[x_binned] #+ slopes[x_binned]*(x - x_means[x_binned]).reshape(-1, 1) + +@njit(fastmath=True) +def fit_feature(x, x_binned, cuts, y): + y_means, x_means, coeffs = compute_coeffs(y, x, x_binned, len(cuts)-1) + y_hat = linear_response(x, x_binned, x_means, y_means, coeffs) + #y_means = np.array([np.array([y[x_binned == j, c].mean() for c in range(y.shape[1])]) for j in range(n_bins + 1)]) + score = np.mean((y - y_hat) ** 2) ** 0.5 + return cuts, y_means, x_means, coeffs, score + + +@njit +def fit_features(x, y, cuts): + pars = [] + for i in range(x.shape[1]): + x_binned = np.digitize(x[:, i], cuts[i]) - 1 + pars.append(fit_feature(x[:, i], x_binned, cuts[i], y)) + return pars class BrutalRegressor: - def __init__(self, n_iter=10): + def __init__(self, n_iter=10, n_bins=11, learning_rate=0.1, bootstrap_fraction=0.2): self.best_pars = [] self.n_iter = n_iter self.target_cols = None - def fit_feature(self, x, y, n_bins=10): - - cuts = np.quantile(x, np.linspace(0, 1, n_bins+1)) - x_binned = np.digitize(x, cuts)-1 - y_means = np.array([y[x_binned == i].mean() for i in range(n_bins+1)]) - score = np.mean((y - y_means[x_binned, :])**2)**0.5 - pars = {'cuts': cuts, 'y_means': y_means, 'score': score} + self.bootstrap_fraction = bootstrap_fraction + self.n_bins = n_bins + self.learning_rate = learning_rate - return pars def one_level_fit(self, x, y): - with ProcessPoolExecutor(4) as executor: - pars = list(executor.map(partial(self.fit_feature, y=y), [x[:, i] for i in range(x.shape[1])])) - # find best feature - best_feature = np.argmin([s['score'] for s in pars]) - best_pars = pars[best_feature] - return best_feature, best_pars - def fit(self, x, y): + #pars = fit_feature(x[:11, 0], y.values[:11], n_bins=self.n_bins) + #with ProcessPoolExecutor() as executor: + # pars = list(tqdm(executor.map(partial(fit_feature, y=y.values), x.T), total=x.shape[1]) ) + cuts = np.vstack([np.quantile(x_i, np.linspace(0, 1, self.n_bins+2)) for x_i in x.T]) + cuts[:, 0] = -np.inf + cuts[:, -1] = np.inf + pars = fit_features(x, y, cuts) + best_feature = np.argmin([s[-1] for s in pars]) + best_pars = {'cuts':pars[best_feature][0], 'y_means':pars[best_feature][1], 'x_means':pars[best_feature][2], + 'coeffs':pars[best_feature][3], 'best_feature':best_feature} + + return best_pars + def fit(self, x, y, plot=False): if isinstance(x, pd.DataFrame): x = x.values self.target_cols = y.columns - err = y.copy() + if plot: + plt.figure() + err = y.copy().values for i in tqdm(range(self.n_iter)): - best_feature, best_pars = self.one_level_fit(x, err) - y_hat = best_pars['y_means'][np.digitize(x[:, best_feature], best_pars['cuts'])-1] + rand_idx = np.random.choice(x.shape[0], int(self.bootstrap_fraction*x.shape[0]), replace=True) + best_pars = self.one_level_fit(x[rand_idx], err[rand_idx]) + #y_hat = self.learning_rate*best_pars['y_means'][np.digitize(x[:, best_pars['best_feature']], best_pars['cuts'])-1] + x_binned = np.digitize(x[:, best_pars['best_feature']], best_pars['cuts']) - 1 + y_hat = self.learning_rate*linear_response(x[:, best_pars['best_feature']], x_binned, best_pars['x_means'], best_pars['y_means'], best_pars['coeffs']) err = err - y_hat - print(np.abs(err).mean().mean()) self.best_pars.append(best_pars) + print(np.mean(np.abs(err))) + if np.any(np.isnan(err)): + print('asdasdsad') + if plot: + plt.cla() + plt.scatter(y.iloc[:, 0], err[:, 0], s=1) + plt.pause(0.01) return self def predict(self, x): y_hat = np.zeros((x.shape[0], self.best_pars[0]['y_means'].shape[1])) for i in range(self.n_iter): - y_hat += self.best_pars[i]['y_means'][np.digitize(x.iloc[:, i], self.best_pars[i]['cuts'])-1] + #y_hat += self.learning_rate*self.best_pars[i]['y_means'][np.digitize(x.iloc[:, self.best_pars[i]['best_feature']], self.best_pars[i]['cuts'])-1] + x_binned = np.digitize(x.iloc[:, self.best_pars[i]['best_feature']], self.best_pars[i]['cuts']) - 1 + y_hat += self.learning_rate*linear_response(x.iloc[:, self.best_pars[i]['best_feature']].values, x_binned, self.best_pars[i]['x_means'], self.best_pars[i]['y_means'], self.best_pars[i]['coeffs']) return pd.DataFrame(y_hat, index = x.index, columns=self.target_cols) @@ -284,7 +351,7 @@ def predict(self, x, **kwargs): y_hat = RandomFFRegressor.predict(self, x, **kwargs) y_hat = self.anti_transform(x, y_hat) return y_hat - def predict_quantiles(self, x:pd.DataFrame, **kwargs): + def _predict_quantiles(self, x:pd.DataFrame, **kwargs): preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect))) for h in np.unique(x.index.hour): preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0) @@ -325,7 +392,7 @@ def predict(self, x, **kwargs): y_hat = self.anti_transform(x, y_hat) return y_hat - def predict_quantiles(self, x:pd.DataFrame, **kwargs): + def _predict_quantiles(self, x:pd.DataFrame, **kwargs): preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect))) for h in np.unique(x.index.hour): preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0) diff --git a/pyforecaster/forecasting_models/randomforests.py b/pyforecaster/forecasting_models/randomforests.py index bf035c9..fa49358 100644 --- a/pyforecaster/forecasting_models/randomforests.py +++ b/pyforecaster/forecasting_models/randomforests.py @@ -159,6 +159,7 @@ def predict(self, x, **kwargs): if len(preds.shape) == 2: preds = np.expand_dims(preds, 0) preds = np.swapaxes(preds, 1, 2) + preds = self.quantiles_to_df(preds, index=x.index) else: preds = pd.DataFrame(np.atleast_2d(np.squeeze(preds)), index=x.index, columns=self.target_cols) y_hat = self.anti_transform(x, preds) diff --git a/pyforecaster/forecasting_models/statsmodels_wrapper.py b/pyforecaster/forecasting_models/statsmodels_wrapper.py index 8456b97..ea87674 100644 --- a/pyforecaster/forecasting_models/statsmodels_wrapper.py +++ b/pyforecaster/forecasting_models/statsmodels_wrapper.py @@ -38,12 +38,12 @@ def fit(self, x_pd:pd.DataFrame, y:pd.DataFrame=None): hw_target = hankel(y_present[1:], self.n_sa) resid = hw_target - preds self.err_distr = np.quantile(resid, self.q_vect, axis=0).T - + self.target_cols = [f'{self.target_name}_t+{i}' for i in range(1, self.n_sa+1)] def predict(self, x_pd:pd.DataFrame, **kwargs): pass - def predict_quantiles(self, x:pd.DataFrame, **kwargs): + def _predict_quantiles(self, x:pd.DataFrame, **kwargs): preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect))) for h in np.unique(x.index.hour): preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0) diff --git a/pyforecaster/formatter.py b/pyforecaster/formatter.py index 9f3f87c..435d76f 100644 --- a/pyforecaster/formatter.py +++ b/pyforecaster/formatter.py @@ -289,15 +289,15 @@ def normalize(self, x, y=None, normalizing_fun=None, antitransform=False, return # normalize the target if any if return_target: - # join target and normalizers in a single df - df_n = pd.concat([y, normalizers], axis=1) - - for target_to_norm in np.unique(target_to_norm_names): - for tr in self.target_transformers: - nr_columns = (tr.metadata['name'].isin([target_to_norm])).index - for c in nr_columns: - df_n.loc[:, c] = self.normalizing_wrapper(normalizing_fun, df_n, c) - y = df_n[[c for c in y.columns]] + if isinstance(y.columns, pd.MultiIndex): + # we are normalizing a Multiindex Dataframe containing quantiles, indicated in level=1 + for tau in y.columns.get_level_values(1).unique(): + y_tau = y.loc[:, (slice(None), tau)] + y_tau = y_tau.droplevel(1, 1) + y.loc[:, (slice(None), tau)] = self.normalize_target_inner(y_tau, normalizers, target_to_norm_names, + normalizing_fun).values + else: + y = self.normalize_target_inner(y, normalizers, target_to_norm_names, normalizing_fun) # normalize the features related to the target for target_to_norm in np.unique(target_to_norm_names): @@ -310,6 +310,18 @@ def normalize(self, x, y=None, normalizing_fun=None, antitransform=False, return return y, x + def normalize_target_inner(self, y, normalizers, target_to_norm_names, normalizing_fun): + # join target and normalizers in a single df + df_n = pd.concat([y, normalizers], axis=1) + + for target_to_norm in np.unique(target_to_norm_names): + for tr in self.target_transformers: + nr_columns = (tr.metadata['name'].isin([target_to_norm])).index + for c in nr_columns: + df_n.loc[:, c] = self.normalizing_wrapper(normalizing_fun, df_n, c) + y = df_n[[c for c in y.columns]] + return y + def denormalize(self, x, y): if self.denormalizing_fun is None: self.logger.warning('You did not pass any denormalization expression, ** no denormalization will be applied **. ' diff --git a/pyforecaster/metrics.py b/pyforecaster/metrics.py index 2cd8739..f784567 100644 --- a/pyforecaster/metrics.py +++ b/pyforecaster/metrics.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd from itertools import permutations - +from pyforecaster.forecaster import ScenarioGenerator def chose_axis(x, agg_index): return 0 if len(x) == len(agg_index) else 1 @@ -58,7 +58,7 @@ def quantile_scores(q_hat, t, alphas=None, agg_index=None, **kwargs): :return: """ agg_index = np.ones_like(t.index) if agg_index is None else agg_index - + q_hat = ScenarioGenerator().quantiles_to_numpy(q_hat) if isinstance(q_hat, pd.DataFrame) else q_hat quantile_axis = 2 if q_hat.shape[1] == t.shape[1] else 1 n_quantiles = q_hat.shape[quantile_axis] if alphas is None: diff --git a/pyforecaster/plot_utils.py b/pyforecaster/plot_utils.py index b83d982..356bfa1 100644 --- a/pyforecaster/plot_utils.py +++ b/pyforecaster/plot_utils.py @@ -9,6 +9,7 @@ from matplotlib.gridspec import GridSpec import networkx as nx from pyforecaster.scenred import plot_from_graph +from pyforecaster.forecaster import ScenarioGenerator def basic_setup(subplots_tuple, width, height, b=0.15, l=0.15, w=0.22, r=None, style ='seaborn', **kwargs): plt.style.use(style) @@ -280,7 +281,7 @@ def plot_quantiles(signals, qs, labels, n_rows=50, interval=1, step=1, repeat=Fa remove_spines=True, savepath=None, **kwargs): n_max = np.minimum(signals[0].shape[0], int(n_rows*step)) n_rows = np.minimum(int(np.floor(signals[0].shape[0]/step)), n_rows) - qs = qs.values if isinstance(qs, pd.DataFrame) else qs + qs = ScenarioGenerator().quantiles_to_numpy(qs) if isinstance(qs, pd.DataFrame) else qs fig, ax = plt.subplots(1, **kwargs) signals = signals if isinstance(signals, list) else [signals] t = np.arange(signals[0].shape[1]) diff --git a/tests/test_boosters.py b/tests/test_boosters.py index 62771e0..978b908 100644 --- a/tests/test_boosters.py +++ b/tests/test_boosters.py @@ -52,7 +52,7 @@ def test_linear_val_split(self): y_hat_lgb = m_lgb.predict(x_te) # plot_quantiles([y_hat_lgbh.iloc[:100, :]], q[:100, :, :], ['y_hat_lgb'], n_rows=100, repeat=True) - plot_quantiles([y_hat_lin.iloc[:100, :]], q[:100, :, :], ['y_hat_lgb'], n_rows=100, repeat=False) + plot_quantiles([y_hat_lin.iloc[:100, :]], q.iloc[:100, :], ['y_hat_lgb'], n_rows=100, repeat=False) def do_not_test_linear_val_split(self): diff --git a/tests/test_models.py b/tests/test_models.py index 6de1248..ec233a3 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -185,18 +185,29 @@ def test_qrf(self): relative_lags=True) formatter.add_transform(['all'], ['min', 'max'], agg_bins=[1, 2, 15, 20]) formatter.add_target_transform(['all'], lags=-np.arange(6)) + formatter.add_target_normalizer(['all'], 'mean', agg_freq='3d', name='a_movingavg') + formatter.add_target_normalizer(['all'], 'std', agg_freq='3d', name='a_movingstd') - x, y = formatter.transform(self.data.iloc[:1000]) - x.columns = x.columns.astype(str) - y.columns = y.columns.astype(str) - n_tr = int(len(x) * 0.99) - x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(), - y.iloc[n_tr:].copy()] + #m_lin = LinearForecaster(val_ratio=0.2, formatter=formatter).fit(x_tr, y_tr) + #y_hat_nonorm = m_lin.predict(x_te) + #q_nonorm = m_lin.predict_quantiles(x_te) + + #m_lgb = LGBForecaster(val_ratio=0.5, lgb_pars={'num_leaves':20}, formatter=formatter).fit(x_tr, y_tr) + #y_hat_lgb = m_lgb.predict(x_te) + #mae = lambda x, y: np.abs(x-y).mean().mean() + #print('MAE lin:', mae(y_te, y_hat_nonorm)) + + + formatter.add_normalizing_fun(expr="(df[t] - df['a_movingavg']) / (df['a_movingstd'] + 1)", + inv_expr="df[t]*(df['a_movingstd']+1) + df['a_movingavg']") + x, y_norm = formatter.transform(self.data.iloc[:1000]) + n_tr = int(len(x) * 0.9) + x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y_norm.iloc[:n_tr].copy(), y_norm.iloc[n_tr:].copy()] qrf = QRF(val_ratio=0.2, formatter=formatter, n_jobs=4, n_single=6).fit(x_tr, y_tr) y_hat = qrf.predict(x_te) q = qrf.predict_quantiles(x_te) - + y_te = formatter.denormalize(x_te, y_te) #plot_quantiles([y_te, y_hat], q, ['y_te', 'y_hat', 'y_hat_qrf']) qrf = QRF(val_ratio=0.2, formatter=formatter, n_jobs=4, n_single=2).fit(x_tr, y_tr) @@ -255,7 +266,7 @@ def test_statsmodels_wrappers(self): m = ExponentialSmoothing(target_name='all', q_vect=np.arange(31)/30, nodes_at_step=None, val_ratio=0.8, n_sa=24, seasonal=24).fit(data_tr) y_hat = m.predict(data_te) - q_hat = m.predict_quantiles(data_te) + q_hat = m.predict_quantiles(data_te, dataframe=False) y_plot = pd.concat({'y_{:02d}'.format(i): data_te['all'].shift(-i) for i in range(24)}, axis=1) #plot_quantiles([y_plot, y_hat], q_hat, ['y_te', 'y_hat'], n_rows=300) diff --git a/tests/test_scenario.py b/tests/test_scenario.py index 6daa914..6ebd52c 100644 --- a/tests/test_scenario.py +++ b/tests/test_scenario.py @@ -107,7 +107,7 @@ def test_forecaster(self): lf = LinearForecaster(online_tree_reduction=False).fit(self.x, self.target) preds = lf.predict(self.x) - q_preds = lf.predict_quantiles(self.x.iloc[rand_idx,:]) + q_preds = lf.predict_quantiles(self.x.iloc[rand_idx,:], dataframe=False) trees = lf.predict_trees(self.x.iloc[rand_idx,:], scenarios_per_step=np.linspace(1,20,preds.shape[1], dtype=int)) for i, rand_i in enumerate(rand_idx): @@ -158,7 +158,7 @@ def test_additional_node(self): lf = LinearForecaster(online_tree_reduction=False, additional_node=True).fit(self.x, self.target) preds = lf.predict(self.x) - q_preds = lf.predict_quantiles(self.x.iloc[rand_idx, :]) + q_preds = lf.predict_quantiles(self.x.iloc[rand_idx, :], dataframe=False) init_obs = self.target['target_0'].iloc[np.clip(rand_idx - 1, 0, np.inf).astype(int)].values trees = lf.predict_trees(self.x.iloc[rand_idx, :], init_obs=init_obs) @@ -175,7 +175,7 @@ def test_additional_node(self): lf = LinearForecaster(online_tree_reduction=True, additional_node=True).fit(self.x, self.target) preds = lf.predict(self.x) - q_preds = lf.predict_quantiles(self.x.iloc[rand_idx, :]) + q_preds = lf.predict_quantiles(self.x.iloc[rand_idx, :], dataframe=False) trees = lf.predict_trees(self.x.iloc[rand_idx, :], nodes_at_step=np.linspace(4, 20, preds.shape[1], dtype=int)) plt.close('all')