Skip to content

Commit

Permalink
refactored to return quantiles as multiindex dataframe
Browse files Browse the repository at this point in the history
  • Loading branch information
nepslor committed Sep 9, 2024
1 parent e4cf2b3 commit 0a7ba4a
Show file tree
Hide file tree
Showing 14 changed files with 186 additions and 63 deletions.
40 changes: 34 additions & 6 deletions pyforecaster/forecaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None,
self.additional_node = additional_node
self.formatter = formatter
self.conditional_to_hour = conditional_to_hour
self.target_cols = None

@property
def online_tree_reduction(self):
Expand Down Expand Up @@ -100,17 +101,44 @@ def anti_transform(self, x, y_hat):
return y_hat

@abstractmethod
def predict_quantiles(self, x, **kwargs):
def _predict_quantiles(self, x, **kwargs):
pass

def quantiles_to_df(self, q_hat:np.ndarray, index):
level_0_labels = self.target_cols
level_1_labels = self.q_vect
q_hat = np.swapaxes(q_hat, 1, 2)
q_hat = np.reshape(q_hat, (q_hat.shape[0], q_hat.shape[1] * q_hat.shape[2]))
q_hat = pd.DataFrame(q_hat, index=index, columns=pd.MultiIndex.from_product([level_0_labels, level_1_labels]))
q_hat.columns.names = ['target', 'quantile']
return q_hat

@staticmethod
def quantiles_to_numpy(q_hat:pd.DataFrame):
n_taus = len(q_hat.columns.get_level_values(1).unique())
q_hat = q_hat.values
q_hat = np.reshape(q_hat, (q_hat.shape[0], n_taus, -1))
q_hat = np.swapaxes(q_hat, 1, 2)
return q_hat

def predict_quantiles(self, x, dataframe=True, **kwargs):
q_hat = self._predict_quantiles(x, **kwargs)
if isinstance(q_hat, np.ndarray) and dataframe:
# create multiindex dataframe
q_hat = self.quantiles_to_df(q_hat, x.index)
if isinstance(q_hat, pd.DataFrame) and not dataframe:
q_hat = self.quantiles_to_numpy(q_hat)

return q_hat

def predict_pmf(self, x, discrete_prob_space, **predict_q_kwargs):
"""
Return probability mass function of the target variable, obtained from quantile predictions
:param x:
:param predict_q_kwargs:
:return:
"""
quantiles = self.predict_quantiles(x, **predict_q_kwargs)
quantiles = self.predict_quantiles(x, dataframe=False, **predict_q_kwargs)
pmf = np.zeros((quantiles.shape[0], quantiles.shape[1], len(discrete_prob_space)-1))
for i in range(quantiles.shape[0]):
for j in range(quantiles.shape[1]):
Expand All @@ -120,7 +148,7 @@ def predict_pmf(self, x, discrete_prob_space, **predict_q_kwargs):
def predict_scenarios(self, x, n_scen=None, random_state=None, **predict_q_kwargs):
n_scen = self.n_scen_fit if n_scen is None else n_scen
# retrieve quantiles from child class
quantiles = self.predict_quantiles(x, **predict_q_kwargs)
quantiles = self.predict_quantiles(x, dataframe=False, **predict_q_kwargs)
scenarios = self.scengen.predict_scenarios(quantiles, n_scen=n_scen, x=x, random_state=random_state)
q_from_scens = np.rollaxis(np.quantile(scenarios, self.q_vect, axis=-1), 0, 3)
mean_abs_dev = np.abs(q_from_scens - quantiles).mean(axis=0).mean(axis=0)
Expand All @@ -142,7 +170,7 @@ def predict_trees(self, x, n_scen=100, nodes_at_step=None, init_obs=None, random

if self.online_tree_reduction:
# retrieve quantiles from child class
quantiles = self.predict_quantiles(x, **predict_q_kwargs)
quantiles = self.predict_quantiles(x, dataframe=False, **predict_q_kwargs)

trees = self.scengen.predict_trees(quantiles=quantiles, n_scen=n_scen, x=x,
nodes_at_step=nodes_at_step, init_obs=init_obs,
Expand Down Expand Up @@ -194,7 +222,7 @@ def predict(self, x:pd.DataFrame, **kwargs):
y_hat = self.anti_transform(x, y_hat)
return y_hat

def predict_quantiles(self, x:pd.DataFrame, **kwargs):
def _predict_quantiles(self, x:pd.DataFrame, **kwargs):
preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
for h in np.unique(x.index.hour):
preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
Expand Down Expand Up @@ -250,7 +278,7 @@ def predict(self, x, **kwargs):
y_hat = self.anti_transform(x, y_hat)
return y_hat

def predict_quantiles(self, x, **kwargs):
def _predict_quantiles(self, x, **kwargs):
preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
for h in np.unique(x.index.hour):
preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
Expand Down
4 changes: 2 additions & 2 deletions pyforecaster/forecasting_models/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def predict(self, x:pd.DataFrame, **kwargs):
y_hat = x[self.target_col].values.reshape(-1, 1) * np.ones((1, self.n_sa))
return pd.DataFrame(y_hat, index=x.index)

def predict_quantiles(self, x:pd.DataFrame, **kwargs):
def _predict_quantiles(self, x:pd.DataFrame, **kwargs):
preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
if self.conditional_to_hour:
for h in np.unique(x.index.hour):
Expand Down Expand Up @@ -54,7 +54,7 @@ def predict(self, x: pd.DataFrame, **kwargs):
y_hat = hankel(values, self.n_sa)
return pd.DataFrame(y_hat[:len(x), :], index=x.index)

def predict_quantiles(self, x: pd.DataFrame, **kwargs):
def _predict_quantiles(self, x: pd.DataFrame, **kwargs):
preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
if self.conditional_to_hour:
for h in np.unique(x.index.hour):
Expand Down
8 changes: 5 additions & 3 deletions pyforecaster/forecasting_models/fast_adaptive_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def fit(self, x_pd, y_pd=None, **kwargs):
hw_target = hankel(y_present[1:], self.n_sa)
resid = hw_target - preds
self.err_distr = np.quantile(resid, self.q_vect, axis=0).T
self.target_cols = [f'{self.target_name}_t+{i}' for i in range(1, self.n_sa + 1)]
return self

@abstractmethod
Expand Down Expand Up @@ -238,7 +239,7 @@ def run(self, x, y, return_coeffs=False, start_from=0, fit=True):
return np.vstack(preds[1:]), np.vstack(coeffs_t_history)
return np.vstack(preds[1:])

def predict_quantiles(self, x, **kwargs):
def _predict_quantiles(self, x, **kwargs):
preds = self.predict(x)
return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0)

Expand Down Expand Up @@ -397,7 +398,7 @@ def run(self, x, y, return_coeffs=False, start_from=0, fit=True):
return np.vstack(preds[1:]), coeffs_t_history.T
return np.vstack(preds[1:])

def predict_quantiles(self, x, **kwargs):
def _predict_quantiles(self, x, **kwargs):
preds = self.predict(x)
return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0)

Expand Down Expand Up @@ -503,6 +504,7 @@ def fit(self, x_pd, y_pd=None, **kwargs):
hw_target = hankel(x_pd[self.target_name].values[1:], self.n_sa)
resid = hw_target - preds
self.err_distr = np.quantile(resid, self.q_vect, axis=0).T
self.target_cols = ['{}_{}'.format(self.target_name, t) for t in np.arange(self.n_sa)]
return self

def predict(self, x_pd, **kwargs):
Expand Down Expand Up @@ -563,7 +565,7 @@ def run(self, x_pd, return_coeffs=True, fit=True):
return np.vstack(preds[1:]), np.vstack(coeffs_t_history)
return np.vstack(preds[1:])

def predict_quantiles(self, x, **kwargs):
def _predict_quantiles(self, x, **kwargs):
preds = self.predict(x)
return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0)

Expand Down
2 changes: 1 addition & 1 deletion pyforecaster/forecasting_models/gradientboosters.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def dataset_at_stepahead(df, target_col_num, metadata_features, formatter, logge
period=period, keep_last_n_lags=keep_last_n_lags, keep_last_seconds=keep_last_seconds,
tol_period=tol_period)

def predict_quantiles(self, x, **kwargs):
def _predict_quantiles(self, x, **kwargs):
preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
for h in np.unique(x.index.hour):
preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
Expand Down
5 changes: 3 additions & 2 deletions pyforecaster/forecasting_models/holtwinters.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ def fit(self, x_pd, y_pd=None, **kwargs):
# concat past inputs and last row of target
self.reinit(y)
self.err_distr = np.quantile(resid, self.q_vect, axis=0).T
self.target_cols = ['{}_{}'.format(self.target_name, t) for t in np.arange(self.n_sa)]
return self

def predict(self, x_pd, **kwargs):
Expand Down Expand Up @@ -231,7 +232,7 @@ def predict(self, x_pd, **kwargs):

return y_hat

def predict_quantiles(self, x, **kwargs):
def _predict_quantiles(self, x, **kwargs):
preds = self.predict(x)
return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0)

Expand Down Expand Up @@ -448,7 +449,7 @@ def reinit(self, x):
for i,m in enumerate(self.models):
m.reinit(x)

def predict_quantiles(self, x, **kwargs):
def _predict_quantiles(self, x, **kwargs):
preds = self.predict(x)
return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0)

Expand Down
113 changes: 90 additions & 23 deletions pyforecaster/forecasting_models/random_fourier_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# Author: Tetsuya Ishikawa

import functools

import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
from sklearn.linear_model import LinearRegression
Expand All @@ -10,47 +12,112 @@
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from tqdm import tqdm
from numba import njit, prange, typed, types


@njit(fastmath=True)
def compute_coeffs(y, x, x_binned, n_bins):
n_features = y.shape[1]
y_means = np.empty((n_bins, n_features))
x_means = np.zeros(n_bins)
coeffs = np.zeros((n_bins, n_features))

for j in range(n_bins):
mask = x_binned == j
selected_y = y[mask]
selected_x = x[mask]

if selected_y.shape[0] > 0:
# Use vectorized operation to calculate the mean
y_means[j, :] = np.sum(selected_y, axis=0) / selected_y.shape[0]

# Vectorized computation for x_means[j]
#x_means[j] = np.sum(selected_x) / selected_x.shape[0]

# Vectorized computation for coeffs[j, :]
#x_diff = selected_x - x_means[j]
#y_diff = selected_y - y_means[j, :]
#coeffs[j, :] = y_diff.T @ x_diff / (np.sum(x_diff ** 2) + 1e-12)
else:
y_means[j, :] = np.nan # or 0 if you prefer

return y_means, x_means, coeffs

@njit(fastmath=True)
def linear_response(x, x_binned, x_means, y_means, slopes):
return y_means[x_binned] #+ slopes[x_binned]*(x - x_means[x_binned]).reshape(-1, 1)

@njit(fastmath=True)
def fit_feature(x, x_binned, cuts, y):
y_means, x_means, coeffs = compute_coeffs(y, x, x_binned, len(cuts)-1)
y_hat = linear_response(x, x_binned, x_means, y_means, coeffs)
#y_means = np.array([np.array([y[x_binned == j, c].mean() for c in range(y.shape[1])]) for j in range(n_bins + 1)])
score = np.mean((y - y_hat) ** 2) ** 0.5
return cuts, y_means, x_means, coeffs, score


@njit
def fit_features(x, y, cuts):
pars = []
for i in range(x.shape[1]):
x_binned = np.digitize(x[:, i], cuts[i]) - 1
pars.append(fit_feature(x[:, i], x_binned, cuts[i], y))
return pars

class BrutalRegressor:
def __init__(self, n_iter=10):
def __init__(self, n_iter=10, n_bins=11, learning_rate=0.1, bootstrap_fraction=0.2):
self.best_pars = []
self.n_iter = n_iter
self.target_cols = None
def fit_feature(self, x, y, n_bins=10):

cuts = np.quantile(x, np.linspace(0, 1, n_bins+1))
x_binned = np.digitize(x, cuts)-1
y_means = np.array([y[x_binned == i].mean() for i in range(n_bins+1)])
score = np.mean((y - y_means[x_binned, :])**2)**0.5
pars = {'cuts': cuts, 'y_means': y_means, 'score': score}
self.bootstrap_fraction = bootstrap_fraction
self.n_bins = n_bins
self.learning_rate = learning_rate

return pars

def one_level_fit(self, x, y):
with ProcessPoolExecutor(4) as executor:
pars = list(executor.map(partial(self.fit_feature, y=y), [x[:, i] for i in range(x.shape[1])]))
# find best feature
best_feature = np.argmin([s['score'] for s in pars])
best_pars = pars[best_feature]
return best_feature, best_pars
def fit(self, x, y):
#pars = fit_feature(x[:11, 0], y.values[:11], n_bins=self.n_bins)
#with ProcessPoolExecutor() as executor:
# pars = list(tqdm(executor.map(partial(fit_feature, y=y.values), x.T), total=x.shape[1]) )
cuts = np.vstack([np.quantile(x_i, np.linspace(0, 1, self.n_bins+2)) for x_i in x.T])
cuts[:, 0] = -np.inf
cuts[:, -1] = np.inf
pars = fit_features(x, y, cuts)
best_feature = np.argmin([s[-1] for s in pars])
best_pars = {'cuts':pars[best_feature][0], 'y_means':pars[best_feature][1], 'x_means':pars[best_feature][2],
'coeffs':pars[best_feature][3], 'best_feature':best_feature}

return best_pars
def fit(self, x, y, plot=False):
if isinstance(x, pd.DataFrame):
x = x.values
self.target_cols = y.columns
err = y.copy()
if plot:
plt.figure()
err = y.copy().values
for i in tqdm(range(self.n_iter)):
best_feature, best_pars = self.one_level_fit(x, err)
y_hat = best_pars['y_means'][np.digitize(x[:, best_feature], best_pars['cuts'])-1]
rand_idx = np.random.choice(x.shape[0], int(self.bootstrap_fraction*x.shape[0]), replace=True)
best_pars = self.one_level_fit(x[rand_idx], err[rand_idx])
#y_hat = self.learning_rate*best_pars['y_means'][np.digitize(x[:, best_pars['best_feature']], best_pars['cuts'])-1]
x_binned = np.digitize(x[:, best_pars['best_feature']], best_pars['cuts']) - 1
y_hat = self.learning_rate*linear_response(x[:, best_pars['best_feature']], x_binned, best_pars['x_means'], best_pars['y_means'], best_pars['coeffs'])
err = err - y_hat
print(np.abs(err).mean().mean())
self.best_pars.append(best_pars)
print(np.mean(np.abs(err)))
if np.any(np.isnan(err)):
print('asdasdsad')
if plot:
plt.cla()
plt.scatter(y.iloc[:, 0], err[:, 0], s=1)
plt.pause(0.01)
return self

def predict(self, x):

y_hat = np.zeros((x.shape[0], self.best_pars[0]['y_means'].shape[1]))
for i in range(self.n_iter):
y_hat += self.best_pars[i]['y_means'][np.digitize(x.iloc[:, i], self.best_pars[i]['cuts'])-1]
#y_hat += self.learning_rate*self.best_pars[i]['y_means'][np.digitize(x.iloc[:, self.best_pars[i]['best_feature']], self.best_pars[i]['cuts'])-1]
x_binned = np.digitize(x.iloc[:, self.best_pars[i]['best_feature']], self.best_pars[i]['cuts']) - 1
y_hat += self.learning_rate*linear_response(x.iloc[:, self.best_pars[i]['best_feature']].values, x_binned, self.best_pars[i]['x_means'], self.best_pars[i]['y_means'], self.best_pars[i]['coeffs'])
return pd.DataFrame(y_hat, index = x.index, columns=self.target_cols)


Expand Down Expand Up @@ -284,7 +351,7 @@ def predict(self, x, **kwargs):
y_hat = RandomFFRegressor.predict(self, x, **kwargs)
y_hat = self.anti_transform(x, y_hat)
return y_hat
def predict_quantiles(self, x:pd.DataFrame, **kwargs):
def _predict_quantiles(self, x:pd.DataFrame, **kwargs):
preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
for h in np.unique(x.index.hour):
preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
Expand Down Expand Up @@ -325,7 +392,7 @@ def predict(self, x, **kwargs):

y_hat = self.anti_transform(x, y_hat)
return y_hat
def predict_quantiles(self, x:pd.DataFrame, **kwargs):
def _predict_quantiles(self, x:pd.DataFrame, **kwargs):
preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
for h in np.unique(x.index.hour):
preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
Expand Down
1 change: 1 addition & 0 deletions pyforecaster/forecasting_models/randomforests.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ def predict(self, x, **kwargs):
if len(preds.shape) == 2:
preds = np.expand_dims(preds, 0)
preds = np.swapaxes(preds, 1, 2)
preds = self.quantiles_to_df(preds, index=x.index)
else:
preds = pd.DataFrame(np.atleast_2d(np.squeeze(preds)), index=x.index, columns=self.target_cols)
y_hat = self.anti_transform(x, preds)
Expand Down
4 changes: 2 additions & 2 deletions pyforecaster/forecasting_models/statsmodels_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ def fit(self, x_pd:pd.DataFrame, y:pd.DataFrame=None):
hw_target = hankel(y_present[1:], self.n_sa)
resid = hw_target - preds
self.err_distr = np.quantile(resid, self.q_vect, axis=0).T

self.target_cols = [f'{self.target_name}_t+{i}' for i in range(1, self.n_sa+1)]

def predict(self, x_pd:pd.DataFrame, **kwargs):
pass

def predict_quantiles(self, x:pd.DataFrame, **kwargs):
def _predict_quantiles(self, x:pd.DataFrame, **kwargs):
preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
for h in np.unique(x.index.hour):
preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
Expand Down
Loading

0 comments on commit 0a7ba4a

Please sign in to comment.