refactored to return quantiles as multiindex dataframe

supsi-dacd-isaac · Sep 9, 2024 · 0a7ba4a · 0a7ba4a
1 parent e4cf2b3
commit 0a7ba4a
Show file tree

Hide file tree

Showing 14 changed files with 186 additions and 63 deletions.
diff --git a/pyforecaster/forecaster.py b/pyforecaster/forecaster.py
@@ -54,6 +54,7 @@ def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None,
         self.additional_node = additional_node
         self.formatter = formatter
         self.conditional_to_hour = conditional_to_hour
+        self.target_cols = None
 
     @property
     def online_tree_reduction(self):
@@ -100,17 +101,44 @@ def anti_transform(self, x, y_hat):
         return y_hat
 
     @abstractmethod
-    def predict_quantiles(self, x, **kwargs):
+    def _predict_quantiles(self, x, **kwargs):
         pass
 
+    def quantiles_to_df(self, q_hat:np.ndarray, index):
+        level_0_labels = self.target_cols
+        level_1_labels = self.q_vect
+        q_hat = np.swapaxes(q_hat, 1, 2)
+        q_hat = np.reshape(q_hat, (q_hat.shape[0], q_hat.shape[1] * q_hat.shape[2]))
+        q_hat = pd.DataFrame(q_hat, index=index, columns=pd.MultiIndex.from_product([level_0_labels, level_1_labels]))
+        q_hat.columns.names = ['target', 'quantile']
+        return q_hat
+
+    @staticmethod
+    def quantiles_to_numpy(q_hat:pd.DataFrame):
+        n_taus = len(q_hat.columns.get_level_values(1).unique())
+        q_hat = q_hat.values
+        q_hat = np.reshape(q_hat, (q_hat.shape[0], n_taus, -1))
+        q_hat = np.swapaxes(q_hat, 1, 2)
+        return q_hat
+
+    def predict_quantiles(self, x, dataframe=True, **kwargs):
+        q_hat = self._predict_quantiles(x, **kwargs)
+        if isinstance(q_hat, np.ndarray) and dataframe:
+            # create multiindex dataframe
+            q_hat = self.quantiles_to_df(q_hat, x.index)
+        if isinstance(q_hat, pd.DataFrame) and not dataframe:
+            q_hat = self.quantiles_to_numpy(q_hat)
+
+        return q_hat
+
     def predict_pmf(self, x, discrete_prob_space, **predict_q_kwargs):
         """
         Return probability mass function of the target variable, obtained from quantile predictions
         :param x:
         :param predict_q_kwargs:
         :return:
         """
-        quantiles = self.predict_quantiles(x, **predict_q_kwargs)
+        quantiles = self.predict_quantiles(x, dataframe=False, **predict_q_kwargs)
         pmf = np.zeros((quantiles.shape[0], quantiles.shape[1], len(discrete_prob_space)-1))
         for i in range(quantiles.shape[0]):
             for j in range(quantiles.shape[1]):
@@ -120,7 +148,7 @@ def predict_pmf(self, x, discrete_prob_space, **predict_q_kwargs):
     def predict_scenarios(self, x, n_scen=None, random_state=None, **predict_q_kwargs):
         n_scen = self.n_scen_fit if n_scen is None else n_scen
         # retrieve quantiles from child class
-        quantiles = self.predict_quantiles(x, **predict_q_kwargs)
+        quantiles = self.predict_quantiles(x, dataframe=False, **predict_q_kwargs)
         scenarios = self.scengen.predict_scenarios(quantiles, n_scen=n_scen, x=x, random_state=random_state)
         q_from_scens = np.rollaxis(np.quantile(scenarios, self.q_vect, axis=-1), 0, 3)
         mean_abs_dev = np.abs(q_from_scens - quantiles).mean(axis=0).mean(axis=0)
@@ -142,7 +170,7 @@ def predict_trees(self, x, n_scen=100, nodes_at_step=None, init_obs=None, random
 
         if self.online_tree_reduction:
             # retrieve quantiles from child class
-            quantiles = self.predict_quantiles(x, **predict_q_kwargs)
+            quantiles = self.predict_quantiles(x, dataframe=False, **predict_q_kwargs)
 
             trees = self.scengen.predict_trees(quantiles=quantiles, n_scen=n_scen, x=x,
                                          nodes_at_step=nodes_at_step, init_obs=init_obs,
@@ -194,7 +222,7 @@ def predict(self, x:pd.DataFrame, **kwargs):
         y_hat = self.anti_transform(x, y_hat)
         return y_hat
 
-    def predict_quantiles(self, x:pd.DataFrame, **kwargs):
+    def _predict_quantiles(self, x:pd.DataFrame, **kwargs):
         preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
         for h in np.unique(x.index.hour):
             preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
@@ -250,7 +278,7 @@ def predict(self, x, **kwargs):
         y_hat = self.anti_transform(x, y_hat)
         return y_hat
 
-    def predict_quantiles(self, x, **kwargs):
+    def _predict_quantiles(self, x, **kwargs):
         preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
         for h in np.unique(x.index.hour):
             preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)

diff --git a/pyforecaster/forecasting_models/benchmarks.py b/pyforecaster/forecasting_models/benchmarks.py
@@ -21,7 +21,7 @@ def predict(self, x:pd.DataFrame, **kwargs):
         y_hat = x[self.target_col].values.reshape(-1, 1) * np.ones((1, self.n_sa))
         return pd.DataFrame(y_hat, index=x.index)
 
-    def predict_quantiles(self, x:pd.DataFrame, **kwargs):
+    def _predict_quantiles(self, x:pd.DataFrame, **kwargs):
         preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
         if self.conditional_to_hour:
             for h in np.unique(x.index.hour):
@@ -54,7 +54,7 @@ def predict(self, x: pd.DataFrame, **kwargs):
         y_hat = hankel(values, self.n_sa)
         return pd.DataFrame(y_hat[:len(x), :], index=x.index)
 
-    def predict_quantiles(self, x: pd.DataFrame, **kwargs):
+    def _predict_quantiles(self, x: pd.DataFrame, **kwargs):
         preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
         if self.conditional_to_hour:
             for h in np.unique(x.index.hour):

diff --git a/pyforecaster/forecasting_models/fast_adaptive_models.py b/pyforecaster/forecasting_models/fast_adaptive_models.py
@@ -135,6 +135,7 @@ def fit(self, x_pd, y_pd=None, **kwargs):
         hw_target = hankel(y_present[1:], self.n_sa)
         resid = hw_target - preds
         self.err_distr = np.quantile(resid, self.q_vect, axis=0).T
+        self.target_cols = [f'{self.target_name}_t+{i}' for i in range(1, self.n_sa + 1)]
         return self
 
     @abstractmethod
@@ -238,7 +239,7 @@ def run(self, x, y, return_coeffs=False, start_from=0, fit=True):
             return np.vstack(preds[1:]), np.vstack(coeffs_t_history)
         return np.vstack(preds[1:])
 
-    def predict_quantiles(self, x, **kwargs):
+    def _predict_quantiles(self, x, **kwargs):
         preds = self.predict(x)
         return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0)
 
@@ -397,7 +398,7 @@ def run(self, x, y, return_coeffs=False, start_from=0, fit=True):
             return np.vstack(preds[1:]), coeffs_t_history.T
         return np.vstack(preds[1:])
 
-    def predict_quantiles(self, x, **kwargs):
+    def _predict_quantiles(self, x, **kwargs):
         preds = self.predict(x)
         return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0)
 
@@ -503,6 +504,7 @@ def fit(self, x_pd, y_pd=None, **kwargs):
         hw_target = hankel(x_pd[self.target_name].values[1:], self.n_sa)
         resid = hw_target - preds
         self.err_distr = np.quantile(resid, self.q_vect, axis=0).T
+        self.target_cols = ['{}_{}'.format(self.target_name, t) for t in np.arange(self.n_sa)]
         return self
 
     def predict(self, x_pd, **kwargs):
@@ -563,7 +565,7 @@ def run(self, x_pd, return_coeffs=True, fit=True):
             return np.vstack(preds[1:]), np.vstack(coeffs_t_history)
         return np.vstack(preds[1:])
 
-    def predict_quantiles(self, x, **kwargs):
+    def _predict_quantiles(self, x, **kwargs):
         preds = self.predict(x)
         return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0)
 

diff --git a/pyforecaster/forecasting_models/gradientboosters.py b/pyforecaster/forecasting_models/gradientboosters.py
@@ -165,7 +165,7 @@ def dataset_at_stepahead(df, target_col_num, metadata_features, formatter, logge
                                                         period=period, keep_last_n_lags=keep_last_n_lags, keep_last_seconds=keep_last_seconds,
                                                         tol_period=tol_period)
 
-    def predict_quantiles(self, x, **kwargs):
+    def _predict_quantiles(self, x, **kwargs):
         preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
         for h in np.unique(x.index.hour):
             preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)

diff --git a/pyforecaster/forecasting_models/holtwinters.py b/pyforecaster/forecasting_models/holtwinters.py
@@ -198,6 +198,7 @@ def fit(self, x_pd, y_pd=None, **kwargs):
         # concat past inputs and last row of target
         self.reinit(y)
         self.err_distr = np.quantile(resid, self.q_vect, axis=0).T
+        self.target_cols = ['{}_{}'.format(self.target_name, t) for t in np.arange(self.n_sa)]
         return self
 
     def predict(self, x_pd, **kwargs):
@@ -231,7 +232,7 @@ def predict(self, x_pd, **kwargs):
 
         return y_hat
 
-    def predict_quantiles(self, x, **kwargs):
+    def _predict_quantiles(self, x, **kwargs):
         preds = self.predict(x)
         return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0)
 
@@ -448,7 +449,7 @@ def reinit(self, x):
         for i,m in enumerate(self.models):
             m.reinit(x)
 
-    def predict_quantiles(self, x, **kwargs):
+    def _predict_quantiles(self, x, **kwargs):
         preds = self.predict(x)
         return np.expand_dims(preds, -1) + np.expand_dims(self.err_distr, 0)
 

diff --git a/pyforecaster/forecasting_models/random_fourier_features.py b/pyforecaster/forecasting_models/random_fourier_features.py
@@ -2,6 +2,8 @@
 # Author: Tetsuya Ishikawa
 
 import functools
+
+import matplotlib.pyplot as plt
 import numpy as np
 import scipy.stats
 from sklearn.linear_model import LinearRegression
@@ -10,47 +12,112 @@
 from concurrent.futures import ProcessPoolExecutor
 from functools import partial
 from tqdm import tqdm
+from numba import njit, prange, typed, types
+
+
+@njit(fastmath=True)
+def compute_coeffs(y, x, x_binned, n_bins):
+    n_features = y.shape[1]
+    y_means = np.empty((n_bins, n_features))
+    x_means = np.zeros(n_bins)
+    coeffs = np.zeros((n_bins, n_features))
+
+    for j in range(n_bins):
+        mask = x_binned == j
+        selected_y = y[mask]
+        selected_x = x[mask]
+
+        if selected_y.shape[0] > 0:
+            # Use vectorized operation to calculate the mean
+            y_means[j, :] = np.sum(selected_y, axis=0) / selected_y.shape[0]
+
+            # Vectorized computation for x_means[j]
+            #x_means[j] = np.sum(selected_x) / selected_x.shape[0]
+
+            # Vectorized computation for coeffs[j, :]
+            #x_diff = selected_x - x_means[j]
+            #y_diff = selected_y - y_means[j, :]
+            #coeffs[j, :] = y_diff.T @ x_diff / (np.sum(x_diff ** 2) + 1e-12)
+        else:
+            y_means[j, :] = np.nan  # or 0 if you prefer
+
+    return y_means, x_means, coeffs
+
+@njit(fastmath=True)
+def linear_response(x, x_binned, x_means, y_means, slopes):
+    return y_means[x_binned] #+ slopes[x_binned]*(x - x_means[x_binned]).reshape(-1, 1)
+
+@njit(fastmath=True)
+def fit_feature(x, x_binned, cuts, y):
+    y_means, x_means, coeffs = compute_coeffs(y, x, x_binned, len(cuts)-1)
+    y_hat = linear_response(x, x_binned, x_means, y_means, coeffs)
+    #y_means = np.array([np.array([y[x_binned == j, c].mean() for c in range(y.shape[1])]) for j in range(n_bins + 1)])
+    score = np.mean((y - y_hat) ** 2) ** 0.5
+    return cuts, y_means, x_means, coeffs, score
+
+
+@njit
+def fit_features(x, y, cuts):
+    pars = []
+    for i in range(x.shape[1]):
+        x_binned = np.digitize(x[:, i], cuts[i]) - 1
+        pars.append(fit_feature(x[:, i], x_binned, cuts[i], y))
+    return pars
 
 class BrutalRegressor:
-    def __init__(self, n_iter=10):
+    def __init__(self, n_iter=10, n_bins=11, learning_rate=0.1, bootstrap_fraction=0.2):
         self.best_pars = []
         self.n_iter = n_iter
         self.target_cols = None
-    def fit_feature(self, x, y, n_bins=10):
-
-        cuts = np.quantile(x, np.linspace(0, 1, n_bins+1))
-        x_binned = np.digitize(x, cuts)-1
-        y_means = np.array([y[x_binned == i].mean() for i in range(n_bins+1)])
-        score = np.mean((y - y_means[x_binned, :])**2)**0.5
-        pars = {'cuts': cuts, 'y_means': y_means, 'score': score}
+        self.bootstrap_fraction = bootstrap_fraction
+        self.n_bins = n_bins
+        self.learning_rate = learning_rate
 
-        return pars
 
     def one_level_fit(self, x, y):
-        with ProcessPoolExecutor(4) as executor:
-            pars = list(executor.map(partial(self.fit_feature, y=y), [x[:, i] for i in range(x.shape[1])]))
-        # find best feature
-        best_feature = np.argmin([s['score'] for s in pars])
-        best_pars = pars[best_feature]
-        return best_feature, best_pars
-    def fit(self, x, y):
+        #pars = fit_feature(x[:11, 0], y.values[:11], n_bins=self.n_bins)
+        #with ProcessPoolExecutor() as executor:
+        #    pars = list(tqdm(executor.map(partial(fit_feature, y=y.values), x.T), total=x.shape[1]) )
+        cuts = np.vstack([np.quantile(x_i,  np.linspace(0, 1, self.n_bins+2)) for x_i in x.T])
+        cuts[:, 0] = -np.inf
+        cuts[:, -1] = np.inf
+        pars = fit_features(x, y, cuts)
+        best_feature = np.argmin([s[-1] for s in pars])
+        best_pars = {'cuts':pars[best_feature][0], 'y_means':pars[best_feature][1], 'x_means':pars[best_feature][2],
+                     'coeffs':pars[best_feature][3], 'best_feature':best_feature}
+
+        return best_pars
+    def fit(self, x, y, plot=False):
         if isinstance(x, pd.DataFrame):
             x = x.values
         self.target_cols = y.columns
-        err = y.copy()
+        if plot:
+            plt.figure()
+        err = y.copy().values
         for i in tqdm(range(self.n_iter)):
-            best_feature, best_pars = self.one_level_fit(x, err)
-            y_hat = best_pars['y_means'][np.digitize(x[:, best_feature], best_pars['cuts'])-1]
+            rand_idx = np.random.choice(x.shape[0], int(self.bootstrap_fraction*x.shape[0]), replace=True)
+            best_pars = self.one_level_fit(x[rand_idx], err[rand_idx])
+            #y_hat = self.learning_rate*best_pars['y_means'][np.digitize(x[:, best_pars['best_feature']], best_pars['cuts'])-1]
+            x_binned = np.digitize(x[:, best_pars['best_feature']], best_pars['cuts']) - 1
+            y_hat = self.learning_rate*linear_response(x[:, best_pars['best_feature']], x_binned, best_pars['x_means'], best_pars['y_means'], best_pars['coeffs'])
             err = err - y_hat
-            print(np.abs(err).mean().mean())
             self.best_pars.append(best_pars)
+            print(np.mean(np.abs(err)))
+            if np.any(np.isnan(err)):
+                print('asdasdsad')
+            if plot:
+                plt.cla()
+                plt.scatter(y.iloc[:, 0], err[:, 0], s=1)
+                plt.pause(0.01)
         return self
 
     def predict(self, x):
 
         y_hat = np.zeros((x.shape[0], self.best_pars[0]['y_means'].shape[1]))
         for i in range(self.n_iter):
-            y_hat += self.best_pars[i]['y_means'][np.digitize(x.iloc[:, i], self.best_pars[i]['cuts'])-1]
+            #y_hat += self.learning_rate*self.best_pars[i]['y_means'][np.digitize(x.iloc[:, self.best_pars[i]['best_feature']], self.best_pars[i]['cuts'])-1]
+            x_binned = np.digitize(x.iloc[:, self.best_pars[i]['best_feature']], self.best_pars[i]['cuts']) - 1
+            y_hat += self.learning_rate*linear_response(x.iloc[:, self.best_pars[i]['best_feature']].values, x_binned, self.best_pars[i]['x_means'], self.best_pars[i]['y_means'], self.best_pars[i]['coeffs'])
         return pd.DataFrame(y_hat, index = x.index, columns=self.target_cols)
 
 
@@ -284,7 +351,7 @@ def predict(self, x, **kwargs):
         y_hat = RandomFFRegressor.predict(self, x, **kwargs)
         y_hat = self.anti_transform(x, y_hat)
         return y_hat
-    def predict_quantiles(self, x:pd.DataFrame, **kwargs):
+    def _predict_quantiles(self, x:pd.DataFrame, **kwargs):
         preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
         for h in np.unique(x.index.hour):
             preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
@@ -325,7 +392,7 @@ def predict(self, x, **kwargs):
 
         y_hat = self.anti_transform(x, y_hat)
         return y_hat
-    def predict_quantiles(self, x:pd.DataFrame, **kwargs):
+    def _predict_quantiles(self, x:pd.DataFrame, **kwargs):
         preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
         for h in np.unique(x.index.hour):
             preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)

diff --git a/pyforecaster/forecasting_models/randomforests.py b/pyforecaster/forecasting_models/randomforests.py
@@ -159,6 +159,7 @@ def predict(self, x, **kwargs):
                 if len(preds.shape) == 2:
                     preds = np.expand_dims(preds, 0)
                 preds = np.swapaxes(preds, 1, 2)
+                preds = self.quantiles_to_df(preds, index=x.index)
         else:
             preds = pd.DataFrame(np.atleast_2d(np.squeeze(preds)), index=x.index, columns=self.target_cols)
         y_hat = self.anti_transform(x, preds)

diff --git a/pyforecaster/forecasting_models/statsmodels_wrapper.py b/pyforecaster/forecasting_models/statsmodels_wrapper.py
@@ -38,12 +38,12 @@ def fit(self, x_pd:pd.DataFrame, y:pd.DataFrame=None):
         hw_target = hankel(y_present[1:], self.n_sa)
         resid = hw_target - preds
         self.err_distr = np.quantile(resid, self.q_vect, axis=0).T
-
+        self.target_cols = [f'{self.target_name}_t+{i}' for i in range(1, self.n_sa+1)]
 
     def predict(self, x_pd:pd.DataFrame, **kwargs):
         pass
 
-    def predict_quantiles(self, x:pd.DataFrame, **kwargs):
+    def _predict_quantiles(self, x:pd.DataFrame, **kwargs):
         preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
         for h in np.unique(x.index.hour):
             preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)