diff --git a/.coverage b/.coverage index 7db2155a..0b9f5a5c 100644 Binary files a/.coverage and b/.coverage differ diff --git a/pyfixest/estimation/feols_.py b/pyfixest/estimation/feols_.py index 72d1617d..04a77e6b 100644 --- a/pyfixest/estimation/feols_.py +++ b/pyfixest/estimation/feols_.py @@ -4,7 +4,6 @@ from importlib import import_module from typing import Optional, Union -import numba as nb import numpy as np import pandas as pd from formulaic import Formula @@ -12,7 +11,7 @@ from scipy.sparse.linalg import spsolve from scipy.stats import f, norm, t -from pyfixest.errors import NanInClusterVarError, VcovTypeNotSupportedError +from pyfixest.errors import VcovTypeNotSupportedError from pyfixest.estimation.ritest import ( _decode_resampvar, _get_ritest_pvalue, @@ -20,6 +19,14 @@ _get_ritest_stats_slow, _plot_ritest_pvalue, ) +from pyfixest.estimation.vcov_utils import ( + _check_cluster_df, + _compute_bread, + _count_G_for_ssc_correction, + _crv1_meat_loop, + _get_cluster_df, + _prepare_twoway_clustering, +) from pyfixest.utils.dev_utils import ( DataFrameType, _polars_to_pandas, @@ -87,8 +94,6 @@ class Feols: Number of independent variables (or features). _support_crv3_inference : bool Indicates support for CRV3 inference. - _support_iid_inference : bool - Indicates support for IID inference. _data : Any Data used in the regression, to be enriched outside of the class. _fml : Any @@ -215,7 +220,6 @@ def __init__( self._support_crv3_inference = True if self._weights_name is not None: self._supports_wildboottest = False - self._support_iid_inference = True self._supports_wildboottest = True self._supports_cluster_causal_variance = True if self._has_weights or self._is_iv: @@ -352,30 +356,18 @@ def vcov( _has_fixef = self._has_fixef _is_iv = self._is_iv _method = self._method - _support_iid_inference = self._support_iid_inference _support_crv3_inference = self._support_crv3_inference - _weights_name = self._weights_name - _beta_hat = self._beta_hat - - _X = self._X - _Z = self._Z _tXZ = self._tXZ _tZZinv = self._tZZinv _tZX = self._tZX # _tZXinv = self._tZXinv _hessian = self._hessian - _scores = self._scores - _weights = self._weights - _weights_type = self._weights_type _ssc_dict = self._ssc_dict _N = self._N - _N_rows = self._N_rows _k = self._k - _u_hat = self._u_hat - # deparse vcov input _check_vcov_input(vcov, _data) ( @@ -385,18 +377,10 @@ def vcov( self._clustervar, ) = _deparse_vcov_input(vcov, _has_fixef, _is_iv) - if _is_iv: - bread = np.linalg.inv(_tXZ @ _tZZinv @ _tZX) - else: - bread = np.linalg.inv(_hessian) + self._bread = _compute_bread(_is_iv, _tXZ, _tZZinv, _tZX, _hessian) # compute vcov if self._vcov_type == "iid": - if not _support_iid_inference: - raise NotImplementedError( - f"'iid' inference is not supported for {_method} regressions." - ) - self._ssc = get_ssc( ssc_dict=_ssc_dict, N=_N, @@ -406,16 +390,7 @@ def vcov( vcov_type="iid", ) - if self._method == "feols": - sigma2 = np.sum(_u_hat.flatten() ** 2) / (_N - 1) - elif self._method == "fepois": - sigma2 = 1 - else: - raise NotImplementedError( - f"'iid' inference is not supported for {_method} regressions." - ) - - self._vcov = self._ssc * bread * sigma2 + self._vcov = self._ssc * self._vcov_iid() elif self._vcov_type == "hetero": self._ssc = get_ssc( @@ -427,84 +402,33 @@ def vcov( vcov_type="hetero", ) - if self._vcov_type_detail in ["hetero", "HC1"]: - u = _u_hat - transformed_scores = _scores - elif self._vcov_type_detail in ["HC2", "HC3"]: - if _is_iv: - raise VcovTypeNotSupportedError( - "HC2 and HC3 inference is not supported for IV regressions." - ) - _tZXinv = np.linalg.inv(_tZX) - leverage = np.sum(_X * (_X @ _tZXinv), axis=1) - if self._vcov_type_detail == "HC2": - u = _u_hat / np.sqrt(1 - leverage) - transformed_scores = _scores / np.sqrt(1 - leverage)[:, None] - else: - transformed_scores = _scores / (1 - leverage)[:, None] - - if _is_iv is False: - meat = transformed_scores.transpose() @ transformed_scores - self._vcov = self._ssc * bread @ meat @ bread - else: - if u.ndim == 1: - u = u.reshape((-1, 1)) - Omega = ( - transformed_scores.transpose() @ transformed_scores - ) # np.transpose( self._Z) @ ( self._Z * (u**2)) # k x k - meat = _tXZ @ _tZZinv @ Omega @ _tZZinv @ _tZX # k x k - self._vcov = self._ssc * bread @ meat @ bread + self._vcov = self._ssc * self._vcov_hetero() elif self._vcov_type == "CRV": if data is not None: - data_pandas = _polars_to_pandas(data) - self._cluster_df = data_pandas[self._clustervar].copy() - elif not self._data.empty: - self._cluster_df = self._data[self._clustervar].copy() - else: - raise AttributeError( - """The input data set needs to be stored in the model object if - you call `vcov()` post estimation with a novel cluster variable. - Please set the function argument `store_data=True` when calling - the regression. - """ + # use input data set + self._cluster_df = _get_cluster_df( + data=data, clustervar=self._clustervar ) - - if np.any(self._cluster_df.isna().any()): - raise NanInClusterVarError( - "CRV inference not supported with missing values in the cluster variable." - "Please drop missing values before running the regression." + _check_cluster_df(cluster_df=self._cluster_df, data=data) + else: + # use stored data + self._cluster_df = _get_cluster_df( + data=self._data, clustervar=self._clustervar ) + _check_cluster_df(cluster_df=self._cluster_df, data=self._data) if self._cluster_df.shape[1] > 1: - # paste both columns together - # set cluster_df to string - - cluster_one = self._clustervar[0] - cluster_two = self._clustervar[1] - - cluster_df_one_str = self._cluster_df[cluster_one].astype(str) - cluster_df_two_str = self._cluster_df[cluster_two].astype(str) - self._cluster_df.loc[:, "cluster_intersection"] = ( - cluster_df_one_str.str.cat(cluster_df_two_str, sep="-") + self._cluster_df = _prepare_twoway_clustering( + clustervar=self._clustervar, cluster_df=self._cluster_df ) - if self._cluster_df.shape[0] != _N_rows: - raise ValueError( - "The cluster variable must have the same length as the data set." - ) - - G = [] - for col in self._cluster_df.columns: - G.append(self._cluster_df[col].nunique()) - - if _ssc_dict["cluster_df"] == "min": - G = [min(G)] * 3 + self._G = _count_G_for_ssc_correction( + cluster_df=self._cluster_df, ssc_dict=_ssc_dict + ) # loop over columns of cluster_df vcov_sign_list = [1, 1, -1] - self._G = G - self._vcov = np.zeros((self._k, self._k)) for x, col in enumerate(self._cluster_df.columns): @@ -521,34 +445,13 @@ def vcov( vcov_type="CRV", ) - if x == 0: - self._ssc = np.array([ssc]) - else: - self._ssc = np.append(self._ssc, ssc) + self._ssc = np.array([ssc]) if x == 0 else np.append(self._ssc, ssc) if self._vcov_type_detail == "CRV1": - k_instruments = _Z.shape[1] - meat = np.zeros((k_instruments, k_instruments)) - - # deviance uniquely for Poisson - if self._method == "fepois": - weighted_uhat = _weights.flatten() * _u_hat.flatten() - else: - weighted_uhat = _u_hat - - meat = _crv1_meat_loop( - _Z=_Z.astype(np.float64), - weighted_uhat=weighted_uhat.astype(np.float64).reshape((-1, 1)), - clustid=clustid, - cluster_col=cluster_col, + self._vcov += self._ssc[x] * self._vcov_crv1( + clustid=clustid, cluster_col=cluster_col ) - if _is_iv is False: - self._vcov += self._ssc[x] * bread @ meat @ bread - else: - meat = _tXZ @ _tZZinv @ meat @ _tZZinv @ self._tZX - self._vcov += self._ssc[x] * bread @ meat @ bread - elif self._vcov_type_detail == "CRV3": # check: is fixed effect cluster fixed effect? # if not, either error or turn fixefs into dummies @@ -559,67 +462,205 @@ def vcov( "CRV3 inference is not supported with IV regression." ) - beta_jack = np.zeros((len(clustid), _k)) - if ( - (self._has_fixef is False) - and (self._method == "feols") + (_has_fixef is False) + and (_method == "feols") and (_is_iv is False) ): - # inverse hessian precomputed? - tXX = np.transpose(self._X) @ self._X - tXy = np.transpose(self._X) @ self._Y - - # compute leave-one-out regression coefficients (aka clusterjacks') # noqa: W505 - for ixg, g in enumerate(clustid): - Xg = self._X[np.equal(g, cluster_col)] - Yg = self._Y[np.equal(g, cluster_col)] - tXgXg = np.transpose(Xg) @ Xg - # jackknife regression coefficient - beta_jack[ixg, :] = ( - np.linalg.pinv(tXX - tXgXg) - @ (tXy - np.transpose(Xg) @ Yg) - ).flatten() - + self._vcov += self._ssc[x] * self._vcov_crv3_fast( + clustid=clustid, cluster_col=cluster_col + ) else: - # lazy loading to avoid circular import - fixest_module = import_module("pyfixest.estimation") - if self._method == "feols": - fit_ = getattr(fixest_module, "feols") - else: - fit_ = getattr(fixest_module, "fepois") - - for ixg, g in enumerate(clustid): - # direct leave one cluster out implementation - data = _data[~np.equal(g, cluster_col)] - fit = fit_( - fml=self._fml, - data=data, - vcov="iid", - weights=_weights_name, - weights_type=_weights_type, - ) - beta_jack[ixg, :] = fit.coef().to_numpy() - - # optional: beta_bar in MNW (2022) - # center = "estimate" - # if center == 'estimate': - # beta_center = beta_hat - # else: - # beta_center = np.mean(beta_jack, axis = 0) - beta_center = _beta_hat - - vcov_mat = np.zeros((_k, _k)) - for ixg, g in enumerate(clustid): - beta_centered = beta_jack[ixg, :] - beta_center - vcov_mat += np.outer(beta_centered, beta_centered) - - self._vcov += self._ssc[x] * vcov_mat + self._vcov += self._ssc[x] * self._vcov_crv3_slow( + clustid=clustid, cluster_col=cluster_col + ) + # update p-value, t-stat, standard error, confint self.get_inference() return self + def _vcov_iid(self): + _N = self._N + _u_hat = self._u_hat + _method = self._method + _bread = self._bread + + if _method == "feols": + sigma2 = np.sum(_u_hat.flatten() ** 2) / (_N - 1) + elif _method == "fepois": + sigma2 = 1 + else: + raise NotImplementedError( + f"'iid' inference is not supported for {_method} regressions." + ) + + _vcov = _bread * sigma2 + + return _vcov + + def _vcov_hetero(self): + _u_hat = self._u_hat + _scores = self._scores + _vcov_type_detail = self._vcov_type_detail + _tXZ = self._tXZ + _tZZinv = self._tZZinv + _tZX = self._tZX + _X = self._X + _is_iv = self._is_iv + _bread = self._bread + + if _vcov_type_detail in ["hetero", "HC1"]: + u = _u_hat + transformed_scores = _scores + elif _vcov_type_detail in ["HC2", "HC3"]: + if _is_iv: + raise VcovTypeNotSupportedError( + "HC2 and HC3 inference is not supported for IV regressions." + ) + _tZXinv = np.linalg.inv(_tZX) + leverage = np.sum(_X * (_X @ _tZXinv), axis=1) + if _vcov_type_detail == "HC2": + u = _u_hat / np.sqrt(1 - leverage) + transformed_scores = _scores / np.sqrt(1 - leverage)[:, None] + else: + transformed_scores = _scores / (1 - leverage)[:, None] + + if _is_iv is False: + meat = transformed_scores.transpose() @ transformed_scores + _vcov = _bread @ meat @ _bread + else: + if u.ndim == 1: + u = u.reshape((-1, 1)) + Omega = ( + transformed_scores.transpose() @ transformed_scores + ) # np.transpose( _Z) @ ( _Z * (u**2)) # k x k + meat = _tXZ @ _tZZinv @ Omega @ _tZZinv @ _tZX # k x k + _vcov = _bread @ meat @ _bread + + return _vcov + + def _vcov_crv1(self, clustid, cluster_col): + _Z = self._Z + _weights = self._weights + _u_hat = self._u_hat + _method = self._method + _is_iv = self._is_iv + _tXZ = self._tXZ + _tZZinv = self._tZZinv + _tZX = self._tZX + _bread = self._bread + + k_instruments = _Z.shape[1] + meat = np.zeros((k_instruments, k_instruments)) + + # deviance uniquely for Poisson + if _method == "fepois": + weighted_uhat = _weights.flatten() * _u_hat.flatten() + else: + weighted_uhat = _u_hat + + meat = _crv1_meat_loop( + _Z=_Z.astype(np.float64), + weighted_uhat=weighted_uhat.astype(np.float64).reshape((-1, 1)), + clustid=clustid, + cluster_col=cluster_col, + ) + + if _is_iv is False: + _vcov = _bread @ meat @ _bread + else: + meat = _tXZ @ _tZZinv @ meat @ _tZZinv @ _tZX + _vcov = _bread @ meat @ _bread + + return _vcov + + def _vcov_crv3_fast(self, clustid, cluster_col): + _k = self._k + _Y = self._Y + _X = self._X + _beta_hat = self._beta_hat + + beta_jack = np.zeros((len(clustid), _k)) + + # inverse hessian precomputed? + tXX = np.transpose(_X) @ _X + tXy = np.transpose(_X) @ _Y + + # compute leave-one-out regression coefficients (aka clusterjacks') # noqa: W505 + for ixg, g in enumerate(clustid): + Xg = _X[np.equal(g, cluster_col)] + Yg = _Y[np.equal(g, cluster_col)] + tXgXg = np.transpose(Xg) @ Xg + # jackknife regression coefficient + beta_jack[ixg, :] = ( + np.linalg.pinv(tXX - tXgXg) @ (tXy - np.transpose(Xg) @ Yg) + ).flatten() + + # optional: beta_bar in MNW (2022) + # center = "estimate" + # if center == 'estimate': + # beta_center = beta_hat + # else: + # beta_center = np.mean(beta_jack, axis = 0) + beta_center = _beta_hat + + vcov_mat = np.zeros((_k, _k)) + for ixg, g in enumerate(clustid): + beta_centered = beta_jack[ixg, :] - beta_center + vcov_mat += np.outer(beta_centered, beta_centered) + + _vcov = vcov_mat + + return _vcov + + def _vcov_crv3_slow(self, clustid, cluster_col): + _k = self._k + _method = self._method + _fml = self._fml + _data = self._data + _weights_name = self._weights_name + _weights_type = self._weights_type + _beta_hat = self._beta_hat + + beta_jack = np.zeros((len(clustid), _k)) + + # lazy loading to avoid circular import + fixest_module = import_module("pyfixest.estimation") + if _method == "feols": + fit_ = getattr(fixest_module, "feols") + else: + fit_ = getattr(fixest_module, "fepois") + + for ixg, g in enumerate(clustid): + # direct leave one cluster out implementation + data = _data[~np.equal(g, cluster_col)] + fit = fit_( + fml=_fml, + data=data, + vcov="iid", + weights=_weights_name, + weights_type=_weights_type, + ) + beta_jack[ixg, :] = fit.coef().to_numpy() + + # optional: beta_bar in MNW (2022) + # center = "estimate" + # if center == 'estimate': + # beta_center = beta_hat + # else: + # beta_center = np.mean(beta_jack, axis = 0) + beta_center = _beta_hat + + vcov_mat = np.zeros((_k, _k)) + for ixg, g in enumerate(clustid): + beta_centered = beta_jack[ixg, :] - beta_center + vcov_mat += np.outer(beta_centered, beta_centered) + + _vcov = vcov_mat + + return _vcov + def get_inference(self, alpha: float = 0.95) -> None: """ Compute standard errors, t-statistics, and p-values for the regression model. @@ -1450,7 +1491,7 @@ def get_performance(self) -> None: self._adj_r2 = np.nan self._adj_r2_within = np.nan - def tidy(self) -> pd.DataFrame: + def tidy(self, alpha=0.05) -> pd.DataFrame: """ Tidy model outputs. @@ -1464,6 +1505,7 @@ def tidy(self) -> pd.DataFrame: estimates, standard errors, t-statistics, and p-values. """ _coefnames = self._coefnames + _se = self._se _tstat = self._tstat _pvalue = self._pvalue @@ -2068,87 +2110,6 @@ def _find_collinear_variables( return id_excl, n_excl, False -# CODE from Styfen Schaer (@styfenschaer) -@nb.njit(parallel=False) -def bucket_argsort(arr: np.ndarray) -> tuple[np.ndarray, np.ndarray]: - """ - Sorts the input array using the bucket sort algorithm. - - Parameters - ---------- - arr : array_like - An array_like object that needs to be sorted. - - Returns - ------- - array_like - A sorted copy of the input array. - - Raises - ------ - ValueError - If the input is not an array_like object. - - Notes - ----- - The bucket sort algorithm works by distributing the elements of an array - into a number of buckets. Each bucket is then sorted individually, either - using a different sorting algorithm, or by recursively applying the bucket - sorting algorithm. - """ - counts = np.zeros(arr.max() + 1, dtype=np.uint32) - for i in range(arr.size): - counts[arr[i]] += 1 - - locs = np.empty(counts.size + 1, dtype=np.uint32) - locs[0] = 0 - pos = np.empty(counts.size, dtype=np.uint32) - for i in range(counts.size): - locs[i + 1] = locs[i] + counts[i] - pos[i] = locs[i] - - args = np.empty(arr.size, dtype=np.uint32) - for i in range(arr.size): - e = arr[i] - args[pos[e]] = i - pos[e] += 1 - - return args, locs - - -# CODE from Styfen Schaer (@styfenschaer) -@nb.njit(parallel=False) -def _crv1_meat_loop( - _Z: np.ndarray, - weighted_uhat: np.ndarray, - clustid: np.ndarray, - cluster_col: np.ndarray, -) -> np.ndarray: - k = _Z.shape[1] - dtype = _Z.dtype - meat = np.zeros((k, k), dtype=dtype) - - g_indices, g_locs = bucket_argsort(cluster_col) - - score_g = np.empty((k, 1), dtype=dtype) - meat_i = np.empty((k, k), dtype=dtype) - - for i in range(clustid.size): - g = clustid[i] - start = g_locs[g] - end = g_locs[g + 1] - g_index = g_indices[start:end] - - Zg = _Z[g_index] - ug = weighted_uhat[g_index] - - np.dot(Zg.T, ug, out=score_g) - np.outer(score_g, score_g, out=meat_i) - meat += meat_i - - return meat - - def _check_vcov_input(vcov: Union[str, dict[str, str]], data: pd.DataFrame): """ Check the input for the vcov argument in the Feols class. diff --git a/pyfixest/estimation/vcov_utils.py b/pyfixest/estimation/vcov_utils.py new file mode 100644 index 00000000..80822703 --- /dev/null +++ b/pyfixest/estimation/vcov_utils.py @@ -0,0 +1,143 @@ +import numba as nb +import numpy as np + +from pyfixest.errors import NanInClusterVarError +from pyfixest.utils.dev_utils import _polars_to_pandas + + +def _compute_bread(_is_iv, _tXZ, _tZZinv, _tZX, _hessian): + return np.linalg.inv(_tXZ @ _tZZinv @ _tZX) if _is_iv else np.linalg.inv(_hessian) + + +def _get_cluster_df(data, clustervar): + if not data.empty: + data_pandas = _polars_to_pandas(data) + cluster_df = data_pandas[clustervar].copy() + else: + raise AttributeError( + """The input data set needs to be stored in the model object if + you call `vcov()` post estimation with a novel cluster variable. + Please set the function argument `store_data=True` when calling + the regression. + """ + ) + + return cluster_df + + +def _check_cluster_df(cluster_df, data): + if np.any(cluster_df.isna().any()): + raise NanInClusterVarError( + "CRV inference not supported with missing values in the cluster variable." + "Please drop missing values before running the regression." + ) + + N = data.shape[0] + if cluster_df.shape[0] != N: + raise ValueError( + "The cluster variable must have the same length as the data set." + ) + + +def _count_G_for_ssc_correction(cluster_df, ssc_dict): + G = [] + for col in cluster_df.columns: + G.append(cluster_df[col].nunique()) + + if ssc_dict["cluster_df"] == "min": + G = [min(G)] * 3 + + return G + + +def _prepare_twoway_clustering(clustervar, cluster_df): + cluster_one = clustervar[0] + cluster_two = clustervar + cluster_df_one_str = cluster_df[cluster_one].astype(str) + cluster_df_two_str = cluster_df[cluster_two].astype(str) + cluster_df.loc[:, "cluster_intersection"] = cluster_df_one_str.str.cat( + cluster_df_two_str, sep="-" + ) + + return cluster_df + + +# CODE from Styfen Schaer (@styfenschaer) +@nb.njit(parallel=False) +def bucket_argsort(arr: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """ + Sorts the input array using the bucket sort algorithm. + + Parameters + ---------- + arr : array_like + An array_like object that needs to be sorted. + + Returns + ------- + array_like + A sorted copy of the input array. + + Raises + ------ + ValueError + If the input is not an array_like object. + + Notes + ----- + The bucket sort algorithm works by distributing the elements of an array + into a number of buckets. Each bucket is then sorted individually, either + using a different sorting algorithm, or by recursively applying the bucket + sorting algorithm. + """ + counts = np.zeros(arr.max() + 1, dtype=np.uint32) + for i in range(arr.size): + counts[arr[i]] += 1 + + locs = np.empty(counts.size + 1, dtype=np.uint32) + locs[0] = 0 + pos = np.empty(counts.size, dtype=np.uint32) + for i in range(counts.size): + locs[i + 1] = locs[i] + counts[i] + pos[i] = locs[i] + + args = np.empty(arr.size, dtype=np.uint32) + for i in range(arr.size): + e = arr[i] + args[pos[e]] = i + pos[e] += 1 + + return args, locs + + +# CODE from Styfen Schaer (@styfenschaer) +@nb.njit(parallel=False) +def _crv1_meat_loop( + _Z: np.ndarray, + weighted_uhat: np.ndarray, + clustid: np.ndarray, + cluster_col: np.ndarray, +) -> np.ndarray: + k = _Z.shape[1] + dtype = _Z.dtype + meat = np.zeros((k, k), dtype=dtype) + + g_indices, g_locs = bucket_argsort(cluster_col) + + score_g = np.empty((k, 1), dtype=dtype) + meat_i = np.empty((k, k), dtype=dtype) + + for i in range(clustid.size): + g = clustid[i] + start = g_locs[g] + end = g_locs[g + 1] + g_index = g_indices[start:end] + + Zg = _Z[g_index] + ug = weighted_uhat[g_index] + + np.dot(Zg.T, ug, out=score_g) + np.outer(score_g, score_g, out=meat_i) + meat += meat_i + + return meat