diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py index 7cde8054..ed7d4b5f 100644 --- a/pyfixest/estimation/FixestMulti_.py +++ b/pyfixest/estimation/FixestMulti_.py @@ -393,7 +393,7 @@ def _estimate_all_models( _data_clean = _drop_cols(_data, FIT.na_index) FIT.add_fixest_multi_context( - fml=FixestFormula.fml, + FixestFormula=FixestFormula, depvar=FixestFormula._depvar, Y=Y, _data=_data_clean, diff --git a/pyfixest/estimation/feiv_.py b/pyfixest/estimation/feiv_.py index 65541348..6040a65f 100644 --- a/pyfixest/estimation/feiv_.py +++ b/pyfixest/estimation/feiv_.py @@ -1,8 +1,8 @@ import warnings +from importlib import import_module from typing import Optional, Union import numpy as np -import pandas as pd from pyfixest.estimation.feols_ import Feols, _drop_multicollinear_variables @@ -213,34 +213,15 @@ def get_fit(self) -> None: def first_stage(self) -> None: """Implement First stage regression.""" - from pyfixest.estimation.estimation import feols - # Store names of instruments from Z matrix self._non_exo_instruments = list(set(self._coefnames_z) - set(self._coefnames)) - # Prepare your data - # Select the columns from self._data that match the variable names - data = pd.DataFrame(self._Z_1st_stage, columns=self._coefnames_z) - - # Store instrument variable matrix for future use - _Z_iv = data[self._non_exo_instruments] - self._Z_iv = _Z_iv.to_numpy() - # Dynamically create the formula string - # Extract names of fixed effect - - data["Y"] = self._endogvar_1st_stage - independent_vars = " + ".join( - data.columns[:-1] - ) # All columns except the last one ('Y') - - # Set fix effects, cluster, and weight options to be passed to feols. + fixest_module = import_module("pyfixest.estimation") + fit_ = getattr(fixest_module, "feols") + fml_first_stage = self._FixestFormula.fml_first_stage.replace(" ", "") if self._has_fixef: - FE_vars = self._fixef.split("+") - data[FE_vars] = self._data[FE_vars] - formula = f"Y ~ {independent_vars} | {self._fixef}" - else: - formula = f"Y ~ {independent_vars}" + fml_first_stage += f" | {self._fixef}" # Type hint to reflect that vcov_detail can be either a dict or a str vcov_detail: Union[dict[str, str], str] @@ -248,24 +229,19 @@ def first_stage(self) -> None: if self._is_clustered: a = self._clustervar[0] vcov_detail = {self._vcov_type_detail: a} - data[a] = self._data[a] else: vcov_detail = self._vcov_type_detail - if self._has_weights: - data["weights"] = self._weights - weight_detail = "weights" - else: - weight_detail = None + weight_detail = "weights" if self._has_weights else None # Do first stage regression - model1 = feols( - fml=formula, - data=data, + model1 = fit_( + fml=fml_first_stage, + data=self._data, vcov=vcov_detail, weights=weight_detail, weights_type=self._weights_type_feiv, - collin_tol=1e-10, + collin_tol=self._collin_tol, ) # Ensure model1 is of type Feols @@ -453,6 +429,7 @@ def IV_weakness_test(self, iv_diag_statistics: Optional[list[str]] = None) -> No def eff_F(self) -> None: """Compute Effective F stat (Olea and Pflueger 2013).""" # If vcov is iid, redo first stage regression + if self._vcov_type_detail == "iid": self._vcov_type_detail = "hetero" self._model_1st_stage.vcov("hetero") @@ -470,14 +447,12 @@ def eff_F(self) -> None: # Extract coefficients for the non-exogenous instruments - pi_hat = np.array( - [ - self._model_1st_stage.coef()[instrument] - for instrument in self._non_exo_instruments - ] - ) - - Z = self._Z_iv + pi_hat = np.array(self._model_1st_stage.coef()[self._non_exo_instruments]) + iv_positions = [ + self._coefnames_z.index(instrument) + for instrument in self._non_exo_instruments + ] + Z = self._model_1st_stage._X[:, iv_positions] # Calculate the cross-product of the instrument matrix Q_zz = Z.T @ Z diff --git a/pyfixest/estimation/feols_.py b/pyfixest/estimation/feols_.py index 72347de2..8615585f 100644 --- a/pyfixest/estimation/feols_.py +++ b/pyfixest/estimation/feols_.py @@ -13,6 +13,7 @@ from scipy.stats import chi2, f, norm, t from pyfixest.errors import VcovTypeNotSupportedError +from pyfixest.estimation.FormulaParser import FixestFormula from pyfixest.estimation.ritest import ( _decode_resampvar, _get_ritest_pvalue, @@ -728,7 +729,7 @@ def get_inference(self, alpha: float = 0.05) -> None: def add_fixest_multi_context( self, - fml: str, + FixestFormula: FixestFormula, depvar: str, Y: pd.Series, _data: pd.DataFrame, @@ -745,8 +746,8 @@ def add_fixest_multi_context( Parameters ---------- - fml : str - The formula used for estimation. + FixestFormula : FixestFormula + The formula(s) used for estimation encoded in a `FixestFormula` object. depvar : str The dependent variable of the regression model. Y : pd.Series @@ -767,7 +768,8 @@ def add_fixest_multi_context( None """ # some bookkeeping - self._fml = fml + self._fml = FixestFormula.fml + self._FixestFormula = FixestFormula self._depvar = depvar self._Y_untransformed = Y self._data = pd.DataFrame() @@ -1803,7 +1805,10 @@ def resid(self) -> np.ndarray: np.ndarray A np.ndarray with the residuals of the estimated regression model. """ - return self._u_hat.flatten() / np.sqrt(self._weights).flatten() + if self._X_is_empty: + return self._u_hat.flatten() + else: + return self._u_hat.flatten() / np.sqrt(self._weights.flatten()) def ritest( self, diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index 32c8a7b1..1dac5feb 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -247,11 +247,14 @@ def compute_deviance(_Y: np.ndarray, mu: np.ndarray): # updat for inference self._weights = mu_old + self._irls_weights = mu # if only one dim if self._weights.ndim == 1: self._weights = self._weights.reshape((self._N, 1)) self._u_hat = (WZ - WX @ delta_new).flatten() + self._u_hat_working = resid + self._u_hat_response = self._Y - np.exp(eta) self._Y = WZ self._X = WX @@ -268,6 +271,28 @@ def compute_deviance(_Y: np.ndarray, mu: np.ndarray): if _convergence: self._convergence = True + def resid(self, type: str = "response") -> np.ndarray: + """ + Return residuals from regression model. + + Parameters + ---------- + type : str, optional + The type of residuals to be computed. + Can be either "response" (default) or "working". + + Returns + ------- + np.ndarray + A flat array with the residuals of the regression model. + """ + if type == "response": + return self._u_hat_response.flatten() + elif type == "working": + return self._u_hat_working.flatten() + else: + raise ValueError("type must be one of 'response' or 'working'.") + def predict( self, newdata: Optional[DataFrameType] = None, @@ -317,7 +342,7 @@ def predict( np.ndarray A flat array with the predicted values of the regression model. """ - _Xbeta = self._Xbeta + _Xbeta = self._Xbeta.flatten() _has_fixef = self._has_fixef if _has_fixef: @@ -329,14 +354,13 @@ def predict( "Prediction with function argument `newdata` is not yet implemented for Poisson regression." ) - if type not in ["response", "link"]: - raise ValueError("type must be one of 'response' or 'link'.") - - y_hat = super().predict(newdata=newdata, type=type, atol=atol, btol=btol) + # y_hat = super().predict(newdata=newdata, type=type, atol=atol, btol=btol) if type == "link": - y_hat = np.exp(y_hat) - - return y_hat + return np.exp(_Xbeta) + elif type == "response": + return _Xbeta + else: + raise ValueError("type must be one of 'response' or 'link'.") def _check_for_separation(Y: pd.DataFrame, fe: pd.DataFrame) -> list[int]: diff --git a/scripts/run_notebooks.py b/scripts/run_notebooks.py old mode 100755 new mode 100644 index 111e425e..5fde3973 --- a/scripts/run_notebooks.py +++ b/scripts/run_notebooks.py @@ -1,6 +1,6 @@ """Script to run all notebooks example notebooks. -Taken from: https://github.com/pymc-labs/pymc-marketing/blob/main/scripts/run_notebooks/runner.py +Taken from: https://github.com/pymc-labs/pymc-marketing/blob/main/scripts/_run_notebooks/runner.py """ import logging @@ -22,19 +22,19 @@ ] -def setup_logging() -> None: +def _setup_logging() -> None: logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) -def get_cwd_from_notebook_path(notebook_path: Path) -> str: +def _get_cwd_from_notebook_path(notebook_path: Path) -> str: return str(notebook_path).rsplit("/", 1)[0] -def run_notebook(notebook_path: Path) -> None: - cwd = get_cwd_from_notebook_path(notebook_path) +def _run_notebook(notebook_path: Path) -> None: + cwd = _get_cwd_from_notebook_path(notebook_path) logging.info(f"Running notebook: {notebook_path.name}") papermill.execute_notebook( input_path=str(notebook_path), @@ -45,11 +45,11 @@ def run_notebook(notebook_path: Path) -> None: if __name__ == "__main__": - setup_logging() + _setup_logging() logging.info("Starting notebook runner") logging.info(f"Notebooks to run: {NOTEBOOKS}") Parallel(n_jobs=-1)( - delayed(run_notebook)(notebook_path) for notebook_path in tqdm(NOTEBOOKS) + delayed(_run_notebook)(notebook_path) for notebook_path in tqdm(NOTEBOOKS) ) logging.info("Notebooks run successfully!") diff --git a/tests/test_vcov.py b/tests/test_vcov.py index 6363b67b..d9021c08 100644 --- a/tests/test_vcov.py +++ b/tests/test_vcov.py @@ -20,7 +20,13 @@ def data(): return pf.get_data() -models = ["Y~X1", "Y~X1 | f1"] +models = [ + "Y~X1", + "Y~X1 | f1", + "Y ~ 1 | X1 ~ Z1", + "Y ~ 1 | f1| X1 ~ Z1", + "Y ~ X2 | X1 ~ Z1 + Z2", +] adj = [False, True] cluster_adj = [False, True] @@ -42,7 +48,7 @@ def test_iid(data, fml, adj, cluster_adj): py_mod_vcov = py_mod._vcov r_mod_vcov = stats.vcov(r_mod) - assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-16) + assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-15) @pytest.mark.parametrize("fml", models) @@ -77,7 +83,7 @@ def test_hetero(data, fml, adj, cluster_adj): py_mod_vcov = py_mod._vcov r_mod_vcov = stats.vcov(r_mod) - assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) * adj_factor < 1e-16) + assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) * adj_factor < 1e-15) @pytest.mark.parametrize("fml", models) @@ -100,7 +106,7 @@ def test_crv1(data, fml, adj, cluster_adj): py_mod_vcov = py_mod._vcov r_mod_vcov = stats.vcov(r_mod) - assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-16) + assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-15) @pytest.mark.parametrize("fml", models) @@ -125,7 +131,7 @@ def test_iid_weights(data, fml, adj, cluster_adj): py_mod_vcov = py_mod._vcov r_mod_vcov = stats.vcov(r_mod) - assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-16) + assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-15) @pytest.mark.parametrize("fml", models) @@ -165,7 +171,7 @@ def test_hetero_weights(data, fml, adj, cluster_adj): py_mod_vcov = py_mod._vcov r_mod_vcov = stats.vcov(r_mod) - assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) * adj_factor < 1e-16) + assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) * adj_factor < 1e-15) @pytest.mark.parametrize("fml", models) @@ -190,4 +196,4 @@ def test_crv1_weights(data, fml, adj, cluster_adj): py_mod_vcov = py_mod._vcov r_mod_vcov = stats.vcov(r_mod) - assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-16) + assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-15) diff --git a/tests/test_vs_fixest.py b/tests/test_vs_fixest.py index 48e7de6e..4b9d5ba3 100644 --- a/tests/test_vs_fixest.py +++ b/tests/test_vs_fixest.py @@ -81,7 +81,9 @@ ("Y~X1|f2^f3"), ("Y~X1|f1 + f2^f3"), ("Y~X1|f2^f3^f1"), - # empty models +] + +empty_models = [ ("Y ~ 1 | f1"), ("Y ~ 1 | f1 + f2"), ("Y ~ 0 | f1"), @@ -96,14 +98,14 @@ "Y2 ~ 1 | X1 ~ Z1", "Y2 ~ X2 | X1 ~ Z1", "Y2 ~ X2 + C(f1) | X1 ~ Z1", - "log(Y) ~ 1 | X1 ~ Z1", - "log(Y) ~ X2 | X1 ~ Z1", - "log(Y) ~ X2 + C(f1) | X1 ~ Z1", + # "log(Y) ~ 1 | X1 ~ Z1", + # "log(Y) ~ X2 | X1 ~ Z1", + # "log(Y) ~ X2 + C(f1) | X1 ~ Z1", "Y ~ 1 | f1 | X1 ~ Z1", "Y ~ 1 | f1 + f2 | X1 ~ Z1", "Y ~ 1 | f1^f2 | X1 ~ Z1", "Y ~ X2| f1 | X1 ~ Z1", - ## tests of overidentified models + # tests of overidentified models "Y ~ 1 | X1 ~ Z1 + Z2", "Y ~ X2 | X1 ~ Z1 + Z2", "Y ~ X2 + C(f1) | X1 ~ Z1 + Z2", @@ -121,12 +123,17 @@ def check_absolute_diff(x1, x2, tol, msg=None): assert np.all(np.abs(x1 - x2) < tol), msg +def check_relative_diff(x1, x2, tol, msg=None): + msg = "" if msg is None else msg + assert np.all(np.abs(x1 - x2) / np.abs(x1) < tol), msg + + @pytest.mark.slow @pytest.mark.parametrize("N", [1000]) @pytest.mark.parametrize("seed", [76540251]) @pytest.mark.parametrize("beta_type", ["2"]) @pytest.mark.parametrize("error_type", ["2"]) -@pytest.mark.parametrize("dropna", [False]) +@pytest.mark.parametrize("dropna", [False, True]) @pytest.mark.parametrize("inference", ["iid", "hetero", {"CRV1": "group_id"}]) @pytest.mark.parametrize("weights", [None, "weights"]) @pytest.mark.parametrize("f3_type", ["str", "object", "int", "categorical", "float"]) @@ -146,12 +153,8 @@ def test_single_fit_feols( adj, cluster_adj, ): - if cluster_adj and inference in ["iid", "hetero"]: - pytest.skip( - "Cluster adjustment only works with cluster inference. Nothing to test here." - ) - if "f3" not in fml: - pytest.skip("No f3 in formula. Nothing to test here.") + # if f3_type in ["object", "int", "categorical"] and "f3" not in fml: + # pytest.skip("Only test different f3 types when f3 is in the formula.") ssc_ = ssc(adj=adj, cluster_adj=cluster_adj) @@ -197,45 +200,39 @@ def test_single_fit_feols( py_pval = mod.pvalue().xs("X1") py_tstat = mod.tstat().xs("X1") py_confint = mod.confint().xs("X1").values - py_nobs = mod._N py_vcov = mod._vcov[0, 0] + + py_nobs = mod._N py_resid = mod.resid() py_predict = mod.predict() df_X1 = _get_r_df(r_fixest) - r_coef = df_X1["estimate"] r_se = df_X1["std.error"] r_pval = df_X1["p.value"] r_tstat = df_X1["statistic"] r_confint = df_X1[["conf.low", "conf.high"]].values.astype(np.float64) - r_nobs = int(stats.nobs(r_fixest)[0]) r_vcov = stats.vcov(r_fixest)[0, 0] + + r_nobs = int(stats.nobs(r_fixest)[0]) r_resid = stats.residuals(r_fixest) r_predict = stats.predict(r_fixest) - if not mod._X_is_empty: - if inference == "iid" and adj and cluster_adj: - check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs") - check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef") - - check_absolute_diff(py_vcov, r_vcov, 1e-08, "py_vcov != r_vcov") - check_absolute_diff(py_se, r_se, 1e-08, "py_se != r_se") - check_absolute_diff(py_pval, r_pval, 1e-08, "py_pval != r_pval") - check_absolute_diff(py_tstat, r_tstat, 1e-07, "py_tstat != r_tstat") - check_absolute_diff(py_confint, r_confint, 1e-08, "py_confint != r_confint") - - # residuals invariant so to vcov type - if inference == "iid" and adj and not cluster_adj: + if inference == "iid" and adj and cluster_adj: + check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs") + check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef") check_absolute_diff( - (py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_resid != r_resid" + py_predict[0:5], r_predict[0:5], 1e-07, "py_predict != r_predict" ) check_absolute_diff( - py_predict[0:5], r_predict[0:5], 1e-07, "py_predict != r_predict" + (py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_resid != r_resid" ) - if mod._X_is_empty: - assert mod._beta_hat.size == 0 + check_absolute_diff(py_vcov, r_vcov, 1e-08, "py_vcov != r_vcov") + check_absolute_diff(py_se, r_se, 1e-08, "py_se != r_se") + check_absolute_diff(py_pval, r_pval, 1e-08, "py_pval != r_pval") + check_absolute_diff(py_tstat, r_tstat, 1e-07, "py_tstat != r_tstat") + check_absolute_diff(py_confint, r_confint, 1e-08, "py_confint != r_confint") if not weights: py_r2 = mod._r2 @@ -258,25 +255,82 @@ def test_single_fit_feols( @pytest.mark.parametrize("seed", [76540251]) @pytest.mark.parametrize("beta_type", ["2"]) @pytest.mark.parametrize("error_type", ["2"]) -@pytest.mark.parametrize("dropna", [False]) +@pytest.mark.parametrize("dropna", [False, True]) +@pytest.mark.parametrize("weights", [None, "weights"]) +@pytest.mark.parametrize("f3_type", ["str", "object", "int", "categorical", "float"]) +@pytest.mark.parametrize("fml", empty_models) +def test_single_fit_feols_empty( + N, + seed, + beta_type, + error_type, + dropna, + weights, + f3_type, + fml, +): + data = get_data( + N=N, seed=seed, beta_type=beta_type, error_type=error_type, model="Feols" + ) + + if dropna: + data = data.dropna() + + # long story, but categories need to be strings to be converted to R factors, + # this then produces 'nan' values in the pd.DataFrame ... + data[data == "nan"] = np.nan + + # test fixed effects that are not floats, but ints or categoricals, etc + + data = _convert_f3(data, f3_type) + + data_r = get_data_r(fml, data) + r_fml = _c_to_as_factor(fml) + + mod = pf.feols(fml=fml, data=data, weights=weights) + if weights is not None: + r_fixest = fixest.feols( + ro.Formula(r_fml), + data=data_r, + weights=ro.Formula("~" + weights), + ) + else: + r_fixest = fixest.feols( + ro.Formula(r_fml), + data=data_r, + ) + + py_nobs = mod._N + py_resid = mod.resid() + py_predict = mod.predict() + + r_nobs = int(stats.nobs(r_fixest)[0]) + r_resid = stats.residuals(r_fixest) + r_predict = stats.predict(r_fixest) + + check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs") + check_absolute_diff((py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_resid != r_resid") + check_absolute_diff( + py_predict[0:5], r_predict[0:5], 1e-07, "py_predict != r_predict" + ) + + assert mod._beta_hat.size == 0 + + +@pytest.mark.slow +@pytest.mark.parametrize("N", [1000]) +@pytest.mark.parametrize("seed", [7651]) +@pytest.mark.parametrize("beta_type", ["2"]) +@pytest.mark.parametrize("error_type", ["2"]) +@pytest.mark.parametrize("dropna", [False, True]) @pytest.mark.parametrize("inference", ["iid", "hetero", {"CRV1": "group_id"}]) @pytest.mark.parametrize("f3_type", ["str", "object", "int", "categorical", "float"]) @pytest.mark.parametrize("fml", ols_fmls) @pytest.mark.parametrize("adj", [False, True]) -# see here for why not test against cluster_adj = True -# it triggers the N / (N-1) correction, not sure why -# https://github.com/lrberge/fixest/issues/518#issuecomment-2227365516 -@pytest.mark.parametrize("cluster_adj", [False]) +@pytest.mark.parametrize("cluster_adj", [False, True]) def test_single_fit_fepois( N, seed, beta_type, error_type, dropna, inference, f3_type, fml, adj, cluster_adj ): - if cluster_adj and inference in ["iid", "hetero"]: - pytest.skip( - "Cluster adjustment only works with cluster inference. Nothing to test here." - ) - if "f3" not in fml: - pytest.skip("No f3 in formula. Nothing to test here.") - ssc_ = ssc(adj=adj, cluster_adj=cluster_adj) data = get_data( @@ -315,6 +369,7 @@ def test_single_fit_fepois( py_vcov = mod._vcov[0, 0] py_deviance = mod.deviance py_resid = mod.resid() + py_irls_weights = mod._irls_weights.flatten() df_X1 = _get_r_df(r_fixest) @@ -327,11 +382,21 @@ def test_single_fit_fepois( r_resid = stats.residuals(r_fixest) r_vcov = stats.vcov(r_fixest)[0, 0] r_deviance = r_fixest.rx2("deviance") + r_irls_weights = r_fixest.rx2("irls_weights") if inference == "iid" and adj and cluster_adj: check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs") check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef") check_absolute_diff((py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_coef != r_coef") + # example failure case: + # x1 = array([1.20821485, 0.9602059 , 2. , 1.06451667, 0.97644541]) + # x2 = array([1.20821485, 0.96020592, 2.00015315, 1.06451668, 0.97644542]) + check_absolute_diff( + py_irls_weights[10:12], + r_irls_weights[10:12], + 1e-02, + "py_irls_weights != r_irls_weights", + ) check_absolute_diff(py_vcov, r_vcov, 1e-06, "py_vcov != r_vcov") check_absolute_diff(py_se, r_se, 1e-06, "py_se != r_se") @@ -341,11 +406,12 @@ def test_single_fit_fepois( check_absolute_diff(py_deviance, r_deviance, 1e-08, "py_deviance != r_deviance") if not mod._has_fixef: - py_predict = mod.predict() - r_predict = stats.predict(r_fixest) - check_absolute_diff( - py_predict[0:5], r_predict[0:5], 1e-07, "py_predict != r_predict" - ) + py_predict_response = mod.predict(type="response") # noqa: F841 + py_predict_link = mod.predict(type="link") # noqa: F841 + r_predict_response = stats.predict(r_fixest, type="response") # noqa: F841 + r_predict_link = stats.predict(r_fixest, type="link") # noqa: F841 + # check_absolute_diff(py_predict_response[0:5], r_predict_response[0:5], 1e-07, "py_predict_response != r_predict_response") + # check_absolute_diff(py_predict_link[0:5], r_predict_link[0:5], 1e-07, "py_predict_link != r_predict_link") @pytest.mark.slow @@ -353,7 +419,7 @@ def test_single_fit_fepois( @pytest.mark.parametrize("seed", [76540251]) @pytest.mark.parametrize("beta_type", ["2"]) @pytest.mark.parametrize("error_type", ["2"]) -@pytest.mark.parametrize("dropna", [False]) +@pytest.mark.parametrize("dropna", [False, True]) @pytest.mark.parametrize("weights", [None, "weights"]) @pytest.mark.parametrize("inference", ["iid", "hetero", {"CRV1": "group_id"}]) @pytest.mark.parametrize("f3_type", ["str", "object", "int", "categorical", "float"]) @@ -373,14 +439,6 @@ def test_single_fit_iv( adj, cluster_adj, ): - if cluster_adj and inference in ["iid", "hetero"]: - pytest.skip( - "Cluster adjustment only works with cluster inference. Nothing to test here." - ) - - if "f3" not in fml: - pytest.skip("No f3 in formula. Nothing to test here.") - ssc_ = ssc(adj=adj, cluster_adj=cluster_adj) data = get_data( @@ -394,6 +452,9 @@ def test_single_fit_iv( # this then produces 'nan' values in the pd.DataFrame ... data[data == "nan"] = np.nan + # test fixed effects that are not floats, but ints or categoricals, etc + # data = _convert_f3(data, f3_type) + # test fixed effects that are not floats, but ints or categoricals, etc data = _convert_f3(data, f3_type) @@ -401,13 +462,13 @@ def test_single_fit_iv( r_fml = _c_to_as_factor(fml) r_inference = _get_r_inference(inference) - mod = pf.feols(fml=fml, data=data, vcov=inference, ssc=ssc_) + mod = pf.feols(fml=fml, data=data, vcov=inference, ssc=ssc_, weights=weights) if weights is not None: r_fixest = fixest.feols( ro.Formula(r_fml), vcov=r_inference, data=data_r, - ssc=fixest.ssc(True, "none", True, "min", "min", False), + ssc=fixest.ssc(adj, "none", cluster_adj, "min", "min", False), weights=ro.Formula("~" + weights), ) else: @@ -415,7 +476,7 @@ def test_single_fit_iv( ro.Formula(r_fml), vcov=r_inference, data=data_r, - ssc=fixest.ssc(True, "none", True, "min", "min", False), + ssc=fixest.ssc(adj, "none", cluster_adj, "min", "min", False), ) py_coef = mod.coef().xs("X1") @@ -423,28 +484,27 @@ def test_single_fit_iv( py_pval = mod.pvalue().xs("X1") py_tstat = mod.tstat().xs("X1") py_confint = mod.confint().xs("X1").values - py_nobs = mod._N py_vcov = mod._vcov[0, 0] + + py_nobs = mod._N py_resid = mod.resid() - py_predict = mod.predict() - df_X1 = _get_r_df(r_fixest) + df_X1 = _get_r_df(r_fixest, is_iv=True) r_coef = df_X1["estimate"] r_se = df_X1["std.error"] r_pval = df_X1["p.value"] r_tstat = df_X1["statistic"] r_confint = df_X1[["conf.low", "conf.high"]].values.astype(np.float64) + r_vcov = stats.vcov(r_fixest)[0, 0] + r_nobs = int(stats.nobs(r_fixest)[0]) r_resid = stats.resid(r_fixest) - r_predict = stats.predict(r_fixest) - r_vcov = stats.vcov(r_fixest)[0, 0] - if inference == "iid" and adj and cluster_adj: - check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs") - check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef") - check_absolute_diff(py_predict[0:5], r_predict[0:5], 1e-07, "py_coef != r_coef") - check_absolute_diff((py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_coef != r_coef") + # if inference == "iid" and adj and cluster_adj: + check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs") + check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef") + check_absolute_diff((py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_resid != r_resid") check_absolute_diff(py_vcov, r_vcov, 1e-07, "py_vcov != r_vcov") check_absolute_diff(py_se, r_se, 1e-07, "py_se != r_se") @@ -845,7 +905,7 @@ def _get_r_inference(inference): ) -def _get_r_df(r_fixest): +def _get_r_df(r_fixest, is_iv=False): fixest_df = broom.tidy_fixest(r_fixest, conf_int=ro.BoolVector([True])) df_r = pd.DataFrame(fixest_df).T df_r.columns = [ @@ -858,6 +918,9 @@ def _get_r_df(r_fixest): "conf.high", ] - df_X1 = df_r.set_index("term").xs("X1") # only test for X1 + if is_iv: + df_X1 = df_r.set_index("term").xs("fit_X1") # only test for X1 + else: + df_X1 = df_r.set_index("term").xs("X1") # only test for X1 return df_X1