py-econometrics · s3alfisc · Sep 7, 2024 · Sep 1, 2024 · Sep 1, 2024 · Sep 1, 2024
diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py
@@ -393,7 +393,7 @@ def _estimate_all_models(
                 _data_clean = _drop_cols(_data, FIT.na_index)
 
                 FIT.add_fixest_multi_context(
-                    fml=FixestFormula.fml,
+                    FixestFormula=FixestFormula,
                     depvar=FixestFormula._depvar,
                     Y=Y,
                     _data=_data_clean,

diff --git a/pyfixest/estimation/feiv_.py b/pyfixest/estimation/feiv_.py
@@ -1,8 +1,8 @@
 import warnings
+from importlib import import_module
 from typing import Optional, Union
 
 import numpy as np
-import pandas as pd
 
 from pyfixest.estimation.feols_ import Feols, _drop_multicollinear_variables
 
@@ -213,59 +213,35 @@ def get_fit(self) -> None:
 
     def first_stage(self) -> None:
         """Implement First stage regression."""
-        from pyfixest.estimation.estimation import feols
-
         # Store names of instruments from Z matrix
         self._non_exo_instruments = list(set(self._coefnames_z) - set(self._coefnames))
 
-        # Prepare your data
-        # Select the columns from self._data that match the variable names
-        data = pd.DataFrame(self._Z_1st_stage, columns=self._coefnames_z)
-
-        # Store instrument variable matrix for future use
-        _Z_iv = data[self._non_exo_instruments]
-        self._Z_iv = _Z_iv.to_numpy()
-        # Dynamically create the formula string
-        # Extract names of fixed effect
-
-        data["Y"] = self._endogvar_1st_stage
-        independent_vars = " + ".join(
-            data.columns[:-1]
-        )  # All columns except the last one ('Y')
-
-        # Set fix effects, cluster, and weight options to be passed to feols.
+        fixest_module = import_module("pyfixest.estimation")
+        fit_ = getattr(fixest_module, "feols")
 
+        fml_first_stage = self._FixestFormula.fml_first_stage.replace(" ", "")
         if self._has_fixef:
-            FE_vars = self._fixef.split("+")
-            data[FE_vars] = self._data[FE_vars]
-            formula = f"Y ~ {independent_vars} | {self._fixef}"
-        else:
-            formula = f"Y ~ {independent_vars}"
+            fml_first_stage += f" | {self._fixef}"
 
         # Type hint to reflect that vcov_detail can be either a dict or a str
         vcov_detail: Union[dict[str, str], str]
 
         if self._is_clustered:
             a = self._clustervar[0]
             vcov_detail = {self._vcov_type_detail: a}
-            data[a] = self._data[a]
         else:
             vcov_detail = self._vcov_type_detail
 
-        if self._has_weights:
-            data["weights"] = self._weights
-            weight_detail = "weights"
-        else:
-            weight_detail = None
+        weight_detail = "weights" if self._has_weights else None
 
         # Do first stage regression
-        model1 = feols(
-            fml=formula,
-            data=data,
+        model1 = fit_(
+            fml=fml_first_stage,
+            data=self._data,
             vcov=vcov_detail,
             weights=weight_detail,
             weights_type=self._weights_type_feiv,
-            collin_tol=1e-10,
+            collin_tol=self._collin_tol,
         )
 
         # Ensure model1 is of type Feols
@@ -453,6 +429,7 @@ def IV_weakness_test(self, iv_diag_statistics: Optional[list[str]] = None) -> No
     def eff_F(self) -> None:
         """Compute Effective F stat (Olea and Pflueger 2013)."""
         # If vcov is iid, redo first stage regression
+
         if self._vcov_type_detail == "iid":
             self._vcov_type_detail = "hetero"
             self._model_1st_stage.vcov("hetero")
@@ -470,14 +447,12 @@ def eff_F(self) -> None:
 
         # Extract coefficients for the non-exogenous instruments
 
-        pi_hat = np.array(
-            [
-                self._model_1st_stage.coef()[instrument]
-                for instrument in self._non_exo_instruments
-            ]
-        )
-
-        Z = self._Z_iv
+        pi_hat = np.array(self._model_1st_stage.coef()[self._non_exo_instruments])
+        iv_positions = [
+            self._coefnames_z.index(instrument)
+            for instrument in self._non_exo_instruments
+        ]
+        Z = self._model_1st_stage._X[:, iv_positions]
 
         # Calculate the cross-product of the instrument matrix
         Q_zz = Z.T @ Z

diff --git a/pyfixest/estimation/feols_.py b/pyfixest/estimation/feols_.py
@@ -13,6 +13,7 @@
 from scipy.stats import chi2, f, norm, t
 
 from pyfixest.errors import VcovTypeNotSupportedError
+from pyfixest.estimation.FormulaParser import FixestFormula
 from pyfixest.estimation.ritest import (
     _decode_resampvar,
     _get_ritest_pvalue,
@@ -728,7 +729,7 @@ def get_inference(self, alpha: float = 0.05) -> None:
 
     def add_fixest_multi_context(
         self,
-        fml: str,
+        FixestFormula: FixestFormula,
         depvar: str,
         Y: pd.Series,
         _data: pd.DataFrame,
@@ -745,8 +746,8 @@ def add_fixest_multi_context(
 
         Parameters
         ----------
-        fml : str
-            The formula used for estimation.
+        FixestFormula : FixestFormula
+            The formula(s) used for estimation encoded in a `FixestFormula` object.
         depvar : str
             The dependent variable of the regression model.
         Y : pd.Series
@@ -767,7 +768,8 @@ def add_fixest_multi_context(
         None
         """
         # some bookkeeping
-        self._fml = fml
+        self._fml = FixestFormula.fml
+        self._FixestFormula = FixestFormula
         self._depvar = depvar
         self._Y_untransformed = Y
         self._data = pd.DataFrame()
@@ -1803,7 +1805,10 @@ def resid(self) -> np.ndarray:
         np.ndarray
             A np.ndarray with the residuals of the estimated regression model.
         """
-        return self._u_hat.flatten() / np.sqrt(self._weights).flatten()
+        if self._X_is_empty:
+            return self._u_hat.flatten()
+        else:
+            return self._u_hat.flatten() / np.sqrt(self._weights.flatten())
 
     def ritest(
         self,

diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py
@@ -247,11 +247,14 @@ def compute_deviance(_Y: np.ndarray, mu: np.ndarray):
 
         # updat for inference
         self._weights = mu_old
+        self._irls_weights = mu
         # if only one dim
         if self._weights.ndim == 1:
             self._weights = self._weights.reshape((self._N, 1))
 
         self._u_hat = (WZ - WX @ delta_new).flatten()
+        self._u_hat_working = resid
+        self._u_hat_response = self._Y - np.exp(eta)
 
         self._Y = WZ
         self._X = WX
@@ -268,6 +271,28 @@ def compute_deviance(_Y: np.ndarray, mu: np.ndarray):
         if _convergence:
             self._convergence = True
 
+    def resid(self, type: str = "response") -> np.ndarray:
+        """
+        Return residuals from regression model.
+
+        Parameters
+        ----------
+        type : str, optional
+            The type of residuals to be computed.
+            Can be either "response" (default) or "working".
+
+        Returns
+        -------
+        np.ndarray
+            A flat array with the residuals of the regression model.
+        """
+        if type == "response":
+            return self._u_hat_response.flatten()
+        elif type == "working":
+            return self._u_hat_working.flatten()
+        else:
+            raise ValueError("type must be one of 'response' or 'working'.")
+
     def predict(
         self,
         newdata: Optional[DataFrameType] = None,
@@ -317,7 +342,7 @@ def predict(
         np.ndarray
             A flat array with the predicted values of the regression model.
         """
-        _Xbeta = self._Xbeta
+        _Xbeta = self._Xbeta.flatten()
         _has_fixef = self._has_fixef
 
         if _has_fixef:
@@ -329,14 +354,13 @@ def predict(
                 "Prediction with function argument `newdata` is not yet implemented for Poisson regression."
             )
 
-        if type not in ["response", "link"]:
-            raise ValueError("type must be one of 'response' or 'link'.")
-
-        y_hat = super().predict(newdata=newdata, type=type, atol=atol, btol=btol)
+        # y_hat = super().predict(newdata=newdata, type=type, atol=atol, btol=btol)
         if type == "link":
-            y_hat = np.exp(y_hat)
-
-        return y_hat
+            return np.exp(_Xbeta)
+        elif type == "response":
+            return _Xbeta
+        else:
+            raise ValueError("type must be one of 'response' or 'link'.")
 
 
 def _check_for_separation(Y: pd.DataFrame, fe: pd.DataFrame) -> list[int]:

diff --git a/scripts/run_notebooks.py b/scripts/run_notebooks.py
@@ -1,6 +1,6 @@
 """Script to run all notebooks example notebooks.
 
-Taken from: https://github.com/pymc-labs/pymc-marketing/blob/main/scripts/run_notebooks/runner.py
+Taken from: https://github.com/pymc-labs/pymc-marketing/blob/main/scripts/_run_notebooks/runner.py
 """
 
 import logging
@@ -22,19 +22,19 @@
 ]
 
 
-def setup_logging() -> None:
+def _setup_logging() -> None:
     logging.basicConfig(
         level=logging.INFO,
         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
     )
 
 
-def get_cwd_from_notebook_path(notebook_path: Path) -> str:
+def _get_cwd_from_notebook_path(notebook_path: Path) -> str:
     return str(notebook_path).rsplit("/", 1)[0]
 
 
-def run_notebook(notebook_path: Path) -> None:
-    cwd = get_cwd_from_notebook_path(notebook_path)
+def _run_notebook(notebook_path: Path) -> None:
+    cwd = _get_cwd_from_notebook_path(notebook_path)
     logging.info(f"Running notebook: {notebook_path.name}")
     papermill.execute_notebook(
         input_path=str(notebook_path),
@@ -45,11 +45,11 @@ def run_notebook(notebook_path: Path) -> None:
 
 
 if __name__ == "__main__":
-    setup_logging()
+    _setup_logging()
     logging.info("Starting notebook runner")
     logging.info(f"Notebooks to run: {NOTEBOOKS}")
     Parallel(n_jobs=-1)(
-        delayed(run_notebook)(notebook_path) for notebook_path in tqdm(NOTEBOOKS)
+        delayed(_run_notebook)(notebook_path) for notebook_path in tqdm(NOTEBOOKS)
     )
 
     logging.info("Notebooks run successfully!")
diff --git a/tests/test_vcov.py b/tests/test_vcov.py
@@ -20,7 +20,13 @@ def data():
     return pf.get_data()
 
 
-models = ["Y~X1", "Y~X1 | f1"]
+models = [
+    "Y~X1",
+    "Y~X1 | f1",
+    "Y ~ 1 | X1 ~ Z1",
+    "Y ~ 1 | f1| X1 ~ Z1",
+    "Y ~ X2 | X1 ~ Z1 + Z2",
+]
 adj = [False, True]
 cluster_adj = [False, True]
 
@@ -42,7 +48,7 @@ def test_iid(data, fml, adj, cluster_adj):
     py_mod_vcov = py_mod._vcov
     r_mod_vcov = stats.vcov(r_mod)
 
-    assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-16)
+    assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-15)
 
 
 @pytest.mark.parametrize("fml", models)
@@ -77,7 +83,7 @@ def test_hetero(data, fml, adj, cluster_adj):
     py_mod_vcov = py_mod._vcov
     r_mod_vcov = stats.vcov(r_mod)
 
-    assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) * adj_factor < 1e-16)
+    assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) * adj_factor < 1e-15)
 
 
 @pytest.mark.parametrize("fml", models)
@@ -100,7 +106,7 @@ def test_crv1(data, fml, adj, cluster_adj):
     py_mod_vcov = py_mod._vcov
     r_mod_vcov = stats.vcov(r_mod)
 
-    assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-16)
+    assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-15)
 
 
 @pytest.mark.parametrize("fml", models)
@@ -125,7 +131,7 @@ def test_iid_weights(data, fml, adj, cluster_adj):
     py_mod_vcov = py_mod._vcov
     r_mod_vcov = stats.vcov(r_mod)
 
-    assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-16)
+    assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-15)
 
 
 @pytest.mark.parametrize("fml", models)
@@ -165,7 +171,7 @@ def test_hetero_weights(data, fml, adj, cluster_adj):
     py_mod_vcov = py_mod._vcov
     r_mod_vcov = stats.vcov(r_mod)
 
-    assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) * adj_factor < 1e-16)
+    assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) * adj_factor < 1e-15)
 
 
 @pytest.mark.parametrize("fml", models)
@@ -190,4 +196,4 @@ def test_crv1_weights(data, fml, adj, cluster_adj):
     py_mod_vcov = py_mod._vcov
     r_mod_vcov = stats.vcov(r_mod)
 
-    assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-16)
+    assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-15)