py-econometrics · leostimpfle · Oct 26, 2024 · Oct 28, 2024 · Nov 10, 2024 · Nov 10, 2024
diff --git a/docs/changelog.qmd b/docs/changelog.qmd
@@ -2,6 +2,9 @@
 
 ## PyFixest 0.28.0 (In Development, can be installed from github)
 
+- Add support for the `newdata` argument for `Fepois.predict`.
+- Changes `Fepois.predict()` defaults `type` argument to `response`, to match `fixest` defaults.
+- Updates the `Fepois._Y_hat_response` attribute, which would previously return `Fepois._Y_hat_link`.
 - Fix a bug that caused reindexing of `LPDID._coeftable` when calling `LPDID.iplot()`. As a result, a second call of `LPDID.iplot()` would fail.
 
 ## PyFixest 0.27.0

diff --git a/pyfixest/estimation/feols_.py b/pyfixest/estimation/feols_.py
@@ -3,7 +3,7 @@
 import re
 import warnings
 from importlib import import_module
-from typing import Literal, Optional, Union
+from typing import Any, Literal, Optional, Union
 
 import numba as nb
 import numpy as np
@@ -1766,7 +1766,7 @@ def predict(
         newdata: Optional[DataFrameType] = None,
         atol: float = 1e-6,
         btol: float = 1e-6,
-        type: PredictionType = "link",
+        type: PredictionType = "response",
     ) -> np.ndarray:
         """
         Predict values of the model on new data.
@@ -1775,6 +1775,11 @@ def predict(
         If new fixed effect levels are introduced in `newdata`, predicted values
         for such observations will be set to NaN.
 
+        The method always returns predictions for the "link" function; for linear
+        models, this is identical to the "response" function. Transformations to
+        "response" functions for models where this is not the case - GLMs -
+        this happens in the dedicated predict method of the respective GLM class.
+
         Parameters
         ----------
         newdata : Optional[DataFrameType], optional
@@ -1810,10 +1815,7 @@ def predict(
         _validate_literal_argument(type, PredictionType)
 
         if newdata is None:
-            if type == "link" or self._method == "feols":
-                return self._Y_hat_link
-            else:
-                return self._Y_hat_response
+            return self._Y_hat_link
 
         newdata = _narwhals_to_pandas(newdata).reset_index(drop=False)
 
@@ -1835,16 +1837,14 @@ def predict(
 
         if self._has_fixef:
             if self._sumFE is None:
-                self.fixef(atol, btol)
+                self.fixef(atol=atol, btol=btol)
             fvals = self._fixef.split("+")
             df_fe = newdata[fvals].astype(str)
             # populate fixed effect dicts with omitted categories handling
             fixef_dicts = {}
             for f in fvals:
                 fdict = self._fixef_dict[f"C({f})"]
-                omitted_cat = set(self._data[f].unique().astype(str).tolist()) - set(
-                    fdict.keys()
-                )
+                omitted_cat = {str(x) for x in self._data[f].unique() if x not in fdict}
                 if omitted_cat:
                     fdict.update({x: 0 for x in omitted_cat})
                 fixef_dicts[f"C({f})"] = fdict
@@ -2512,7 +2512,7 @@ def _get_vcov_type(vcov: str, fval: str):
 
 def _drop_multicollinear_variables(
     X: np.ndarray, names: list[str], collin_tol: float
-) -> tuple[np.ndarray, list[str], list[str], list[int]]:
+) -> Any:
     """
     Check for multicollinearity in the design matrices X and Z.
 

diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py
@@ -292,9 +292,8 @@ def compute_deviance(_Y: np.ndarray, mu: np.ndarray):
             stop_iterating = crit < _tol
 
         self._beta_hat = delta_new.flatten()
-        self._Y_hat_response = mu.flatten()
         self._Y_hat_link = eta.flatten()
-        # (Y - self._Y_hat)
+        self._Y_hat_response = mu.flatten()
         # needed for the calculation of the vcov
 
         # update for inference
@@ -350,7 +349,7 @@ def predict(
         newdata: Optional[DataFrameType] = None,
         atol: float = 1e-6,
         btol: float = 1e-6,
-        type: PredictionType = "link",
+        type: PredictionType = "response",
     ) -> np.ndarray:
         """
         Return predicted values from regression model.
@@ -392,15 +391,20 @@ def predict(
         np.ndarray
             A flat array with the predicted values of the regression model.
         """
-        if self._has_fixef:
+        if self._has_fixef and newdata is not None:
             raise NotImplementedError(
-                "Prediction with fixed effects is not yet implemented for Poisson regression."
+                "Predictions with new data and fixed effect are not yet supported."
             )
-        if newdata is not None:
-            raise NotImplementedError(
-                "Prediction with function argument `newdata` is not yet implemented for Poisson regression."
-            )
-        return super().predict(newdata=newdata, type=type, atol=atol, btol=btol)
+
+        y_hat = super().predict(newdata=newdata, type="link", atol=atol, btol=btol)
+
+        if type == "response":
+            y_hat = np.exp(y_hat)
+
+        # if self._has_fixef and newdata is not None:
+        #    y_hat = np.log(y_hat)
+
+        return y_hat
 
 
 def _check_for_separation(

diff --git a/pyfixest/estimation/model_matrix_fixest_.py b/pyfixest/estimation/model_matrix_fixest_.py
@@ -209,13 +209,15 @@ def model_matrix_fixest(
     }
 
 
-def _get_na_index(N: int, Y_index: pd.Series) -> np.ndarray:
+def _get_na_index(N: int, Y_index: pd.Index) -> np.ndarray:
     all_indices = np.arange(N)
+
     max_index = all_indices.max() + 1
     mask = np.ones(max_index, dtype=bool)
     Y_index_arr = Y_index.to_numpy()
+
     mask[Y_index_arr] = False
-    na_index = np.nonzero(mask)[0]
+    na_index = np.flatnonzero(mask)
     return na_index
 
 

diff --git a/pyfixest/utils/dev_utils.py b/pyfixest/utils/dev_utils.py
@@ -167,7 +167,7 @@ def _drop_cols(_data: pd.DataFrame, na_index: np.ndarray):
         return _data
 
 
-def _extract_variable_level(fe_string: str):
+def _extract_variable_level(fe_string: str) -> tuple[str, str]:
     """
     Extract the variable and level from a given string.
 

diff --git a/tests/test_predict_resid_fixef.py b/tests/test_predict_resid_fixef.py
@@ -49,10 +49,16 @@ def test_ols_prediction_internally(data, fml, weights):
     Currently only for OLS.
     """
     # predict via feols, without fixed effect
+
+    data = data.dropna()
+
     mod = feols(fml=fml, data=data, vcov="iid", weights=weights)
     original_prediction = mod.predict()
     updated_prediction = mod.predict(newdata=mod._data)
-    np.allclose(original_prediction, updated_prediction)
+
+    assert np.allclose(
+        original_prediction, updated_prediction
+    ), "preditction with newdata should be identical"
     assert mod._data.shape[0] == original_prediction.shape[0]
     assert mod._data.shape[0] == updated_prediction.shape[0]
 
@@ -62,9 +68,22 @@ def test_ols_prediction_internally(data, fml, weights):
         np.allclose(original_prediction, updated_prediction)
 
 
-@pytest.mark.parametrize("fml", ["Y ~ X1", "Y~X1 |f1", "Y ~ X1 | f1 + f2"])
+@pytest.mark.parametrize("fml", ["Y ~ X1", "Y ~ X1*X2", "Y~X1 |f1", "Y ~ X1 | f1 + f2"])
 @pytest.mark.parametrize("weights", ["weights"])
 def test_poisson_prediction_internally(data, weights, fml):
+    data = data.dropna()
+    mod = fepois(fml=fml, data=data, vcov="iid")
+    original_prediction = mod.predict()
+
+    if mod._has_fixef:
+        with pytest.raises(NotImplementedError):
+            updated_prediction = mod.predict(newdata=mod._data)
+    else:
+        updated_prediction = mod.predict(newdata=mod._data)
+        assert np.allclose(
+            original_prediction, updated_prediction
+        ), "preditction with newdata should be identical"
+
     with pytest.raises(TypeError):
         fit = fepois(fml=fml, data=data, vcov="hetero", weights=weights)
         fit.predict(newdata=fit._data)

diff --git a/tests/test_vs_fixest.py b/tests/test_vs_fixest.py
@@ -1,4 +1,5 @@
 import re
+from typing import get_args
 
 import numpy as np
 import pandas as pd
@@ -10,6 +11,7 @@
 from rpy2.robjects.packages import importr
 
 import pyfixest as pf
+from pyfixest.estimation import literals
 from pyfixest.estimation.estimation import feols
 from pyfixest.estimation.FixestMulti_ import FixestMulti
 from pyfixest.utils.set_rpy2_path import update_r_paths
@@ -33,6 +35,9 @@
 iwls_maxiter = 25
 iwls_tol = 1e-08
 
+# currently, bug when using predict with newdata and i() or C() or "^" syntax
+blocked_transforms = ["i(", "^", "poly("]
+
 ols_fmls = [
     ("Y~X1"),
     ("Y~X1+X2"),
@@ -279,10 +284,7 @@ def test_single_fit_feols(
             (py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_resid != r_resid"
         )
 
-        # currently, bug when using predict with newdata and i() or C() or "^" syntax
-        blocked_transforms = ["i(", "^", "poly("]
         blocked_transform_found = any(bt in fml for bt in blocked_transforms)
-
         if blocked_transform_found:
             with pytest.raises(NotImplementedError):
                 py_predict_newsample = mod.predict(
@@ -382,7 +384,7 @@ def test_single_fit_feols_empty(
 
 
 @pytest.mark.slow
-@pytest.mark.parametrize("dropna", [False])
+@pytest.mark.parametrize("dropna", [False, True])
 @pytest.mark.parametrize("inference", ["iid", "hetero", {"CRV1": "group_id"}])
 @pytest.mark.parametrize("f3_type", ["str"])
 @pytest.mark.parametrize("fml", ols_fmls)
@@ -470,23 +472,62 @@ def test_single_fit_fepois(
     check_absolute_diff(py_deviance, r_deviance, 1e-08, "py_deviance != r_deviance")
 
     if not mod._has_fixef:
-        py_predict_response = mod.predict(type="response")
+        py_predict_default = mod.predict(atol=1e-12, btol=1e-12)
+        r_predict_default = stats.predict(r_fixest)
+        py_predict_response = mod.predict(type="response", atol=1e-12, btol=1e-12)
         py_predict_link = mod.predict(type="link")
-        r_predict_response = stats.predict(r_fixest, type="response")
+        r_predict_response = stats.predict(
+            r_fixest, type="response", atol=1e-12, btol=1e-12
+        )
         r_predict_link = stats.predict(r_fixest, type="link")
+
+        check_absolute_diff(
+            py_predict_default[0:5],
+            r_predict_default[0:5],
+            1e-04 if mod._has_fixef else 1e-07,
+            "py_predict_default != r_predict_default",
+        )
+
         check_absolute_diff(
             py_predict_response[0:5],
             r_predict_response[0:5],
-            1e-07,
+            1e-03 if mod._has_fixef else 1e-07,
             "py_predict_response != r_predict_response",
         )
         check_absolute_diff(
             py_predict_link[0:5],
             r_predict_link[0:5],
-            1e-07,
+            1e-03 if mod._has_fixef else 1e-07,
             "py_predict_link != r_predict_link",
         )
 
+        # check prediction with newdata
+        blocked_transform_found = any(bt in fml for bt in blocked_transforms)
+        if blocked_transform_found:
+            with pytest.raises(NotImplementedError):
+                py_predict_newsample = mod.predict(
+                    newdata=data.iloc[0:100], atol=1e-08, btol=1e-08
+                )
+        else:
+            for prediction_type in get_args(literals.PredictionType):
+                py_predict_newsample = mod.predict(
+                    newdata=data.iloc[0:100],
+                    type=prediction_type,
+                    atol=1e-12,
+                    btol=1e-12,
+                )
+                r_predict_newsample = stats.predict(
+                    r_fixest,
+                    newdata=data_r.iloc[0:100],
+                    type=prediction_type,
+                )
+                check_absolute_diff(
+                    na_omit(py_predict_newsample)[0:5],
+                    na_omit(r_predict_newsample)[0:5],
+                    1e-03 if mod._has_fixef else 1e-07,
+                    f"py_predict_newdata != r_predict_newdata when type == '{prediction_type}'",
+                )
+
 
 @pytest.mark.slow
 @pytest.mark.parametrize("dropna", [False])