diff --git a/docs/changelog.qmd b/docs/changelog.qmd index 68205a1d..3e0749cb 100644 --- a/docs/changelog.qmd +++ b/docs/changelog.qmd @@ -2,6 +2,9 @@ ## PyFixest 0.28.0 (In Development, can be installed from github) +- Add support for the `newdata` argument for `Fepois.predict`. +- Changes `Fepois.predict()` defaults `type` argument to `response`, to match `fixest` defaults. +- Updates the `Fepois._Y_hat_response` attribute, which would previously return `Fepois._Y_hat_link`. - Fix a bug that caused reindexing of `LPDID._coeftable` when calling `LPDID.iplot()`. As a result, a second call of `LPDID.iplot()` would fail. ## PyFixest 0.27.0 diff --git a/pyfixest/estimation/feols_.py b/pyfixest/estimation/feols_.py index e4092224..e0405c32 100644 --- a/pyfixest/estimation/feols_.py +++ b/pyfixest/estimation/feols_.py @@ -3,7 +3,7 @@ import re import warnings from importlib import import_module -from typing import Literal, Optional, Union +from typing import Any, Literal, Optional, Union import numba as nb import numpy as np @@ -1766,7 +1766,7 @@ def predict( newdata: Optional[DataFrameType] = None, atol: float = 1e-6, btol: float = 1e-6, - type: PredictionType = "link", + type: PredictionType = "response", ) -> np.ndarray: """ Predict values of the model on new data. @@ -1775,6 +1775,11 @@ def predict( If new fixed effect levels are introduced in `newdata`, predicted values for such observations will be set to NaN. + The method always returns predictions for the "link" function; for linear + models, this is identical to the "response" function. Transformations to + "response" functions for models where this is not the case - GLMs - + this happens in the dedicated predict method of the respective GLM class. + Parameters ---------- newdata : Optional[DataFrameType], optional @@ -1810,10 +1815,7 @@ def predict( _validate_literal_argument(type, PredictionType) if newdata is None: - if type == "link" or self._method == "feols": - return self._Y_hat_link - else: - return self._Y_hat_response + return self._Y_hat_link newdata = _narwhals_to_pandas(newdata).reset_index(drop=False) @@ -1835,16 +1837,14 @@ def predict( if self._has_fixef: if self._sumFE is None: - self.fixef(atol, btol) + self.fixef(atol=atol, btol=btol) fvals = self._fixef.split("+") df_fe = newdata[fvals].astype(str) # populate fixed effect dicts with omitted categories handling fixef_dicts = {} for f in fvals: fdict = self._fixef_dict[f"C({f})"] - omitted_cat = set(self._data[f].unique().astype(str).tolist()) - set( - fdict.keys() - ) + omitted_cat = {str(x) for x in self._data[f].unique() if x not in fdict} if omitted_cat: fdict.update({x: 0 for x in omitted_cat}) fixef_dicts[f"C({f})"] = fdict @@ -2512,7 +2512,7 @@ def _get_vcov_type(vcov: str, fval: str): def _drop_multicollinear_variables( X: np.ndarray, names: list[str], collin_tol: float -) -> tuple[np.ndarray, list[str], list[str], list[int]]: +) -> Any: """ Check for multicollinearity in the design matrices X and Z. diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index 1644b566..0271e5ff 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -292,9 +292,8 @@ def compute_deviance(_Y: np.ndarray, mu: np.ndarray): stop_iterating = crit < _tol self._beta_hat = delta_new.flatten() - self._Y_hat_response = mu.flatten() self._Y_hat_link = eta.flatten() - # (Y - self._Y_hat) + self._Y_hat_response = mu.flatten() # needed for the calculation of the vcov # update for inference @@ -350,7 +349,7 @@ def predict( newdata: Optional[DataFrameType] = None, atol: float = 1e-6, btol: float = 1e-6, - type: PredictionType = "link", + type: PredictionType = "response", ) -> np.ndarray: """ Return predicted values from regression model. @@ -392,15 +391,20 @@ def predict( np.ndarray A flat array with the predicted values of the regression model. """ - if self._has_fixef: + if self._has_fixef and newdata is not None: raise NotImplementedError( - "Prediction with fixed effects is not yet implemented for Poisson regression." + "Predictions with new data and fixed effect are not yet supported." ) - if newdata is not None: - raise NotImplementedError( - "Prediction with function argument `newdata` is not yet implemented for Poisson regression." - ) - return super().predict(newdata=newdata, type=type, atol=atol, btol=btol) + + y_hat = super().predict(newdata=newdata, type="link", atol=atol, btol=btol) + + if type == "response": + y_hat = np.exp(y_hat) + + # if self._has_fixef and newdata is not None: + # y_hat = np.log(y_hat) + + return y_hat def _check_for_separation( diff --git a/pyfixest/estimation/model_matrix_fixest_.py b/pyfixest/estimation/model_matrix_fixest_.py index 0942cbfe..581b3bd8 100644 --- a/pyfixest/estimation/model_matrix_fixest_.py +++ b/pyfixest/estimation/model_matrix_fixest_.py @@ -209,13 +209,15 @@ def model_matrix_fixest( } -def _get_na_index(N: int, Y_index: pd.Series) -> np.ndarray: +def _get_na_index(N: int, Y_index: pd.Index) -> np.ndarray: all_indices = np.arange(N) + max_index = all_indices.max() + 1 mask = np.ones(max_index, dtype=bool) Y_index_arr = Y_index.to_numpy() + mask[Y_index_arr] = False - na_index = np.nonzero(mask)[0] + na_index = np.flatnonzero(mask) return na_index diff --git a/pyfixest/utils/dev_utils.py b/pyfixest/utils/dev_utils.py index bc8966f0..5f200624 100644 --- a/pyfixest/utils/dev_utils.py +++ b/pyfixest/utils/dev_utils.py @@ -167,7 +167,7 @@ def _drop_cols(_data: pd.DataFrame, na_index: np.ndarray): return _data -def _extract_variable_level(fe_string: str): +def _extract_variable_level(fe_string: str) -> tuple[str, str]: """ Extract the variable and level from a given string. diff --git a/tests/test_predict_resid_fixef.py b/tests/test_predict_resid_fixef.py index 18aa711f..1707d2a8 100644 --- a/tests/test_predict_resid_fixef.py +++ b/tests/test_predict_resid_fixef.py @@ -49,10 +49,16 @@ def test_ols_prediction_internally(data, fml, weights): Currently only for OLS. """ # predict via feols, without fixed effect + + data = data.dropna() + mod = feols(fml=fml, data=data, vcov="iid", weights=weights) original_prediction = mod.predict() updated_prediction = mod.predict(newdata=mod._data) - np.allclose(original_prediction, updated_prediction) + + assert np.allclose( + original_prediction, updated_prediction + ), "preditction with newdata should be identical" assert mod._data.shape[0] == original_prediction.shape[0] assert mod._data.shape[0] == updated_prediction.shape[0] @@ -62,9 +68,22 @@ def test_ols_prediction_internally(data, fml, weights): np.allclose(original_prediction, updated_prediction) -@pytest.mark.parametrize("fml", ["Y ~ X1", "Y~X1 |f1", "Y ~ X1 | f1 + f2"]) +@pytest.mark.parametrize("fml", ["Y ~ X1", "Y ~ X1*X2", "Y~X1 |f1", "Y ~ X1 | f1 + f2"]) @pytest.mark.parametrize("weights", ["weights"]) def test_poisson_prediction_internally(data, weights, fml): + data = data.dropna() + mod = fepois(fml=fml, data=data, vcov="iid") + original_prediction = mod.predict() + + if mod._has_fixef: + with pytest.raises(NotImplementedError): + updated_prediction = mod.predict(newdata=mod._data) + else: + updated_prediction = mod.predict(newdata=mod._data) + assert np.allclose( + original_prediction, updated_prediction + ), "preditction with newdata should be identical" + with pytest.raises(TypeError): fit = fepois(fml=fml, data=data, vcov="hetero", weights=weights) fit.predict(newdata=fit._data) diff --git a/tests/test_vs_fixest.py b/tests/test_vs_fixest.py index 70be2a25..1839608d 100644 --- a/tests/test_vs_fixest.py +++ b/tests/test_vs_fixest.py @@ -1,4 +1,5 @@ import re +from typing import get_args import numpy as np import pandas as pd @@ -10,6 +11,7 @@ from rpy2.robjects.packages import importr import pyfixest as pf +from pyfixest.estimation import literals from pyfixest.estimation.estimation import feols from pyfixest.estimation.FixestMulti_ import FixestMulti from pyfixest.utils.set_rpy2_path import update_r_paths @@ -33,6 +35,9 @@ iwls_maxiter = 25 iwls_tol = 1e-08 +# currently, bug when using predict with newdata and i() or C() or "^" syntax +blocked_transforms = ["i(", "^", "poly("] + ols_fmls = [ ("Y~X1"), ("Y~X1+X2"), @@ -279,10 +284,7 @@ def test_single_fit_feols( (py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_resid != r_resid" ) - # currently, bug when using predict with newdata and i() or C() or "^" syntax - blocked_transforms = ["i(", "^", "poly("] blocked_transform_found = any(bt in fml for bt in blocked_transforms) - if blocked_transform_found: with pytest.raises(NotImplementedError): py_predict_newsample = mod.predict( @@ -382,7 +384,7 @@ def test_single_fit_feols_empty( @pytest.mark.slow -@pytest.mark.parametrize("dropna", [False]) +@pytest.mark.parametrize("dropna", [False, True]) @pytest.mark.parametrize("inference", ["iid", "hetero", {"CRV1": "group_id"}]) @pytest.mark.parametrize("f3_type", ["str"]) @pytest.mark.parametrize("fml", ols_fmls) @@ -470,23 +472,62 @@ def test_single_fit_fepois( check_absolute_diff(py_deviance, r_deviance, 1e-08, "py_deviance != r_deviance") if not mod._has_fixef: - py_predict_response = mod.predict(type="response") + py_predict_default = mod.predict(atol=1e-12, btol=1e-12) + r_predict_default = stats.predict(r_fixest) + py_predict_response = mod.predict(type="response", atol=1e-12, btol=1e-12) py_predict_link = mod.predict(type="link") - r_predict_response = stats.predict(r_fixest, type="response") + r_predict_response = stats.predict( + r_fixest, type="response", atol=1e-12, btol=1e-12 + ) r_predict_link = stats.predict(r_fixest, type="link") + + check_absolute_diff( + py_predict_default[0:5], + r_predict_default[0:5], + 1e-04 if mod._has_fixef else 1e-07, + "py_predict_default != r_predict_default", + ) + check_absolute_diff( py_predict_response[0:5], r_predict_response[0:5], - 1e-07, + 1e-03 if mod._has_fixef else 1e-07, "py_predict_response != r_predict_response", ) check_absolute_diff( py_predict_link[0:5], r_predict_link[0:5], - 1e-07, + 1e-03 if mod._has_fixef else 1e-07, "py_predict_link != r_predict_link", ) + # check prediction with newdata + blocked_transform_found = any(bt in fml for bt in blocked_transforms) + if blocked_transform_found: + with pytest.raises(NotImplementedError): + py_predict_newsample = mod.predict( + newdata=data.iloc[0:100], atol=1e-08, btol=1e-08 + ) + else: + for prediction_type in get_args(literals.PredictionType): + py_predict_newsample = mod.predict( + newdata=data.iloc[0:100], + type=prediction_type, + atol=1e-12, + btol=1e-12, + ) + r_predict_newsample = stats.predict( + r_fixest, + newdata=data_r.iloc[0:100], + type=prediction_type, + ) + check_absolute_diff( + na_omit(py_predict_newsample)[0:5], + na_omit(r_predict_newsample)[0:5], + 1e-03 if mod._has_fixef else 1e-07, + f"py_predict_newdata != r_predict_newdata when type == '{prediction_type}'", + ) + @pytest.mark.slow @pytest.mark.parametrize("dropna", [False])