Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fepois predict with newdata #703

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
3 changes: 3 additions & 0 deletions docs/changelog.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## PyFixest 0.28.0 (In Development, can be installed from github)

- Add support for the `newdata` argument for `Fepois.predict`.
- Changes `Fepois.predict()` defaults `type` argument to `response`, to match `fixest` defaults.
- Updates the `Fepois._Y_hat_response` attribute, which would previously return `Fepois._Y_hat_link`.
- Fix a bug that caused reindexing of `LPDID._coeftable` when calling `LPDID.iplot()`. As a result, a second call of `LPDID.iplot()` would fail.

## PyFixest 0.27.0
Expand Down
22 changes: 11 additions & 11 deletions pyfixest/estimation/feols_.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
import warnings
from importlib import import_module
from typing import Literal, Optional, Union
from typing import Any, Literal, Optional, Union

import numba as nb
import numpy as np
Expand Down Expand Up @@ -1766,7 +1766,7 @@ def predict(
newdata: Optional[DataFrameType] = None,
atol: float = 1e-6,
btol: float = 1e-6,
type: PredictionType = "link",
type: PredictionType = "response",
) -> np.ndarray:
"""
Predict values of the model on new data.
Expand All @@ -1775,6 +1775,11 @@ def predict(
If new fixed effect levels are introduced in `newdata`, predicted values
for such observations will be set to NaN.

The method always returns predictions for the "link" function; for linear
models, this is identical to the "response" function. Transformations to
"response" functions for models where this is not the case - GLMs -
this happens in the dedicated predict method of the respective GLM class.

Parameters
----------
newdata : Optional[DataFrameType], optional
Expand Down Expand Up @@ -1810,10 +1815,7 @@ def predict(
_validate_literal_argument(type, PredictionType)

if newdata is None:
if type == "link" or self._method == "feols":
return self._Y_hat_link
else:
return self._Y_hat_response
return self._Y_hat_link

newdata = _narwhals_to_pandas(newdata).reset_index(drop=False)

Expand All @@ -1835,16 +1837,14 @@ def predict(

if self._has_fixef:
if self._sumFE is None:
self.fixef(atol, btol)
self.fixef(atol=atol, btol=btol)
fvals = self._fixef.split("+")
df_fe = newdata[fvals].astype(str)
# populate fixed effect dicts with omitted categories handling
fixef_dicts = {}
for f in fvals:
fdict = self._fixef_dict[f"C({f})"]
omitted_cat = set(self._data[f].unique().astype(str).tolist()) - set(
fdict.keys()
)
omitted_cat = {str(x) for x in self._data[f].unique() if x not in fdict}
if omitted_cat:
fdict.update({x: 0 for x in omitted_cat})
fixef_dicts[f"C({f})"] = fdict
Expand Down Expand Up @@ -2512,7 +2512,7 @@ def _get_vcov_type(vcov: str, fval: str):

def _drop_multicollinear_variables(
X: np.ndarray, names: list[str], collin_tol: float
) -> tuple[np.ndarray, list[str], list[str], list[int]]:
) -> Any:
"""
Check for multicollinearity in the design matrices X and Z.

Expand Down
24 changes: 14 additions & 10 deletions pyfixest/estimation/fepois_.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,8 @@ def compute_deviance(_Y: np.ndarray, mu: np.ndarray):
stop_iterating = crit < _tol

self._beta_hat = delta_new.flatten()
self._Y_hat_response = mu.flatten()
self._Y_hat_link = eta.flatten()
# (Y - self._Y_hat)
self._Y_hat_response = mu.flatten()
# needed for the calculation of the vcov

# update for inference
Expand Down Expand Up @@ -350,7 +349,7 @@ def predict(
newdata: Optional[DataFrameType] = None,
atol: float = 1e-6,
btol: float = 1e-6,
type: PredictionType = "link",
type: PredictionType = "response",
) -> np.ndarray:
"""
Return predicted values from regression model.
Expand Down Expand Up @@ -392,15 +391,20 @@ def predict(
np.ndarray
A flat array with the predicted values of the regression model.
"""
if self._has_fixef:
if self._has_fixef and newdata is not None:
raise NotImplementedError(
"Prediction with fixed effects is not yet implemented for Poisson regression."
"Predictions with new data and fixed effect are not yet supported."
)
if newdata is not None:
raise NotImplementedError(
"Prediction with function argument `newdata` is not yet implemented for Poisson regression."
)
return super().predict(newdata=newdata, type=type, atol=atol, btol=btol)

y_hat = super().predict(newdata=newdata, type="link", atol=atol, btol=btol)

if type == "response":
y_hat = np.exp(y_hat)

# if self._has_fixef and newdata is not None:
# y_hat = np.log(y_hat)

return y_hat


def _check_for_separation(
Expand Down
6 changes: 4 additions & 2 deletions pyfixest/estimation/model_matrix_fixest_.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,13 +209,15 @@ def model_matrix_fixest(
}


def _get_na_index(N: int, Y_index: pd.Series) -> np.ndarray:
def _get_na_index(N: int, Y_index: pd.Index) -> np.ndarray:
all_indices = np.arange(N)

max_index = all_indices.max() + 1
mask = np.ones(max_index, dtype=bool)
Y_index_arr = Y_index.to_numpy()

mask[Y_index_arr] = False
na_index = np.nonzero(mask)[0]
na_index = np.flatnonzero(mask)
return na_index


Expand Down
2 changes: 1 addition & 1 deletion pyfixest/utils/dev_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def _drop_cols(_data: pd.DataFrame, na_index: np.ndarray):
return _data


def _extract_variable_level(fe_string: str):
def _extract_variable_level(fe_string: str) -> tuple[str, str]:
"""
Extract the variable and level from a given string.

Expand Down
23 changes: 21 additions & 2 deletions tests/test_predict_resid_fixef.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,16 @@ def test_ols_prediction_internally(data, fml, weights):
Currently only for OLS.
"""
# predict via feols, without fixed effect

data = data.dropna()

mod = feols(fml=fml, data=data, vcov="iid", weights=weights)
original_prediction = mod.predict()
updated_prediction = mod.predict(newdata=mod._data)
np.allclose(original_prediction, updated_prediction)

assert np.allclose(
original_prediction, updated_prediction
), "preditction with newdata should be identical"
assert mod._data.shape[0] == original_prediction.shape[0]
assert mod._data.shape[0] == updated_prediction.shape[0]

Expand All @@ -62,9 +68,22 @@ def test_ols_prediction_internally(data, fml, weights):
np.allclose(original_prediction, updated_prediction)


@pytest.mark.parametrize("fml", ["Y ~ X1", "Y~X1 |f1", "Y ~ X1 | f1 + f2"])
@pytest.mark.parametrize("fml", ["Y ~ X1", "Y ~ X1*X2", "Y~X1 |f1", "Y ~ X1 | f1 + f2"])
@pytest.mark.parametrize("weights", ["weights"])
def test_poisson_prediction_internally(data, weights, fml):
data = data.dropna()
mod = fepois(fml=fml, data=data, vcov="iid")
original_prediction = mod.predict()

if mod._has_fixef:
with pytest.raises(NotImplementedError):
updated_prediction = mod.predict(newdata=mod._data)
else:
updated_prediction = mod.predict(newdata=mod._data)
assert np.allclose(
original_prediction, updated_prediction
), "preditction with newdata should be identical"

with pytest.raises(TypeError):
fit = fepois(fml=fml, data=data, vcov="hetero", weights=weights)
fit.predict(newdata=fit._data)
Expand Down
57 changes: 49 additions & 8 deletions tests/test_vs_fixest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from typing import get_args

import numpy as np
import pandas as pd
Expand All @@ -10,6 +11,7 @@
from rpy2.robjects.packages import importr

import pyfixest as pf
from pyfixest.estimation import literals
from pyfixest.estimation.estimation import feols
from pyfixest.estimation.FixestMulti_ import FixestMulti
from pyfixest.utils.set_rpy2_path import update_r_paths
Expand All @@ -33,6 +35,9 @@
iwls_maxiter = 25
iwls_tol = 1e-08

# currently, bug when using predict with newdata and i() or C() or "^" syntax
blocked_transforms = ["i(", "^", "poly("]

ols_fmls = [
("Y~X1"),
("Y~X1+X2"),
Expand Down Expand Up @@ -279,10 +284,7 @@ def test_single_fit_feols(
(py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_resid != r_resid"
)

# currently, bug when using predict with newdata and i() or C() or "^" syntax
blocked_transforms = ["i(", "^", "poly("]
blocked_transform_found = any(bt in fml for bt in blocked_transforms)

if blocked_transform_found:
with pytest.raises(NotImplementedError):
py_predict_newsample = mod.predict(
Expand Down Expand Up @@ -382,7 +384,7 @@ def test_single_fit_feols_empty(


@pytest.mark.slow
@pytest.mark.parametrize("dropna", [False])
@pytest.mark.parametrize("dropna", [False, True])
@pytest.mark.parametrize("inference", ["iid", "hetero", {"CRV1": "group_id"}])
@pytest.mark.parametrize("f3_type", ["str"])
@pytest.mark.parametrize("fml", ols_fmls)
Expand Down Expand Up @@ -470,23 +472,62 @@ def test_single_fit_fepois(
check_absolute_diff(py_deviance, r_deviance, 1e-08, "py_deviance != r_deviance")

if not mod._has_fixef:
py_predict_response = mod.predict(type="response")
py_predict_default = mod.predict(atol=1e-12, btol=1e-12)
r_predict_default = stats.predict(r_fixest)
py_predict_response = mod.predict(type="response", atol=1e-12, btol=1e-12)
py_predict_link = mod.predict(type="link")
r_predict_response = stats.predict(r_fixest, type="response")
r_predict_response = stats.predict(
r_fixest, type="response", atol=1e-12, btol=1e-12
)
r_predict_link = stats.predict(r_fixest, type="link")

check_absolute_diff(
py_predict_default[0:5],
r_predict_default[0:5],
1e-04 if mod._has_fixef else 1e-07,
"py_predict_default != r_predict_default",
)

check_absolute_diff(
py_predict_response[0:5],
r_predict_response[0:5],
1e-07,
1e-03 if mod._has_fixef else 1e-07,
"py_predict_response != r_predict_response",
)
check_absolute_diff(
py_predict_link[0:5],
r_predict_link[0:5],
1e-07,
1e-03 if mod._has_fixef else 1e-07,
"py_predict_link != r_predict_link",
)

# check prediction with newdata
blocked_transform_found = any(bt in fml for bt in blocked_transforms)
if blocked_transform_found:
with pytest.raises(NotImplementedError):
py_predict_newsample = mod.predict(
newdata=data.iloc[0:100], atol=1e-08, btol=1e-08
)
else:
for prediction_type in get_args(literals.PredictionType):
py_predict_newsample = mod.predict(
newdata=data.iloc[0:100],
type=prediction_type,
atol=1e-12,
btol=1e-12,
)
r_predict_newsample = stats.predict(
r_fixest,
newdata=data_r.iloc[0:100],
type=prediction_type,
)
check_absolute_diff(
na_omit(py_predict_newsample)[0:5],
na_omit(r_predict_newsample)[0:5],
1e-03 if mod._has_fixef else 1e-07,
f"py_predict_newdata != r_predict_newdata when type == '{prediction_type}'",
)


@pytest.mark.slow
@pytest.mark.parametrize("dropna", [False])
Expand Down
Loading