Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lsqr() for fixed effects solver #546

Merged
merged 20 commits into from
Jul 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ readme.ipynb
#objects.json
#docs/site_libs
#site_libs
tests/.coverage

18 changes: 12 additions & 6 deletions pyfixest/estimation/feols_.py
Original file line number Diff line number Diff line change
Expand Up @@ -1358,7 +1358,7 @@ def ccv(
n_splits=n_splits,
)

def fixef(self) -> dict[str, dict[str, float]]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add docstrings that explain atol and btol and that refer to the scipy lsqr docs?

https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.lsqr.html

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I've seen you did it already for predict. Thanks!

def fixef(self, atol : float = 1e-06, btol : float = 1e-06) -> dict[str, dict[str, float]]:
"""
Compute the coefficients of (swept out) fixed effects for a regression model.

Expand Down Expand Up @@ -1404,13 +1404,13 @@ def fixef(self) -> dict[str, dict[str, float]]:
X = X[self._coefnames] # drop intercept, potentially multicollinear vars
Y = Y.to_numpy().flatten().astype(np.float64)
X = X.to_numpy()
uhat = csr_matrix(Y - X @ self._beta_hat).transpose()
uhat = (Y - X @ self._beta_hat).flatten()

D2 = Formula("-1+" + fixef_fml).get_model_matrix(_data, output="sparse")
cols = D2.model_spec.column_names
s3alfisc marked this conversation as resolved.
Show resolved Hide resolved

alpha = spsolve(D2.transpose() @ D2, D2.transpose() @ uhat)

alpha = lsqr(D2, uhat, atol=atol, btol=btol)[0]
res: dict[str, dict[str, float]] = {}
for i, col in enumerate(cols):
variable, level = _extract_variable_level(col)
Expand All @@ -1429,7 +1429,7 @@ def fixef(self) -> dict[str, dict[str, float]]:

return self._fixef_dict

def predict(self, newdata: Optional[DataFrameType] = None) -> np.ndarray:
def predict(self, newdata: Optional[DataFrameType] = None, atol: float = 1e-6, btol: float = 1e-6) -> np.ndarray:
"""
Predict values of the model on new data.

Expand All @@ -1442,6 +1442,12 @@ def predict(self, newdata: Optional[DataFrameType] = None) -> np.ndarray:
newdata : Optional[DataFrameType], optional
A pd.DataFrame or pl.DataFrame with the data to be used for prediction.
If None (default), the data used for fitting the model is used.
atol : Float, default 1e-6
Stopping tolerance for scipy.sparse.linalg.lsqr().
See https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.lsqr.html
btol : Float, default 1e-6
Another stopping tolerance for scipy.sparse.linalg.lsqr().
See https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.lsqr.html

Returns
-------
Expand Down Expand Up @@ -1472,7 +1478,7 @@ def predict(self, newdata: Optional[DataFrameType] = None) -> np.ndarray:

if self._has_fixef:
if self._sumFE is None:
self.fixef()
self.fixef(atol, btol)
fvals = self._fixef.split("+")
df_fe = newdata[fvals].astype(str)
# populate fixed effect dicts with omitted categories handling
Expand Down
Binary file modified tests/.coverage
Binary file not shown.
20 changes: 10 additions & 10 deletions tests/test_did.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def test_event_study(data):
did2s_df = pd.DataFrame(did2s_df).T

if True:
np.testing.assert_allclose(fit_did2s.coef(), stats.coef(fit_did2s_r))
np.testing.assert_allclose(fit_did2s.se(), float(did2s_df[2]))
np.testing.assert_allclose(fit_did2s.coef(), stats.coef(fit_did2s_r), atol = 1e-05, rtol = 1e-05)
np.testing.assert_allclose(fit_did2s.se(), float(did2s_df[2]), atol = 1e-05, rtol = 1e-05)


def test_did2s(data):
Expand Down Expand Up @@ -78,8 +78,8 @@ def test_did2s(data):
did2s_df = broom.tidy_fixest(fit_did2s_r, conf_int=ro.BoolVector([True]))
did2s_df = pd.DataFrame(did2s_df).T

np.testing.assert_allclose(fit_did2s.coef(), stats.coef(fit_did2s_r))
np.testing.assert_allclose(fit_did2s.se(), float(did2s_df[2]))
np.testing.assert_allclose(fit_did2s.coef(), stats.coef(fit_did2s_r), atol = 1e-05, rtol = 1e-05)
np.testing.assert_allclose(fit_did2s.se(), float(did2s_df[2]), atol = 1e-05, rtol = 1e-05)

if True:
# ATT, event study
Expand All @@ -105,8 +105,8 @@ def test_did2s(data):
did2s_df = broom.tidy_fixest(fit_r, conf_int=ro.BoolVector([True]))
did2s_df = pd.DataFrame(did2s_df).T

np.testing.assert_allclose(fit.coef(), stats.coef(fit_r))
np.testing.assert_allclose(fit.se(), did2s_df[2].values.astype(float))
np.testing.assert_allclose(fit.coef(), stats.coef(fit_r), atol = 1e-05, rtol = 1e-05)
np.testing.assert_allclose(fit.se(), did2s_df[2].values.astype(float), atol = 1e-05, rtol = 1e-05)

if True:
# test event study with covariate in first stage
Expand All @@ -131,8 +131,8 @@ def test_did2s(data):
did2s_df = broom.tidy_fixest(fit_r, conf_int=ro.BoolVector([True]))
did2s_df = pd.DataFrame(did2s_df).T

np.testing.assert_allclose(fit.coef(), stats.coef(fit_r))
np.testing.assert_allclose(fit.se(), did2s_df[2].values.astype(float))
np.testing.assert_allclose(fit.coef(), stats.coef(fit_r), atol = 1e-05, rtol = 1e-05)
np.testing.assert_allclose(fit.se(), did2s_df[2].values.astype(float), atol = 1e-05, rtol = 1e-05)

if True:
# test event study with covariate in first stage and second stage
Expand All @@ -157,8 +157,8 @@ def test_did2s(data):
did2s_df = broom.tidy_fixest(fit_r, conf_int=ro.BoolVector([True]))
did2s_df = pd.DataFrame(did2s_df).T

np.testing.assert_allclose(fit.coef(), stats.coef(fit_r))
np.testing.assert_allclose(fit.se(), did2s_df[2].values.astype(float))
np.testing.assert_allclose(fit.coef(), stats.coef(fit_r), atol = 1e-05, rtol = 1e-05)
np.testing.assert_allclose(fit.se(), did2s_df[2].values.astype(float), atol = 1e-05, rtol = 1e-05)

if True:
# binary non boolean treatment variable, just check that it runs
Expand Down
8 changes: 4 additions & 4 deletions tests/test_predict_resid_fixef.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def test_predict_nas():
res = fit.predict(newdata=data)
fit_r = fixest.feols(ro.Formula(fml), data=data)
res_r = stats.predict(fit_r, newdata=data)
np.testing.assert_allclose(res, res_r)
np.testing.assert_allclose(res, res_r, atol=1e-05, rtol=1e-05)
assert data.shape[0] == len(res)
assert len(res) == len(res_r)

Expand All @@ -187,14 +187,14 @@ def test_predict_nas():
res = fit.predict(newdata=newdata)
fit_r = fixest.feols(ro.Formula(fml), data=data)
res_r = stats.predict(fit_r, newdata=newdata)
np.testing.assert_allclose(res, res_r)
np.testing.assert_allclose(res, res_r, atol=1e-05, rtol=1e-05)
assert newdata.shape[0] == len(res)
assert len(res) == len(res_r)

newdata.loc[198, "Y"] = np.nan
res = fit.predict(newdata=newdata)
res_r = stats.predict(fit_r, newdata=newdata)
np.testing.assert_allclose(res, res_r)
np.testing.assert_allclose(res, res_r, atol=1e-05, rtol=1e-05)
assert newdata.shape[0] == len(res)
assert len(res) == len(res_r)

Expand All @@ -204,7 +204,7 @@ def test_predict_nas():
res = fit.predict(newdata=data)
fit_r = fixest.feols(ro.Formula(fml), data=data)
res_r = stats.predict(fit_r, newdata=data)
np.testing.assert_allclose(res, res_r)
np.testing.assert_allclose(res, res_r, atol=1e-05, rtol=1e-05)
assert data.shape[0] == len(res)
assert len(res) == len(res_r)

Expand Down