Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run more tests #603

Merged
merged 10 commits into from
Sep 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyfixest/estimation/FixestMulti_.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ def _estimate_all_models(
_data_clean = _drop_cols(_data, FIT.na_index)

FIT.add_fixest_multi_context(
fml=FixestFormula.fml,
FixestFormula=FixestFormula,
depvar=FixestFormula._depvar,
Y=Y,
_data=_data_clean,
Expand Down
59 changes: 17 additions & 42 deletions pyfixest/estimation/feiv_.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import warnings
from importlib import import_module
from typing import Optional, Union

import numpy as np
import pandas as pd

from pyfixest.estimation.feols_ import Feols, _drop_multicollinear_variables

Expand Down Expand Up @@ -213,59 +213,35 @@ def get_fit(self) -> None:

def first_stage(self) -> None:
"""Implement First stage regression."""
from pyfixest.estimation.estimation import feols

# Store names of instruments from Z matrix
self._non_exo_instruments = list(set(self._coefnames_z) - set(self._coefnames))

# Prepare your data
# Select the columns from self._data that match the variable names
data = pd.DataFrame(self._Z_1st_stage, columns=self._coefnames_z)

# Store instrument variable matrix for future use
_Z_iv = data[self._non_exo_instruments]
self._Z_iv = _Z_iv.to_numpy()
# Dynamically create the formula string
# Extract names of fixed effect

data["Y"] = self._endogvar_1st_stage
independent_vars = " + ".join(
data.columns[:-1]
) # All columns except the last one ('Y')

# Set fix effects, cluster, and weight options to be passed to feols.
fixest_module = import_module("pyfixest.estimation")
fit_ = getattr(fixest_module, "feols")

fml_first_stage = self._FixestFormula.fml_first_stage.replace(" ", "")
if self._has_fixef:
FE_vars = self._fixef.split("+")
data[FE_vars] = self._data[FE_vars]
formula = f"Y ~ {independent_vars} | {self._fixef}"
else:
formula = f"Y ~ {independent_vars}"
fml_first_stage += f" | {self._fixef}"

# Type hint to reflect that vcov_detail can be either a dict or a str
vcov_detail: Union[dict[str, str], str]

if self._is_clustered:
a = self._clustervar[0]
vcov_detail = {self._vcov_type_detail: a}
data[a] = self._data[a]
else:
vcov_detail = self._vcov_type_detail

if self._has_weights:
data["weights"] = self._weights
weight_detail = "weights"
else:
weight_detail = None
weight_detail = "weights" if self._has_weights else None

# Do first stage regression
model1 = feols(
fml=formula,
data=data,
model1 = fit_(
fml=fml_first_stage,
data=self._data,
vcov=vcov_detail,
weights=weight_detail,
weights_type=self._weights_type_feiv,
collin_tol=1e-10,
collin_tol=self._collin_tol,
)

# Ensure model1 is of type Feols
Expand Down Expand Up @@ -453,6 +429,7 @@ def IV_weakness_test(self, iv_diag_statistics: Optional[list[str]] = None) -> No
def eff_F(self) -> None:
"""Compute Effective F stat (Olea and Pflueger 2013)."""
# If vcov is iid, redo first stage regression

if self._vcov_type_detail == "iid":
self._vcov_type_detail = "hetero"
self._model_1st_stage.vcov("hetero")
Expand All @@ -470,14 +447,12 @@ def eff_F(self) -> None:

# Extract coefficients for the non-exogenous instruments

pi_hat = np.array(
[
self._model_1st_stage.coef()[instrument]
for instrument in self._non_exo_instruments
]
)

Z = self._Z_iv
pi_hat = np.array(self._model_1st_stage.coef()[self._non_exo_instruments])
iv_positions = [
self._coefnames_z.index(instrument)
for instrument in self._non_exo_instruments
]
Z = self._model_1st_stage._X[:, iv_positions]

# Calculate the cross-product of the instrument matrix
Q_zz = Z.T @ Z
Expand Down
15 changes: 10 additions & 5 deletions pyfixest/estimation/feols_.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from scipy.stats import chi2, f, norm, t

from pyfixest.errors import VcovTypeNotSupportedError
from pyfixest.estimation.FormulaParser import FixestFormula
from pyfixest.estimation.ritest import (
_decode_resampvar,
_get_ritest_pvalue,
Expand Down Expand Up @@ -728,7 +729,7 @@ def get_inference(self, alpha: float = 0.05) -> None:

def add_fixest_multi_context(
self,
fml: str,
FixestFormula: FixestFormula,
depvar: str,
Y: pd.Series,
_data: pd.DataFrame,
Expand All @@ -745,8 +746,8 @@ def add_fixest_multi_context(

Parameters
----------
fml : str
The formula used for estimation.
FixestFormula : FixestFormula
The formula(s) used for estimation encoded in a `FixestFormula` object.
depvar : str
The dependent variable of the regression model.
Y : pd.Series
Expand All @@ -767,7 +768,8 @@ def add_fixest_multi_context(
None
"""
# some bookkeeping
self._fml = fml
self._fml = FixestFormula.fml
self._FixestFormula = FixestFormula
self._depvar = depvar
self._Y_untransformed = Y
self._data = pd.DataFrame()
Expand Down Expand Up @@ -1803,7 +1805,10 @@ def resid(self) -> np.ndarray:
np.ndarray
A np.ndarray with the residuals of the estimated regression model.
"""
return self._u_hat.flatten() / np.sqrt(self._weights).flatten()
if self._X_is_empty:
return self._u_hat.flatten()
else:
return self._u_hat.flatten() / np.sqrt(self._weights.flatten())

def ritest(
self,
Expand Down
40 changes: 32 additions & 8 deletions pyfixest/estimation/fepois_.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,11 +247,14 @@ def compute_deviance(_Y: np.ndarray, mu: np.ndarray):

# updat for inference
self._weights = mu_old
self._irls_weights = mu
# if only one dim
if self._weights.ndim == 1:
self._weights = self._weights.reshape((self._N, 1))

self._u_hat = (WZ - WX @ delta_new).flatten()
self._u_hat_working = resid
self._u_hat_response = self._Y - np.exp(eta)

self._Y = WZ
self._X = WX
Expand All @@ -268,6 +271,28 @@ def compute_deviance(_Y: np.ndarray, mu: np.ndarray):
if _convergence:
self._convergence = True

def resid(self, type: str = "response") -> np.ndarray:
"""
Return residuals from regression model.

Parameters
----------
type : str, optional
The type of residuals to be computed.
Can be either "response" (default) or "working".

Returns
-------
np.ndarray
A flat array with the residuals of the regression model.
"""
if type == "response":
return self._u_hat_response.flatten()
elif type == "working":
return self._u_hat_working.flatten()
else:
raise ValueError("type must be one of 'response' or 'working'.")

def predict(
self,
newdata: Optional[DataFrameType] = None,
Expand Down Expand Up @@ -317,7 +342,7 @@ def predict(
np.ndarray
A flat array with the predicted values of the regression model.
"""
_Xbeta = self._Xbeta
_Xbeta = self._Xbeta.flatten()
_has_fixef = self._has_fixef

if _has_fixef:
Expand All @@ -329,14 +354,13 @@ def predict(
"Prediction with function argument `newdata` is not yet implemented for Poisson regression."
)

if type not in ["response", "link"]:
raise ValueError("type must be one of 'response' or 'link'.")

y_hat = super().predict(newdata=newdata, type=type, atol=atol, btol=btol)
# y_hat = super().predict(newdata=newdata, type=type, atol=atol, btol=btol)
if type == "link":
y_hat = np.exp(y_hat)

return y_hat
return np.exp(_Xbeta)
elif type == "response":
return _Xbeta
else:
raise ValueError("type must be one of 'response' or 'link'.")


def _check_for_separation(Y: pd.DataFrame, fe: pd.DataFrame) -> list[int]:
Expand Down
14 changes: 7 additions & 7 deletions scripts/run_notebooks.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Script to run all notebooks example notebooks.

Taken from: https://github.com/pymc-labs/pymc-marketing/blob/main/scripts/run_notebooks/runner.py
Taken from: https://github.com/pymc-labs/pymc-marketing/blob/main/scripts/_run_notebooks/runner.py
"""

import logging
Expand All @@ -22,19 +22,19 @@
]


def setup_logging() -> None:
def _setup_logging() -> None:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)


def get_cwd_from_notebook_path(notebook_path: Path) -> str:
def _get_cwd_from_notebook_path(notebook_path: Path) -> str:
return str(notebook_path).rsplit("/", 1)[0]


def run_notebook(notebook_path: Path) -> None:
cwd = get_cwd_from_notebook_path(notebook_path)
def _run_notebook(notebook_path: Path) -> None:
cwd = _get_cwd_from_notebook_path(notebook_path)
logging.info(f"Running notebook: {notebook_path.name}")
papermill.execute_notebook(
input_path=str(notebook_path),
Expand All @@ -45,11 +45,11 @@ def run_notebook(notebook_path: Path) -> None:


if __name__ == "__main__":
setup_logging()
_setup_logging()
logging.info("Starting notebook runner")
logging.info(f"Notebooks to run: {NOTEBOOKS}")
Parallel(n_jobs=-1)(
delayed(run_notebook)(notebook_path) for notebook_path in tqdm(NOTEBOOKS)
delayed(_run_notebook)(notebook_path) for notebook_path in tqdm(NOTEBOOKS)
)

logging.info("Notebooks run successfully!")
20 changes: 13 additions & 7 deletions tests/test_vcov.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@ def data():
return pf.get_data()


models = ["Y~X1", "Y~X1 | f1"]
models = [
"Y~X1",
"Y~X1 | f1",
"Y ~ 1 | X1 ~ Z1",
"Y ~ 1 | f1| X1 ~ Z1",
"Y ~ X2 | X1 ~ Z1 + Z2",
]
adj = [False, True]
cluster_adj = [False, True]

Expand All @@ -42,7 +48,7 @@ def test_iid(data, fml, adj, cluster_adj):
py_mod_vcov = py_mod._vcov
r_mod_vcov = stats.vcov(r_mod)

assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-16)
assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-15)


@pytest.mark.parametrize("fml", models)
Expand Down Expand Up @@ -77,7 +83,7 @@ def test_hetero(data, fml, adj, cluster_adj):
py_mod_vcov = py_mod._vcov
r_mod_vcov = stats.vcov(r_mod)

assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) * adj_factor < 1e-16)
assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) * adj_factor < 1e-15)


@pytest.mark.parametrize("fml", models)
Expand All @@ -100,7 +106,7 @@ def test_crv1(data, fml, adj, cluster_adj):
py_mod_vcov = py_mod._vcov
r_mod_vcov = stats.vcov(r_mod)

assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-16)
assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-15)


@pytest.mark.parametrize("fml", models)
Expand All @@ -125,7 +131,7 @@ def test_iid_weights(data, fml, adj, cluster_adj):
py_mod_vcov = py_mod._vcov
r_mod_vcov = stats.vcov(r_mod)

assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-16)
assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-15)


@pytest.mark.parametrize("fml", models)
Expand Down Expand Up @@ -165,7 +171,7 @@ def test_hetero_weights(data, fml, adj, cluster_adj):
py_mod_vcov = py_mod._vcov
r_mod_vcov = stats.vcov(r_mod)

assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) * adj_factor < 1e-16)
assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) * adj_factor < 1e-15)


@pytest.mark.parametrize("fml", models)
Expand All @@ -190,4 +196,4 @@ def test_crv1_weights(data, fml, adj, cluster_adj):
py_mod_vcov = py_mod._vcov
r_mod_vcov = stats.vcov(r_mod)

assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-16)
assert np.all(np.abs(py_mod_vcov) - np.abs(r_mod_vcov) < 1e-15)
Loading
Loading