Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add updates to wald_test method and unit testing files #536

Merged
merged 8 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .coverage
Binary file not shown.
148 changes: 103 additions & 45 deletions pyfixest/estimation/feols_.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@

import numpy as np
import pandas as pd
import polars as pl
from formulaic import Formula
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import lsqr, spsolve
from scipy.stats import f, norm, t
from scipy.stats import chi2, f, norm, t

from pyfixest.errors import VcovTypeNotSupportedError
from pyfixest.estimation.ritest import (
Expand Down Expand Up @@ -383,6 +384,10 @@
Feols
An instance of class [Feols(/reference/Feols.qmd) with updated inference.
"""
# Assuming `data` is the DataFrame in question
if isinstance(data, pl.DataFrame):
data = _polars_to_pandas(data)

Check warning on line 389 in pyfixest/estimation/feols_.py

View check run for this annotation

Codecov / codecov/patch

pyfixest/estimation/feols_.py#L389

Added line #L389 was not covered by tests

_data = self._data
_has_fixef = self._has_fixef
_is_iv = self._is_iv
Expand Down Expand Up @@ -439,7 +444,8 @@
if data is not None:
# use input data set
self._cluster_df = _get_cluster_df(
data=data, clustervar=self._clustervar
data=data,
clustervar=self._clustervar,
)
_check_cluster_df(cluster_df=self._cluster_df, data=data)
else:
Expand Down Expand Up @@ -789,13 +795,29 @@
else:
self._has_fixef = False

def wald_test(self, R=None, q=None, distribution="F") -> None:
def wald_test(self, R=None, q=None, distribution="F"):
"""
Conduct Wald test.

Compute a Wald test for a linear hypothesis of the form Rb = q.
Compute a Wald test for a linear hypothesis of the form R * β = q.
where R is m x k matrix, β is a k x 1 vector of coefficients,
and q is m x 1 vector.
By default, tests the joint null hypothesis that all coefficients are zero.

This method producues the following attriutes

_dfd : int
degree of freedom in denominator
_dfn : int
degree of freedom in numerator
_wald_statistic : scalar
Wald-statistics computed for hypothesis testing
_f_statistic : scalar
Wald-statistics(when R is an indentity matrix, and q being zero vector)
computed for hypothesis testing
_p_value : scalar
corresponding p-value for statistics

Parameters
----------
R : array-like, optional
Expand All @@ -812,65 +834,101 @@
-------
pd.Series
A pd.Series with the Wald statistic and p-value.
"""
raise ValueError("wald_tests will be released as a feature with pyfixest 0.14.")

Examples
--------
import numpy as np
import pandas as pd

from pyfixest.estimation.estimation import feols

data = pd.read_csv("pyfixest/did/data/df_het.csv")
data = data.iloc[1:3000]

R = np.array([[1,-1]] )
q = np.array([0.0])

fml = "dep_var ~ treat"
fit = feols(fml, data, vcov={"CRV1": "year"}, ssc=ssc(adj=False))

# Wald test
fit.wald_test(R=R, q=q, distribution = "chi2")
f_stat = fit._f_statistic
p_stat = fit._p_value

print(f"Python f_stat: {f_stat}")
print(f"Python p_stat: {p_stat}")

# The code above produces the following results :
# Python f_stat: 256.55432910297003
# Python p_stat: 9.67406627744023e-58
"""
_vcov = self._vcov
_N = self._N
_k = self._k
_beta_hat = self._beta_hat
_k_fe = np.sum(self._k_fe.values) if self._has_fixef else 0

dfn = _N - _k_fe - _k
dfd = _k

# if R is not two dimensional, make it two dimensional
if R is not None:
if R.ndim == 1:
R = R.reshape((1, len(R)))
assert (
R.shape[1] == _k
), "R must have the same number of columns as the number of coefficients."
else:
# If R is None, default to the identity matrix
if R is None:
R = np.eye(_k)

if q is not None:
assert isinstance(
q, (int, float, np.ndarray)
), "q must be a numeric scalar."
if isinstance(q, np.ndarray):
assert q.ndim == 1, "q must be a one-dimensional array or a scalar."
assert (
q.shape[0] == R.shape[0]
), "q must have the same number of rows as R."
warnings.warn(
"Note that the argument q is experimental and no unit tests are implemented. Please use with caution / take a look at the source code."
# Ensure R is two-dimensional
if R.ndim == 1:
R = R.reshape((1, len(R)))

if R.shape[1] != _k:
raise ValueError(
"The number of columns of R must be equal to the number of coefficients."
)
else:

# If q is None, default to a vector of zeros
if q is None:
q = np.zeros(R.shape[0])
else:
if not isinstance(q, (int, float, np.ndarray)):
raise ValueError("q must be a numeric scalar or array.")
if isinstance(q, np.ndarray):
if q.ndim != 1:
raise ValueError("q must be a one-dimensional array or a scalar.")
if q.shape[0] != R.shape[0]:
raise ValueError("q must have the same number of rows as R.")

Check warning on line 895 in pyfixest/estimation/feols_.py

View check run for this annotation

Codecov / codecov/patch

pyfixest/estimation/feols_.py#L895

Added line #L895 was not covered by tests

assert distribution in [
"F",
"chi2",
], "distribution must be either 'F' or 'chi2'."
n_restriction = R.shape[0]
self._dfn = n_restriction

if self._is_clustered:
self._dfd = np.min(np.array(self._G)) - 1
else:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it standard to include the fixed effects in the dof computation? I am actually not sure about this and will take a look at the fixest docs!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh sorry for the late reply, I didn't notice your comment until now. According to my impression of clubSanwich package, the fixed effects didn't change anything related with the inference part(it was just like demeaning the whole dataset and implementing estimation on this demeaned data and then doing inference). I will take a close look at this and update if serious corrections are necessary(I will PR this if they really are)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries - this is at least what fixest seems to be doing (see the function I posted somewhere in the PR) =) I think clubSandwich is not supporting fixed effects / fixest as far as I am aware (due to difficulties computing CRV2 and CRV3 vcov's with fixed effects I believe) - so I am afraid your chances to find anything there are probably not super big :/

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, I actually meant to say fixest. Thanks for letting me know. :)

self._dfd = _N - _k - _k_fe

bread = R @ _beta_hat - q
meat = np.linalg.inv(R @ _vcov @ R.T)
W = bread.T @ meat @ bread

# this is chi-squared(k) distributed, with k = number of coefficients
self._wald_statistic = W
self._f_statistic = W / dfd

# Check if distribution is "F" and R is not identity matrix
# or q is not zero vector
if distribution == "F" and (
not np.array_equal(R, np.eye(_k)) or not np.all(q == 0)
):
warnings.warn(
"Distribution changed to chi2, as R is not an identity matrix and q is not a zero vector."
)
distribution = "chi2"

if distribution == "F":
self._f_statistic_pvalue = f.sf(self._f_statistic, dfn=dfn, dfd=dfd)
# self._f_statistic_pvalue = 1 - chi2(df = _k).cdf(self._f_statistic)
self._f_statistic = W / self._dfn
self._p_value = 1 - f.cdf(self._f_statistic, dfn=self._dfn, dfd=self._dfd)
res = pd.Series({"statistic": self._f_statistic, "pvalue": self._p_value})
elif distribution == "chi2":
self._f_statistic = W / self._dfn
self._p_value = chi2.sf(self._wald_statistic, self._dfn)
res = pd.Series(
{"statistic": self._f_statistic, "pvalue": self._f_statistic_pvalue}
{"statistic": self._wald_statistic, "pvalue": self._p_value}
)
else:
raise NotImplementedError("chi2 distribution not yet implemented.")
# self._wald_pvalue = 1 - chi2(df = _k).cdf(self._wald_statistic)
raise ValueError("Distribution must be F or chi2")

return res

Expand Down Expand Up @@ -1529,12 +1587,12 @@
Return a tidy pd.DataFrame with the point estimates, standard errors,
t-statistics, and p-values.

Parameters
Parameters
----------
alpha: Optional[float]
The significance level for the confidence intervals. If None,
computes a 95% confidence interval (`alpha = 0.05`).
The significance level for the confidence intervals. If None,
computes a 95% confidence interval (`alpha = 0.05`).

Returns
-------
tidy_df : pd.DataFrame
Expand Down
38 changes: 37 additions & 1 deletion tests/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from pyfixest.estimation.FormulaParser import FixestFormulaParser
from pyfixest.estimation.multcomp import rwolf
from pyfixest.report.summarize import etable, summary
from pyfixest.utils.utils import get_data
from pyfixest.utils.utils import get_data, ssc


@pytest.fixture
Expand Down Expand Up @@ -407,3 +407,39 @@ def test_ritest_error(data):
fit = pf.feols("Y ~ X1", data=data)
fit.ritest(resampvar="X1", reps=100)
fit.plot_ritest()


def test_wald_test_invalid_distribution():
data = pd.read_csv("pyfixest/did/data/df_het.csv")
data = data.iloc[1:3000]

fml = "dep_var ~ treat"
fit = feols(fml, data, vcov={"CRV1": "year"}, ssc=ssc(adj=False))

with pytest.raises(ValueError):
fit.wald_test(R=np.array([[1, -1]]), distribution="abc")


def test_wald_test_R_q_column_consistency():
data = pd.read_csv("pyfixest/did/data/df_het.csv")
data = data.iloc[1:3000]
fml = "dep_var ~ treat"
fit = feols(fml, data, vcov={"CRV1": "year"}, ssc=ssc(adj=False))

# Test with R.size[1] == number of coeffcients
with pytest.raises(ValueError):
fit.wald_test(R=np.array([[1, 0, 0]]))

# Test with q type
with pytest.raises(ValueError):
fit.wald_test(R=np.array([[1, 0]]), q="invalid type q")

# Test with q being a one-dimensional array or a scalar.
with pytest.raises(ValueError):
fit.wald_test(R=np.array([[1, 0], [0, 1]]), q=np.array([[0, 1]]))

# q must have the same number of rows as R
with pytest.raises(ValueError):
fit.wald_test(
R=np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), q=np.array([[0, 1]])
)
Loading