From faf8282cf9614c9777e4e34956171aac756ccd0a Mon Sep 17 00:00:00 2001 From: Vincent Arel-Bundock Date: Sun, 1 Oct 2023 07:32:33 -0400 Subject: [PATCH] hypothesis=float --- NEWS.md | 2 + marginaleffects/comparisons.py | 5 ++- marginaleffects/hypotheses.py | 71 ++++++++++++++++++++++++++++++++-- marginaleffects/hypothesis.py | 6 ++- marginaleffects/predictions.py | 6 +-- marginaleffects/sanity.py | 9 +++++ marginaleffects/uncertainty.py | 9 +++-- pyproject.toml | 2 +- tests/test_hypotheses.py | 26 +++++++++++++ 9 files changed, 121 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index 37be5e1..22a63b2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # dev +* `hypothesis` accepts a float or integer to specify a different null hypothesis. + # 0.0.5 * `predictions()` supports categorical predictors when `newdata` does not include all levels (internal padding). diff --git a/marginaleffects/comparisons.py b/marginaleffects/comparisons.py index c8f963d..0c68179 100644 --- a/marginaleffects/comparisons.py +++ b/marginaleffects/comparisons.py @@ -9,7 +9,7 @@ from .estimands import estimands from .hypothesis import get_hypothesis from .predictions import get_predictions -from .sanity import sanitize_newdata, sanitize_variables, sanitize_vcov, sanitize_by +from .sanity import sanitize_newdata, sanitize_variables, sanitize_vcov, sanitize_by, sanitize_hypothesis_null from .transform import get_transform from .uncertainty import get_jacobian, get_se, get_z_p_ci from .utils import get_pad, sort_columns, upcast, get_modeldata @@ -101,6 +101,7 @@ def comparisons( V = sanitize_vcov(vcov, model) newdata = sanitize_newdata(model, newdata=newdata, wts=wts, by=by) modeldata = get_modeldata(model) + hypothesis_null = sanitize_hypothesis_null(hypothesis) # after sanitize_newdata() variables = sanitize_variables( @@ -268,7 +269,7 @@ def outer(x): J = get_jacobian(func=outer, coefs=model.params.to_numpy()) se = get_se(J, V) out = out.with_columns(pl.Series(se).alias("std_error")) - out = get_z_p_ci(out, model, conf_level=conf_level) + out = get_z_p_ci(out, model, conf_level=conf_level, hypothesis_null=hypothesis_null) out = get_transform(out, transform=transform) out = get_equivalence(out, equivalence=equivalence, df=np.inf) diff --git a/marginaleffects/hypotheses.py b/marginaleffects/hypotheses.py index f2be819..8f80496 100644 --- a/marginaleffects/hypotheses.py +++ b/marginaleffects/hypotheses.py @@ -2,16 +2,78 @@ import polars as pl from .hypothesis import get_hypothesis -from .sanity import sanitize_vcov +from .sanity import sanitize_vcov, sanitize_hypothesis_null from .uncertainty import get_jacobian, get_se, get_z_p_ci +from .equivalence import get_equivalence from .utils import sort_columns from .classes import MarginaleffectsDataFrame -def hypotheses(model, hypothesis=None, conf_level=0.95, vcov=True): - # sanity checks +def hypotheses(model, hypothesis=None, conf_level=0.95, vcov=True, equivalence=None): + """ + (Non-)Linear Tests for Null Hypotheses, Joint Hypotheses, Equivalence, Non Superiority, and Non Inferiority. + + This function calculates uncertainty estimates as first-order approximate standard errors for linear or non-linear + functions of a vector of random variables with known or estimated covariance matrix. It emulates the behavior of + the excellent and well-established `car::deltaMethod` and `car::linearHypothesis` functions in R, but it supports + more models; requires fewer dependencies; expands the range of tests to equivalence and superiority/inferiority; + and offers convenience features like robust standard errors. + + To learn more, visit the package website: + + Warning #1: Tests are conducted directly on the scale defined by the `type` argument. For some models, it can make + sense to conduct hypothesis or equivalence tests on the `"link"` scale instead of the `"response"` scale which is + often the default. + + Warning #2: For hypothesis tests on objects produced by the `marginaleffects` package, it is safer to use the + `hypothesis` argument of the original function. Using `hypotheses()` may not work in certain environments, in lists, + or when working programmatically with *apply style functions. + + Warning #3: The tests assume that the `hypothesis` expression is (approximately) normally distributed, which for + non-linear functions of the parameters may not be realistic. More reliable confidence intervals can be obtained using + the `inferences()` function with `method = "boot"`. + + Parameters: + model : object + Model object estimated by `statsmodels` + hypothesis : str formula, int, float, or optional + The null hypothesis value. Default is None. + conf_level : float, optional + Confidence level for the hypothesis test. Default is 0.95. + vcov : bool, optional + Whether to use the covariance matrix in the hypothesis test. Default is True. + equivalence : tuple, optional + The equivalence range for the hypothesis test. Default is None. + + Returns: + MarginaleffectsDataFrame + A DataFrame containing the results of the hypothesis tests. + + Examples: + + # When `hypothesis` is `None`, `hypotheses()` returns a DataFrame of parameters + hypotheses(model) + + # A different null hypothesis + hypotheses(model, hypothesis = 3) + + # Test of equality between coefficients + hypotheses(model, hypothesis="param1 = param2") + + # Non-linear function + hypotheses(model, hypothesis="exp(param1 + param2) = 0.1") + + # Robust standard errors + hypotheses(model, hypothesis="param1 = param2", vcov="HC3") + + # Equivalence, non-inferiority, and non-superiority tests + hypotheses(model, equivalence=(0, 10)) + """ + V = sanitize_vcov(vcov, model) + hypothesis_null = sanitize_hypothesis_null(hypothesis) + # estimands def fun(x): out = pl.DataFrame({"estimate": x}) @@ -23,7 +85,8 @@ def fun(x): J = get_jacobian(fun, model.params.to_numpy()) se = get_se(J, V) out = out.with_columns(pl.Series(se).alias("std_error")) - out = get_z_p_ci(out, model, conf_level=conf_level) + out = get_z_p_ci(out, model, conf_level=conf_level, hypothesis_null=hypothesis_null) + out = get_equivalence(out, equivalence=equivalence) out = sort_columns(out, by=None) out = MarginaleffectsDataFrame(out, conf_level=conf_level) return out diff --git a/marginaleffects/hypothesis.py b/marginaleffects/hypothesis.py index 8ea992f..fa8d30c 100644 --- a/marginaleffects/hypothesis.py +++ b/marginaleffects/hypothesis.py @@ -47,8 +47,9 @@ def eval_string_function(vec, hypothesis, rowlabels): # function extracts the estimate column from a data frame and sets it to x. If `hypothesis` argument is a numpy array, it feeds it directly to lincome_multiply. If lincome is a string, it checks if the string is valid, and then calls the corresponding function. def get_hypothesis(x, hypothesis): - msg = f"Invalid hypothesis argument: {hypothesis}. Valid arguments are: 'reference', 'revreference', 'sequential', 'revsequential', 'pairwise', 'revpairwise' or a numpy array." - if hypothesis is None: + msg = f"Invalid hypothesis argument: {hypothesis}. Valid arguments are: 'reference', 'revreference', 'sequential', 'revsequential', 'pairwise', 'revpairwise' or a numpy array or a float." + + if hypothesis is None or isinstance(hypothesis, (int, float)): return x if isinstance(hypothesis, np.ndarray): out = lincom_multiply(x, hypothesis) @@ -70,6 +71,7 @@ def get_hypothesis(x, hypothesis): elif hypothesis == "revpairwise": hypmat = lincom_revpairwise(x) else: + raise ValueError(msg) out = lincom_multiply(x, hypmat.to_numpy()) out = out.with_columns(pl.Series(hypothesis.columns).alias("term")) diff --git a/marginaleffects/predictions.py b/marginaleffects/predictions.py index cce02a5..04d1d46 100644 --- a/marginaleffects/predictions.py +++ b/marginaleffects/predictions.py @@ -5,7 +5,7 @@ from .by import get_by from .equivalence import get_equivalence from .hypothesis import get_hypothesis -from .sanity import sanitize_newdata, sanitize_vcov, get_variables_names, sanitize_by +from .sanity import sanitize_newdata, sanitize_vcov, get_variables_names, sanitize_by, sanitize_hypothesis_null from .transform import get_transform from .uncertainty import get_jacobian, get_se, get_z_p_ci from .utils import sort_columns, get_modeldata, get_pad, upcast @@ -88,12 +88,12 @@ def predictions( - conf_low: lower bound of the confidence interval (or equal-tailed interval for Bayesian models) - conf_high: upper bound of the confidence interval (or equal-tailed interval for Bayesian models) """ - pass # sanity checks by = sanitize_by(by) V = sanitize_vcov(vcov, model) newdata = sanitize_newdata(model, newdata, wts=wts, by=by) + hypothesis_null = sanitize_hypothesis_null(hypothesis) # pad modeldata = get_modeldata(model) @@ -126,7 +126,7 @@ def fun(x): J = get_jacobian(fun, model.params) se = get_se(J, V) out = out.with_columns(pl.Series(se).alias("std_error")) - out = get_z_p_ci(out, model, conf_level=conf_level) + out = get_z_p_ci(out, model, conf_level=conf_level, hypothesis_null=hypothesis_null) out = get_transform(out, transform=transform) out = get_equivalence(out, equivalence=equivalence) out = sort_columns(out, by=by) diff --git a/marginaleffects/sanity.py b/marginaleffects/sanity.py index bd17907..45fc428 100644 --- a/marginaleffects/sanity.py +++ b/marginaleffects/sanity.py @@ -455,3 +455,12 @@ def sanitize_variables(variables, model, newdata, comparison, eps, by, wts=None) out = [item for sublist in out for item in sublist] return out + + + +def sanitize_hypothesis_null(hypothesis): + if isinstance(hypothesis, (int, float)): + hypothesis_null = hypothesis + else: + hypothesis_null = 0 + return hypothesis_null \ No newline at end of file diff --git a/marginaleffects/uncertainty.py b/marginaleffects/uncertainty.py index 16e4059..e071a7d 100644 --- a/marginaleffects/uncertainty.py +++ b/marginaleffects/uncertainty.py @@ -37,10 +37,12 @@ def get_se(J, V): return se -def get_z_p_ci(df, model, conf_level): +def get_z_p_ci(df, model, conf_level, hypothesis_null=0): if "std_error" not in df.columns: return df - df = df.with_columns((pl.col("estimate") / pl.col("std_error")).alias("statistic")) + df = df.with_columns( + ((pl.col("estimate") - float(hypothesis_null)) / pl.col("std_error")).alias("statistic") + ) if hasattr(model, "df_resid") and isinstance(model.df_resid, float): dof = model.df_resid else: @@ -53,6 +55,7 @@ def get_z_p_ci(df, model, conf_level): df = df.with_columns( (pl.col("estimate") + critical_value * pl.col("std_error")).alias("conf_high") ) + df = df.with_columns( pl.col("statistic") .apply(lambda x: (2 * (1 - stats.t.cdf(np.abs(x), dof)))) @@ -66,4 +69,4 @@ def get_z_p_ci(df, model, conf_level): ) except: pass - return df + return df \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 499c3f2..b10388d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marginaleffects" -version = "0.0.5.9000" +version = "0.0.5.9001" description = "" authors = ["Vincent Arel-Bundock "] readme = "README.md" diff --git a/tests/test_hypotheses.py b/tests/test_hypotheses.py index 1d8795f..29b7517 100644 --- a/tests/test_hypotheses.py +++ b/tests/test_hypotheses.py @@ -8,6 +8,8 @@ mod = smf.ols("Literacy ~ Pop1831 * Desertion", dat).fit() +avg_predictions(mod, by = "Region") + def test_coefs(): hyp_py = hypotheses(mod, hypothesis = np.array([1, -1, 0, 0])) hyp_r = pl.read_csv("tests/r/test_hypotheses_coefs.csv") @@ -21,3 +23,27 @@ def test_comparisons(): # absolute because the order of rows is different in R and Python `comparisons()` output assert_series_equal(hyp_r["estimate"].abs(), hyp_py["estimate"].abs()) assert_series_equal(hyp_r["std.error"], hyp_py["std_error"], check_names = False) + + +def test_null_hypothesis(): + # Test with hypothesis = 0 + hyp_py_0 = hypotheses(mod, hypothesis=np.array([1, -1, 0, 0]), hypothesis_null=0) + hyp_r_0 = pl.read_csv("tests/r/test_hypotheses_coefs.csv") + assert_series_equal(hyp_r_0["estimate"], hyp_py_0["estimate"]) + assert_series_equal(hyp_r_0["std.error"], hyp_py_0["std_error"], check_names=False) + + # Test with hypothesis = 1 + hyp_py_1 = hypotheses(mod, hypothesis=np.array([1, -1, 0, 0]), hypothesis_null=1) + hyp_r_1 = pl.read_csv("tests/r/test_hypotheses_coefs_hypothesis_1.csv") + assert_series_equal(hyp_r_1["estimate"], hyp_py_1["estimate"]) + assert_series_equal(hyp_r_1["std.error"], hyp_py_1["std_error"], check_names=False) + + +def test_hypothesis_list(): + # Hypothesis values from R + hypothesis_values = [0.4630551, -112.8876651, -10.6664417, -5384.2708089] + mod = smf.ols("Literacy ~ Pop1831 * Desertion", dat).fit() + hyp = hypotheses(mod, hypothesis=3) + assert np.allclose(hyp["statistic"], hypothesis_values) + hyp = hypotheses(mod, hypothesis=3.) + assert np.allclose(hyp["statistic"], hypothesis_values) \ No newline at end of file