hypothesis=float

vincentarelbundock · Oct 1, 2023 · faf8282 · faf8282
1 parent 8dac66e
commit faf8282
Show file tree

Hide file tree

Showing 9 changed files with 121 additions and 15 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # dev
 
+* `hypothesis` accepts a float or integer to specify a different null hypothesis.
+
 # 0.0.5
 
 * `predictions()` supports categorical predictors when `newdata` does not include all levels (internal padding).

diff --git a/marginaleffects/comparisons.py b/marginaleffects/comparisons.py
@@ -9,7 +9,7 @@
 from .estimands import estimands
 from .hypothesis import get_hypothesis
 from .predictions import get_predictions
-from .sanity import sanitize_newdata, sanitize_variables, sanitize_vcov, sanitize_by
+from .sanity import sanitize_newdata, sanitize_variables, sanitize_vcov, sanitize_by, sanitize_hypothesis_null
 from .transform import get_transform
 from .uncertainty import get_jacobian, get_se, get_z_p_ci
 from .utils import get_pad, sort_columns, upcast, get_modeldata
@@ -101,6 +101,7 @@ def comparisons(
     V = sanitize_vcov(vcov, model)
     newdata = sanitize_newdata(model, newdata=newdata, wts=wts, by=by)
     modeldata = get_modeldata(model)
+    hypothesis_null = sanitize_hypothesis_null(hypothesis)
 
     # after sanitize_newdata()
     variables = sanitize_variables(
@@ -268,7 +269,7 @@ def outer(x):
         J = get_jacobian(func=outer, coefs=model.params.to_numpy())
         se = get_se(J, V)
         out = out.with_columns(pl.Series(se).alias("std_error"))
-        out = get_z_p_ci(out, model, conf_level=conf_level)
+        out = get_z_p_ci(out, model, conf_level=conf_level, hypothesis_null=hypothesis_null)
 
     out = get_transform(out, transform=transform)
     out = get_equivalence(out, equivalence=equivalence, df=np.inf)

diff --git a/marginaleffects/hypotheses.py b/marginaleffects/hypotheses.py
@@ -2,16 +2,78 @@
 import polars as pl
 
 from .hypothesis import get_hypothesis
-from .sanity import sanitize_vcov
+from .sanity import sanitize_vcov, sanitize_hypothesis_null
 from .uncertainty import get_jacobian, get_se, get_z_p_ci
+from .equivalence import get_equivalence
 from .utils import sort_columns
 from .classes import MarginaleffectsDataFrame
 
 
-def hypotheses(model, hypothesis=None, conf_level=0.95, vcov=True):
-    # sanity checks
+def hypotheses(model, hypothesis=None, conf_level=0.95, vcov=True, equivalence=None):
+    """
+    (Non-)Linear Tests for Null Hypotheses, Joint Hypotheses, Equivalence, Non Superiority, and Non Inferiority.
+    
+    This function calculates uncertainty estimates as first-order approximate standard errors for linear or non-linear 
+    functions of a vector of random variables with known or estimated covariance matrix. It emulates the behavior of 
+    the excellent and well-established `car::deltaMethod` and `car::linearHypothesis` functions in R, but it supports 
+    more models; requires fewer dependencies; expands the range of tests to equivalence and superiority/inferiority; 
+    and offers convenience features like robust standard errors.
+    
+    To learn more, visit the package website: <https://marginaleffects.com/>
+    
+    Warning #1: Tests are conducted directly on the scale defined by the `type` argument. For some models, it can make 
+    sense to conduct hypothesis or equivalence tests on the `"link"` scale instead of the `"response"` scale which is 
+    often the default.
+    
+    Warning #2: For hypothesis tests on objects produced by the `marginaleffects` package, it is safer to use the 
+    `hypothesis` argument of the original function. Using `hypotheses()` may not work in certain environments, in lists, 
+    or when working programmatically with *apply style functions.
+    
+    Warning #3: The tests assume that the `hypothesis` expression is (approximately) normally distributed, which for 
+    non-linear functions of the parameters may not be realistic. More reliable confidence intervals can be obtained using 
+    the `inferences()` function with `method = "boot"`.
+    
+    Parameters:
+    model : object
+        Model object estimated by `statsmodels`
+    hypothesis : str formula, int, float, or optional
+        The null hypothesis value. Default is None.
+    conf_level : float, optional
+        Confidence level for the hypothesis test. Default is 0.95.
+    vcov : bool, optional
+        Whether to use the covariance matrix in the hypothesis test. Default is True.
+    equivalence : tuple, optional
+        The equivalence range for the hypothesis test. Default is None.
+        
+    Returns:
+    MarginaleffectsDataFrame
+        A DataFrame containing the results of the hypothesis tests.
+        
+    Examples:
+
+        # When `hypothesis` is `None`, `hypotheses()` returns a DataFrame of parameters
+        hypotheses(model)
+
+        # A different null hypothesis
+        hypotheses(model, hypothesis = 3)
+        
+        # Test of equality between coefficients
+        hypotheses(model, hypothesis="param1 = param2")
+        
+        # Non-linear function
+        hypotheses(model, hypothesis="exp(param1 + param2) = 0.1")
+        
+        # Robust standard errors
+        hypotheses(model, hypothesis="param1 = param2", vcov="HC3")
+        
+        # Equivalence, non-inferiority, and non-superiority tests
+        hypotheses(model, equivalence=(0, 10))
+    """
+
     V = sanitize_vcov(vcov, model)
 
+    hypothesis_null = sanitize_hypothesis_null(hypothesis)
+
     # estimands
     def fun(x):
         out = pl.DataFrame({"estimate": x})
@@ -23,7 +85,8 @@ def fun(x):
         J = get_jacobian(fun, model.params.to_numpy())
         se = get_se(J, V)
         out = out.with_columns(pl.Series(se).alias("std_error"))
-        out = get_z_p_ci(out, model, conf_level=conf_level)
+        out = get_z_p_ci(out, model, conf_level=conf_level, hypothesis_null=hypothesis_null)
+    out = get_equivalence(out, equivalence=equivalence)
     out = sort_columns(out, by=None)
     out = MarginaleffectsDataFrame(out, conf_level=conf_level)
     return out
diff --git a/marginaleffects/hypothesis.py b/marginaleffects/hypothesis.py
@@ -47,8 +47,9 @@ def eval_string_function(vec, hypothesis, rowlabels):
 
 # function extracts the estimate column from a data frame and sets it to x. If `hypothesis` argument is a numpy array, it feeds it directly to lincome_multiply. If lincome is a string, it checks if the string is valid, and then calls the corresponding function.
 def get_hypothesis(x, hypothesis):
-    msg = f"Invalid hypothesis argument: {hypothesis}. Valid arguments are: 'reference', 'revreference', 'sequential', 'revsequential', 'pairwise', 'revpairwise' or a numpy array."
-    if hypothesis is None:
+    msg = f"Invalid hypothesis argument: {hypothesis}. Valid arguments are: 'reference', 'revreference', 'sequential', 'revsequential', 'pairwise', 'revpairwise' or a numpy array or a float."
+
+    if hypothesis is None or isinstance(hypothesis, (int, float)):
         return x
     if isinstance(hypothesis, np.ndarray):
         out = lincom_multiply(x, hypothesis)
@@ -70,6 +71,7 @@ def get_hypothesis(x, hypothesis):
         elif hypothesis == "revpairwise":
             hypmat = lincom_revpairwise(x)
         else:
+
             raise ValueError(msg)
         out = lincom_multiply(x, hypmat.to_numpy())
         out = out.with_columns(pl.Series(hypothesis.columns).alias("term"))

diff --git a/marginaleffects/predictions.py b/marginaleffects/predictions.py
@@ -5,7 +5,7 @@
 from .by import get_by
 from .equivalence import get_equivalence
 from .hypothesis import get_hypothesis
-from .sanity import sanitize_newdata, sanitize_vcov, get_variables_names, sanitize_by
+from .sanity import sanitize_newdata, sanitize_vcov, get_variables_names, sanitize_by, sanitize_hypothesis_null
 from .transform import get_transform
 from .uncertainty import get_jacobian, get_se, get_z_p_ci
 from .utils import sort_columns, get_modeldata, get_pad, upcast
@@ -88,12 +88,12 @@ def predictions(
         - conf_low: lower bound of the confidence interval (or equal-tailed interval for Bayesian models)
         - conf_high: upper bound of the confidence interval (or equal-tailed interval for Bayesian models)
     """
-    pass
 
     # sanity checks
     by = sanitize_by(by)
     V = sanitize_vcov(vcov, model)
     newdata = sanitize_newdata(model, newdata, wts=wts, by=by)
+    hypothesis_null = sanitize_hypothesis_null(hypothesis)
 
     # pad
     modeldata = get_modeldata(model)
@@ -126,7 +126,7 @@ def fun(x):
         J = get_jacobian(fun, model.params)
         se = get_se(J, V)
         out = out.with_columns(pl.Series(se).alias("std_error"))
-        out = get_z_p_ci(out, model, conf_level=conf_level)
+        out = get_z_p_ci(out, model, conf_level=conf_level, hypothesis_null=hypothesis_null)
     out = get_transform(out, transform=transform)
     out = get_equivalence(out, equivalence=equivalence)
     out = sort_columns(out, by=by)

diff --git a/marginaleffects/sanity.py b/marginaleffects/sanity.py
@@ -455,3 +455,12 @@ def sanitize_variables(variables, model, newdata, comparison, eps, by, wts=None)
     out = [item for sublist in out for item in sublist]
 
     return out
+
+
+
+def sanitize_hypothesis_null(hypothesis):
+    if isinstance(hypothesis, (int, float)):
+        hypothesis_null = hypothesis
+    else:
+        hypothesis_null = 0
+    return hypothesis_null
diff --git a/marginaleffects/uncertainty.py b/marginaleffects/uncertainty.py
@@ -37,10 +37,12 @@ def get_se(J, V):
     return se
 
 
-def get_z_p_ci(df, model, conf_level):
+def get_z_p_ci(df, model, conf_level, hypothesis_null=0):
     if "std_error" not in df.columns:
         return df
-    df = df.with_columns((pl.col("estimate") / pl.col("std_error")).alias("statistic"))
+    df = df.with_columns(
+        ((pl.col("estimate") - float(hypothesis_null)) / pl.col("std_error")).alias("statistic")
+    )
     if hasattr(model, "df_resid") and isinstance(model.df_resid, float):
         dof = model.df_resid
     else:
@@ -53,6 +55,7 @@ def get_z_p_ci(df, model, conf_level):
     df = df.with_columns(
         (pl.col("estimate") + critical_value * pl.col("std_error")).alias("conf_high")
     )
+
     df = df.with_columns(
         pl.col("statistic")
         .apply(lambda x: (2 * (1 - stats.t.cdf(np.abs(x), dof))))
@@ -66,4 +69,4 @@ def get_z_p_ci(df, model, conf_level):
             )
         except:
             pass
-    return df
+    return df
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marginaleffects"
-version = "0.0.5.9000"
+version = "0.0.5.9001"
 description = ""
 authors = ["Vincent Arel-Bundock <[email protected]>"]
 readme = "README.md"

diff --git a/tests/test_hypotheses.py b/tests/test_hypotheses.py
@@ -8,6 +8,8 @@
 
 mod = smf.ols("Literacy ~ Pop1831 * Desertion", dat).fit()
 
+avg_predictions(mod, by = "Region")
+
 def test_coefs():
     hyp_py = hypotheses(mod, hypothesis = np.array([1, -1, 0, 0]))
     hyp_r = pl.read_csv("tests/r/test_hypotheses_coefs.csv")
@@ -21,3 +23,27 @@ def test_comparisons():
     # absolute because the order of rows is different in R and Python `comparisons()` output
     assert_series_equal(hyp_r["estimate"].abs(), hyp_py["estimate"].abs())
     assert_series_equal(hyp_r["std.error"], hyp_py["std_error"], check_names = False)
+
+
+def test_null_hypothesis():
+    # Test with hypothesis = 0
+    hyp_py_0 = hypotheses(mod, hypothesis=np.array([1, -1, 0, 0]), hypothesis_null=0)
+    hyp_r_0 = pl.read_csv("tests/r/test_hypotheses_coefs.csv")
+    assert_series_equal(hyp_r_0["estimate"], hyp_py_0["estimate"])
+    assert_series_equal(hyp_r_0["std.error"], hyp_py_0["std_error"], check_names=False)
+
+    # Test with hypothesis = 1
+    hyp_py_1 = hypotheses(mod, hypothesis=np.array([1, -1, 0, 0]), hypothesis_null=1)
+    hyp_r_1 = pl.read_csv("tests/r/test_hypotheses_coefs_hypothesis_1.csv")
+    assert_series_equal(hyp_r_1["estimate"], hyp_py_1["estimate"])
+    assert_series_equal(hyp_r_1["std.error"], hyp_py_1["std_error"], check_names=False)
+
+
+def test_hypothesis_list():
+    # Hypothesis values from R
+    hypothesis_values = [0.4630551, -112.8876651, -10.6664417, -5384.2708089]
+    mod = smf.ols("Literacy ~ Pop1831 * Desertion", dat).fit()
+    hyp = hypotheses(mod, hypothesis=3)
+    assert np.allclose(hyp["statistic"], hypothesis_values)
+    hyp = hypotheses(mod, hypothesis=3.)
+    assert np.allclose(hyp["statistic"], hypothesis_values)