vincentarelbundock · vincentarelbundock · Dec 18, 2024 · Dec 15, 2024 · Dec 15, 2024 · Dec 15, 2024
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@ __pypackages__/
 site/
 debug.py
 debug.R
+build/
 
 # Poetry files
 /dist/

diff --git a/NEWS.md b/NEWS.md
@@ -27,7 +27,7 @@ Breaking change:
 # 0.0.9
 
 * Issue #90: Informative error on reserved keyword like 'group'.
-* Issue #91: get_variables_names() in class ModelStatsmodels does not return all variables which causes errors
+* Issue #91: find_variables() in class ModelStatsmodels does not return all variables which causes errors
 
 # 0.0.8
 

diff --git a/marginaleffects/__init__.py b/marginaleffects/__init__.py
@@ -6,6 +6,8 @@
 from .plot_slopes import plot_slopes
 from .predictions import avg_predictions, predictions
 from .slopes import avg_slopes, slopes
+from .model_statsmodels import fit_statsmodels
+from .model_sklearn import fit_sklearn
 
 __all__ = [
     "avg_comparisons",
@@ -19,4 +21,6 @@
     "predictions",
     "avg_slopes",
     "slopes",
+    "fit_statsmodels",
+    "fit_sklearn",
 ]
diff --git a/marginaleffects/by.py b/marginaleffects/by.py
@@ -1,4 +1,5 @@
 import polars as pl
+import numpy as np
 
 
 def get_by(model, estimand, newdata, by=None, wts=None):
@@ -25,6 +26,8 @@ def get_by(model, estimand, newdata, by=None, wts=None):
         out = pl.DataFrame({"estimate": estimand["estimate"]})
 
     by = [x for x in by if x in out.columns]
+    by = np.unique(by)
+
     if isinstance(by, list) and len(by) == 0:
         return out
 

diff --git a/marginaleffects/comparisons.py b/marginaleffects/comparisons.py
@@ -19,7 +19,7 @@
 )
 from .transform import get_transform
 from .uncertainty import get_jacobian, get_se, get_z_p_ci
-from .utils import get_pad, sort_columns, upcast
+from .utils import get_pad, sort_columns, upcast, ingest
 from .model_pyfixest import ModelPyfixest
 
 
@@ -125,7 +125,7 @@ def comparisons(
     by = sanitize_by(by)
     V = sanitize_vcov(vcov, model)
     newdata = sanitize_newdata(model, newdata=newdata, wts=wts, by=by)
-    modeldata = model.modeldata
+    modeldata = model.data
     hypothesis_null = sanitize_hypothesis_null(hypothesis)
 
     # For each variable in `variables`, this will return two values that we want
@@ -177,7 +177,7 @@ def comparisons(
     # Hack: We run into Patsy-related issues unless we "pad" the
     # character/categorical variables to include all unique levels. We add them
     # here but drop them after creating the design matrices.
-    vars = model.get_variables_names(variables=None, newdata=modeldata)
+    vars = model.find_variables()
     vars = [re.sub(r"\[.*", "", x) for x in vars]
     vars = list(set(vars))
     for v in vars:
@@ -219,9 +219,9 @@ def comparisons(
         lo_X = lo
         nd_X = nd
     else:
-        y, hi_X = patsy.dmatrices(model.formula, hi.to_pandas())
-        y, lo_X = patsy.dmatrices(model.formula, lo.to_pandas())
-        y, nd_X = patsy.dmatrices(model.formula, nd.to_pandas())
+        y, hi_X = patsy.dmatrices(model.formula, ingest(hi).to_pandas())
+        y, lo_X = patsy.dmatrices(model.formula, ingest(lo).to_pandas())
+        y, nd_X = patsy.dmatrices(model.formula, ingest(nd).to_pandas())
 
     # unpad
     if pad.shape[0] > 0:
@@ -328,7 +328,7 @@ def outer(x):
     out = outer(model.coef)
 
     # Compute standard errors and confidence intervals
-    if vcov is not None and vcov is not False:
+    if vcov is not None and vcov is not False and V is not None:
         J = get_jacobian(func=outer, coefs=model.coef, eps_vcov=eps_vcov)
         se = get_se(J, V)
         out = out.with_columns(pl.Series(se).alias("std_error"))

diff --git a/marginaleffects/datagrid.py b/marginaleffects/datagrid.py
@@ -81,7 +81,7 @@ def datagrid(
         model = sanitize_model(model)
 
     if newdata is None:
-        newdata = model.modeldata
+        newdata = model.data
 
     if grid_type == "counterfactual":
         return datagridcf(model=model, newdata=newdata, **kwargs)
@@ -143,7 +143,7 @@ def datagridcf(model=None, newdata=None, **kwargs):
     model = sanitize_model(model)
 
     if newdata is None:
-        newdata = model.modeldata
+        newdata = model.data
 
     if "rowid" not in newdata.columns:
         newdata = newdata.with_columns(

diff --git a/marginaleffects/formulaic.py b/marginaleffects/formulaic.py
@@ -0,0 +1,39 @@
+import formulaic
+import polars as pl
+from .utils import validate_types
+
+
+@validate_types
+def variables(formula: str):
+    tok = formulaic.parser.DefaultFormulaParser().get_tokens(formula)
+    tok = [t for t in tok if t.kind.value == "name"]
+    tok = [str(t) for t in tok]
+    return tok
+
+
+@validate_types
+def listwise_deletion(formula: str, data: pl.DataFrame):
+    vars = variables(formula)
+    return data.drop_nulls(subset=vars)
+
+
+def model_matrices(formula: str, data: pl.DataFrame, formula_engine: str = "formulaic"):
+    if formula_engine == "formulaic":
+        endog, exog = formulaic.model_matrix(formula, data.to_pandas())
+        endog = endog.to_numpy()
+        exog = exog.to_numpy()
+        return endog, exog
+    elif formula_engine == "patsy":
+        try:
+            import patsy
+        except ImportError:
+            raise ImportError("The patsy package is required to use this feature.")
+        if isinstance(formula, str):
+            import re
+
+            formula = re.sub(".*~", "", formula)
+        exog = patsy.dmatrix(formula, data.to_pandas())
+        return None, exog
+
+
+__all__ = ["listwise_deletion", "model_matrices"]
diff --git a/marginaleffects/hypotheses.py b/marginaleffects/hypotheses.py
@@ -93,7 +93,7 @@ def hypotheses(
 
     # estimands
     def fun(x):
-        out = pl.DataFrame({"term": model.get_coef_names(), "estimate": x})
+        out = pl.DataFrame({"term": model.find_coef(), "estimate": x})
         out = get_hypothesis(out, hypothesis=hypothesis)
         return out
 

diff --git a/marginaleffects/hypotheses_joint.py b/marginaleffects/hypotheses_joint.py
@@ -12,7 +12,7 @@ def joint_hypotheses(obj, joint_index=None, joint_test="f", hypothesis=0):
     # theta_hat: P x 1 vector of estimated parameters
     theta_hat = obj.get_coef()
 
-    var_names = obj.get_variables_names()
+    var_names = obj.find_predictors()
 
     if len(theta_hat) == len(var_names) + 1:
         var_names = ["Intercept"] + var_names

diff --git a/marginaleffects/model_abstract.py b/marginaleffects/model_abstract.py
@@ -2,73 +2,79 @@
 import polars as pl
 from abc import ABC, abstractmethod
 from .utils import get_type_dictionary
+from . import formulaic as fml
 
 
 class ModelAbstract(ABC):
     def __init__(self, model):
         self.model = model
+        self.formula_engine = "formulaic"
         self.validate_coef()
         self.validate_modeldata()
         self.validate_response_name()
         self.validate_formula()
-        self.variables_type = get_type_dictionary(self.modeldata)
+        self.variables_type = get_type_dictionary(self.data)
 
     def validate_coef(self):
         coef = self.get_coef()
-        if not isinstance(coef, np.ndarray):
+        if not isinstance(coef, np.ndarray) and coef is not None:
             raise ValueError("coef must be a numpy array")
         self.coef = coef
 
     def validate_modeldata(self):
-        modeldata = self.get_modeldata()
-        if not isinstance(modeldata, pl.DataFrame):
-            raise ValueError("modeldata must be a Polars DataFrame")
-        self.modeldata = modeldata
+        if not isinstance(self.data, pl.DataFrame):
+            raise ValueError("data attribute must be a Polars DataFrame")
 
     def validate_response_name(self):
-        response_name = self.get_response_name()
+        response_name = self.find_response()
         if not isinstance(response_name, str):
             raise ValueError("response_name must be a string")
         self.response_name = response_name
 
     def validate_formula(self):
-        formula = self.get_formula()
+        formula = self.formula
 
         if not isinstance(formula, str):
             raise ValueError("formula must be a string")
 
+        if "~" not in formula:
+            raise ValueError(
+                "Model formula must contain '~' to separate dependent and independent variables"
+            )
+
         if "scale(" in formula or "center(" in formula:
             raise ValueError(
                 "The formula cannot include scale( or center(. Please center your variables before fitting the model."
             )
         self.formula = formula
 
-    @abstractmethod
-    def get_vcov(self):
-        pass
+    def get_vcov(self, vcov=False):
+        return None
 
-    @abstractmethod
-    def get_modeldata(self):
-        pass
+    def get_coef(self):
+        return None
 
-    @abstractmethod
-    def get_response_name(self):
-        pass
+    def find_coef(self):
+        return None
 
-    # names of the variables in the original dataset, excluding interactions, intercept, etc.
-    @abstractmethod
-    def get_variables_names(self):
-        pass
+    def find_variables(self, variables=None, newdata=None):
+        out = fml.variables(self.formula)
+        return out
 
-    # names of the parameters
-    @abstractmethod
-    def get_coef_names(self):
-        pass
+    def find_response(self):
+        vars = self.find_variables()
+        if vars is None:
+            return None
+        else:
+            return vars[0]
 
-    @abstractmethod
-    def get_predict(self):
-        pass
+    def find_predictors(self):
+        vars = self.find_variables()
+        if vars is None:
+            return None
+        else:
+            return vars[1:]
 
     @abstractmethod
-    def get_formula(self):
+    def get_predict(self):
         pass
diff --git a/marginaleffects/model_pyfixest.py b/marginaleffects/model_pyfixest.py
@@ -1,12 +1,14 @@
 import re
 import numpy as np
 import polars as pl
-import warnings
 from .model_abstract import ModelAbstract
+from .utils import ingest
 
 
 class ModelPyfixest(ModelAbstract):
     def __init__(self, model):
+        self.data = ingest(model._data)
+        self.formula = model._fml
         super().__init__(model)
         if hasattr(self.model, "_fixef"):
             if self.model._fixef is not None:
@@ -17,52 +19,21 @@ def __init__(self, model):
     def get_coef(self):
         return np.array(self.model._beta_hat)
 
-    def get_coef_names(self):
+    def find_coef(self):
         return np.array(self.model._coefnames)
 
-    def get_modeldata(self):
-        df = self.model._data
-        if not isinstance(df, pl.DataFrame):
-            df = pl.from_pandas(df)
-        return df
-
-    def get_response_name(self):
-        return self.model._fml.split("~")[0]  # the response variable
-
     def get_vcov(self, vcov=True):
         V = None
         if isinstance(vcov, bool):
             if vcov is True:
                 V = self.model._vcov
         return V
 
-    def get_formula(self):
-        return self.model._fml
-
-    def get_variables_names(self, variables=None, newdata=None):
-        if variables is None:
-            variables = self.model._coefnames
-            variables = [re.sub(r"\[.*\]", "", x) for x in variables]
-            variables = [x for x in variables if x in self.modeldata.columns]
-            variables = pl.Series(variables).unique().to_list()
-        if isinstance(variables, (str, dict)):
-            variables = [variables] if isinstance(variables, str) else variables
-        elif isinstance(variables, list) and all(
-            isinstance(var, str) for var in variables
-        ):
-            pass
-        else:
-            raise ValueError(
-                "`variables` must be None, a dict, string, or list of strings"
-            )
-        if newdata is not None:
-            good = [x for x in variables if x in newdata.columns]
-            bad = [x for x in variables if x not in newdata.columns]
-            if len(bad) > 0:
-                bad = ", ".join(bad)
-                warnings.warn(f"Variable(s) not in newdata: {bad}")
-            if len(good) == 0:
-                raise ValueError("There is no valid column name in `variables`.")
+    def find_predictors(self):
+        variables = self.model._coefnames
+        variables = [re.sub(r"\[.*\]", "", x) for x in variables]
+        variables = [x for x in variables if x in self.data.columns]
+        variables = pl.Series(variables).unique().to_list()
         return variables
 
     def get_predict(self, params, newdata: pl.DataFrame):
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ __pypackages__/ @@
     site/
     debug.py
     debug.R
+    build/
     # Poetry files
     /dist/
@@ Expand Down @@