Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scikit-Learn support #145

Merged
merged 37 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
4b89f81
scikit-learn support
vincentarelbundock Dec 15, 2024
fcd5e70
abstract default methods
vincentarelbundock Dec 15, 2024
2d0eae7
null V in comparisons
vincentarelbundock Dec 15, 2024
419d791
cleanup scikit class
vincentarelbundock Dec 15, 2024
3516ef0
scikit multiclass
vincentarelbundock Dec 16, 2024
33692c2
minor
vincentarelbundock Dec 16, 2024
b6d365d
fixup
vincentarelbundock Dec 16, 2024
db271c0
gitignore
vincentarelbundock Dec 16, 2024
123eafa
lint
vincentarelbundock Dec 16, 2024
c58255e
comments
vincentarelbundock Dec 16, 2024
3cb9de8
formulaic module
vincentarelbundock Dec 16, 2024
d52339b
type validation
vincentarelbundock Dec 16, 2024
536d43c
init
vincentarelbundock Dec 16, 2024
7619496
Scikit -> Sklearn
vincentarelbundock Dec 16, 2024
96918fb
rename methods
vincentarelbundock Dec 17, 2024
d6886eb
sanitize_model cleanup
vincentarelbundock Dec 17, 2024
76735a6
simplification
vincentarelbundock Dec 17, 2024
f44749a
simplification
vincentarelbundock Dec 17, 2024
8d0753a
lint
vincentarelbundock Dec 17, 2024
4c4d0b3
minor
vincentarelbundock Dec 17, 2024
92d79b5
ingest pandas everywhere
vincentarelbundock Dec 17, 2024
3e7f456
pydantic dependency
vincentarelbundock Dec 17, 2024
44f7c8c
clean ingest()
vincentarelbundock Dec 17, 2024
72e1449
tests pass
vincentarelbundock Dec 18, 2024
51c7958
lint
vincentarelbundock Dec 18, 2024
ed41578
minor
vincentarelbundock Dec 18, 2024
c376e92
simplify
vincentarelbundock Dec 18, 2024
c0a927a
model.modeldata -> model.data
vincentarelbundock Dec 18, 2024
ff3e7cb
remove get_modeldata()
vincentarelbundock Dec 18, 2024
710d816
deprecate get_formula()
vincentarelbundock Dec 18, 2024
cc67765
comment
vincentarelbundock Dec 18, 2024
bfc1955
fit_sklearn fit_statsmodels
vincentarelbundock Dec 18, 2024
4835fc2
fit_statsmodels in the statsmodels file
vincentarelbundock Dec 18, 2024
bdc6c9f
dependencies
vincentarelbundock Dec 18, 2024
d91b810
minor
vincentarelbundock Dec 18, 2024
8374b00
find_variables -> find_predictors
vincentarelbundock Dec 18, 2024
e5c19c3
bump
vincentarelbundock Dec 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ __pypackages__/
site/
debug.py
debug.R
build/

# Poetry files
/dist/
Expand Down
2 changes: 1 addition & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Breaking change:
# 0.0.9

* Issue #90: Informative error on reserved keyword like 'group'.
* Issue #91: get_variables_names() in class ModelStatsmodels does not return all variables which causes errors
* Issue #91: find_variables() in class ModelStatsmodels does not return all variables which causes errors

# 0.0.8

Expand Down
4 changes: 4 additions & 0 deletions marginaleffects/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from .plot_slopes import plot_slopes
from .predictions import avg_predictions, predictions
from .slopes import avg_slopes, slopes
from .model_statsmodels import fit_statsmodels
from .model_sklearn import fit_sklearn

__all__ = [
"avg_comparisons",
Expand All @@ -19,4 +21,6 @@
"predictions",
"avg_slopes",
"slopes",
"fit_statsmodels",
"fit_sklearn",
]
3 changes: 3 additions & 0 deletions marginaleffects/by.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import polars as pl
import numpy as np


def get_by(model, estimand, newdata, by=None, wts=None):
Expand All @@ -25,6 +26,8 @@ def get_by(model, estimand, newdata, by=None, wts=None):
out = pl.DataFrame({"estimate": estimand["estimate"]})

by = [x for x in by if x in out.columns]
by = np.unique(by)

if isinstance(by, list) and len(by) == 0:
return out

Expand Down
14 changes: 7 additions & 7 deletions marginaleffects/comparisons.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
)
from .transform import get_transform
from .uncertainty import get_jacobian, get_se, get_z_p_ci
from .utils import get_pad, sort_columns, upcast
from .utils import get_pad, sort_columns, upcast, ingest
from .model_pyfixest import ModelPyfixest


Expand Down Expand Up @@ -125,7 +125,7 @@ def comparisons(
by = sanitize_by(by)
V = sanitize_vcov(vcov, model)
newdata = sanitize_newdata(model, newdata=newdata, wts=wts, by=by)
modeldata = model.modeldata
modeldata = model.data
hypothesis_null = sanitize_hypothesis_null(hypothesis)

# For each variable in `variables`, this will return two values that we want
Expand Down Expand Up @@ -177,7 +177,7 @@ def comparisons(
# Hack: We run into Patsy-related issues unless we "pad" the
# character/categorical variables to include all unique levels. We add them
# here but drop them after creating the design matrices.
vars = model.get_variables_names(variables=None, newdata=modeldata)
vars = model.find_variables()
vars = [re.sub(r"\[.*", "", x) for x in vars]
vars = list(set(vars))
for v in vars:
Expand Down Expand Up @@ -219,9 +219,9 @@ def comparisons(
lo_X = lo
nd_X = nd
else:
y, hi_X = patsy.dmatrices(model.formula, hi.to_pandas())
y, lo_X = patsy.dmatrices(model.formula, lo.to_pandas())
y, nd_X = patsy.dmatrices(model.formula, nd.to_pandas())
y, hi_X = patsy.dmatrices(model.formula, ingest(hi).to_pandas())
y, lo_X = patsy.dmatrices(model.formula, ingest(lo).to_pandas())
y, nd_X = patsy.dmatrices(model.formula, ingest(nd).to_pandas())

# unpad
if pad.shape[0] > 0:
Expand Down Expand Up @@ -328,7 +328,7 @@ def outer(x):
out = outer(model.coef)

# Compute standard errors and confidence intervals
if vcov is not None and vcov is not False:
if vcov is not None and vcov is not False and V is not None:
J = get_jacobian(func=outer, coefs=model.coef, eps_vcov=eps_vcov)
se = get_se(J, V)
out = out.with_columns(pl.Series(se).alias("std_error"))
Expand Down
4 changes: 2 additions & 2 deletions marginaleffects/datagrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def datagrid(
model = sanitize_model(model)

if newdata is None:
newdata = model.modeldata
newdata = model.data

if grid_type == "counterfactual":
return datagridcf(model=model, newdata=newdata, **kwargs)
Expand Down Expand Up @@ -143,7 +143,7 @@ def datagridcf(model=None, newdata=None, **kwargs):
model = sanitize_model(model)

if newdata is None:
newdata = model.modeldata
newdata = model.data

if "rowid" not in newdata.columns:
newdata = newdata.with_columns(
Expand Down
39 changes: 39 additions & 0 deletions marginaleffects/formulaic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import formulaic
import polars as pl
from .utils import validate_types


@validate_types
def variables(formula: str):
tok = formulaic.parser.DefaultFormulaParser().get_tokens(formula)
tok = [t for t in tok if t.kind.value == "name"]
tok = [str(t) for t in tok]
return tok


@validate_types
def listwise_deletion(formula: str, data: pl.DataFrame):
vars = variables(formula)
return data.drop_nulls(subset=vars)


def model_matrices(formula: str, data: pl.DataFrame, formula_engine: str = "formulaic"):
if formula_engine == "formulaic":
endog, exog = formulaic.model_matrix(formula, data.to_pandas())
endog = endog.to_numpy()
exog = exog.to_numpy()
return endog, exog
elif formula_engine == "patsy":
try:
import patsy
except ImportError:
raise ImportError("The patsy package is required to use this feature.")
if isinstance(formula, str):
import re

formula = re.sub(".*~", "", formula)
exog = patsy.dmatrix(formula, data.to_pandas())
return None, exog


__all__ = ["listwise_deletion", "model_matrices"]
2 changes: 1 addition & 1 deletion marginaleffects/hypotheses.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def hypotheses(

# estimands
def fun(x):
out = pl.DataFrame({"term": model.get_coef_names(), "estimate": x})
out = pl.DataFrame({"term": model.find_coef(), "estimate": x})
out = get_hypothesis(out, hypothesis=hypothesis)
return out

Expand Down
2 changes: 1 addition & 1 deletion marginaleffects/hypotheses_joint.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def joint_hypotheses(obj, joint_index=None, joint_test="f", hypothesis=0):
# theta_hat: P x 1 vector of estimated parameters
theta_hat = obj.get_coef()

var_names = obj.get_variables_names()
var_names = obj.find_predictors()

if len(theta_hat) == len(var_names) + 1:
var_names = ["Intercept"] + var_names
Expand Down
64 changes: 35 additions & 29 deletions marginaleffects/model_abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,73 +2,79 @@
import polars as pl
from abc import ABC, abstractmethod
from .utils import get_type_dictionary
from . import formulaic as fml


class ModelAbstract(ABC):
def __init__(self, model):
self.model = model
self.formula_engine = "formulaic"
self.validate_coef()
self.validate_modeldata()
self.validate_response_name()
self.validate_formula()
self.variables_type = get_type_dictionary(self.modeldata)
self.variables_type = get_type_dictionary(self.data)

def validate_coef(self):
coef = self.get_coef()
if not isinstance(coef, np.ndarray):
if not isinstance(coef, np.ndarray) and coef is not None:
raise ValueError("coef must be a numpy array")
self.coef = coef

def validate_modeldata(self):
modeldata = self.get_modeldata()
if not isinstance(modeldata, pl.DataFrame):
raise ValueError("modeldata must be a Polars DataFrame")
self.modeldata = modeldata
if not isinstance(self.data, pl.DataFrame):
raise ValueError("data attribute must be a Polars DataFrame")

def validate_response_name(self):
response_name = self.get_response_name()
response_name = self.find_response()
if not isinstance(response_name, str):
raise ValueError("response_name must be a string")
self.response_name = response_name

def validate_formula(self):
formula = self.get_formula()
formula = self.formula

if not isinstance(formula, str):
raise ValueError("formula must be a string")

if "~" not in formula:
raise ValueError(
"Model formula must contain '~' to separate dependent and independent variables"
)

if "scale(" in formula or "center(" in formula:
raise ValueError(
"The formula cannot include scale( or center(. Please center your variables before fitting the model."
)
self.formula = formula

@abstractmethod
def get_vcov(self):
pass
def get_vcov(self, vcov=False):
return None

@abstractmethod
def get_modeldata(self):
pass
def get_coef(self):
return None

@abstractmethod
def get_response_name(self):
pass
def find_coef(self):
return None

# names of the variables in the original dataset, excluding interactions, intercept, etc.
@abstractmethod
def get_variables_names(self):
pass
def find_variables(self, variables=None, newdata=None):
out = fml.variables(self.formula)
return out

# names of the parameters
@abstractmethod
def get_coef_names(self):
pass
def find_response(self):
vars = self.find_variables()
if vars is None:
return None
else:
return vars[0]

@abstractmethod
def get_predict(self):
pass
def find_predictors(self):
vars = self.find_variables()
if vars is None:
return None
else:
return vars[1:]

@abstractmethod
def get_formula(self):
def get_predict(self):
pass
47 changes: 9 additions & 38 deletions marginaleffects/model_pyfixest.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import re
import numpy as np
import polars as pl
import warnings
from .model_abstract import ModelAbstract
from .utils import ingest


class ModelPyfixest(ModelAbstract):
def __init__(self, model):
self.data = ingest(model._data)
self.formula = model._fml
super().__init__(model)
if hasattr(self.model, "_fixef"):
if self.model._fixef is not None:
Expand All @@ -17,52 +19,21 @@ def __init__(self, model):
def get_coef(self):
return np.array(self.model._beta_hat)

def get_coef_names(self):
def find_coef(self):
return np.array(self.model._coefnames)

def get_modeldata(self):
df = self.model._data
if not isinstance(df, pl.DataFrame):
df = pl.from_pandas(df)
return df

def get_response_name(self):
return self.model._fml.split("~")[0] # the response variable

def get_vcov(self, vcov=True):
V = None
if isinstance(vcov, bool):
if vcov is True:
V = self.model._vcov
return V

def get_formula(self):
return self.model._fml

def get_variables_names(self, variables=None, newdata=None):
if variables is None:
variables = self.model._coefnames
variables = [re.sub(r"\[.*\]", "", x) for x in variables]
variables = [x for x in variables if x in self.modeldata.columns]
variables = pl.Series(variables).unique().to_list()
if isinstance(variables, (str, dict)):
variables = [variables] if isinstance(variables, str) else variables
elif isinstance(variables, list) and all(
isinstance(var, str) for var in variables
):
pass
else:
raise ValueError(
"`variables` must be None, a dict, string, or list of strings"
)
if newdata is not None:
good = [x for x in variables if x in newdata.columns]
bad = [x for x in variables if x not in newdata.columns]
if len(bad) > 0:
bad = ", ".join(bad)
warnings.warn(f"Variable(s) not in newdata: {bad}")
if len(good) == 0:
raise ValueError("There is no valid column name in `variables`.")
def find_predictors(self):
variables = self.model._coefnames
variables = [re.sub(r"\[.*\]", "", x) for x in variables]
variables = [x for x in variables if x in self.data.columns]
variables = pl.Series(variables).unique().to_list()
return variables

def get_predict(self, params, newdata: pl.DataFrame):
Expand Down
Loading
Loading