diff --git a/econml/bootstrap.py b/econml/bootstrap.py index 23cdeac50..500b0401c 100644 --- a/econml/bootstrap.py +++ b/econml/bootstrap.py @@ -44,32 +44,62 @@ class BootstrapEstimator: In case a method ending in '_interval' exists on the wrapped object, whether that should be preferred (meaning this wrapper will compute the mean of it). This option only affects behavior if `compute_means` is set to ``True``. + + stratify_treatment: bool, default False + Whether to stratify by treatment when calling fit; this will ensure that each stratum of treatment + is subsampled independently, so that each resample will have the same number of entries with each + treatment as the original sample did. """ - def __init__(self, wrapped, n_bootstrap_samples=1000, n_jobs=None, compute_means=True, prefer_wrapped=False): + def __init__(self, wrapped, n_bootstrap_samples=1000, n_jobs=None, + compute_means=True, prefer_wrapped=False, stratify_treatment=False): self._instances = [clone(wrapped, safe=False) for _ in range(n_bootstrap_samples)] self._n_bootstrap_samples = n_bootstrap_samples self._n_jobs = n_jobs self._compute_means = compute_means self._prefer_wrapped = prefer_wrapped + self._stratify_treatment = stratify_treatment # TODO: Add a __dir__ implementation? + def _stratified_indices(self, Y, T, *args, **kwargs): + assert 1 <= np.ndim(T) <= 2 + unique = np.unique(T, axis=0) + indices = [] + for el in unique: + ind, = np.where(np.all(T == el, axis=1) if np.ndim(T) == 2 else T == el) + indices.append(ind) + return indices + def fit(self, *args, **named_args): """ Fit the model. The full signature of this method is the same as that of the wrapped object's `fit` method. """ - n_samples = np.shape(args[0] if args else named_args[(*named_args,)[0]])[0] - indices = np.random.choice(n_samples, size=(self._n_bootstrap_samples, n_samples), replace=True) + + if self._stratify_treatment: + index_chunks = self._stratified_indices(*args, **named_args) + else: + n_samples = np.shape(args[0] if args else named_args[(*named_args,)[0]])[0] + index_chunks = [np.arange(n_samples)] # one chunk with all indices + + indices = [] + for chunk in index_chunks: + n_samples = len(chunk) + indices.append(chunk[np.random.choice(n_samples, + size=(self._n_bootstrap_samples, n_samples), + replace=True)]) + + indices = np.hstack(indices) def fit(x, *args, **kwargs): x.fit(*args, **kwargs) return x # Explicitly return x in case fit fails to return its target def convertArg(arg, inds): - return arg[inds] if arg is not None else None + return np.asarray(arg)[inds] if arg is not None else None + self._instances = Parallel(n_jobs=self._n_jobs, prefer='threads', verbose=3)( delayed(fit)(obj, *[convertArg(arg, inds) for arg in args], @@ -84,6 +114,11 @@ def __getattr__(self, name): Additionally, the suffix "_interval" is supported for getting an interval instead of a point estimate. """ + + # don't proxy special methods + if name.startswith('__'): + raise AttributeError(name) + def proxy(make_call, name, summary): def summarize_with(f): return summary(np.array(Parallel(n_jobs=self._n_jobs, prefer='threads', verbose=3)( diff --git a/econml/inference.py b/econml/inference.py index 026aa196a..b18dc7c2c 100644 --- a/econml/inference.py +++ b/econml/inference.py @@ -52,7 +52,9 @@ def __init__(self, n_bootstrap_samples=100, n_jobs=-1): self._n_jobs = n_jobs def fit(self, estimator, *args, **kwargs): - est = BootstrapEstimator(estimator, self._n_bootstrap_samples, self._n_jobs, compute_means=False) + discrete_treatment = estimator._discrete_treatment if hasattr(estimator, '_discrete_treatment') else False + est = BootstrapEstimator(estimator, self._n_bootstrap_samples, self._n_jobs, compute_means=False, + stratify_treatment=discrete_treatment) est.fit(*args, **kwargs) self._est = est diff --git a/econml/tests/test_bootstrap.py b/econml/tests/test_bootstrap.py index cf165c83e..7e298554b 100644 --- a/econml/tests/test_bootstrap.py +++ b/econml/tests/test_bootstrap.py @@ -5,7 +5,7 @@ from econml.inference import BootstrapInference from econml.dml import LinearDMLCateEstimator from econml.two_stage_least_squares import NonparametricTwoStageLeastSquares -from sklearn.linear_model import LinearRegression +from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.preprocessing import PolynomialFeatures import numpy as np import unittest @@ -265,3 +265,18 @@ def test_internal_options(self): # TODO: test that the estimated effect is usually within the bounds # and that the true effect is also usually within the bounds + + def test_stratify(self): + """Test that we can properly stratify by treatment""" + T = [1, 0, 1, 2, 0, 2] + Y = [1, 2, 3, 4, 5, 6] + X = np.array([1, 1, 2, 2, 1, 2]).reshape(-1, 1) + est = LinearDMLCateEstimator(model_y=LinearRegression(), model_t=LogisticRegression(), discrete_treatment=True) + est.fit(Y, T, inference='bootstrap') + est.const_marginal_effect_interval() + + est.fit(Y, T, X=X, inference='bootstrap') + est.const_marginal_effect_interval(X) + + est.fit(Y, np.asarray(T).reshape(-1, 1), inference='bootstrap') # test stratifying 2D treatment + est.const_marginal_effect_interval() diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index 4bc9ad53b..207d1167f 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -97,11 +97,7 @@ def make_random(is_discrete, d): model_t = LogisticRegression() if is_discrete else Lasso() - # TODO: add stratification to bootstrap so that we can use it - # even with discrete treatments - all_infs = [None, 'statsmodels'] - if not is_discrete: - all_infs.append(BootstrapInference(1)) + all_infs = [None, 'statsmodels', BootstrapInference(1)] for est, multi, infs in\ [(LinearDMLCateEstimator(model_y=Lasso(),