Skip to content

Commit

Permalink
Enable pivot bootstrap intervals
Browse files Browse the repository at this point in the history
  • Loading branch information
kbattocchi committed May 22, 2020
1 parent ba1446c commit 6495632
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 60 deletions.
29 changes: 23 additions & 6 deletions econml/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
from joblib import Parallel, delayed
from sklearn.base import clone
from scipy.stats import norm


class BootstrapEstimator:
Expand Down Expand Up @@ -44,14 +45,22 @@ class BootstrapEstimator:
In case a method ending in '_interval' exists on the wrapped object, whether
that should be preferred (meaning this wrapper will compute the mean of it).
This option only affects behavior if `compute_means` is set to ``True``.
bootstrap_type: 'percentile' or 'standard', default 'percentile'
Bootstrap method used to compute results. 'percentile' will result in using the empiracal CDF of
the replicated copmutations of the statistics. 'standard' will instead compute a pivot interval
assuming the replicates are normally distributed.
"""

def __init__(self, wrapped, n_bootstrap_samples=1000, n_jobs=None, compute_means=True, prefer_wrapped=False):
def __init__(self, wrapped, n_bootstrap_samples=1000, n_jobs=None, compute_means=True, prefer_wrapped=False,
bootstrap_type='percentile'):
self._instances = [clone(wrapped, safe=False) for _ in range(n_bootstrap_samples)]
self._n_bootstrap_samples = n_bootstrap_samples
self._n_jobs = n_jobs
self._compute_means = compute_means
self._prefer_wrapped = prefer_wrapped
self._boostrap_type = bootstrap_type
self._wrapped = wrapped

# TODO: Add a __dir__ implementation?

Expand Down Expand Up @@ -87,7 +96,7 @@ def __getattr__(self, name):
def proxy(make_call, name, summary):
def summarize_with(f):
return summary(np.array(Parallel(n_jobs=self._n_jobs, prefer='threads', verbose=3)(
(f, (obj, name), {}) for obj in self._instances)))
(f, (obj, name), {}) for obj in self._instances)), f(self._wrapped, name))
if make_call:
def call(*args, **kwargs):
return summarize_with(lambda obj, name: getattr(obj, name)(*args, **kwargs))
Expand All @@ -97,16 +106,24 @@ def call(*args, **kwargs):

def get_mean():
# for attributes that exist on the wrapped object, just compute the mean of the wrapped calls
return proxy(callable(getattr(self._instances[0], name)), name, lambda arr: np.mean(arr, axis=0))
return proxy(callable(getattr(self._instances[0], name)), name, lambda arr, _: np.mean(arr, axis=0))

def get_interval():
def get_interval(kind=None):
# if the attribute exists on the wrapped object once we remove the suffix,
# then we should be computing a confidence interval for the wrapped calls
prefix = name[: - len("_interval")]

def call_with_bounds(can_call, lower, upper):
return proxy(can_call, prefix,
lambda arr: (np.percentile(arr, lower, axis=0), np.percentile(arr, upper, axis=0)))
def percentile_bootstrap(arr, _):
return np.percentile(arr, lower, axis=0), np.percentile(arr, upper, axis=0)

def pivot_bootstrap(arr, est):
std = np.std(arr, axis=0)
return est - norm.ppf(upper / 100) * std, est - norm.ppf(lower / 100) * std
# TODO: studentized bootstrap? would be more accurate in most cases but can we avoid
# second level bootstrap which would be prohibitive computationally
fn = {'percentile': percentile_bootstrap, 'standard': pivot_bootstrap}[self._boostrap_type]
return proxy(can_call, prefix, fn)

can_call = callable(getattr(self._instances[0], prefix))
if can_call:
Expand Down
109 changes: 55 additions & 54 deletions econml/tests/test_bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,60 +17,61 @@ class TestBootstrap(unittest.TestCase):
def test_with_sklearn(self):
"""Test that we can bootstrap sklearn estimators."""
for n_jobs in [None, -1]: # test parallelism
x = np.random.normal(size=(1000, 1))
y = x * 0.5 + np.random.normal(size=(1000, 1))
y = y.flatten()

est = LinearRegression()
est.fit(x, y)

bs = BootstrapEstimator(est, 50, n_jobs=n_jobs)
# test that we can fit with the same arguments as the base estimator
bs.fit(x, y)

# test that we can get the same attribute for the bootstrap as the original, with the same shape
self.assertEqual(np.shape(est.coef_), np.shape(bs.coef_))

# test that we can get an interval for the same attribute for the bootstrap as the original,
# with the same shape for the lower and upper bounds
lower, upper = bs.coef__interval()
for bound in [lower, upper]:
self.assertEqual(np.shape(est.coef_), np.shape(bound))

# test that the lower and upper bounds differ
assert (lower <= upper).all()
assert (lower < upper).any()

# test that we can do the same thing once we provide percentile bounds
lower, upper = bs.coef__interval(lower=10, upper=90)
for bound in [lower, upper]:
self.assertEqual(np.shape(est.coef_), np.shape(bound))

# test that the lower and upper bounds differ
assert (lower <= upper).all()
assert (lower < upper).any()

# test that we can do the same thing with the results of a method, rather than an attribute
self.assertEqual(np.shape(est.predict(x)), np.shape(bs.predict(x)))

# test that we can get an interval for the same attribute for the bootstrap as the original,
# with the same shape for the lower and upper bounds
lower, upper = bs.predict_interval(x)
for bound in [lower, upper]:
self.assertEqual(np.shape(est.predict(x)), np.shape(bound))

# test that the lower and upper bounds differ
assert (lower <= upper).all()
assert (lower < upper).any()

# test that we can do the same thing once we provide percentile bounds
lower, upper = bs.predict_interval(x, lower=10, upper=90)
for bound in [lower, upper]:
self.assertEqual(np.shape(est.predict(x)), np.shape(bound))

# test that the lower and upper bounds differ
assert (lower <= upper).all()
assert (lower < upper).any()
for kind in ['percentile', 'standard']: # test both percentile and pivot intervals
x = np.random.normal(size=(1000, 1))
y = x * 0.5 + np.random.normal(size=(1000, 1))
y = y.flatten()

est = LinearRegression()
est.fit(x, y)

bs = BootstrapEstimator(est, 50, n_jobs=n_jobs, bootstrap_type=kind)
# test that we can fit with the same arguments as the base estimator
bs.fit(x, y)

# test that we can get the same attribute for the bootstrap as the original, with the same shape
self.assertEqual(np.shape(est.coef_), np.shape(bs.coef_))

# test that we can get an interval for the same attribute for the bootstrap as the original,
# with the same shape for the lower and upper bounds
lower, upper = bs.coef__interval()
for bound in [lower, upper]:
self.assertEqual(np.shape(est.coef_), np.shape(bound))

# test that the lower and upper bounds differ
assert (lower <= upper).all()
assert (lower < upper).any()

# test that we can do the same thing once we provide percentile bounds
lower, upper = bs.coef__interval(lower=10, upper=90)
for bound in [lower, upper]:
self.assertEqual(np.shape(est.coef_), np.shape(bound))

# test that the lower and upper bounds differ
assert (lower <= upper).all()
assert (lower < upper).any()

# test that we can do the same thing with the results of a method, rather than an attribute
self.assertEqual(np.shape(est.predict(x)), np.shape(bs.predict(x)))

# test that we can get an interval for the same attribute for the bootstrap as the original,
# with the same shape for the lower and upper bounds
lower, upper = bs.predict_interval(x)
for bound in [lower, upper]:
self.assertEqual(np.shape(est.predict(x)), np.shape(bound))

# test that the lower and upper bounds differ
assert (lower <= upper).all()
assert (lower < upper).any()

# test that we can do the same thing once we provide percentile bounds
lower, upper = bs.predict_interval(x, lower=10, upper=90)
for bound in [lower, upper]:
self.assertEqual(np.shape(est.predict(x)), np.shape(bound))

# test that the lower and upper bounds differ
assert (lower <= upper).all()
assert (lower < upper).any()

def test_with_econml(self):
"""Test that we can bootstrap econml estimators."""
Expand Down

0 comments on commit 6495632

Please sign in to comment.