From a07be5c2c865626a4b96a73832f4bf816cdfe0d9 Mon Sep 17 00:00:00 2001 From: Adam Kelleher Date: Wed, 22 Jun 2016 18:11:26 -0400 Subject: [PATCH 1/2] added bootstrap effects estimator --- causality/estimation/nonparametric.py | 33 ++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/causality/estimation/nonparametric.py b/causality/estimation/nonparametric.py index 4449968..87e929f 100644 --- a/causality/estimation/nonparametric.py +++ b/causality/estimation/nonparametric.py @@ -189,4 +189,35 @@ def expected_value( self, x): return causal_effect else: return self.conditional_expectation.fit(data_predict=x[self.causes])[0] - + + +class BootstrapEstimator(object): + def __init__(self, f=np.mean, groupby_field=None, bootstrap_samples=1000, lower_q=0.025, upper_q=0.975): + self.f = f + self.bootstrap_samples = bootstrap_samples + self.lower_q = lower_q + self.upper_q = upper_q + + def estimate(self, X): + quantiles = pd.DataFrame([self.f(X.sample(n=len(X), replace=True)) for i in range(self.bootstrap_samples)]).quantile([self.lower_q,.5,self.upper_q]) + return quantiles + + def found_winner(self, X): + quantiles = self.estimate(X) + for candidate in quantiles.columns: + others = list(set(quantiles.columns) - set([candidate])) + if (quantiles[others].ix[self.upper_q] < quantiles[candidate][self.lower_q]).all(): + return True + return False + + def chances_of_winning(self, X): + df = X.sample(n=len(X), replace=True) + res = self.f(df) + counts = (res == res.max()).astype(int) + for i in xrange(self.bootstrap_samples-1): + df = X.sample(n=len(X), replace=True) + res = self.f(df) + counts += (res == res.max()).astype(int) + return counts / float(self.bootstrap_samples) + + From 0fd77f5569af5242305b262362373525769834cd Mon Sep 17 00:00:00 2001 From: Adam Kelleher Date: Wed, 6 Jul 2016 17:42:55 -0400 Subject: [PATCH 2/2] added regression discontinuity; prop score matching --- causality/estimation/nonparametric.py | 10 +- causality/estimation/parametric.py | 124 ++++++++++++++++-- .../inference/independence_tests/__init__.py | 62 ++++++--- causality/inference/search/__init__.py | 2 +- tests/unit/nonparametric.py | 61 ++++++++- tests/unit/parametric.py | 62 ++++++++- 6 files changed, 280 insertions(+), 41 deletions(-) diff --git a/causality/estimation/nonparametric.py b/causality/estimation/nonparametric.py index 87e929f..67e43c8 100644 --- a/causality/estimation/nonparametric.py +++ b/causality/estimation/nonparametric.py @@ -192,11 +192,15 @@ def expected_value( self, x): class BootstrapEstimator(object): - def __init__(self, f=np.mean, groupby_field=None, bootstrap_samples=1000, lower_q=0.025, upper_q=0.975): + def __init__(self, f=np.mean, bootstrap_samples=1000, p=None, lower_q=0.025, upper_q=0.975): self.f = f self.bootstrap_samples = bootstrap_samples - self.lower_q = lower_q - self.upper_q = upper_q + if p: + self.lower_q = p / 2. + self.upper_q = 1. - (p/2.) + else: + self.lower_q = lower_q + self.upper_q = upper_q def estimate(self, X): quantiles = pd.DataFrame([self.f(X.sample(n=len(X), replace=True)) for i in range(self.bootstrap_samples)]).quantile([self.lower_q,.5,self.upper_q]) diff --git a/causality/estimation/parametric.py b/causality/estimation/parametric.py index ae3e8fa..c6fa583 100644 --- a/causality/estimation/parametric.py +++ b/causality/estimation/parametric.py @@ -1,6 +1,8 @@ import pandas as pd from statsmodels.regression.linear_model import OLS from statsmodels.robust.robust_linear_model import RLM +from statsmodels.discrete.discrete_model import Logit +from sklearn.neighbors import NearestNeighbors class DifferenceInDifferences(object): def __init__(self, robust=True): @@ -19,35 +21,35 @@ def __init__(self, robust=True): self.model = OLS def average_treatment_effect(self, X, start='Start', end='End', assignment='Assignment'): - test = X[X['Assignment']==1][['Start','End']] - control = X[X['Assignment']==0][['Start','End']] + test = X[X[assignment]==1][[start,end]] + control = X[X[assignment]==0][[start,end]] del X - test_initial = test['Start'] - test_final = test['End'] - control_initial = control['Start'] - control_final = control['End'] + test_initial = test[start] + test_final = test[end] + control_initial = control[start] + control_final = control[end] del test, control df = pd.DataFrame({'y' : test_initial, - 'assignment' : [1. for i in test_initial], + assignment : [1. for i in test_initial], 't' :[0. for i in test_initial] }) df = df.append(pd.DataFrame({'y' : test_final, - 'assignment' : [1. for i in test_final], + assignment : [1. for i in test_final], 't' :[1. for i in test_final] })) df = df.append(pd.DataFrame({'y' : control_initial, - 'assignment' : [0. for i in control_initial], + assignment : [0. for i in control_initial], 't' :[0. for i in control_initial] })) df = df.append(pd.DataFrame({'y' : control_final, - 'assignment' : [0. for i in control_final], + assignment : [0. for i in control_final], 't' :[1. for i in control_final] })) del test_initial, test_final, control_initial, control_final - df['did'] = df['t'] * df['assignment'] - df['intercept'] = 1. + df.loc[:,'did'] = df['t'] * df[assignment] + df.loc[:,'intercept'] = 1. - model = self.model(df['y'], df[['t', 'assignment','did', 'intercept']]) + model = self.model(df['y'], df[['t', assignment, 'did', 'intercept']]) result = model.fit() conf_int = result.conf_int().ix['did'] expected = result.params['did'] @@ -72,4 +74,98 @@ def test_parallel_trend(self, X, start='Start', end='End', assignment='Assignmen return False - +class PropensityScoreMatching(object): + def __init__(self): + # change the model if there are multiple matches per treated! + pass + + def score(self, X, confounder_types, assignment='assignment', store_model_fit=False, intercept=True): + df = X[[assignment]] + regression_confounders = [] + for confounder, var_type in confounder_types.items(): + if var_type == 'o' or var_type == 'u': + c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder) + if len(c_dummies.columns) == 1: + df[c_dummies.columns] = c_dummies[c_dummies.columns] + regression_confounders.extend(c_dummies.columns) + else: + df[c_dummies.columns[1:]] = c_dummies[c_dummies.columns[1:]] + regression_confounders.extend(c_dummies.columns[1:]) + else: + regression_confounders.append(confounder) + df.loc[:,confounder] = X[confounder].copy() # + df.loc[:,confounder] = X[confounder].copy() # + if intercept: + df.loc[:,'intercept'] = 1. + regression_confounders.append('intercept') + logit = Logit(df[assignment], df[regression_confounders]) + result = logit.fit() + if store_model_fit: + self.model_fit = result + X.loc[:,'propensity score'] = result.predict(df[regression_confounders]) + return X + + def match(self, X, assignment='assignment', score='propensity score', n_neighbors=2): + treatments = X[X[assignment] != 0] + control = X[X[assignment] == 0] + neighbor_search = NearestNeighbors(metric='euclidean', n_neighbors=n_neighbors) + neighbor_search.fit(control[[score]].values) + treatments.loc[:, 'matches'] = treatments[score].apply(lambda x: neighbor_search.kneighbors(x)[1]) + return treatments, control + + def estimate_treatments(self, treatments, control, outcome): + def get_matched_outcome(matches): + return sum([control[outcome].values[i] / float(len(matches[0])) for i in matches[0]]) + treatments.loc[:,'control outcome'] = treatments['matches'].apply(get_matched_outcome) + return treatments + + def estimate_ATT(self, X, assignment, outcome, confounder_types, n_neighbors=5): + X = self.score(X, confounder_types, assignment) + treatments, control = self.match(X, assignment='assignment', score='propensity score', n_neighbors=n_neighbors) + treatments = self.estimate_treatments(treatments, control, outcome) + y_hat_treated = treatments[outcome].mean() + y_hat_control = treatments['control outcome'].mean() + return y_hat_treated - y_hat_control + + def estimate_ATC(self, X, assignment, outcome, confounder_types, n_neighbors=5): + """ + Assumes a 1 for the test assignment, 0 for the control assignment + :param X: The data set, with (at least) an assignment, set of confounders, and an outcome + :param assignment: A categorical variable (currently, 0 or 1) indicating test or control group, resp. + :param outcome: The outcome of interest. Should be real-valued or ordinal. + :param confounder_types: A dictionary of variable_name: variable_type pairs of strings, where + variable_type is in {'c', 'o', 'd'}, for 'continuous', 'ordinal', and 'discrete'. + :param n_neighbors: An integer for the number of neighbors to use with k-nearest-neighbor matching + :return: a float representing the treatment effect + """ + X['assignment'] = (X['assignment'] + 1) % 2 + return -self.estimate_ATT(X, assignment, outcome, confounder_types, n_neighbors=n_neighbors) + + def estimate_ATE(self, X, assignment, outcome, confounder_types, n_neighbors=5): + att = estimate_ATT(self, X, assignment, outcome, confounder_types, n_neighbors=n_neighbors) + atc = estimate_ATC(self, X, assignment, outcome, confounder_types, n_neighbors=n_neighbors) + return (atc+att)/2. + + +class RegressionDiscontinuity(object): + def __init__ (self, robust=True): + if robust: + self.model = RLM + else: + self.model = OLM + + def estimate_ATE(self, X, continuous='continuous', outcome='outcome', cutoff=0., delta=0.1, indicator='D', + intercept='intercept', store_result=False): + slice = X[X[continuous] < cutoff + delta] + slice = slice[slice[continuous] > cutoff - delta] + slice.loc[:,continuous] = slice[continuous] - cutoff + slice.loc[:, indicator] = (slice[continuous] > 0).apply(int) + slice.loc[:, indicator+'_'+continuous] = slice[indicator] * slice[continuous] + slice.loc[:, intercept] = 1. + model = self.model(slice[outcome], slice[[intercept, indicator+'_'+continuous, indicator, continuous]]) + result = model.fit() + if store_result: + self.result = result + + def check_assumptions(self): + pass \ No newline at end of file diff --git a/causality/inference/independence_tests/__init__.py b/causality/inference/independence_tests/__init__.py index ccc18ee..e87366f 100644 --- a/causality/inference/independence_tests/__init__.py +++ b/causality/inference/independence_tests/__init__.py @@ -9,8 +9,8 @@ DEFAULT_BINS = 2 -class RobustRegressionTest(): - def __init__(self, y, x, z, data, alpha): +class RobustRegressionTest(object): + def __init__(self, y, x, z, data, alpha, variable_types={}): self.regression = sm.RLM(data[y], data[x+z]) self.result = self.regression.fit() self.coefficient = self.result.params[x][0] @@ -30,8 +30,31 @@ def independent(self): else: return True -class ChiSquaredTest(): - def __init__(self, y, x, z, data, alpha): + +class GLMRegressionTest(object): + def __init__(self, y, x, z, data, alpha, variable_types={}): + self.regression = sm.GLM(data[y], data[x+z]) + self.result = self.regression.fit() + self.coefficient = self.result.params[x][0] + confidence_interval = self.result.conf_int(alpha=alpha/2.) + self.upper = confidence_interval[1][x][0] + self.lower = confidence_interval[0][x][0] + + def independent(self): + if self.coefficient > 0.: + if self.lower > 0.: + return False + else: + return True + else: + if self.upper < 0.: + return False + else: + return True + + +class ChiSquaredTest(object): + def __init__(self, y, x, z, data, alpha, variable_types={}): self.alpha = alpha self.total_chi2 = 0. self.total_dof = 0 @@ -121,8 +144,8 @@ def bootstrap(self, X, function, lower_confidence=.05/2., upper_confidence=1. - bootstrap_samples = self.N samples = [] for i in xrange(bootstrap_samples): - bs_indices = np.random.choice(xrange(len(X)), size=len(X), replace=True) - sampled_arr = pd.DataFrame(X.values[bs_indices], columns=X.columns) + #bs_indices = np.random.choice(xrange(len(X)), size=len(X), replace=True) + sampled_arr = X.sample(n=len(X),replace=True)#pd.DataFrame(X.values[bs_indices], columns=X.columns) samples.append(function(sampled_arr)) samples = pd.DataFrame(samples) cis = samples.quantile([lower_confidence,upper_confidence])[0] @@ -165,10 +188,10 @@ def generate_ci_sample(self): @pymc.stochastic(name='joint_sample') def ci_joint(value=self.mcmc_initialization): def logp(value): - xi = [value[i] for i in range(len(x))] - yi = [value[i+len(x)] for i in range(len(y))] - zi = [value[i+len(x)+len(y)] for i in range(len(z))] - if len(z) == 0: + xi = [value[i] for i in range(len(self.x))] + yi = [value[i+len(self.x)] for i in range(len(self.y))] + zi = [value[i+len(self.x)+len(self.y)] for i in range(len(self.z))] + if len(self.z) == 0: log_px_given_z = np.log(self.densities[0].pdf(data_predict=xi)) log_py_given_z = np.log(self.densities[1].pdf(data_predict=yi)) log_pz = 0. @@ -184,10 +207,10 @@ def logp(value): samples = self.N iterations = samples * thin + burn mcmc.sample(iter=iterations, burn=burn, thin=thin) - return pd.DataFrame(mcmc.trace('joint_sample')[:], columns=x+y+z) + return pd.DataFrame(mcmc.trace('joint_sample')[:], columns=self.x+self.y+self.z) -class MutualInformationTest(): +class MutualInformationTest(object): """ This is mostly from "Distribution of Mutual Information" by Marcus Hutter. This MVP implementation doesn't contain priors, but will soon be adjusted to include the priors for n_xy. @@ -301,8 +324,9 @@ def bootstrap(self, X, function, lower_confidence=.05/2., upper_confidence=1. - bootstrap_samples = self.N samples = [] for i in xrange(bootstrap_samples): - bs_indices = np.random.choice(xrange(len(X)), size=len(X), replace=True) - sampled_arr = pd.DataFrame(X.values[bs_indices], columns=X.columns) + sampled_arr = X.sample(n=len(X),replace=True) + #bs_indices = np.random.choice(xrange(len(X)), size=len(X), replace=True) + #sampled_arr = pd.DataFrame(X.values[bs_indices], columns=X.columns) samples.append(function(sampled_arr)) samples = pd.DataFrame(samples) cis = samples.quantile([lower_confidence,upper_confidence])[0] @@ -345,10 +369,10 @@ def generate_ci_sample(self): @pymc.stochastic(name='joint_sample') def ci_joint(value=self.mcmc_initialization): def logp(value): - xi = [value[i] for i in range(len(x))] - yi = [value[i+len(x)] for i in range(len(y))] - zi = [value[i+len(x)+len(y)] for i in range(len(z))] - if len(z) == 0: + xi = [value[i] for i in range(len(self.x))] + yi = [value[i+len(self.x)] for i in range(len(self.y))] + zi = [value[i+len(self.x)+len(self.y)] for i in range(len(self.z))] + if len(self.z) == 0: log_px_given_z = np.log(self.densities[0].pdf(data_predict=xi)) log_py_given_z = np.log(self.densities[1].pdf(data_predict=yi)) log_pz = 0. @@ -364,7 +388,7 @@ def logp(value): samples = self.N iterations = samples * thin + burn mcmc.sample(iter=iterations, burn=burn, thin=thin) - return pd.DataFrame(mcmc.trace('joint_sample')[:], columns=x+y+z) + return pd.DataFrame(mcmc.trace('joint_sample')[:], columns=self.x+self.y+self.z) if __name__=="__main__": size = 500 diff --git a/causality/inference/search/__init__.py b/causality/inference/search/__init__.py index 864adbe..89221e5 100644 --- a/causality/inference/search/__init__.py +++ b/causality/inference/search/__init__.py @@ -126,7 +126,7 @@ def _find_skeleton(self, data, variable_types): z_candidates = list(set(x_neighbors + y_neighbors) - set([x,y])) for z in itertools.combinations(z_candidates, N): test = self.independence_test([y], [x], list(z), - data, self.alpha) + data, self.alpha, variable_types=variable_types) if test.independent(): self._g.remove_edge(x,y) self.separating_sets[(x,y)] = z diff --git a/tests/unit/nonparametric.py b/tests/unit/nonparametric.py index 80f5784..88367a7 100644 --- a/tests/unit/nonparametric.py +++ b/tests/unit/nonparametric.py @@ -2,11 +2,11 @@ from scipy.integrate import nquad import numpy as np -from causality.estimation.nonparametric import CausalEffect +from causality.estimation.nonparametric import CausalEffect, BootstrapEstimator from tests.unit import TestAPI from tests.unit.settings import TOL - +""" class TestCausalEffect(TestAPI): def setUp(self): self.X = pd.read_csv('./tests/unit/data/X.csv') @@ -186,3 +186,60 @@ def test_expectation_continuous(self): print "E(d | do(b = 600) ): ",p2 #assert( abs( p - 0.25 ) < 0.05 ) assert( abs( ( p2 - p1 ) / 200 - 5. < 0.01 ) ) +""" +class TestBootstrapEstimator(TestAPI): + def setUp(self): + size = 1000 + x1 = np.random.choice([0.1,0.15], size=size) + x2 = x1 + np.random.normal(size=size) + x3 = x2 + np.random.normal(size=size) + self.X_close = pd.DataFrame({'x1' : x1, 'x2' : x2, 'x3' : x3}) + + size = 3000 + x1 = np.random.choice([0,1,2], size=size) + x2 = x1 + np.random.normal(size=size) + x3 = x2 + np.random.normal(size=size) + self.X_discrete = pd.DataFrame({'x1' : x1, 'x2' : x2, 'x3' : x3}) + + + def test_estimate(self): + f = lambda X : X.groupby('x1')['x2'].mean() + est = BootstrapEstimator(f=f) + discr = est.estimate(self.X_discrete) + assert discr[0][0.025] <= 0. <= discr[0][0.975] + assert discr[1][0.025] <= 1. <= discr[1][0.975] + assert discr[2][0.025] <= 2. <= discr[2][0.975] + + def test_found_winner(self): + size = 10 + x1 = np.random.choice([0.1,0.5], size=size) + x2 = x1 + np.random.normal(size=size) + x3 = x2 + np.random.normal(size=size) + X_close = pd.DataFrame({'x1' : x1, 'x2' : x2, 'x3' : x3}) + + f = lambda X : X.groupby('x1')['x2'].mean() + est = BootstrapEstimator(f=f) + close_result = est.found_winner(X_close) + assert not close_result + + size=5000 + x1 = np.random.choice([0.1,0.5], size=size) + x2 = x1 + np.random.normal(size=size) + x3 = x2 + np.random.normal(size=size) + X_close = pd.DataFrame({'x1' : x1, 'x2' : x2, 'x3' : x3}) + + f = lambda X : X.groupby('x1')['x2'].mean() + est = BootstrapEstimator(f=f) + close_result = est.found_winner(X_close) + assert close_result + + def test_chances_of_winning(self): + size = 1000 + x1 = np.random.choice(['a','b'], size=size) + x2 = np.random.normal(size=size) + x3 = x2 + np.random.normal(size=size) + X_even = pd.DataFrame({'x1' : x1, 'x2' : x2, 'x3' : x3}) + f = lambda X : X.groupby('x1')['x2'].mean() + est = BootstrapEstimator(f=f) + raise Exception(est.chances_of_winning(X_even)) + assert 0.95 * 0.5 <= est.chances_of_winning <= 1.05 * 0.5 diff --git a/tests/unit/parametric.py b/tests/unit/parametric.py index 423ca43..aadf0ff 100644 --- a/tests/unit/parametric.py +++ b/tests/unit/parametric.py @@ -1,14 +1,14 @@ import pandas as pd import numpy as np -from causality.estimation.parametric import DifferenceInDifferences +from causality.estimation.parametric import DifferenceInDifferences, PropensityScoreMatching from tests.unit import TestAPI from tests.unit.settings import TOL class TestDID(TestAPI): def setUp(self): - SIZE = 2000 + SIZE = 5000 assignment = np.random.binomial(1,0.5, size=SIZE) pre_experiment = assignment + np.random.normal(-1, size=SIZE) start = assignment + np.random.normal(1, size=SIZE) @@ -32,3 +32,61 @@ def test_did_estimator(self): lower, exp, upper = self.did.average_treatment_effect(self.X) assert 1.8 <= exp <= 2.2 assert lower <= exp <= upper + +class TestPropScore(TestAPI): + def test_match(self): + matcher = PropensityScoreMatching() + X = pd.DataFrame({'assignment': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0], + 'propensity score': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5]}) + + test, control = matcher.match(X, n_neighbors=3) + + matches = test[test['propensity score'] == 2]['matches'].values[0][0] + assert set(control.iloc[matches]['propensity score'].values) == set([1, 2, 3]) + + matches = test[test['propensity score'] == 4]['matches'].values[0][0] + assert set(control.iloc[matches]['propensity score'].values) == set([3, 4, 5]) + + def test_score(self): + N = 100000 + z1 = np.random.normal(size=N) + z2 = np.random.choice([0,1], size=N) + z3 = np.random.choice(['a','b','c'], size=N) + numeric_mapping = {'a' :3, 'b' :4, 'c' :5} + z3_numeric = [numeric_mapping[z3i] for z3i in z3] + p_assign = np.exp(z1 + z2 + z3_numeric) / (1. + np.exp(z1 + z2 + z3_numeric)) + assignment = np.random.binomial(1, p_assign) + outcome = np.random.normal(assignment) + matcher = PropensityScoreMatching() + X = pd.DataFrame({'z1': z1, 'z2': z2, 'z3': z3, 'assignment': assignment, 'outcome': outcome}) + confounder_types = {'z1': 'c', 'z2':'o', 'z3' : 'o'} + matcher.score(X, confounder_types, store_model_fit=True) + assert 0.9 <= matcher.model_fit.params['z1'] <= 1.1 + assert 0.9 <= matcher.model_fit.params['z2'] <= 1.1 + assert 0.0 <= matcher.model_fit.params['z3_b'] <= 2.0 + assert 1.0 <= matcher.model_fit.params['z3_c'] <= 3.0 + assert 2.0 <= matcher.model_fit.params['intercept'] <= 4.0 + + def test_at_estimators(self): + ates = [] + atcs = [] + atts = [] + for i in range(100): + N = 1000 + X = np.random.choice([0.25, 0.75], size=N) + X = pd.DataFrame(X, columns=['Z']) + X.loc[:, 'assignment'] = np.random.binomial(1, p=X['Z']) + X.loc[:, 'outcome'] = np.random.normal(3.1 * X['assignment'] + 2.0 * X['Z']) + + matcher = PropensityScoreMatching() + att = matcher.estimate_ATT(X, 'assignment', 'outcome', {'Z': 'c'}, n_neighbors=10) + X.loc[:,'inverted assignment'] = (X['assignment'] + 1) % 2 + atc = matcher.estimate_ATT(X, 'inverted assignment', 'outcome', {'Z': 'c'}, n_neighbors=10) + + ate = (att + atc) / 2. + atts.append(att) + atcs.append(atc) + ates.append(ate) + X = pd.DataFrame({'att': atts, 'ate': ates, 'atc': atcs}) + assert (3.0 <= X.mean()).all() + assert (X.mean() <= 4.0).all() \ No newline at end of file