From bf5178f9555b232bc29b981ba6539fb9d6e63e45 Mon Sep 17 00:00:00 2001 From: Stewart Siu Date: Tue, 19 Apr 2016 21:05:13 -0400 Subject: [PATCH] Small changes to allow IC to use MixedChiSquare; also a quick hack to ChiSquared test to allow zero contingency entry, so that it can be used for testing. --- .../inference/independence_tests/__init__.py | 17 +++++++++++++---- causality/inference/search/__init__.py | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/causality/inference/independence_tests/__init__.py b/causality/inference/independence_tests/__init__.py index 009afcc..6475547 100644 --- a/causality/inference/independence_tests/__init__.py +++ b/causality/inference/independence_tests/__init__.py @@ -9,7 +9,7 @@ DEFAULT_BINS = 2 class RobustRegressionTest(): - def __init__(self, y, x, z, data, alpha): + def __init__(self, y, x, z, data, alpha, variable_types={}): self.regression = sm.RLM(data[y], data[x+z]) self.result = self.regression.fit() self.coefficient = self.result.params[x][0] @@ -45,13 +45,13 @@ def __init__(self, y, x, z, data, alpha): y_values = {yi : data.groupby(yi).groups.keys()} contingencies = itertools.product(*[z_values[zi] for zi in z]) - for contingency in contingencies: contingency_table = tables.loc[contingency].values try: chi2, _, dof, _ = scipy.stats.chi2_contingency(contingency_table) except ValueError: - raise Exception("Not enough data or entries with 0 present: Chi^2 Test not applicable.") + print "Potentially not enough data or entries with 0 present: Chi^2 Test not applicable." + chi2, _, dof, _ = scipy.stats.chi2_contingency(contingency_table+1) #Hack that shrinks towards equal distribution self.total_dof += dof self.total_chi2 += chi2 self.total_p = 1. - scipy.stats.chi2.cdf(self.total_chi2, self.total_dof) @@ -81,6 +81,7 @@ def __init__(self, y, x, z, X, alpha, variable_types={}, burn=1000, thin=10, bin self.x = x self.y = y self.z = z + print '\nCreating indep test for x, y, z = ',x,y,z if len(X) > 300 or max(len(x+z),len(y+z)) >= 3: self.defaults=EstimatorSettings(n_jobs=4, efficient=True) else: @@ -110,6 +111,8 @@ def discretize(self, X): self.discretized = [] discretized_X = X.copy() for column, var_type in self.variable_types.items(): + if column not in X: + continue if var_type == 'c': bins = self.bins.get(column,DEFAULT_BINS) discretized_X[column] = pd.qcut(X[column],bins,labels=False) @@ -133,6 +136,8 @@ def bootstrap(self, X, function, lower_confidence=.05/2., upper_confidence=1. - def estimate_densities(self, x, y, z, X): p_x_given_z = self.estimate_cond_pdf(x, z, X) p_y_given_z = self.estimate_cond_pdf(y, z, X) + if len(z)==0: + return p_x_given_z, p_y_given_z p_z = self.estimate_cond_pdf(z, [], X) return p_x_given_z, p_y_given_z, p_z @@ -142,9 +147,10 @@ def estimate_cond_pdf(self, x, z, X): bw = 'cv_ml' else: bw = 'cv_ml'#'normal_reference' - # if conditioning on the empty set, return a pdf instead of cond pdf if len(z) == 0: + if len(x)==0: + raise Exception('x in P(x|z) cannot be null') return KDEMultivariate(X[x], var_type=''.join([self.variable_types[xi] for xi in x]), bw=bw, @@ -158,6 +164,9 @@ def estimate_cond_pdf(self, x, z, X): defaults=self.defaults) def generate_ci_sample(self): + x = self.x + y = self.y + z = self.z @pymc.stochastic(name='joint_sample') def ci_joint(value=self.mcmc_initialization): def logp(value): diff --git a/causality/inference/search/__init__.py b/causality/inference/search/__init__.py index 1968493..53843eb 100644 --- a/causality/inference/search/__init__.py +++ b/causality/inference/search/__init__.py @@ -123,7 +123,7 @@ def _find_skeleton(self): z_candidates = list(set(x_neighbors + y_neighbors) - set([x,y])) for z in itertools.combinations(z_candidates, N): test = self.independence_test([y], [x], list(z), - self.data, self.alpha) + self.data, self.alpha, variable_types=self.variable_types) if test.independent(): self._g.remove_edge(x,y) self.separating_sets[(x,y)] = z