From bf5178f9555b232bc29b981ba6539fb9d6e63e45 Mon Sep 17 00:00:00 2001
From: Stewart Siu <ssiu@kcg.com>
Date: Tue, 19 Apr 2016 21:05:13 -0400
Subject: [PATCH] Small changes to allow IC to use MixedChiSquare; also a quick
 hack to ChiSquared test to allow zero contingency entry, so that it can be
 used for testing.

---
 .../inference/independence_tests/__init__.py    | 17 +++++++++++++----
 causality/inference/search/__init__.py          |  2 +-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/causality/inference/independence_tests/__init__.py b/causality/inference/independence_tests/__init__.py
index 009afcc..6475547 100644
--- a/causality/inference/independence_tests/__init__.py
+++ b/causality/inference/independence_tests/__init__.py
@@ -9,7 +9,7 @@
 DEFAULT_BINS = 2
 
 class RobustRegressionTest():
-    def __init__(self, y, x, z, data, alpha):
+    def __init__(self, y, x, z, data, alpha, variable_types={}):
         self.regression = sm.RLM(data[y], data[x+z])
         self.result = self.regression.fit()
         self.coefficient = self.result.params[x][0]
@@ -45,13 +45,13 @@ def __init__(self, y, x, z, data, alpha):
             y_values = {yi : data.groupby(yi).groups.keys()}
 
             contingencies = itertools.product(*[z_values[zi] for zi in z])
-
             for contingency in contingencies:
                 contingency_table = tables.loc[contingency].values
                 try:
                     chi2, _, dof, _ = scipy.stats.chi2_contingency(contingency_table)
                 except ValueError:
-                    raise Exception("Not enough data or entries with 0 present: Chi^2 Test not applicable.")
+                    print "Potentially not enough data or entries with 0 present: Chi^2 Test not applicable."
+                    chi2, _, dof, _ = scipy.stats.chi2_contingency(contingency_table+1) #Hack that shrinks towards equal distribution
                 self.total_dof += dof
                 self.total_chi2 += chi2
         self.total_p = 1. - scipy.stats.chi2.cdf(self.total_chi2, self.total_dof)
@@ -81,6 +81,7 @@ def __init__(self, y, x, z, X, alpha, variable_types={}, burn=1000, thin=10, bin
         self.x = x
         self.y = y
         self.z = z
+        print '\nCreating indep test for x, y, z = ',x,y,z
         if len(X) > 300 or max(len(x+z),len(y+z)) >= 3:
             self.defaults=EstimatorSettings(n_jobs=4, efficient=True)
         else:
@@ -110,6 +111,8 @@ def discretize(self, X):
         self.discretized = []
         discretized_X = X.copy()
         for column, var_type in self.variable_types.items():
+            if column not in X:
+                continue
             if var_type == 'c':
                 bins = self.bins.get(column,DEFAULT_BINS)
                 discretized_X[column] = pd.qcut(X[column],bins,labels=False)
@@ -133,6 +136,8 @@ def bootstrap(self, X, function, lower_confidence=.05/2., upper_confidence=1. -
     def estimate_densities(self, x, y, z, X):
         p_x_given_z = self.estimate_cond_pdf(x, z, X)
         p_y_given_z = self.estimate_cond_pdf(y, z, X)
+        if len(z)==0:
+            return p_x_given_z, p_y_given_z
         p_z = self.estimate_cond_pdf(z, [], X)
         return p_x_given_z, p_y_given_z, p_z
 
@@ -142,9 +147,10 @@ def estimate_cond_pdf(self, x, z, X):
             bw = 'cv_ml'
         else:
             bw = 'cv_ml'#'normal_reference'
-
         # if conditioning on the empty set, return a pdf instead of cond pdf
         if len(z) == 0:
+            if len(x)==0:
+                raise Exception('x in P(x|z) cannot be null')
             return KDEMultivariate(X[x],
                                   var_type=''.join([self.variable_types[xi] for xi in x]),
                                   bw=bw,
@@ -158,6 +164,9 @@ def estimate_cond_pdf(self, x, z, X):
                                               defaults=self.defaults)
 
     def generate_ci_sample(self):
+        x = self.x
+        y = self.y
+        z = self.z
         @pymc.stochastic(name='joint_sample')
         def ci_joint(value=self.mcmc_initialization):
             def logp(value):
diff --git a/causality/inference/search/__init__.py b/causality/inference/search/__init__.py
index 1968493..53843eb 100644
--- a/causality/inference/search/__init__.py
+++ b/causality/inference/search/__init__.py
@@ -123,7 +123,7 @@ def _find_skeleton(self):
                 z_candidates = list(set(x_neighbors + y_neighbors) - set([x,y]))
                 for z in itertools.combinations(z_candidates, N):
                     test = self.independence_test([y], [x], list(z), 
-                        self.data, self.alpha)
+                        self.data, self.alpha, variable_types=self.variable_types)
                     if test.independent():
                         self._g.remove_edge(x,y)
                         self.separating_sets[(x,y)] = z