Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small changes to allow IC to use MixedChiSquare; also a quick hack to… #15

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions causality/inference/independence_tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
DEFAULT_BINS = 2

class RobustRegressionTest():
def __init__(self, y, x, z, data, alpha):
def __init__(self, y, x, z, data, alpha, variable_types={}):
self.regression = sm.RLM(data[y], data[x+z])
self.result = self.regression.fit()
self.coefficient = self.result.params[x][0]
Expand Down Expand Up @@ -45,13 +45,13 @@ def __init__(self, y, x, z, data, alpha):
y_values = {yi : data.groupby(yi).groups.keys()}

contingencies = itertools.product(*[z_values[zi] for zi in z])

for contingency in contingencies:
contingency_table = tables.loc[contingency].values
try:
chi2, _, dof, _ = scipy.stats.chi2_contingency(contingency_table)
except ValueError:
raise Exception("Not enough data or entries with 0 present: Chi^2 Test not applicable.")
print "Potentially not enough data or entries with 0 present: Chi^2 Test not applicable."
chi2, _, dof, _ = scipy.stats.chi2_contingency(contingency_table+1) #Hack that shrinks towards equal distribution
self.total_dof += dof
self.total_chi2 += chi2
self.total_p = 1. - scipy.stats.chi2.cdf(self.total_chi2, self.total_dof)
Expand Down Expand Up @@ -81,6 +81,7 @@ def __init__(self, y, x, z, X, alpha, variable_types={}, burn=1000, thin=10, bin
self.x = x
self.y = y
self.z = z
print '\nCreating indep test for x, y, z = ',x,y,z
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @stewartsiu ! We shouldn't have print statements outside of exceptions!

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, line 32 needs a variable_types={} kwarg!

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@akelleh, Why line 32 (it is a class definition)? If you meant line 33, then wouldn't additional code be needed to handle variable_types?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're right! line 33. No additional code needs to be added to handle it. It's required since the ICs.search() method passes the CIT a variable_types arg by default now

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stewartsiu (so the modification on line 33 should be all we need!)

if len(X) > 300 or max(len(x+z),len(y+z)) >= 3:
self.defaults=EstimatorSettings(n_jobs=4, efficient=True)
else:
Expand Down Expand Up @@ -110,6 +111,8 @@ def discretize(self, X):
self.discretized = []
discretized_X = X.copy()
for column, var_type in self.variable_types.items():
if column not in X:
continue
if var_type == 'c':
bins = self.bins.get(column,DEFAULT_BINS)
discretized_X[column] = pd.qcut(X[column],bins,labels=False)
Expand All @@ -133,6 +136,8 @@ def bootstrap(self, X, function, lower_confidence=.05/2., upper_confidence=1. -
def estimate_densities(self, x, y, z, X):
p_x_given_z = self.estimate_cond_pdf(x, z, X)
p_y_given_z = self.estimate_cond_pdf(y, z, X)
if len(z)==0:
return p_x_given_z, p_y_given_z
p_z = self.estimate_cond_pdf(z, [], X)
return p_x_given_z, p_y_given_z, p_z

Expand All @@ -142,9 +147,10 @@ def estimate_cond_pdf(self, x, z, X):
bw = 'cv_ml'
else:
bw = 'cv_ml'#'normal_reference'

# if conditioning on the empty set, return a pdf instead of cond pdf
if len(z) == 0:
if len(x)==0:
raise Exception('x in P(x|z) cannot be null')
return KDEMultivariate(X[x],
var_type=''.join([self.variable_types[xi] for xi in x]),
bw=bw,
Expand All @@ -158,6 +164,9 @@ def estimate_cond_pdf(self, x, z, X):
defaults=self.defaults)

def generate_ci_sample(self):
x = self.x
y = self.y
z = self.z
@pymc.stochastic(name='joint_sample')
def ci_joint(value=self.mcmc_initialization):
def logp(value):
Expand Down
2 changes: 1 addition & 1 deletion causality/inference/search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def _find_skeleton(self):
z_candidates = list(set(x_neighbors + y_neighbors) - set([x,y]))
for z in itertools.combinations(z_candidates, N):
test = self.independence_test([y], [x], list(z),
self.data, self.alpha)
self.data, self.alpha, variable_types=self.variable_types)
if test.independent():
self._g.remove_edge(x,y)
self.separating_sets[(x,y)] = z
Expand Down