-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow users to run one tailed experiments #137
Changes from 3 commits
ad1244b
28a1db6
a064435
92eacb8
a2d7359
bb51bc0
2a85493
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -6,6 +6,7 @@ | |||
import statsmodels.api as sm | ||||
from pandas.api.types import is_numeric_dtype | ||||
from scipy.stats import ttest_ind, ttest_rel | ||||
from utils import HypothesisEntries | ||||
Gabrielcidral1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
|
||||
|
||||
class ExperimentAnalysis(ABC): | ||||
|
@@ -33,12 +34,14 @@ def __init__( | |||
treatment_col: str = "treatment", | ||||
treatment: str = "B", | ||||
covariates: Optional[List[str]] = None, | ||||
hypothesis: HypothesisEntries = HypothesisEntries.TWO_SIDED, | ||||
Gabrielcidral1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
): | ||||
self.target_col = target_col | ||||
self.treatment = treatment | ||||
self.treatment_col = treatment_col | ||||
self.cluster_cols = cluster_cols | ||||
self.covariates = covariates or [] | ||||
self.hypothesis = hypothesis | ||||
Gabrielcidral1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
|
||||
def _get_cluster_column(self, df: pd.DataFrame) -> pd.Series: | ||||
"""Paste all strings of cluster_cols in one single column""" | ||||
|
@@ -111,6 +114,25 @@ def get_point_estimate(self, df: pd.DataFrame) -> float: | |||
self._data_checks(df=df) | ||||
return self.analysis_point_estimate(df) | ||||
|
||||
def pvalue_based_on_hypothesis(self, model_result) -> float: | ||||
Gabrielcidral1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
"""Returns the p-value of the analysis | ||||
Arguments: | ||||
model_result: statsmodels result object | ||||
verbose (Optional): bool, prints the regression summary if True | ||||
|
||||
""" | ||||
treatment_effect = model_result.params[self.treatment_col] | ||||
p_value_half = model_result.pvalues[self.treatment_col] / 2 | ||||
Gabrielcidral1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
|
||||
if self.hypothesis == "less": | ||||
p_value = p_value_half if treatment_effect <= 0 else 1 - p_value_half | ||||
Gabrielcidral1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
elif self.hypothesis == "greater": | ||||
p_value = p_value_half if treatment_effect >= 0 else 1 - p_value_half | ||||
elif self.hypothesis == "two-sided": | ||||
p_value = model_result.pvalues[self.treatment_col] | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we do this, I understand we are not using enum, right? Then I would remove the Enum code There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, an else clause is missing here raising an error |
||||
|
||||
return p_value | ||||
|
||||
david26694 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
@classmethod | ||||
def from_config(cls, config): | ||||
"""Creates an ExperimentAnalysis object from a PowerConfig object""" | ||||
|
@@ -160,13 +182,15 @@ def __init__( | |||
treatment_col: str = "treatment", | ||||
treatment: str = "B", | ||||
covariates: Optional[List[str]] = None, | ||||
hypothesis: str = "two-sided", | ||||
): | ||||
super().__init__( | ||||
target_col=target_col, | ||||
treatment_col=treatment_col, | ||||
cluster_cols=cluster_cols, | ||||
treatment=treatment, | ||||
covariates=covariates, | ||||
hypothesis=hypothesis, | ||||
) | ||||
self.regressors = [self.treatment_col] + self.covariates | ||||
self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}" | ||||
|
@@ -192,7 +216,8 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float: | |||
results_gee = self.fit_gee(df) | ||||
if verbose: | ||||
print(results_gee.summary()) | ||||
return results_gee.pvalues[self.treatment_col] | ||||
p_value = self.pvalue_based_on_hypothesis(results_gee) | ||||
Gabrielcidral1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
return p_value | ||||
|
||||
def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float: | ||||
"""Returns the point estimate of the analysis | ||||
|
@@ -265,7 +290,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float: | |||
) | ||||
if verbose: | ||||
print(results_ols.summary()) | ||||
return results_ols.pvalues[self.treatment_col] | ||||
|
||||
p_value = self.pvalue_based_on_hypothesis(results_ols) | ||||
return p_value | ||||
|
||||
def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float: | ||||
"""Returns the point estimate of the analysis | ||||
|
@@ -337,7 +364,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float: | |||
control_data = df_grouped.query(f"{self.treatment_col} == 0")[self.target_col] | ||||
assert len(treatment_data), "treatment data should have more than 1 cluster" | ||||
assert len(control_data), "control data should have more than 1 cluster" | ||||
t_test_results = ttest_ind(treatment_data, control_data, equal_var=False) | ||||
t_test_results = ttest_ind( | ||||
treatment_data, control_data, equal_var=False, alternative=self.hypothesis | ||||
) | ||||
return t_test_results.pvalue | ||||
|
||||
@classmethod | ||||
|
@@ -446,7 +475,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float: | |||
|
||||
df_pivot = self._preprocessing(df=df) | ||||
|
||||
t_test_results = ttest_rel(df_pivot.iloc[:, 0], df_pivot.iloc[:, 1]) | ||||
t_test_results = ttest_rel( | ||||
df_pivot.iloc[:, 0], df_pivot.iloc[:, 1], alternative=self.hypothesis | ||||
) | ||||
|
||||
if verbose: | ||||
print(f"paired t test results: \n {t_test_results} \n") | ||||
|
@@ -498,13 +529,15 @@ def __init__( | |||
treatment_col: str = "treatment", | ||||
treatment: str = "B", | ||||
covariates: Optional[List[str]] = None, | ||||
hypothesis: str = "two-sided", | ||||
): | ||||
self.target_col = target_col | ||||
self.treatment = treatment | ||||
self.treatment_col = treatment_col | ||||
self.covariates = covariates or [] | ||||
self.regressors = [self.treatment_col] + self.covariates | ||||
self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}" | ||||
self.hypothesis = hypothesis | ||||
|
||||
def fit_ols(self, df: pd.DataFrame) -> sm.GEE: | ||||
"""Returns the fitted OLS model""" | ||||
|
@@ -519,7 +552,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float: | |||
results_ols = self.fit_ols(df=df) | ||||
if verbose: | ||||
print(results_ols.summary()) | ||||
return results_ols.pvalues[self.treatment_col] | ||||
|
||||
p_value = self.pvalue_based_on_hypothesis(results_ols) | ||||
return p_value | ||||
|
||||
def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float: | ||||
"""Returns the point estimate of the analysis | ||||
|
@@ -612,7 +647,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float: | |||
if verbose: | ||||
print(results_mlm.summary()) | ||||
|
||||
return results_mlm.pvalues[self.treatment_col] | ||||
p_value = self.pvalue_based_on_hypothesis(results_mlm) | ||||
|
||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
for consistency |
||||
return p_value | ||||
|
||||
def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float: | ||||
"""Returns the point estimate of the analysis | ||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
import pandas as pd | ||
import pytest | ||
|
||
from cluster_experiments.experiment_analysis import OLSAnalysis | ||
from tests.examples import analysis_df | ||
|
@@ -16,3 +17,10 @@ def test_get_pvalue(): | |
analysis_df_full = pd.concat([analysis_df for _ in range(100)]) | ||
analyser = OLSAnalysis() | ||
assert analyser.get_pvalue(analysis_df_full) >= 0 | ||
|
||
|
||
@pytest.mark.parametrize("hypothesis", ["one_sided", "two_sided"]) | ||
def test_get_pvalue_hypothesis(hypothesis): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would add tests for the pvalue_based_on_hypothesis method too, since it has some logic |
||
analysis_df_full = pd.concat([analysis_df for _ in range(100)]) | ||
analyser = OLSAnalysis(hypothesis=hypothesis) | ||
assert analyser.get_pvalue(analysis_df_full) >= 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this for your local developement? or is it because of github actions?