david26694 · david26694 · Jan 12, 2024 · Dec 20, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/cluster_experiments/experiment_analysis.py b/cluster_experiments/experiment_analysis.py
@@ -7,6 +7,8 @@
 from pandas.api.types import is_numeric_dtype
 from scipy.stats import ttest_ind, ttest_rel
 
+from cluster_experiments.utils import HypothesisEntries
+
 
 class ExperimentAnalysis(ABC):
     """
@@ -23,6 +25,7 @@ class ExperimentAnalysis(ABC):
         treatment_col: name of the column containing the treatment variable
         treatment: name of the treatment to use as the treated group
         covariates: list of columns to use as covariates
+        hypothesis: one of "two-sided", "less", "greater" indicating the alternative hypothesis
 
     """
 
@@ -33,12 +36,14 @@ def __init__(
         treatment_col: str = "treatment",
         treatment: str = "B",
         covariates: Optional[List[str]] = None,
+        hypothesis: str = "two-sided",
     ):
         self.target_col = target_col
         self.treatment = treatment
         self.treatment_col = treatment_col
         self.cluster_cols = cluster_cols
         self.covariates = covariates or []
+        self.hypothesis = hypothesis
 
     def _get_cluster_column(self, df: pd.DataFrame) -> pd.Series:
         """Paste all strings of cluster_cols in one single column"""
@@ -111,6 +116,26 @@ def get_point_estimate(self, df: pd.DataFrame) -> float:
         self._data_checks(df=df)
         return self.analysis_point_estimate(df)
 
+    def pvalue_based_on_hypothesis(
+        self, model_result
+    ) -> float:  # todo add typehint statsmodels result
+        """Returns the p-value of the analysis
+        Arguments:
+            model_result: statsmodels result object
+            verbose (Optional): bool, prints the regression summary if True
+
+        """
+        treatment_effect = model_result.params[self.treatment_col]
+        p_value = model_result.pvalues[self.treatment_col]
+
+        if HypothesisEntries(self.hypothesis) == HypothesisEntries.LESS:
+            return p_value / 2 if treatment_effect <= 0 else 1 - p_value / 2
+        if HypothesisEntries(self.hypothesis) == HypothesisEntries.GREATER:
+            return p_value / 2 if treatment_effect >= 0 else 1 - p_value / 2
+        if HypothesisEntries(self.hypothesis) == HypothesisEntries.TWO_SIDED:
+            return p_value
+        raise ValueError(f"{self.hypothesis} is not a valid HypothesisEntries")
+
     @classmethod
     def from_config(cls, config):
         """Creates an ExperimentAnalysis object from a PowerConfig object"""
@@ -120,6 +145,7 @@ def from_config(cls, config):
             treatment_col=config.treatment_col,
             treatment=config.treatment,
             covariates=config.covariates,
+            hypothesis=config.hypothesis,
         )
 
 
@@ -133,6 +159,7 @@ class GeeExperimentAnalysis(ExperimentAnalysis):
         treatment_col: name of the column containing the treatment variable
         treatment: name of the treatment to use as the treated group
         covariates: list of columns to use as covariates
+        hypothesis: one of "two-sided", "less", "greater" indicating the alternative hypothesis
 
     Usage:
 
@@ -160,13 +187,15 @@ def __init__(
         treatment_col: str = "treatment",
         treatment: str = "B",
         covariates: Optional[List[str]] = None,
+        hypothesis: str = "two-sided",
     ):
         super().__init__(
             target_col=target_col,
             treatment_col=treatment_col,
             cluster_cols=cluster_cols,
             treatment=treatment,
             covariates=covariates,
+            hypothesis=hypothesis,
         )
         self.regressors = [self.treatment_col] + self.covariates
         self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}"
@@ -192,7 +221,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
         results_gee = self.fit_gee(df)
         if verbose:
             print(results_gee.summary())
-        return results_gee.pvalues[self.treatment_col]
+
+        p_value = self.pvalue_based_on_hypothesis(results_gee)
+        return p_value
 
     def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float:
         """Returns the point estimate of the analysis
@@ -214,6 +245,7 @@ class ClusteredOLSAnalysis(ExperimentAnalysis):
         treatment_col: name of the column containing the treatment variable
         treatment: name of the treatment to use as the treated group
         covariates: list of columns to use as covariates
+        hypothesis: one of "two-sided", "less", "greater" indicating the alternative hypothesis
 
     Usage:
 
@@ -241,13 +273,15 @@ def __init__(
         treatment_col: str = "treatment",
         treatment: str = "B",
         covariates: Optional[List[str]] = None,
+        hypothesis: str = "two-sided",
     ):
         super().__init__(
             target_col=target_col,
             treatment_col=treatment_col,
             cluster_cols=cluster_cols,
             treatment=treatment,
             covariates=covariates,
+            hypothesis=hypothesis,
         )
         self.regressors = [self.treatment_col] + self.covariates
         self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}"
@@ -265,7 +299,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
         )
         if verbose:
             print(results_ols.summary())
-        return results_ols.pvalues[self.treatment_col]
+
+        p_value = self.pvalue_based_on_hypothesis(results_ols)
+        return p_value
 
     def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float:
         """Returns the point estimate of the analysis
@@ -290,6 +326,7 @@ class TTestClusteredAnalysis(ExperimentAnalysis):
         target_col: name of the column containing the variable to measure
         treatment_col: name of the column containing the treatment variable
         treatment: name of the treatment to use as the treated group
+        hypothesis: one of "two-sided", "less", "greater" indicating the alternative hypothesis
 
     Usage:
 
@@ -316,11 +353,13 @@ def __init__(
         target_col: str = "target",
         treatment_col: str = "treatment",
         treatment: str = "B",
+        hypothesis: str = "two-sided",
     ):
         self.target_col = target_col
         self.treatment = treatment
         self.treatment_col = treatment_col
         self.cluster_cols = cluster_cols
+        self.hypothesis = hypothesis
 
     def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
         """Returns the p-value of the analysis
@@ -337,7 +376,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
         control_data = df_grouped.query(f"{self.treatment_col} == 0")[self.target_col]
         assert len(treatment_data), "treatment data should have more than 1 cluster"
         assert len(control_data), "control data should have more than 1 cluster"
-        t_test_results = ttest_ind(treatment_data, control_data, equal_var=False)
+        t_test_results = ttest_ind(
+            treatment_data, control_data, equal_var=False, alternative=self.hypothesis
+        )
         return t_test_results.pvalue
 
     @classmethod
@@ -348,6 +389,7 @@ def from_config(cls, config):
             target_col=config.target_col,
             treatment_col=config.treatment_col,
             treatment=config.treatment,
+            hypothesis=config.hypothesis,
         )
 
 
@@ -361,6 +403,7 @@ class PairedTTestClusteredAnalysis(ExperimentAnalysis):
         treatment_col: name of the column containing the treatment variable
         treatment: name of the treatment to use as the treated group
         strata_cols: list of index columns for paired t test. Should be a subset or equal to cluster_cols
+        hypothesis: one of "two-sided", "less", "greater" indicating the alternative hypothesis
 
     Usage:
 
@@ -389,12 +432,14 @@ def __init__(
         target_col: str = "target",
         treatment_col: str = "treatment",
         treatment: str = "B",
+        hypothesis: str = "two-sided",
     ):
         self.strata_cols = strata_cols
         self.target_col = target_col
         self.treatment = treatment
         self.treatment_col = treatment_col
         self.cluster_cols = cluster_cols
+        self.hypothesis = hypothesis
 
     def _preprocessing(self, df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
         df_grouped = df.groupby(
@@ -446,7 +491,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
 
         df_pivot = self._preprocessing(df=df)
 
-        t_test_results = ttest_rel(df_pivot.iloc[:, 0], df_pivot.iloc[:, 1])
+        t_test_results = ttest_rel(
+            df_pivot.iloc[:, 0], df_pivot.iloc[:, 1], alternative=self.hypothesis
+        )
 
         if verbose:
             print(f"paired t test results: \n {t_test_results} \n")
@@ -462,6 +509,7 @@ def from_config(cls, config):
             treatment_col=config.treatment_col,
             treatment=config.treatment,
             strata_cols=config.strata_cols,
+            hypothesis=config.hypothesis,
         )
 
 
@@ -474,6 +522,7 @@ class OLSAnalysis(ExperimentAnalysis):
         treatment_col: name of the column containing the treatment variable
         treatment: name of the treatment to use as the treated group
         covariates: list of columns to use as covariates
+        hypothesis: one of "two-sided", "less", "greater" indicating the alternative hypothesis
 
     Usage:
 
@@ -498,13 +547,15 @@ def __init__(
         treatment_col: str = "treatment",
         treatment: str = "B",
         covariates: Optional[List[str]] = None,
+        hypothesis: str = "two-sided",
     ):
         self.target_col = target_col
         self.treatment = treatment
         self.treatment_col = treatment_col
         self.covariates = covariates or []
         self.regressors = [self.treatment_col] + self.covariates
         self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}"
+        self.hypothesis = hypothesis
 
     def fit_ols(self, df: pd.DataFrame) -> sm.GEE:
         """Returns the fitted OLS model"""
@@ -519,7 +570,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
         results_ols = self.fit_ols(df=df)
         if verbose:
             print(results_ols.summary())
-        return results_ols.pvalues[self.treatment_col]
+
+        p_value = self.pvalue_based_on_hypothesis(results_ols)
+        return p_value
 
     def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float:
         """Returns the point estimate of the analysis
@@ -538,6 +591,7 @@ def from_config(cls, config):
             treatment_col=config.treatment_col,
             treatment=config.treatment,
             covariates=config.covariates,
+            hypothesis=config.hypothesis,
         )
 
 
@@ -551,6 +605,7 @@ class MLMExperimentAnalysis(ExperimentAnalysis):
         treatment_col: name of the column containing the treatment variable
         treatment: name of the treatment to use as the treated group
         covariates: list of columns to use as covariates
+        hypothesis: one of "two-sided", "less", "greater" indicating the alternative hypothesis
 
     Usage:
 
@@ -578,13 +633,15 @@ def __init__(
         treatment_col: str = "treatment",
         treatment: str = "B",
         covariates: Optional[List[str]] = None,
+        hypothesis: str = "two-sided",
     ):
         super().__init__(
             target_col=target_col,
             treatment_col=treatment_col,
             cluster_cols=cluster_cols,
             treatment=treatment,
             covariates=covariates,
+            hypothesis=hypothesis,
         )
         self.regressors = [self.treatment_col] + self.covariates
         self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}"
@@ -612,7 +669,8 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
         if verbose:
             print(results_mlm.summary())
 
-        return results_mlm.pvalues[self.treatment_col]
+        p_value = self.pvalue_based_on_hypothesis(results_mlm)
+        return p_value
 
     def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float:
         """Returns the point estimate of the analysis

diff --git a/cluster_experiments/power_analysis.py b/cluster_experiments/power_analysis.py
@@ -103,6 +103,7 @@ def __init__(
         alpha: float = 0.05,
         features_cupac_model: Optional[List[str]] = None,
         seed: Optional[int] = None,
+        hypothesis: str = "two-sided",
     ):
         self.perturbator = perturbator
         self.splitter = splitter
@@ -113,6 +114,7 @@ def __init__(
         self.control = control
         self.treatment_col = treatment_col
         self.alpha = alpha
+        self.hypothesis = hypothesis
 
         self.cupac_handler = CupacHandler(
             cupac_model=cupac_model,

diff --git a/cluster_experiments/power_config.py b/cluster_experiments/power_config.py
@@ -128,6 +128,7 @@ class PowerConfig:
 
     # Analysis
     covariates: Optional[List[str]] = None
+    hypothesis: str = "two-sided"
 
     # Power analysis
     n_simulations: int = 100

diff --git a/cluster_experiments/utils.py b/cluster_experiments/utils.py
@@ -1,3 +1,6 @@
+from enum import Enum
+
+
 def _original_time_column(time_col: str) -> str:
     """
     Usage:
@@ -17,3 +20,9 @@ def _get_mapping_key(mapping, key):
         raise KeyError(
             f"Could not find {key = } in mapping. All options are the following: {list(mapping.keys())}"
         )
+
+
+class HypothesisEntries(Enum):
+    TWO_SIDED = "two-sided"
+    LESS = "less"
+    GREATER = "greater"