david26694 · david26694 · Jan 12, 2024 · Dec 20, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -12,7 +12,7 @@ repos:
     rev: 22.12.0
     hooks:
       - id: black
-        language_version: python3.8
+        language_version: python3.9
 - repo: https://github.com/charliermarsh/ruff-pre-commit
   rev: 'v0.0.261'
   hooks:

diff --git a/cluster_experiments/experiment_analysis.py b/cluster_experiments/experiment_analysis.py
@@ -6,6 +6,7 @@
 import statsmodels.api as sm
 from pandas.api.types import is_numeric_dtype
 from scipy.stats import ttest_ind, ttest_rel
+from utils import HypothesisEntries
 
 
 class ExperimentAnalysis(ABC):
@@ -33,12 +34,14 @@ def __init__(
         treatment_col: str = "treatment",
         treatment: str = "B",
         covariates: Optional[List[str]] = None,
+        hypothesis: HypothesisEntries = HypothesisEntries.TWO_SIDED,
     ):
         self.target_col = target_col
         self.treatment = treatment
         self.treatment_col = treatment_col
         self.cluster_cols = cluster_cols
         self.covariates = covariates or []
+        self.hypothesis = hypothesis
 
     def _get_cluster_column(self, df: pd.DataFrame) -> pd.Series:
         """Paste all strings of cluster_cols in one single column"""
@@ -111,6 +114,25 @@ def get_point_estimate(self, df: pd.DataFrame) -> float:
         self._data_checks(df=df)
         return self.analysis_point_estimate(df)
 
+    def pvalue_based_on_hypothesis(self, model_result) -> float:
+        """Returns the p-value of the analysis
+        Arguments:
+            model_result: statsmodels result object
+            verbose (Optional): bool, prints the regression summary if True
+
+        """
+        treatment_effect = model_result.params[self.treatment_col]
+        p_value_half = model_result.pvalues[self.treatment_col] / 2
+
+        if self.hypothesis == "less":
+            p_value = p_value_half if treatment_effect <= 0 else 1 - p_value_half
+        elif self.hypothesis == "greater":
+            p_value = p_value_half if treatment_effect >= 0 else 1 - p_value_half
+        elif self.hypothesis == "two-sided":
+            p_value = model_result.pvalues[self.treatment_col]
+
+        return p_value
+
     @classmethod
     def from_config(cls, config):
         """Creates an ExperimentAnalysis object from a PowerConfig object"""
@@ -160,13 +182,15 @@ def __init__(
         treatment_col: str = "treatment",
         treatment: str = "B",
         covariates: Optional[List[str]] = None,
+        hypothesis: str = "two-sided",
     ):
         super().__init__(
             target_col=target_col,
             treatment_col=treatment_col,
             cluster_cols=cluster_cols,
             treatment=treatment,
             covariates=covariates,
+            hypothesis=hypothesis,
         )
         self.regressors = [self.treatment_col] + self.covariates
         self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}"
@@ -192,7 +216,8 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
         results_gee = self.fit_gee(df)
         if verbose:
             print(results_gee.summary())
-        return results_gee.pvalues[self.treatment_col]
+        p_value = self.pvalue_based_on_hypothesis(results_gee)
+        return p_value
 
     def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float:
         """Returns the point estimate of the analysis
@@ -265,7 +290,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
         )
         if verbose:
             print(results_ols.summary())
-        return results_ols.pvalues[self.treatment_col]
+
+        p_value = self.pvalue_based_on_hypothesis(results_ols)
+        return p_value
 
     def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float:
         """Returns the point estimate of the analysis
@@ -337,7 +364,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
         control_data = df_grouped.query(f"{self.treatment_col} == 0")[self.target_col]
         assert len(treatment_data), "treatment data should have more than 1 cluster"
         assert len(control_data), "control data should have more than 1 cluster"
-        t_test_results = ttest_ind(treatment_data, control_data, equal_var=False)
+        t_test_results = ttest_ind(
+            treatment_data, control_data, equal_var=False, alternative=self.hypothesis
+        )
         return t_test_results.pvalue
 
     @classmethod
@@ -446,7 +475,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
 
         df_pivot = self._preprocessing(df=df)
 
-        t_test_results = ttest_rel(df_pivot.iloc[:, 0], df_pivot.iloc[:, 1])
+        t_test_results = ttest_rel(
+            df_pivot.iloc[:, 0], df_pivot.iloc[:, 1], alternative=self.hypothesis
+        )
 
         if verbose:
             print(f"paired t test results: \n {t_test_results} \n")
@@ -498,13 +529,15 @@ def __init__(
         treatment_col: str = "treatment",
         treatment: str = "B",
         covariates: Optional[List[str]] = None,
+        hypothesis: str = "two-sided",
     ):
         self.target_col = target_col
         self.treatment = treatment
         self.treatment_col = treatment_col
         self.covariates = covariates or []
         self.regressors = [self.treatment_col] + self.covariates
         self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}"
+        self.hypothesis = hypothesis
 
     def fit_ols(self, df: pd.DataFrame) -> sm.GEE:
         """Returns the fitted OLS model"""
@@ -519,7 +552,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
         results_ols = self.fit_ols(df=df)
         if verbose:
             print(results_ols.summary())
-        return results_ols.pvalues[self.treatment_col]
+
+        p_value = self.pvalue_based_on_hypothesis(results_ols)
+        return p_value
 
     def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float:
         """Returns the point estimate of the analysis
@@ -612,7 +647,9 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
         if verbose:
             print(results_mlm.summary())
 
-        return results_mlm.pvalues[self.treatment_col]
+        p_value = self.pvalue_based_on_hypothesis(results_mlm)
+
-
-
+        return p_value
 
     def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float:
         """Returns the point estimate of the analysis

diff --git a/cluster_experiments/utils.py b/cluster_experiments/utils.py
@@ -1,3 +1,6 @@
+from enum import Enum
+
+
 def _original_time_column(time_col: str) -> str:
     """
     Usage:
@@ -17,3 +20,9 @@ def _get_mapping_key(mapping, key):
         raise KeyError(
             f"Could not find {key = } in mapping. All options are the following: {list(mapping.keys())}"
         )
+
+
+class HypothesisEntries(Enum):
+    TWO_SIDED = "two-sided"
+    LESS = "less"
+    GREATER = "greater"
diff --git a/tests/analysis/test_ols_analysis.py b/tests/analysis/test_ols_analysis.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import pytest
 
 from cluster_experiments.experiment_analysis import OLSAnalysis
 from tests.examples import analysis_df
@@ -16,3 +17,10 @@ def test_get_pvalue():
     analysis_df_full = pd.concat([analysis_df for _ in range(100)])
     analyser = OLSAnalysis()
     assert analyser.get_pvalue(analysis_df_full) >= 0
+
+
+@pytest.mark.parametrize("hypothesis", ["one_sided", "two_sided"])
+def test_get_pvalue_hypothesis(hypothesis):
+    analysis_df_full = pd.concat([analysis_df for _ in range(100)])
+    analyser = OLSAnalysis(hypothesis=hypothesis)
+    assert analyser.get_pvalue(analysis_df_full) >= 0