feat: docs and minor changes

david26694 · Dec 27, 2023 · 92eacb8 · 92eacb8
1 parent a064435
commit 92eacb8
Show file tree

Hide file tree

Showing 7 changed files with 277 additions and 21 deletions.
diff --git a/cluster_experiments/experiment_analysis.py b/cluster_experiments/experiment_analysis.py
@@ -6,7 +6,8 @@
 import statsmodels.api as sm
 from pandas.api.types import is_numeric_dtype
 from scipy.stats import ttest_ind, ttest_rel
-from utils import HypothesisEntries
+
+from cluster_experiments.utils import HypothesisEntries
 
 
 class ExperimentAnalysis(ABC):
@@ -34,7 +35,7 @@ def __init__(
         treatment_col: str = "treatment",
         treatment: str = "B",
         covariates: Optional[List[str]] = None,
-        hypothesis: HypothesisEntries = HypothesisEntries.TWO_SIDED,
+        hypothesis: str = "two-sided",
     ):
         self.target_col = target_col
         self.treatment = treatment
@@ -122,16 +123,14 @@ def pvalue_based_on_hypothesis(self, model_result) -> float:
 
         """
         treatment_effect = model_result.params[self.treatment_col]
-        p_value_half = model_result.pvalues[self.treatment_col] / 2
+        p_value = model_result.pvalues[self.treatment_col]
 
-        if self.hypothesis == "less":
-            p_value = p_value_half if treatment_effect <= 0 else 1 - p_value_half
-        elif self.hypothesis == "greater":
-            p_value = p_value_half if treatment_effect >= 0 else 1 - p_value_half
-        elif self.hypothesis == "two-sided":
-            p_value = model_result.pvalues[self.treatment_col]
-
-        return p_value
+        if HypothesisEntries(self.hypothesis) == HypothesisEntries.LESS:
+            return p_value / 2 if treatment_effect <= 0 else 1 - p_value / 2
+        elif HypothesisEntries(self.hypothesis) == HypothesisEntries.GREATER:
+            return p_value / 2 if treatment_effect >= 0 else 1 - p_value / 2
+        elif HypothesisEntries(self.hypothesis) == HypothesisEntries.TWO_SIDED:
+            return p_value
 
     @classmethod
     def from_config(cls, config):
@@ -142,6 +141,7 @@ def from_config(cls, config):
             treatment_col=config.treatment_col,
             treatment=config.treatment,
             covariates=config.covariates,
+            hypothesis=config.hypothesis,
         )
 
 
@@ -216,6 +216,7 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
         results_gee = self.fit_gee(df)
         if verbose:
             print(results_gee.summary())
+
         p_value = self.pvalue_based_on_hypothesis(results_gee)
         return p_value
 
@@ -266,13 +267,15 @@ def __init__(
         treatment_col: str = "treatment",
         treatment: str = "B",
         covariates: Optional[List[str]] = None,
+        hypothesis: str = "two-sided",
     ):
         super().__init__(
             target_col=target_col,
             treatment_col=treatment_col,
             cluster_cols=cluster_cols,
             treatment=treatment,
             covariates=covariates,
+            hypothesis=hypothesis,
         )
         self.regressors = [self.treatment_col] + self.covariates
         self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}"
@@ -343,11 +346,13 @@ def __init__(
         target_col: str = "target",
         treatment_col: str = "treatment",
         treatment: str = "B",
+        hypothesis: str = "two-sided",
     ):
         self.target_col = target_col
         self.treatment = treatment
         self.treatment_col = treatment_col
         self.cluster_cols = cluster_cols
+        self.hypothesis = hypothesis
 
     def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
         """Returns the p-value of the analysis
@@ -377,6 +382,7 @@ def from_config(cls, config):
             target_col=config.target_col,
             treatment_col=config.treatment_col,
             treatment=config.treatment,
+            hypothesis=config.hypothesis,
         )
 
 
@@ -390,6 +396,7 @@ class PairedTTestClusteredAnalysis(ExperimentAnalysis):
         treatment_col: name of the column containing the treatment variable
         treatment: name of the treatment to use as the treated group
         strata_cols: list of index columns for paired t test. Should be a subset or equal to cluster_cols
+        hypothesis: one of "two-sided", "less", "greater"
 
     Usage:
 
@@ -418,12 +425,14 @@ def __init__(
         target_col: str = "target",
         treatment_col: str = "treatment",
         treatment: str = "B",
+        hypothesis: str = "two-sided",
     ):
         self.strata_cols = strata_cols
         self.target_col = target_col
         self.treatment = treatment
         self.treatment_col = treatment_col
         self.cluster_cols = cluster_cols
+        self.hypothesis = hypothesis
 
     def _preprocessing(self, df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
         df_grouped = df.groupby(
@@ -493,6 +502,7 @@ def from_config(cls, config):
             treatment_col=config.treatment_col,
             treatment=config.treatment,
             strata_cols=config.strata_cols,
+            hypothesis=config.hypothesis,
         )
 
 
@@ -573,6 +583,7 @@ def from_config(cls, config):
             treatment_col=config.treatment_col,
             treatment=config.treatment,
             covariates=config.covariates,
+            hypothesis=config.hypothesis,
         )
 
 
@@ -613,13 +624,15 @@ def __init__(
         treatment_col: str = "treatment",
         treatment: str = "B",
         covariates: Optional[List[str]] = None,
+        hypothesis: str = "two-sided",
     ):
         super().__init__(
             target_col=target_col,
             treatment_col=treatment_col,
             cluster_cols=cluster_cols,
             treatment=treatment,
             covariates=covariates,
+            hypothesis=hypothesis,
         )
         self.regressors = [self.treatment_col] + self.covariates
         self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}"
@@ -648,7 +661,6 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
             print(results_mlm.summary())
 
         p_value = self.pvalue_based_on_hypothesis(results_mlm)
-
         return p_value
 
     def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float:

diff --git a/cluster_experiments/power_analysis.py b/cluster_experiments/power_analysis.py
@@ -103,6 +103,7 @@ def __init__(
         alpha: float = 0.05,
         features_cupac_model: Optional[List[str]] = None,
         seed: Optional[int] = None,
+        hypothesis: str = "two-sided",
     ):
         self.perturbator = perturbator
         self.splitter = splitter
@@ -113,6 +114,7 @@ def __init__(
         self.control = control
         self.treatment_col = treatment_col
         self.alpha = alpha
+        self.hypothesis = hypothesis
 
         self.cupac_handler = CupacHandler(
             cupac_model=cupac_model,

diff --git a/cluster_experiments/power_config.py b/cluster_experiments/power_config.py
@@ -128,6 +128,7 @@ class PowerConfig:
 
     # Analysis
     covariates: Optional[List[str]] = None
+    hypothesis: str = "two-sided"
 
     # Power analysis
     n_simulations: int = 100

diff --git a/docs/analysis_with_different_hypothesis.ipynb b/docs/analysis_with_different_hypothesis.ipynb
diff --git a/tests/analysis/test_hypothesis.py b/tests/analysis/test_hypothesis.py
@@ -0,0 +1,56 @@
+import pandas as pd
+import pytest
+
+from cluster_experiments.experiment_analysis import (
+    ClusteredOLSAnalysis,
+    GeeExperimentAnalysis,
+    MLMExperimentAnalysis,
+    OLSAnalysis,
+    TTestClusteredAnalysis,
+)
+from tests.examples import analysis_df, generate_clustered_data
+
+
+@pytest.mark.parametrize("hypothesis", ["less", "greater", "two-sided"])
+@pytest.mark.parametrize("analysis_class", [OLSAnalysis])
+def test_get_pvalue_hypothesis(analysis_class, hypothesis):
+    analysis_df_full = pd.concat([analysis_df for _ in range(100)])
+    analyser = analysis_class(hypothesis=hypothesis)
+    assert analyser.get_pvalue(analysis_df_full) >= 0
+
+
+@pytest.mark.parametrize("hypothesis", ["less", "greater", "two-sided"])
+@pytest.mark.parametrize(
+    "analysis_class",
+    [
+        ClusteredOLSAnalysis,
+        GeeExperimentAnalysis,
+        TTestClusteredAnalysis,
+        MLMExperimentAnalysis,
+    ],
+)
+def test_get_pvalue_hypothesis_clustered(analysis_class, hypothesis):
+
+    analysis_df_full = generate_clustered_data()
+    analyser = analysis_class(hypothesis=hypothesis, cluster_cols=["user_id"])
+    assert analyser.get_pvalue(analysis_df_full) >= 0
+
+
+@pytest.mark.parametrize("analysis_class", [OLSAnalysis])
+def test_get_pvalue_hypothesis_default(analysis_class):
+    analysis_df_full = pd.concat([analysis_df for _ in range(100)])
+    analyser = analysis_class()
+    assert analyser.get_pvalue(analysis_df_full) >= 0
+
+
+@pytest.mark.parametrize("analysis_class", [OLSAnalysis])
+def test_get_pvalue_hypothesis_wrong_input(analysis_class):
+    analysis_df_full = pd.concat([analysis_df for _ in range(100)])
+
+    # Use pytest.raises to check for ValueError
+    with pytest.raises(ValueError) as excinfo:
+        analyser = analysis_class(hypothesis="wrong_input")
+        analyser.get_pvalue(analysis_df_full) >= 0
+
+    # Check if the error message is as expected
+    assert "'wrong_input' is not a valid HypothesisEntries" in str(excinfo.value)
diff --git a/tests/analysis/test_ols_analysis.py b/tests/analysis/test_ols_analysis.py
@@ -1,5 +1,4 @@
 import pandas as pd
-import pytest
 
 from cluster_experiments.experiment_analysis import OLSAnalysis
 from tests.examples import analysis_df
@@ -17,10 +16,3 @@ def test_get_pvalue():
     analysis_df_full = pd.concat([analysis_df for _ in range(100)])
     analyser = OLSAnalysis()
     assert analyser.get_pvalue(analysis_df_full) >= 0
-
-
-@pytest.mark.parametrize("hypothesis", ["one_sided", "two_sided"])
-def test_get_pvalue_hypothesis(hypothesis):
-    analysis_df_full = pd.concat([analysis_df for _ in range(100)])
-    analyser = OLSAnalysis(hypothesis=hypothesis)
-    assert analyser.get_pvalue(analysis_df_full) >= 0
diff --git a/tests/examples.py b/tests/examples.py
@@ -59,7 +59,7 @@ def generate_non_clustered_data(N, n_users):
 )
 
 
-def generate_clustered_data():
+def generate_clustered_data() -> pd.DataFrame:
     analysis_df = pd.DataFrame(
         {
             "country_code": ["ES"] * 4 + ["IT"] * 4 + ["PL"] * 4 + ["RO"] * 4,