Skip to content

Commit

Permalink
feat: docs and minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Gabrielcidral1 committed Dec 27, 2023
1 parent a064435 commit 92eacb8
Show file tree
Hide file tree
Showing 7 changed files with 277 additions and 21 deletions.
36 changes: 24 additions & 12 deletions cluster_experiments/experiment_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import statsmodels.api as sm
from pandas.api.types import is_numeric_dtype
from scipy.stats import ttest_ind, ttest_rel
from utils import HypothesisEntries

from cluster_experiments.utils import HypothesisEntries


class ExperimentAnalysis(ABC):
Expand Down Expand Up @@ -34,7 +35,7 @@ def __init__(
treatment_col: str = "treatment",
treatment: str = "B",
covariates: Optional[List[str]] = None,
hypothesis: HypothesisEntries = HypothesisEntries.TWO_SIDED,
hypothesis: str = "two-sided",
):
self.target_col = target_col
self.treatment = treatment
Expand Down Expand Up @@ -122,16 +123,14 @@ def pvalue_based_on_hypothesis(self, model_result) -> float:
"""
treatment_effect = model_result.params[self.treatment_col]
p_value_half = model_result.pvalues[self.treatment_col] / 2
p_value = model_result.pvalues[self.treatment_col]

if self.hypothesis == "less":
p_value = p_value_half if treatment_effect <= 0 else 1 - p_value_half
elif self.hypothesis == "greater":
p_value = p_value_half if treatment_effect >= 0 else 1 - p_value_half
elif self.hypothesis == "two-sided":
p_value = model_result.pvalues[self.treatment_col]

return p_value
if HypothesisEntries(self.hypothesis) == HypothesisEntries.LESS:
return p_value / 2 if treatment_effect <= 0 else 1 - p_value / 2
elif HypothesisEntries(self.hypothesis) == HypothesisEntries.GREATER:
return p_value / 2 if treatment_effect >= 0 else 1 - p_value / 2
elif HypothesisEntries(self.hypothesis) == HypothesisEntries.TWO_SIDED:
return p_value

@classmethod
def from_config(cls, config):
Expand All @@ -142,6 +141,7 @@ def from_config(cls, config):
treatment_col=config.treatment_col,
treatment=config.treatment,
covariates=config.covariates,
hypothesis=config.hypothesis,
)


Expand Down Expand Up @@ -216,6 +216,7 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
results_gee = self.fit_gee(df)
if verbose:
print(results_gee.summary())

p_value = self.pvalue_based_on_hypothesis(results_gee)
return p_value

Expand Down Expand Up @@ -266,13 +267,15 @@ def __init__(
treatment_col: str = "treatment",
treatment: str = "B",
covariates: Optional[List[str]] = None,
hypothesis: str = "two-sided",
):
super().__init__(
target_col=target_col,
treatment_col=treatment_col,
cluster_cols=cluster_cols,
treatment=treatment,
covariates=covariates,
hypothesis=hypothesis,
)
self.regressors = [self.treatment_col] + self.covariates
self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}"
Expand Down Expand Up @@ -343,11 +346,13 @@ def __init__(
target_col: str = "target",
treatment_col: str = "treatment",
treatment: str = "B",
hypothesis: str = "two-sided",
):
self.target_col = target_col
self.treatment = treatment
self.treatment_col = treatment_col
self.cluster_cols = cluster_cols
self.hypothesis = hypothesis

def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
"""Returns the p-value of the analysis
Expand Down Expand Up @@ -377,6 +382,7 @@ def from_config(cls, config):
target_col=config.target_col,
treatment_col=config.treatment_col,
treatment=config.treatment,
hypothesis=config.hypothesis,
)


Expand All @@ -390,6 +396,7 @@ class PairedTTestClusteredAnalysis(ExperimentAnalysis):
treatment_col: name of the column containing the treatment variable
treatment: name of the treatment to use as the treated group
strata_cols: list of index columns for paired t test. Should be a subset or equal to cluster_cols
hypothesis: one of "two-sided", "less", "greater"
Usage:
Expand Down Expand Up @@ -418,12 +425,14 @@ def __init__(
target_col: str = "target",
treatment_col: str = "treatment",
treatment: str = "B",
hypothesis: str = "two-sided",
):
self.strata_cols = strata_cols
self.target_col = target_col
self.treatment = treatment
self.treatment_col = treatment_col
self.cluster_cols = cluster_cols
self.hypothesis = hypothesis

def _preprocessing(self, df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
df_grouped = df.groupby(
Expand Down Expand Up @@ -493,6 +502,7 @@ def from_config(cls, config):
treatment_col=config.treatment_col,
treatment=config.treatment,
strata_cols=config.strata_cols,
hypothesis=config.hypothesis,
)


Expand Down Expand Up @@ -573,6 +583,7 @@ def from_config(cls, config):
treatment_col=config.treatment_col,
treatment=config.treatment,
covariates=config.covariates,
hypothesis=config.hypothesis,
)


Expand Down Expand Up @@ -613,13 +624,15 @@ def __init__(
treatment_col: str = "treatment",
treatment: str = "B",
covariates: Optional[List[str]] = None,
hypothesis: str = "two-sided",
):
super().__init__(
target_col=target_col,
treatment_col=treatment_col,
cluster_cols=cluster_cols,
treatment=treatment,
covariates=covariates,
hypothesis=hypothesis,
)
self.regressors = [self.treatment_col] + self.covariates
self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}"
Expand Down Expand Up @@ -648,7 +661,6 @@ def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
print(results_mlm.summary())

p_value = self.pvalue_based_on_hypothesis(results_mlm)

return p_value

def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> float:
Expand Down
2 changes: 2 additions & 0 deletions cluster_experiments/power_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def __init__(
alpha: float = 0.05,
features_cupac_model: Optional[List[str]] = None,
seed: Optional[int] = None,
hypothesis: str = "two-sided",
):
self.perturbator = perturbator
self.splitter = splitter
Expand All @@ -113,6 +114,7 @@ def __init__(
self.control = control
self.treatment_col = treatment_col
self.alpha = alpha
self.hypothesis = hypothesis

self.cupac_handler = CupacHandler(
cupac_model=cupac_model,
Expand Down
1 change: 1 addition & 0 deletions cluster_experiments/power_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ class PowerConfig:

# Analysis
covariates: Optional[List[str]] = None
hypothesis: str = "two-sided"

# Power analysis
n_simulations: int = 100
Expand Down
193 changes: 193 additions & 0 deletions docs/analysis_with_different_hypothesis.ipynb

Large diffs are not rendered by default.

56 changes: 56 additions & 0 deletions tests/analysis/test_hypothesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import pandas as pd
import pytest

from cluster_experiments.experiment_analysis import (
ClusteredOLSAnalysis,
GeeExperimentAnalysis,
MLMExperimentAnalysis,
OLSAnalysis,
TTestClusteredAnalysis,
)
from tests.examples import analysis_df, generate_clustered_data


@pytest.mark.parametrize("hypothesis", ["less", "greater", "two-sided"])
@pytest.mark.parametrize("analysis_class", [OLSAnalysis])
def test_get_pvalue_hypothesis(analysis_class, hypothesis):
analysis_df_full = pd.concat([analysis_df for _ in range(100)])
analyser = analysis_class(hypothesis=hypothesis)
assert analyser.get_pvalue(analysis_df_full) >= 0


@pytest.mark.parametrize("hypothesis", ["less", "greater", "two-sided"])
@pytest.mark.parametrize(
"analysis_class",
[
ClusteredOLSAnalysis,
GeeExperimentAnalysis,
TTestClusteredAnalysis,
MLMExperimentAnalysis,
],
)
def test_get_pvalue_hypothesis_clustered(analysis_class, hypothesis):

analysis_df_full = generate_clustered_data()
analyser = analysis_class(hypothesis=hypothesis, cluster_cols=["user_id"])
assert analyser.get_pvalue(analysis_df_full) >= 0


@pytest.mark.parametrize("analysis_class", [OLSAnalysis])
def test_get_pvalue_hypothesis_default(analysis_class):
analysis_df_full = pd.concat([analysis_df for _ in range(100)])
analyser = analysis_class()
assert analyser.get_pvalue(analysis_df_full) >= 0


@pytest.mark.parametrize("analysis_class", [OLSAnalysis])
def test_get_pvalue_hypothesis_wrong_input(analysis_class):
analysis_df_full = pd.concat([analysis_df for _ in range(100)])

# Use pytest.raises to check for ValueError
with pytest.raises(ValueError) as excinfo:
analyser = analysis_class(hypothesis="wrong_input")
analyser.get_pvalue(analysis_df_full) >= 0

# Check if the error message is as expected
assert "'wrong_input' is not a valid HypothesisEntries" in str(excinfo.value)
8 changes: 0 additions & 8 deletions tests/analysis/test_ols_analysis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pandas as pd
import pytest

from cluster_experiments.experiment_analysis import OLSAnalysis
from tests.examples import analysis_df
Expand All @@ -17,10 +16,3 @@ def test_get_pvalue():
analysis_df_full = pd.concat([analysis_df for _ in range(100)])
analyser = OLSAnalysis()
assert analyser.get_pvalue(analysis_df_full) >= 0


@pytest.mark.parametrize("hypothesis", ["one_sided", "two_sided"])
def test_get_pvalue_hypothesis(hypothesis):
analysis_df_full = pd.concat([analysis_df for _ in range(100)])
analyser = OLSAnalysis(hypothesis=hypothesis)
assert analyser.get_pvalue(analysis_df_full) >= 0
2 changes: 1 addition & 1 deletion tests/examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def generate_non_clustered_data(N, n_users):
)


def generate_clustered_data():
def generate_clustered_data() -> pd.DataFrame:
analysis_df = pd.DataFrame(
{
"country_code": ["ES"] * 4 + ["IT"] * 4 + ["PL"] * 4 + ["RO"] * 4,
Expand Down

0 comments on commit 92eacb8

Please sign in to comment.