pymc-labs · drbenvincent · Jun 21, 2024 · Jun 21, 2024 · Jun 21, 2024 · Jun 21, 2024
diff --git a/causalpy/pymc_experiments.py b/causalpy/pymc_experiments.py
@@ -133,6 +133,8 @@
         A pandas dataframe
     :param treatment_time:
         The time when treatment occured, should be in reference to the data index
+    :param validation_time:
+        Optional time to split the data into training and validation data sets
     :param formula:
         A statistical model formula
     :param model:
@@ -160,6 +162,7 @@
     >>> result.summary(round_to=1)
     ==================================Pre-Post Fit==================================
     Formula: actual ~ 0 + a + g
+    Pre-intervention Bayesian $R^2$: 0.9 (std = 0.01)
     Model coefficients:
         a      0.6, 94% HDI [0.6, 0.6]
         g      0.4, 94% HDI [0.4, 0.4]
@@ -171,17 +174,30 @@
         data: pd.DataFrame,
         treatment_time: Union[int, float, pd.Timestamp],
         formula: str,
+        validation_time=None,
         model=None,
         **kwargs,
     ) -> None:
         super().__init__(model=model, **kwargs)
         self._input_validation(data, treatment_time)
         self.treatment_time = treatment_time
+        self.validation_time = validation_time
+        # validate arguments
+        if self.validation_time is not None:
+            # check that validation time is less than treatment time
+            if self.validation_time >= self.treatment_time:
+                raise ValueError(
+                    "Validation time must be less than the treatment time."
+                )
         # set experiment type - usually done in subclasses
         self.expt_type = "Pre-Post Fit"
         # split data in to pre and post intervention
-        self.datapre = data[data.index < self.treatment_time]
-        self.datapost = data[data.index >= self.treatment_time]
+        if self.validation_time is None:
+            self.datapre = data[data.index < self.treatment_time]
+            self.datapost = data[data.index >= self.treatment_time]
+        else:
+            self.datapre = data[data.index < self.validation_time]
+            self.datapost = data[data.index >= self.validation_time]
 
         self.formula = formula
 
@@ -203,8 +219,22 @@
         COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.pre_X.shape[0])}
         self.model.fit(X=self.pre_X, y=self.pre_y, coords=COORDS)
 
-        # score the goodness of fit to the pre-intervention data
-        self.score = self.model.score(X=self.pre_X, y=self.pre_y)
+        if self.validation_time is None:
+            # We just have pre and post data, no validation data. So we can score the pre intervention data
+            self.score = self.model.score(X=self.pre_X, y=self.pre_y)
+        else:
+            # Score on the training data - before the validation time
+            self.datatrain = data[data.index < self.validation_time]
+            y, X = dmatrices(formula, self.datatrain)
+            self.score = self.model.score(X=X, y=y)
+            # Score on the validation data - after the validation time but
+            # before the treatment time
+            self.datavalidate = data[
+                (data.index >= self.validation_time)
+                & (data.index < self.treatment_time)
+            ]
+            y, X = dmatrices(formula, self.datavalidate)
+            self.score_validation = self.model.score(X=X, y=y)
 
         # get the model predictions of the observed (pre-intervention) data
         self.pre_pred = self.model.predict(X=self.pre_X)
@@ -275,13 +305,6 @@
         handles.append(h)
         labels.append("Causal impact")
 
-        ax[0].set(
-            title=f"""
-            Pre-intervention Bayesian $R^2$: {round_num(self.score.r2, round_to)}
-            (std = {round_num(self.score.r2_std, round_to)})
-            """
-        )
-
         # MIDDLE PLOT -----------------------------------------------
         plot_xY(
             self.datapre.index,
@@ -303,10 +326,10 @@
             alpha=0.25,
             label="Causal impact",
         )
-        ax[1].set(title="Causal Impact")
+        ax[1].set(ylabel="Causal Impact")
 
         # BOTTOM PLOT -----------------------------------------------
-        ax[2].set(title="Cumulative Causal Impact")
+        ax[2].set(ylabel="Cumulative Causal Impact")
         plot_xY(
             self.datapost.index,
             self.post_impact_cumulative,
@@ -319,10 +342,17 @@
         for i in [0, 1, 2]:
             ax[i].axvline(
                 x=self.treatment_time,
-                ls="-",
-                lw=3,
-                color="r",
+                ls="--",
+                # lw=3,
+                color="k",
             )
+            if self.validation_time is not None:
+                ax[i].axvline(
+                    x=self.validation_time,
+                    ls="--",
+                    # lw=3,
+                    color="k",
+                )
 
         ax[0].legend(
             handles=(h_tuple for h_tuple in handles),
@@ -342,6 +372,17 @@
 
         print(f"{self.expt_type:=^80}")
         print(f"Formula: {self.formula}")
+        # print goodness of fit scores
+        if self.validation_time is None:
+            print(
+                f"Pre-intervention Bayesian $R^2$: {round_num(self.score.r2, round_to)} (std = {round_num(self.score.r2_std, round_to)})"
+            )
+        else:
+            print(
+                f"Pre-intervention Bayesian $R^2$: {round_num(self.score.r2, round_to)} (std = {round_num(self.score.r2_std, round_to)})\n"
+                f"Validation Bayesian $R^2$: {round_num(self.score_validation.r2, round_to)} (std = {round_num(self.score_validation.r2_std, round_to)})"
+            )
+        # print coefficients
         self.print_coefficients(round_to)
 
 
@@ -355,6 +396,8 @@
         The time when treatment occured, should be in reference to the data index
     :param formula:
         A statistical model formula
+    :param validation_time:
+        Optional time to split the data into training and validation data sets
     :param model:
         A PyMC model
 
@@ -394,6 +437,8 @@
         The time when treatment occured, should be in reference to the data index
     :param formula:
         A statistical model formula
+    :param validation_time:
+        Optional time to split the data into training and validation data sets
     :param model:
         A PyMC model
 

diff --git a/causalpy/tests/test_integration_pymc_examples.py b/causalpy/tests/test_integration_pymc_examples.py
@@ -317,14 +317,15 @@ def test_rkink_bandwidth():
     result.summary()
 
 
+@pytest.mark.parametrize("validation_time", [None, pd.to_datetime("2015-01-01")])
 @pytest.mark.integration
-def test_its():
+def test_its(validation_time):
     """
     Test Interrupted Time-Series experiment.
 
     Loads data and checks:
     1. data is a dataframe
-    2. pymc_experiments.SyntheticControl returns correct type
+    2. pymc_experiments.InterruptedTimeSeries returns correct type
     3. the correct number of MCMC chains exists in the posterior inference data
     4. the correct number of MCMC draws exists in the posterior inference data
     """
@@ -334,19 +335,41 @@ def test_its():
         .set_index("date")
     )
     treatment_time = pd.to_datetime("2017-01-01")
-    result = cp.pymc_experiments.SyntheticControl(
+    result = cp.pymc_experiments.InterruptedTimeSeries(
         df,
         treatment_time,
+        validation_time=validation_time,
         formula="y ~ 1 + t + C(month)",
         model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
     )
     assert isinstance(df, pd.DataFrame)
-    assert isinstance(result, cp.pymc_experiments.SyntheticControl)
+    assert isinstance(result, cp.pymc_experiments.InterruptedTimeSeries)
     assert len(result.idata.posterior.coords["chain"]) == sample_kwargs["chains"]
     assert len(result.idata.posterior.coords["draw"]) == sample_kwargs["draws"]
     result.summary()
 
 
+def test_its_with_invalid_validation_time():
+    """
+    Test that we get a ValueError when validation_time is greater than validation_time.
+    """
+    df = (
+        cp.load_data("its")
+        .assign(date=lambda x: pd.to_datetime(x["date"]))
+        .set_index("date")
+    )
+    treatment_time = pd.to_datetime("2017-01-01")
+    validation_time = pd.to_datetime("2018-01-01")
+    with pytest.raises(ValueError):
+        _ = cp.pymc_experiments.InterruptedTimeSeries(
+            df,
+            treatment_time,
+            validation_time=validation_time,
+            formula="y ~ 1 + t + C(month)",
+            model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
+        )
+
+
 @pytest.mark.integration
 def test_its_covid():
     """
@@ -379,7 +402,8 @@ def test_its_covid():
 
 
 @pytest.mark.integration
-def test_sc():
+@pytest.mark.parametrize("validation_time", [None, 50])
+def test_sc(validation_time):
     """
     Test Synthetic Control experiment.
 
@@ -395,6 +419,7 @@ def test_sc():
     result = cp.pymc_experiments.SyntheticControl(
         df,
         treatment_time,
+        validation_time=validation_time,
         formula="actual ~ 0 + a + b + c + d + e + f + g",
         model=cp.pymc_models.WeightedSumFitter(sample_kwargs=sample_kwargs),
     )
@@ -405,6 +430,23 @@ def test_sc():
     result.summary()
 
 
+def test_sc_with_invalid_validation_time():
+    """
+    Test that we get a ValueError when validation_time is greater than validation_time.
+    """
+    df = cp.load_data("sc")
+    treatment_time = 70
+    validation_time = 80
+    with pytest.raises(ValueError):
+        _ = cp.pymc_experiments.SyntheticControl(
+            df,
+            treatment_time,
+            validation_time=validation_time,
+            formula="actual ~ 0 + a + b + c + d + e + f + g",
+            model=cp.pymc_models.WeightedSumFitter(sample_kwargs=sample_kwargs),
+        )
+
+
 @pytest.mark.integration
 def test_sc_brexit():
     """

diff --git a/docs/source/_static/classes.png b/docs/source/_static/classes.png
diff --git a/docs/source/_static/interrogate_badge.svg b/docs/source/_static/interrogate_badge.svg
diff --git a/docs/source/notebooks/its_pymc.ipynb b/docs/source/notebooks/its_pymc.ipynb
diff --git a/docs/source/notebooks/sc_pymc.ipynb b/docs/source/notebooks/sc_pymc.ipynb