From 209513b892942d560fabda87fd505478ee66f83d Mon Sep 17 00:00:00 2001 From: dhavalpatel624624 Date: Fri, 22 Nov 2024 11:34:11 -0800 Subject: [PATCH] Added 4 new graphs to Pareto Optimizer + bug fixes (#1159) * added formatting with black-formatter * Added 4 new graphs to Pareto Optimizer + bug fixes * corrected notebooks * need to set cluster = true if you want bootstrap graph * Added Pareto utils call to src notebook --------- Co-authored-by: Dhaval Patel --- .../robyn/data/entities/hyperparameters.py | 4 +- python/src/robyn/data/entities/mmmdata.py | 20 ++ .../modeling/clustering/cluster_builder.py | 2 +- .../src/robyn/modeling/feature_engineering.py | 2 + .../robyn/modeling/pareto/pareto_optimizer.py | 2 +- .../src/robyn/reporting/onepager_reporting.py | 13 +- python/src/robyn/robyn.py | 19 +- .../e2e_test/tutorial3_modeling_compare.ipynb | 2 +- .../e2e_test/tutorial4_pareto_e2e_test.ipynb | 29 +- python/src/robyn/tutorials/tutorial1.ipynb | 8 +- .../src/robyn/tutorials/tutorial1_src.ipynb | 25 +- .../tutorial2_feature_engineering.ipynb | 2 +- .../robyn/tutorials/tutorial3_modeling.ipynb | 2 +- .../tutorials/tutorial5_calibration.ipynb | 2 +- .../tutorials/tutorial7_clustering.ipynb | 5 +- .../robyn/visualization/cluster_visualizer.py | 320 +++++++++--------- .../robyn/visualization/pareto_visualizer.py | 280 ++++++++++++++- 17 files changed, 533 insertions(+), 204 deletions(-) diff --git a/python/src/robyn/data/entities/hyperparameters.py b/python/src/robyn/data/entities/hyperparameters.py index 2bca9881a..a2712dc95 100644 --- a/python/src/robyn/data/entities/hyperparameters.py +++ b/python/src/robyn/data/entities/hyperparameters.py @@ -51,8 +51,8 @@ class Hyperparameters: hyperparameters (Dict[str, Hyperparameter]): A dictionary of hyperparameters where the key is the channel name and the value is a Hyperparameter object. """ - hyperparameters: Dict[str, ChannelHyperparameters] = (None,) - adstock: AdstockType = (None,) # Mandatory. User provides this. + hyperparameters: Dict[str, ChannelHyperparameters] = field(default_factory=dict) + adstock: AdstockType = AdstockType.GEOMETRIC # Mandatory. User provides this. lambda_: float = 0.0 # User does not provide this. Model run calculates it. train_size: List[float] = field(default_factory=lambda: [0.5, 0.8]) hyper_bound_list_updated: Dict[str, List[float]] = field(default_factory=dict) diff --git a/python/src/robyn/data/entities/mmmdata.py b/python/src/robyn/data/entities/mmmdata.py index 6d90b4d96..21c0b757e 100644 --- a/python/src/robyn/data/entities/mmmdata.py +++ b/python/src/robyn/data/entities/mmmdata.py @@ -249,3 +249,23 @@ def calculate_rolling_window_indices(self) -> None: - self.mmmdata_spec.rolling_window_start_which + 1 ) + + def set_default_factor_vars(self) -> None: + """ + Set the default factor variables. + """ + factor_variables = self.mmmdata_spec.factor_vars + selected_columns = self.data[self.mmmdata_spec.context_vars] + non_numeric_columns = ~selected_columns.applymap( + lambda x: isinstance(x, (int, float)) + ).all() + if non_numeric_columns.any(): + non_factor_columns = non_numeric_columns[ + ~non_numeric_columns.index.isin(factor_variables or []) + ] + non_factor_columns = non_factor_columns[non_factor_columns] + if len(non_factor_columns) > 0: + factor_variables = ( + factor_variables or [] + ) + non_factor_columns.index.tolist() + self.mmmdata_spec.factor_vars = factor_variables diff --git a/python/src/robyn/modeling/clustering/cluster_builder.py b/python/src/robyn/modeling/clustering/cluster_builder.py index 0758e8bd6..a0a39e9be 100644 --- a/python/src/robyn/modeling/clustering/cluster_builder.py +++ b/python/src/robyn/modeling/clustering/cluster_builder.py @@ -219,7 +219,7 @@ def _calculate_confidence_intervals( cluster_collect = [] self.logger.debug(f"Processing {config.k_clusters} clusters") - for j in range(1, config.k_clusters + 1): + for j in range(0, config.k_clusters): df_outcome = df_clusters_outcome[df_clusters_outcome["cluster"] == j] if len(df_outcome["sol_id"].unique()) < 3: self.logger.warning( diff --git a/python/src/robyn/modeling/feature_engineering.py b/python/src/robyn/modeling/feature_engineering.py index 2df5a256c..e2e5dddb0 100644 --- a/python/src/robyn/modeling/feature_engineering.py +++ b/python/src/robyn/modeling/feature_engineering.py @@ -392,6 +392,8 @@ def _prophet_decomposition(self, dt_mod: pd.DataFrame) -> pd.DataFrame: dt_regressors["ds"] = pd.to_datetime(dt_regressors["ds"]) # Handle factor variables + if self.mmm_data.mmmdata_spec.factor_vars is None: + self.mmm_data.set_default_factor_vars() factor_vars = self.mmm_data.mmmdata_spec.factor_vars if factor_vars: # Create dummy variables but keep original diff --git a/python/src/robyn/modeling/pareto/pareto_optimizer.py b/python/src/robyn/modeling/pareto/pareto_optimizer.py index f4f50199e..8a853f877 100644 --- a/python/src/robyn/modeling/pareto/pareto_optimizer.py +++ b/python/src/robyn/modeling/pareto/pareto_optimizer.py @@ -253,7 +253,7 @@ def optimize( self.logger.info("Pareto optimization completed successfully") return ParetoResult( pareto_solutions=plotting_data["pareto_solutions"], - pareto_fronts=pareto_fronts, + pareto_fronts=max(pareto_data.pareto_fronts), result_hyp_param=aggregated_data["result_hyp_param"], result_calibration=aggregated_data["result_calibration"], x_decomp_agg=pareto_data.x_decomp_agg, diff --git a/python/src/robyn/reporting/onepager_reporting.py b/python/src/robyn/reporting/onepager_reporting.py index 057fe6915..e09ef27e0 100644 --- a/python/src/robyn/reporting/onepager_reporting.py +++ b/python/src/robyn/reporting/onepager_reporting.py @@ -10,7 +10,7 @@ from robyn.modeling.entities.pareto_result import ParetoResult from robyn.modeling.entities.clustering_results import ClusteredResult -from robyn.data.entities.hyperparameters import AdstockType +from robyn.data.entities.hyperparameters import Hyperparameters from robyn.data.entities.mmmdata import MMMData from robyn.data.entities.enums import PlotType @@ -27,13 +27,13 @@ def __init__( self, pareto_result: ParetoResult, clustered_result: Optional[ClusteredResult] = None, - adstock: Optional[AdstockType] = None, + hyperparameter: Optional[Hyperparameters] = None, mmm_data: Optional[MMMData] = None, holidays_data: Optional[HolidaysData] = None, ): self.pareto_result = pareto_result self.clustered_result = clustered_result - self.adstock = adstock + self.hyperparameter = hyperparameter self.mmm_data = mmm_data self.holidays_data = holidays_data @@ -270,9 +270,12 @@ def _generate_solution_plots( # Initialize visualizers pareto_viz = ( ParetoVisualizer( - self.pareto_result, self.adstock, self.mmm_data, self.holidays_data + self.pareto_result, + self.mmm_data, + self.holidays_data, + self.hyperparameter, ) - if self.adstock and self.holidays_data + if self.hyperparameter and self.holidays_data else None ) cluster_viz = ( diff --git a/python/src/robyn/robyn.py b/python/src/robyn/robyn.py index cbc9da2d5..b0462dfe5 100644 --- a/python/src/robyn/robyn.py +++ b/python/src/robyn/robyn.py @@ -4,9 +4,7 @@ from logging.handlers import RotatingFileHandler from pathlib import Path from typing import Dict, Optional, List -import numpy as np -from robyn.modeling.entities.clustering_results import ClusteredResult -from robyn.data.entities.enums import AdstockType, PlotType +import copy from robyn.data.entities.mmmdata import MMMData from robyn.data.entities.holidays_data import HolidaysData from robyn.data.entities.hyperparameters import Hyperparameters @@ -196,7 +194,6 @@ def train_models( try: logger.info("Training models") trials_config = trials_config or TrialsConfig(trials=5, iterations=2000) - model_executor = ModelExecutor( mmmdata=self.mmm_data, holidays_data=self.holidays_data, @@ -261,6 +258,7 @@ def evaluate_models( holidays_data=self.holidays_data, ) self.pareto_result = pareto_optimizer.optimize(**pareto_config) + unfiltered_pareto_result = copy.deepcopy(self.pareto_result) # Optional clustering is_clustered = False @@ -274,10 +272,13 @@ def evaluate_models( ) if display_plots or export_plots: pareto_visualizer = ParetoVisualizer( - self.pareto_result, - self.hyperparameters.adstock, - self.mmm_data, - self.holidays_data, + pareto_result=self.pareto_result, + mmm_data=self.mmm_data, + holiday_data=self.holidays_data, + hyperparameter=self.hyperparameters, + featurized_mmm_data=self.featurized_mmm_data, + unfiltered_pareto_result=unfiltered_pareto_result, + model_outputs=self.model_outputs, ) pareto_visualizer.plot_all(display_plots, self.working_dir) if self.cluster_result: @@ -412,7 +413,7 @@ def generate_one_pager(self, solution_id: Optional[str] = None) -> None: onepager = OnePager( pareto_result=self.pareto_result, clustered_result=self.cluster_result, - adstock=self.hyperparameters.adstock, + hyperparameter=self.hyperparameters, mmm_data=self.mmm_data, holidays_data=self.holidays_data, ) diff --git a/python/src/robyn/tutorials/e2e_test/tutorial3_modeling_compare.ipynb b/python/src/robyn/tutorials/e2e_test/tutorial3_modeling_compare.ipynb index ab4209143..c63cf53ca 100644 --- a/python/src/robyn/tutorials/e2e_test/tutorial3_modeling_compare.ipynb +++ b/python/src/robyn/tutorials/e2e_test/tutorial3_modeling_compare.ipynb @@ -144,7 +144,7 @@ "outputs": [], "source": [ "hyperparameters = Hyperparameters(\n", - " {\n", + " hyperparameters={\n", " \"facebook_S\": ChannelHyperparameters(\n", " alphas=[0.5, 3],\n", " gammas=[0.3, 1],\n", diff --git a/python/src/robyn/tutorials/e2e_test/tutorial4_pareto_e2e_test.ipynb b/python/src/robyn/tutorials/e2e_test/tutorial4_pareto_e2e_test.ipynb index 333e78362..6c6b0168f 100644 --- a/python/src/robyn/tutorials/e2e_test/tutorial4_pareto_e2e_test.ipynb +++ b/python/src/robyn/tutorials/e2e_test/tutorial4_pareto_e2e_test.ipynb @@ -291,17 +291,32 @@ "source": [ "from robyn.data.entities.enums import AdstockType\n", "from robyn.reporting.onepager_reporting import OnePager\n", + "from robyn.visualization.pareto_visualizer import ParetoVisualizer\n", "\n", - "visualizer = OnePager(\n", - " pareto_result=filtered_pareto_results,\n", - " clustered_result=cluster_results,\n", - " adstock=AdstockType.GEOMETRIC,\n", - " mmm_data=mmm_data,\n", - " holidays_data=holidays_data,\n", - ")\n", + "\n", + "visualizer = OnePager(pareto_result=filtered_pareto_results, clustered_result=cluster_results, hyperparameter=hyperparameters, mmm_data=mmm_data, holidays_data=holidays_data)\n", "visualizer.generate_one_pager(top_pareto=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "visualizer = ParetoVisualizer(\n", + " pareto_result=filtered_pareto_results, \n", + " hyperparameter=hyperparameters, \n", + " mmm_data=mmm_data, \n", + " holiday_data=holidays_data,\n", + " featurized_mmm_data=featurized_mmm_data,\n", + " unfiltered_pareto_result=pareto_result,\n", + " model_outputs=output_models)\n", + "\n", + "visualizer.plot_all(True)" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/python/src/robyn/tutorials/tutorial1.ipynb b/python/src/robyn/tutorials/tutorial1.ipynb index 2b0e73e57..4291651fc 100644 --- a/python/src/robyn/tutorials/tutorial1.ipynb +++ b/python/src/robyn/tutorials/tutorial1.ipynb @@ -83,7 +83,7 @@ }, { "cell_type": "markdown", - "id": "0a1cce14", + "id": "eefbc5da", "metadata": {}, "source": [ "## 2.2. Initialize Robyn\n", @@ -126,7 +126,7 @@ "\n", "# Create Hyperparameters\n", "hyperparameters = Hyperparameters(\n", - " {\n", + " hyperparameters={\n", " \"facebook_S\": ChannelHyperparameters(\n", " alphas=[0.5, 3],\n", " gammas=[0.3, 1],\n", @@ -258,10 +258,8 @@ "outputs": [], "source": [ "%matplotlib inline\n", - "\n", "from robyn.modeling.clustering.clustering_config import ClusterBy, ClusteringConfig\n", "\n", - "\n", "configs = ClusteringConfig(\n", " dep_var_type= DependentVarType(mmm_data.mmmdata_spec.dep_var_type),\n", " cluster_by = ClusterBy.HYPERPARAMETERS,\n", @@ -388,7 +386,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.11.5" } }, "nbformat": 4, diff --git a/python/src/robyn/tutorials/tutorial1_src.ipynb b/python/src/robyn/tutorials/tutorial1_src.ipynb index 9e06a764f..84e7b1db4 100644 --- a/python/src/robyn/tutorials/tutorial1_src.ipynb +++ b/python/src/robyn/tutorials/tutorial1_src.ipynb @@ -76,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -180,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -333,6 +333,25 @@ "print(cluster_results)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reestablish Pareto Results" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "from robyn.modeling.pareto.pareto_utils import ParetoUtils\n", + "\n", + "utils = ParetoUtils()\n", + "pareto_result = utils.process_pareto_clustered_results(pareto_result, clustered_result=cluster_results, ran_cluster=True, ran_calibration= False)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -495,7 +514,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.11.5" } }, "nbformat": 4, diff --git a/python/src/robyn/tutorials/tutorial2_feature_engineering.ipynb b/python/src/robyn/tutorials/tutorial2_feature_engineering.ipynb index f210081c1..e50c5e787 100644 --- a/python/src/robyn/tutorials/tutorial2_feature_engineering.ipynb +++ b/python/src/robyn/tutorials/tutorial2_feature_engineering.ipynb @@ -313,7 +313,7 @@ ], "source": [ "hyperparameters = Hyperparameters(\n", - " {\n", + " hyperparameters={\n", " \"facebook_S\": ChannelHyperparameters(\n", " alphas=[0.5, 3],\n", " gammas=[0.3, 1],\n", diff --git a/python/src/robyn/tutorials/tutorial3_modeling.ipynb b/python/src/robyn/tutorials/tutorial3_modeling.ipynb index eec62ee22..9d551c167 100644 --- a/python/src/robyn/tutorials/tutorial3_modeling.ipynb +++ b/python/src/robyn/tutorials/tutorial3_modeling.ipynb @@ -301,7 +301,7 @@ ], "source": [ "hyperparameters = Hyperparameters(\n", - " {\n", + " hyperparameters={\n", " \"facebook_S\": ChannelHyperparameters(\n", " alphas=[0.5, 3],\n", " gammas=[0.3, 1],\n", diff --git a/python/src/robyn/tutorials/tutorial5_calibration.ipynb b/python/src/robyn/tutorials/tutorial5_calibration.ipynb index d7cb09f79..f7d812b75 100644 --- a/python/src/robyn/tutorials/tutorial5_calibration.ipynb +++ b/python/src/robyn/tutorials/tutorial5_calibration.ipynb @@ -403,7 +403,7 @@ ], "source": [ "hyperparameters = Hyperparameters(\n", - " {\n", + " hyperparameters={\n", " \"facebook_S\": ChannelHyperparameters(\n", " alphas=[0.5, 3],\n", " gammas=[0.3, 1],\n", diff --git a/python/src/robyn/tutorials/tutorial7_clustering.ipynb b/python/src/robyn/tutorials/tutorial7_clustering.ipynb index ae50ceaaf..487592e83 100644 --- a/python/src/robyn/tutorials/tutorial7_clustering.ipynb +++ b/python/src/robyn/tutorials/tutorial7_clustering.ipynb @@ -137,7 +137,7 @@ "from robyn.modeling.pareto.pareto_utils import ParetoUtils\n", "\n", "utils = ParetoUtils()\n", - "new_pareto_results = utils.process_pareto_clustered_results(pareto_result, clustered_result=cluster_results, ran_cluster=False, ran_calibration= False)" + "new_pareto_results = utils.process_pareto_clustered_results(pareto_result, clustered_result=cluster_results, ran_cluster=True, ran_calibration= False)" ] }, { @@ -147,9 +147,10 @@ "outputs": [], "source": [ "from robyn.data.entities.enums import AdstockType \n", + "from robyn.data.entities.hyperparameters import Hyperparameters\n", "from robyn.reporting.onepager_reporting import OnePager\n", "\n", - "visualizer = OnePager(pareto_result=new_pareto_results, clustered_result=cluster_results, adstock=AdstockType.GEOMETRIC, mmm_data=mmm_data, holidays_data=holidays_data)\n", + "visualizer = OnePager(pareto_result=new_pareto_results, clustered_result=cluster_results, hyperparameter=Hyperparameters(adstock=AdstockType.GEOMETRIC), mmm_data=mmm_data, holidays_data=holidays_data)\n", "visualizer.generate_one_pager(top_pareto=True)" ] } diff --git a/python/src/robyn/visualization/cluster_visualizer.py b/python/src/robyn/visualization/cluster_visualizer.py index 38696659b..66da3dc04 100644 --- a/python/src/robyn/visualization/cluster_visualizer.py +++ b/python/src/robyn/visualization/cluster_visualizer.py @@ -418,165 +418,167 @@ def plot_dimensionality_reduction(self) -> None: raise NotImplementedError def generate_bootstrap_confidence( - self, solution_id: str, ax: Optional[plt.Axes] = None - ) -> Optional[plt.Figure]: - """Generate error bar plot showing bootstrapped ROI/CPA confidence intervals.""" - logger.debug("Starting generation of bootstrap confidence plot") - if not hasattr(self, "pareto_result"): - raise ValueError("Pareto result not initialized") - - if solution_id not in self.pareto_result.plot_data_collect: - raise ValueError(f"Invalid solution ID: {solution_id}") - - x_decomp_agg = self.pareto_result.x_decomp_agg[ - self.pareto_result.x_decomp_agg["sol_id"] == solution_id - ] - - if "ci_low" not in x_decomp_agg.columns: - if ax is None: - fig, ax = plt.subplots(figsize=(10, 6)) - ax.text(0.5, 0.5, "No bootstrap results", ha="center", va="center") - return fig - else: - ax.text(0.5, 0.5, "No bootstrap results", ha="center", va="center") - return None - - bootstrap_data = x_decomp_agg[ - (~x_decomp_agg["ci_low"].isna()) - & (~x_decomp_agg["ci_up"].isna()) - & (~x_decomp_agg["boot_mean"].isna()) - & (x_decomp_agg["sol_id"] == solution_id) - ][["rn", "sol_id", "boot_mean", "ci_low", "ci_up"]] - - if bootstrap_data.empty: - if ax is None: - fig, ax = plt.subplots(figsize=(16, 10)) - ax.text( - 0.5, - 0.5, - "No valid bootstrap results after filtering", - ha="center", - va="center", - ) - return fig - else: - ax.text( - 0.5, - 0.5, - "No valid bootstrap results after filtering", - ha="center", - va="center", - ) - return None - - # Sort data alphabetically by rn - bootstrap_data = bootstrap_data.sort_values('rn', ascending=True) - - if ax is None: - fig, ax = plt.subplots(figsize=(12, min(8, 3 + len(bootstrap_data) * 0.3))) - else: - fig = None - - ax.set_facecolor("white") - - metric_type = ( - "ROAS" - if ( - self.mmm_data - and hasattr(self.mmm_data.mmmdata_spec, "dep_var_type") - and self.mmm_data.mmmdata_spec.dep_var_type == DependentVarType.REVENUE - ) - else "CPA" - ) - - y_pos = range(len(bootstrap_data)) - - ax.errorbar( - x=bootstrap_data["boot_mean"], - y=y_pos, - xerr=[ - (bootstrap_data["boot_mean"] - bootstrap_data["ci_low"]), - (bootstrap_data["ci_up"] - bootstrap_data["boot_mean"]), - ], - fmt="o", - color="black", - capsize=3, - markersize=3, - elinewidth=1, - zorder=3, - ) - - for i, row in enumerate(bootstrap_data.itertuples()): - ax.text( - row.boot_mean, - i, - f"{float(f'{row.boot_mean:.2g}')}", - va="bottom", - ha="center", - fontsize=10, - color="black", - ) - - ax.text( - row.ci_low, - i, - f"{float(f'{row.ci_low:.2g}')}", - va="center", - ha="right", - fontsize=9, - color="black", - ) - - ax.text( - row.ci_up, - i, - f"{float(f'{row.ci_up:.2g}')}", - va="center", - ha="left", - fontsize=9, - color="black", - ) - - ax.set_yticks(y_pos) - ax.set_yticklabels(bootstrap_data["rn"], fontsize=9) - - ax.spines["right"].set_visible(False) - ax.spines["top"].set_visible(False) - - if metric_type == "ROAS": - ax.axvline(x=1, color="gray", linestyle="--", alpha=0.5, zorder=2) - - cluster_txt = "" - if self.clustered_result is not None: - temp2 = self.clustered_result.cluster_data - - if "n" not in temp2.columns: - temp2 = ( - temp2.groupby("cluster") - .apply(lambda x: x.assign(n=len(x))) - .reset_index(drop=True) - ) - temp2 = temp2[temp2["sol_id"] == solution_id] - if not temp2.empty: - cluster_txt = f" {temp2['cluster'].iloc[0]} ({temp2['n'].iloc[0]} IDs)" - - title = f"In-cluster{cluster_txt} bootstrapped {metric_type} [95% CI & mean]" - - ax.set_title(title, pad=20, fontsize=11) - - x_min = bootstrap_data["ci_low"].min() - x_max = bootstrap_data["ci_up"].max() - margin = (x_max - x_min) * 0.05 - ax.set_xlim(x_min - margin, x_max + margin) - - ax.grid(True, axis="x", color="lightgray", linestyle="-", alpha=0.3, zorder=1) - ax.set_axisbelow(True) - - logger.debug("Successfully generated bootstrap confidence plot") - if fig: - plt.tight_layout() - return fig - + self, solution_id: str, ax: Optional[plt.Axes] = None + ) -> Optional[plt.Figure]: + """Generate error bar plot showing bootstrapped ROI/CPA confidence intervals.""" + logger.debug("Starting generation of bootstrap confidence plot") + if not hasattr(self, "pareto_result"): + raise ValueError("Pareto result not initialized") + + if solution_id not in self.pareto_result.plot_data_collect: + raise ValueError(f"Invalid solution ID: {solution_id}") + + x_decomp_agg = self.pareto_result.x_decomp_agg[ + self.pareto_result.x_decomp_agg["sol_id"] == solution_id + ] + + if "ci_low" not in x_decomp_agg.columns: + if ax is None: + fig, ax = plt.subplots(figsize=(10, 6)) + ax.text(0.5, 0.5, "No bootstrap results", ha="center", va="center") + return fig + else: + ax.text(0.5, 0.5, "No bootstrap results", ha="center", va="center") + return None + + bootstrap_data = x_decomp_agg[ + (~x_decomp_agg["ci_low"].isna()) + & (~x_decomp_agg["ci_up"].isna()) + & (~x_decomp_agg["boot_mean"].isna()) + & (x_decomp_agg["sol_id"] == solution_id) + ][["rn", "sol_id", "boot_mean", "ci_low", "ci_up"]] + + if bootstrap_data.empty: + if ax is None: + fig, ax = plt.subplots(figsize=(16, 10)) + ax.text( + 0.5, + 0.5, + "No valid bootstrap results after filtering", + ha="center", + va="center", + ) + return fig + else: + ax.text( + 0.5, + 0.5, + "No valid bootstrap results after filtering", + ha="center", + va="center", + ) + return None + + # Sort data alphabetically by rn + bootstrap_data = bootstrap_data.sort_values("rn", ascending=True) + + if ax is None: + fig, ax = plt.subplots(figsize=(12, min(8, 3 + len(bootstrap_data) * 0.3))) + else: + fig = None + + ax.set_facecolor("white") + + metric_type = ( + "ROAS" + if ( + self.mmm_data + and hasattr(self.mmm_data.mmmdata_spec, "dep_var_type") + and self.mmm_data.mmmdata_spec.dep_var_type == DependentVarType.REVENUE + ) + else "CPA" + ) + + y_pos = range(len(bootstrap_data)) + + ax.errorbar( + x=bootstrap_data["boot_mean"], + y=y_pos, + xerr=[ + (bootstrap_data["boot_mean"] - bootstrap_data["ci_low"]), + (bootstrap_data["ci_up"] - bootstrap_data["boot_mean"]), + ], + fmt="o", + color="black", + capsize=3, + markersize=3, + elinewidth=1, + zorder=3, + ) + + for i, row in enumerate(bootstrap_data.itertuples()): + ax.text( + row.boot_mean, + i, + f"{float(f'{row.boot_mean:.2g}')}", + va="bottom", + ha="center", + fontsize=10, + color="black", + ) + + ax.text( + row.ci_low, + i, + f"{float(f'{row.ci_low:.2g}')}", + va="center", + ha="right", + fontsize=9, + color="black", + ) + + ax.text( + row.ci_up, + i, + f"{float(f'{row.ci_up:.2g}')}", + va="center", + ha="left", + fontsize=9, + color="black", + ) + + ax.set_yticks(y_pos) + ax.set_yticklabels(bootstrap_data["rn"], fontsize=9) + + ax.spines["right"].set_visible(False) + ax.spines["top"].set_visible(False) + + if metric_type == "ROAS": + ax.axvline(x=1, color="gray", linestyle="--", alpha=0.5, zorder=2) + + cluster_txt = "" + if self.clustered_result is not None: + temp2 = self.clustered_result.cluster_data + + if "n" not in temp2.columns: + temp2 = ( + temp2.groupby("cluster") + .apply(lambda x: x.assign(n=len(x))) + .reset_index(drop=True) + ) + temp2 = temp2[temp2["sol_id"] == solution_id] + if not temp2.empty: + cluster_txt = f" {temp2['cluster'].iloc[0]} ({temp2['n'].iloc[0]} IDs)" + + title = f"In-cluster{cluster_txt} bootstrapped {metric_type} [95% CI & mean]" + + ax.set_title(title, pad=20, fontsize=11) + + x_min = bootstrap_data["ci_low"].min() + x_max = bootstrap_data["ci_up"].max() + margin = (x_max - x_min) * 0.05 + ax.set_xlim(x_min - margin, x_max + margin) + + ax.grid(True, axis="x", color="lightgray", linestyle="-", alpha=0.3, zorder=1) + ax.set_axisbelow(True) + + logger.debug("Successfully generated bootstrap confidence plot") + if fig: + plt.tight_layout() + fig = plt.gcf() + plt.close(fig) + return fig + def plot_all( self, display_plots: bool = True, export_location: Union[str, Path] = None ) -> None: diff --git a/python/src/robyn/visualization/pareto_visualizer.py b/python/src/robyn/visualization/pareto_visualizer.py index f3ed800b4..ad07e54d7 100644 --- a/python/src/robyn/visualization/pareto_visualizer.py +++ b/python/src/robyn/visualization/pareto_visualizer.py @@ -1,16 +1,21 @@ from pathlib import Path +import re from typing import Dict, List, Optional, Union from matplotlib import ticker, transforms import matplotlib.pyplot as plt import numpy as np import pandas as pd +from robyn.modeling.entities.modeloutputs import ModelOutputs +import seaborn as sns import logging from robyn.data.entities.enums import ProphetVariableType from robyn.data.entities.holidays_data import HolidaysData +from robyn.modeling.entities.featurized_mmm_data import FeaturizedMMMData from robyn.modeling.entities.pareto_result import ParetoResult -from robyn.data.entities.hyperparameters import AdstockType +from robyn.data.entities.hyperparameters import AdstockType, Hyperparameters from robyn.data.entities.mmmdata import MMMData from robyn.visualization.base_visualizer import BaseVisualizer +from robyn.data.entities.enums import DependentVarType import math import matplotlib.dates as mdates @@ -21,15 +26,21 @@ class ParetoVisualizer(BaseVisualizer): def __init__( self, pareto_result: ParetoResult, - adstock: AdstockType, mmm_data: MMMData, holiday_data: Optional[HolidaysData] = None, + hyperparameter: Optional[Hyperparameters] = None, + featurized_mmm_data: Optional[FeaturizedMMMData] = None, + unfiltered_pareto_result: Optional[ParetoResult] = None, + model_outputs: Optional[ModelOutputs] = None, ): super().__init__() self.pareto_result = pareto_result - self.adstock = adstock self.mmm_data = mmm_data self.holiday_data = holiday_data + self.hyperparameter = hyperparameter + self.featurized_mmm_data = featurized_mmm_data + self.unfiltered_pareto_result = unfiltered_pareto_result + self.model_outputs = model_outputs def _baseline_vars( self, baseline_level, prophet_vars: List[ProphetVariableType] = [] @@ -217,6 +228,8 @@ def generate_waterfall( # Adjust layout if fig: plt.subplots_adjust(right=0.85, top=0.85) + fig = plt.gcf() + plt.close(fig) return fig return None @@ -376,6 +389,8 @@ def generate_fitted_vs_actual( if fig: plt.tight_layout() plt.subplots_adjust(top=0.85) + fig = plt.gcf() + plt.close(fig) return fig return None @@ -462,6 +477,8 @@ def generate_diagnostic_plot( if fig: plt.tight_layout() + fig = plt.gcf() + plt.close(fig) return fig return None @@ -573,6 +590,8 @@ def generate_immediate_vs_carryover( if fig: plt.tight_layout() plt.subplots_adjust(top=0.85) + fig = plt.gcf() + plt.close(fig) return fig return None @@ -599,7 +618,8 @@ def generate_adstock_rate( else: fig = None - if self.adstock == AdstockType.GEOMETRIC: + # Handle different adstock types + if self.hyperparameter.adstock == AdstockType.GEOMETRIC: dt_geometric = adstock_data["dt_geometric"].copy() # Sort data alphabetically by channel @@ -637,7 +657,10 @@ def generate_adstock_rate( ax.set_xlabel(f"Thetas [by {interval_type}]") ax.set_ylabel(None) - elif self.adstock in [AdstockType.WEIBULL_CDF, AdstockType.WEIBULL_PDF]: + elif self.hyperparameter.adstock in [ + AdstockType.WEIBULL_CDF, + AdstockType.WEIBULL_PDF, + ]: # [Weibull code remains the same] weibull_data = adstock_data["weibullCollect"] wb_type = adstock_data["wb_type"] @@ -680,7 +703,8 @@ def generate_adstock_rate( ax_sub.grid(True, alpha=0.2) ax_sub.set_ylim(0, 1) - if self.adstock == AdstockType.GEOMETRIC: + # Customize grid + if self.hyperparameter.adstock == AdstockType.GEOMETRIC: ax.grid(True, axis="x", alpha=0.2) ax.grid(False, axis="y") ax.set_axisbelow(True) @@ -691,9 +715,240 @@ def generate_adstock_rate( if fig: plt.tight_layout() + fig = plt.gcf() + plt.close(fig) return fig return None + def create_prophet_decomposition_plot(self): + """Create Prophet Decomposition Plot.""" + prophet_vars = ( + [ProphetVariableType(var) for var in self.holiday_data.prophet_vars] + if self.holiday_data and self.holiday_data.prophet_vars + else [] + ) + factor_vars = self.mmm_data.mmmdata_spec.factor_vars if self.mmm_data else [] + if not (prophet_vars or factor_vars): + return None + df = self.featurized_mmm_data.dt_mod.copy() + prophet_vars_str = [variable.value for variable in prophet_vars] + prophet_vars_str.sort(reverse=True) + value_variables = ( + [ + ( + "dep_var" + if hasattr(df, "dep_var") + else self.mmm_data.mmmdata_spec.dep_var + ) + ] + + factor_vars + + prophet_vars_str + ) + df_long = df.melt( + id_vars=["ds"], + value_vars=value_variables, + var_name="variable", + value_name="value", + ) + df_long["ds"] = pd.to_datetime(df_long["ds"]) + plt.figure(figsize=(12, 3 * len(df_long["variable"].unique()))) + prophet_decomp_plot = plt.figure( + figsize=(12, 3 * len(df_long["variable"].unique())) + ) + gs = prophet_decomp_plot.add_gridspec(len(df_long["variable"].unique()), 1) + for i, var in enumerate(df_long["variable"].unique()): + ax = prophet_decomp_plot.add_subplot(gs[i, 0]) + var_data = df_long[df_long["variable"] == var] + ax.plot(var_data["ds"], var_data["value"], color="steelblue") + ax.set_title(var) + ax.set_xlabel(None) + ax.set_ylabel(None) + plt.suptitle("Prophet decomposition") + plt.tight_layout() + fig = plt.gcf() + plt.close(fig) + return fig + + def create_hyperparameter_sampling_distribution(self): + """Create Hyperparameter Sampling Distribution Plot.""" + unfiltered_pareto_results = self.unfiltered_pareto_result + if unfiltered_pareto_results is None: + return None + result_hyp_param = unfiltered_pareto_results.result_hyp_param + hp_names = list(self.hyperparameter.hyperparameters.keys()) + hp_names = [name.replace("lambda", "lambda_hp") for name in hp_names] + matching_columns = [ + col + for col in result_hyp_param.columns + if any(re.search(pattern, col, re.IGNORECASE) for pattern in hp_names) + ] + matching_columns.sort() + hyp_df = result_hyp_param[matching_columns] + melted_df = hyp_df.melt(var_name="variable", value_name="value") + melted_df["variable"] = melted_df["variable"].replace("lambda_hp", "lambda") + + def parse_variable(variable): + parts = variable.split("_") + return {"type": parts[-1], "channel": "_".join(parts[:-1])} + + parsed_vars = melted_df["variable"].apply(parse_variable).apply(pd.Series) + melted_df[["type", "channel"]] = parsed_vars + melted_df["type"] = pd.Categorical( + melted_df["type"], categories=melted_df["type"].unique() + ) + melted_df["channel"] = pd.Categorical( + melted_df["channel"], categories=melted_df["channel"].unique()[::-1] + ) + plt.figure(figsize=(12, 7)) + g = sns.FacetGrid(melted_df, col="type", sharex=False, height=6, aspect=1) + + def violin_plot(x, y, **kwargs): + sns.violinplot(x=x, y=y, **kwargs, alpha=0.8, linewidth=0) + + g.map_dataframe( + violin_plot, x="value", y="channel", hue="channel", palette="Set2" + ) + g.set_titles("{col_name}") + g.set_xlabels("Hyperparameter space") + g.set_ylabels("") + g.figure.suptitle("Hyperparameters Optimization Distributions", y=1.05) + subtitle_text = ( + f"Sample distribution, iterations = " + f"{self.model_outputs.iterations} x {len(self.model_outputs.trials)} trial" + ) + g.figure.text(0.5, 0.98, subtitle_text, ha="center", fontsize=10) + plt.subplots_adjust(top=0.9) + plt.tight_layout() + fig = plt.gcf() + plt.close(fig) + return fig + + def create_pareto_front_plot(self, is_calibrated): + """Create Pareto Front Plot.""" + unfiltered_pareto_results = self.unfiltered_pareto_result + result_hyp_param = unfiltered_pareto_results.result_hyp_param + pareto_fronts = self.pareto_result.pareto_fronts + if is_calibrated: + result_hyp_param["iterations"] = np.where( + result_hyp_param["robynPareto"].isna(), + np.nan, + result_hyp_param["iterations"], + ) + result_hyp_param = result_hyp_param.sort_values( + by="robynPareto", na_position="first" + ) + pareto_fronts_vec = list(range(1, pareto_fronts + 1)) + plt.figure(figsize=(12, 8)) + scatter = plt.scatter( + result_hyp_param["nrmse"], + result_hyp_param["decomp.rssd"], + c=result_hyp_param["iterations"], + cmap="Blues", + alpha=0.7, + ) + plt.colorbar(scatter, label="Iterations") + if is_calibrated: + scatter = plt.scatter( + result_hyp_param["nrmse"], + result_hyp_param["decomp.rssd"], + c=result_hyp_param["iterations"], + cmap="Blues", + s=result_hyp_param["mape"] * 100, + alpha=1 - result_hyp_param["mape"], + ) + for pfs in range(1, max(pareto_fronts_vec) + 1): + temp = result_hyp_param[result_hyp_param["robynPareto"] == pfs] + if len(temp) > 1: + temp = temp.sort_values("nrmse") + plt.plot(temp["nrmse"], temp["decomp.rssd"], color="coral", linewidth=2) + plt.title( + "Multi-objective Evolutionary Performance" + + (" with Calibration" if is_calibrated else "") + ) + plt.xlabel("NRMSE") + plt.ylabel("DECOMP.RSSD") + plt.suptitle( + f"2D Pareto fronts with {self.model_outputs.nevergrad_algo or 'Unknown'}, " + f"for {len(self.model_outputs.trials)} trial{'' if pareto_fronts == 1 else 's'} " + f"with {self.model_outputs.iterations or 1} iterations each" + ) + plt.tight_layout() + fig = plt.gcf() + plt.close(fig) + return fig + + def create_ridgeline_model_convergence(self): + """Create Ridgeline Model Convergence Plots.""" + all_plots = {} + x_decomp_agg = self.unfiltered_pareto_result.x_decomp_agg + paid_media_spends = self.mmm_data.mmmdata_spec.paid_media_spends + dt_ridges = x_decomp_agg[x_decomp_agg["rn"].isin(paid_media_spends)].copy() + dt_ridges["iteration"] = ( + dt_ridges["iterNG"] - 1 + ) * self.model_outputs.cores + dt_ridges["iterPar"] + dt_ridges = dt_ridges[["rn", "roi_total", "iteration", "trial"]] + dt_ridges = dt_ridges.sort_values(["iteration", "rn"]) + iterations = self.model_outputs.iterations or 100 + qt_len = ( + 1 + if iterations <= 100 + else (20 if iterations > 2000 else int(np.ceil(iterations / 100))) + ) + set_qt = np.floor(np.linspace(1, iterations, qt_len + 1)).astype(int) + set_bin = set_qt[1:] + dt_ridges["iter_bin"] = pd.cut( + dt_ridges["iteration"], bins=set_qt, labels=set_bin + ) + dt_ridges = dt_ridges.dropna(subset=["iter_bin"]) + dt_ridges["iter_bin"] = pd.Categorical( + dt_ridges["iter_bin"], + categories=sorted(set_bin, reverse=True), + ordered=True, + ) + dt_ridges["trial"] = dt_ridges["trial"].astype("category") + plot_vars = dt_ridges["rn"].unique() + plot_n = int(np.ceil(len(plot_vars) / 6)) + metric = ( + "ROAS" + if self.mmm_data.mmmdata_spec.dep_var_type == DependentVarType.REVENUE + else "CPA" + ) + for pl in range(1, plot_n + 1): + start_idx = (pl - 1) * 6 + loop_vars = plot_vars[start_idx : start_idx + 6] + dt_ridges_loop = dt_ridges[dt_ridges["rn"].isin(loop_vars)] + fig, axes = plt.subplots( + nrows=len(loop_vars), figsize=(12, 3 * len(loop_vars)), sharex=False + ) + if len(loop_vars) == 1: + axes = [axes] + for idx, var in enumerate(loop_vars): + var_data = dt_ridges_loop[dt_ridges_loop["rn"] == var] + offset = 0 + for iter_bin in sorted(var_data["iter_bin"].unique(), reverse=True): + bin_data = var_data[var_data["iter_bin"] == iter_bin]["roi_total"] + sns.kdeplot( + bin_data, + ax=axes[idx], + fill=True, + alpha=0.6, + color=plt.cm.GnBu(offset / len(var_data["iter_bin"].unique())), + label=f"Bin {iter_bin}", + warn_singular=False, + ) + offset += 1 + axes[idx].set_title(f"{var} {metric}") + axes[idx].set_ylabel("") + axes[idx].legend().remove() + axes[idx].spines["right"].set_visible(False) + axes[idx].spines["top"].set_visible(False) + plt.suptitle(f"{metric} Distribution over Iteration Buckets", fontsize=16) + plt.tight_layout() + fig = plt.gcf() + plt.close(fig) + all_plots[f"{metric}_convergence_{pl}"] = fig + return all_plots + def plot_all( self, display_plots: bool = True, export_location: Union[str, Path] = None ) -> None: @@ -732,6 +987,19 @@ def plot_all( break # TODO: This will generate too many plots. Only generate plots for the first solution. we can export all plots to a folder if too many to display + if not self.model_outputs.hyper_fixed: + prophet_decomp_plot = self.create_prophet_decomposition_plot() + if prophet_decomp_plot: + figures["prophet_decomp"] = prophet_decomp_plot + hyperparameters_plot = self.create_hyperparameter_sampling_distribution() + if hyperparameters_plot: + figures["hyperparameters_sampling"] = hyperparameters_plot + pareto_front_plot = self.create_pareto_front_plot(is_calibrated=False) + if pareto_front_plot: + figures["pareto_front"] = pareto_front_plot + ridgeline_plots = self.create_ridgeline_model_convergence() + figures.update(ridgeline_plots) + # Display plots if required if display_plots: self.display_plots(figures)