diff --git a/examples/plot_complex_data.py b/examples/plot_complex_data.py new file mode 100644 index 0000000..d0d4857 --- /dev/null +++ b/examples/plot_complex_data.py @@ -0,0 +1,185 @@ +""" +========================================== +Modeling competing risks on synthetic data +========================================== + +This example introduces a synthetic data generation tool that can +be helpful to study the relative performance and potential biases +of predictive competing risks estimators on right-censored data. +Some of the input features and their second order multiplicative +interactions are statistically associated with the parameters of +the distributions from which the competing events are sampled. + +""" + +# %% +import numpy as np +import hazardous.data._competing_weibull as competing_w +from time import perf_counter +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from sklearn.utils import check_random_state +import pandas as pd +import seaborn as sns + +from hazardous import GradientBoostingIncidence +from lifelines import AalenJohansenFitter + +seed = 0 +rng = check_random_state(seed) + + +# %% +# In the following cell, we verify that the synthetic dataset is well defined. + +df_features = pd.DataFrame(rng.randn(100_000, 10)) +df_features.columns = [f"feature_{i}" for i in range(10)] + +df_shape_scale_star = competing_w.compute_shape_and_scale( + df_features, + features_rate=0.2, + n_events=3, + degree_interaction=2, + random_state=0, +) +fig, axes = plt.subplots(2, 3, figsize=(10, 7)) +for idx, col in enumerate(df_shape_scale_star.columns): + sns.histplot(df_shape_scale_star[col], ax=axes[idx // 3][idx % 3]) + +# %% + +X, y_censored, y_uncensored = competing_w.make_synthetic_competing_weibull( + n_samples=3_000, + base_scale=1_000, + n_features=10, + features_rate=0.5, + degree_interaction=2, + independent_censoring=False, + features_censoring_rate=0.2, + return_uncensored_data=True, + return_X_y=True, + feature_rounding=3, + target_rounding=4, + censoring_relative_scale=4.0, + random_state=0, + complex_features=True, +) + +# %% +n_samples = len(X) +calculate_variance = n_samples <= 5_000 +aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0) + +y_censored["event"].value_counts() + +gb_incidence = GradientBoostingIncidence( + learning_rate=0.1, + n_iter=20, + max_leaf_nodes=15, + hard_zero_fraction=0.1, + min_samples_leaf=5, + loss="ibs", + show_progressbar=False, + random_state=seed, +) + + +# %% +# +# CIFs estimated on uncensored data +# --------------------------------- +# +# Let's now estimate the CIFs on uncensored data and plot them against the +# theoretical CIFs: + + +def plot_cumulative_incidence_functions( + X, y, gb_incidence=None, aj=None, X_test=None, y_test=None +): + n_events = y["event"].max() + t_max = y["duration"].max() + _, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True) + # Compute the estimate of the CIFs on a coarse grid. + coarse_timegrid = np.linspace(0, t_max, num=100) + censoring_fraction = (y["event"] == 0).mean() + plt.suptitle( + "Cause-specific cumulative incidence functions" + f" ({censoring_fraction:.1%} censoring)" + ) + + for event_id, ax in enumerate(axes, 1): + if gb_incidence is not None: + tic = perf_counter() + gb_incidence.set_params(event_of_interest=event_id) + gb_incidence.fit(X, y) + duration = perf_counter() - tic + print(f"GB Incidence for event {event_id} fit in {duration:.3f} s") + tic = perf_counter() + cifs_pred = gb_incidence.predict_cumulative_incidence(X, coarse_timegrid) + cif_mean = cifs_pred.mean(axis=0) + duration = perf_counter() - tic + print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s") + print("Brier score on training data:", -gb_incidence.score(X, y)) + if X_test is not None: + print( + "Brier score on testing data:", -gb_incidence.score(X_test, y_test) + ) + ax.plot( + coarse_timegrid, + cif_mean, + label="GradientBoostingIncidence", + ) + ax.set(title=f"Event {event_id}") + + if aj is not None: + tic = perf_counter() + aj.fit(y["duration"], y["event"], event_of_interest=event_id) + duration = perf_counter() - tic + print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s") + aj.plot(label="Aalen-Johansen", ax=ax) + ax.set_xlim(0, 8_000) + ax.set_ylim(0, 0.5) + if event_id == 1: + ax.legend(loc="lower right") + else: + ax.legend().remove() + + +# %% + +X_train, X_test, y_train_c, y_test_c = train_test_split( + X, y_censored, test_size=0.3, random_state=seed +) +y_train_u = y_uncensored.loc[y_train_c.index] +y_test_u = y_uncensored.loc[y_test_c.index] + +gb_incidence = GradientBoostingIncidence( + learning_rate=0.1, + n_iter=20, + max_leaf_nodes=15, + hard_zero_fraction=0.1, + min_samples_leaf=5, + loss="ibs", + show_progressbar=False, + random_state=seed, +) + +plot_cumulative_incidence_functions( + X_train, + y_train_u, + gb_incidence=gb_incidence, + aj=aj, + X_test=X_test, + y_test=y_test_u, +) + +plot_cumulative_incidence_functions( + X_train, + y_train_c, + gb_incidence=gb_incidence, + aj=aj, + X_test=X_test, + y_test=y_test_c, +) + +# %% diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py index 2b15f9c..b20401c 100644 --- a/hazardous/data/_competing_weibull.py +++ b/hazardous/data/_competing_weibull.py @@ -2,46 +2,160 @@ import numpy as np import pandas as pd +from scipy.special import expit from scipy.stats import weibull_min from sklearn.datasets._base import Bunch +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import PolynomialFeatures, SplineTransformer, StandardScaler from sklearn.utils import check_random_state DEFAULT_SHAPE_RANGES = ( - (0.4, 0.9), + (1.0, 3.5), (1.0, 1.0), - (1.2, 3), + (3.0, 6.0), ) DEFAULT_SCALE_RANGES = ( - (1, 20), - (1, 10), + (1, 7), + (1, 15), (1.5, 5), ) -def _censor(y, relative_scale, random_state=None): - if relative_scale == 0 or relative_scale is None: +def _censor( + y, + independent_censoring=True, + X=None, + features_censoring_rate=0.2, + censoring_relative_scale=1.5, + random_state=0, +): + rng = check_random_state(random_state) + if censoring_relative_scale == 0 or censoring_relative_scale is None: return y + if independent_censoring: + scale_censoring = censoring_relative_scale * y["duration"].mean() + shape_censoring = 3 + else: + w_censoring_star = rng.randn(X.shape[1], 2) + w_censoring_star = np.where( + rng.rand(w_censoring_star.shape[0], w_censoring_star.shape[1]) + < features_censoring_rate, + w_censoring_star, + 0, + ) + df_censoring_params = censoring_relative_scale * X @ w_censoring_star + df_censoring_params.columns = ["shape_0", "scale_0"] + + df_censoring_params["shape_0"] = _rescale_params( + df_censoring_params[["shape_0"]], + param_min=2.0, + param_max=3.0, + ) + df_censoring_params["scale_0"] = _rescale_params( + df_censoring_params[["scale_0"]], + param_min=1, + param_max=censoring_relative_scale, + ) + + scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean() + shape_censoring = df_censoring_params["shape_0"].values + censoring = weibull_min.rvs( + shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng + ) + y_censored = y.copy() + y_censored["event"] = np.where( + y_censored["duration"] < censoring, y_censored["event"], 0 + ) + y_censored["duration"] = np.minimum(y_censored["duration"], censoring) + return y_censored + + +def compute_shape_and_scale( + X, + features_rate=0.2, + n_events=3, + degree_interaction=2, + shape_ranges=DEFAULT_SHAPE_RANGES, + scale_ranges=DEFAULT_SCALE_RANGES, + random_state=0, +): rng = check_random_state(random_state) - scale = relative_scale * y["duration"].mean() - censoring = weibull_min.rvs(1, scale=scale, size=y.shape[0], random_state=rng) - y = y.copy() - y["event"] = np.where(y["duration"] < censoring, y["event"], 0) - y["duration"] = np.minimum(y["duration"], censoring) - return y + # Adding interactions between features + preprocessor = make_pipeline( + SplineTransformer(n_knots=3), + PolynomialFeatures( + degree=degree_interaction, interaction_only=True, include_bias=False + ), + ) + preprocessor.set_output(transform="pandas") + X_trans = preprocessor.fit_transform(X) + # Create masked matrix with the interactions + n_weibull_parameters = 2 * n_events + w_star = rng.randn(X_trans.shape[1], n_weibull_parameters) + # 1-feature_rate% of marginal features and interacted features + # are set to 0 + cols_features = X_trans.columns + marginal_mask = np.array([len(col.split(" ")) == 1 for col in cols_features]) + marginal_cols = np.repeat( + marginal_mask.reshape(-1, 1), repeats=n_weibull_parameters, axis=1 + ) # (X_trans.shape[1], n_weibull_parameters) + drop_out_mask = rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate + w_star_marginal = np.where( + drop_out_mask & marginal_cols, + w_star, + 0, + ) + w_star = np.where( + drop_out_mask & ~marginal_cols, + w_star, + 0, + ) + w_star += w_star_marginal -def make_synthetic_competing_weibull( + # Computation of the true values of shape and scale + shape_scale_star = X_trans.values @ w_star + shape_scale_columns = [f"shape_{i}" for i in range(1, n_events + 1)] + [ + f"scale_{i}" for i in range(1, n_events + 1) + ] + df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns) + # Rescaling of these values to stay in the chosen range + + return rescale_params(df_shape_scale_star, n_events, shape_ranges, scale_ranges) + + +def rescale_params(df_shape_scale_star, n_events, shape_ranges, scale_ranges): + for event_id, shape_range, scale_range in zip( + range(1, n_events + 1), cycle(shape_ranges), cycle(scale_ranges) + ): + shape_min, shape_max = shape_range + scale_min, scale_max = scale_range + df_shape_scale_star[f"shape_{event_id}"] = _rescale_params( + df_shape_scale_star[[f"shape_{event_id}"]], shape_min, shape_max + ) + df_shape_scale_star[f"scale_{event_id}"] = _rescale_params( + df_shape_scale_star[[f"scale_{event_id}"]], scale_min, scale_max + ) + return df_shape_scale_star + + +def _rescale_params(column_param, param_min, param_max): + # Rescaling of these values to stay in the chosen range + scaler = StandardScaler() + column_param = ( + param_min + (param_max - param_min) * expit(scaler.fit_transform(column_param)) + ).flatten() + return column_param + + +def make_simple_features( n_events=3, - n_samples=3000, - return_X_y=False, + n_samples=3_000, base_scale=1_000, - feature_rounding=2, - target_rounding=1, shape_ranges=DEFAULT_SHAPE_RANGES, scale_ranges=DEFAULT_SCALE_RANGES, - censoring_relative_scale=1.5, random_state=None, ): """Generate a synthetic dataset with competing Weibull-distributed events. @@ -58,14 +172,6 @@ def make_synthetic_competing_weibull( individual, the event type with the shortest duration is kept as the target event (competing risks setting) and its event identifier and duration are returned as the target dataframe. - - A fraction of the individuals are censored by sampling a censoring time - from a Weibull distribution with shape 1 and scale equal to the mean - duration of the target event times the ``censoring_relative_scale``. - - Setting ``censoring_relative_scale`` to 0 or None disables censoring. - Setting it to a small value (e.g. 0.5 instead of 1.5) will result in a - larger fraction of censored individuals. """ rng = check_random_state(random_state) all_features = [] @@ -84,21 +190,129 @@ def make_synthetic_competing_weibull( event_durations = np.asarray(event_durations) duration_argmin = np.argmin(event_durations, axis=0) X = pd.concat(all_features, axis=1) + return X, event_durations, duration_argmin + + +def make_complex_features_with_sparse_matrix( + n_events=3, + n_samples=3_000, + base_scale=1_000, + shape_ranges=DEFAULT_SHAPE_RANGES, + scale_ranges=DEFAULT_SCALE_RANGES, + n_features=10, + features_rate=0.3, + degree_interaction=2, + random_state=0, +): + rng = np.random.RandomState(random_state) + # Create features given to the model as X and then creating the interactions + columns = [f"feature_{i}" for i in range(n_features)] + df_features = pd.DataFrame( + rng.randn(n_samples, n_features), + columns=columns, + ) + + df_shape_scale_star = compute_shape_and_scale( + df_features, + features_rate, + n_events, + degree_interaction, + shape_ranges, + scale_ranges, + random_state, + ) + # Throw durations from a weibull distribution with scale and shape as the parameters + event_durations = [] + for event in range(1, n_events + 1): + shape = df_shape_scale_star[f"shape_{event}"] + scale = df_shape_scale_star[f"scale_{event}"] + durations = weibull_min.rvs(shape, scale=scale * base_scale, random_state=rng) + event_durations.append(durations) + + # Creating the target tabular + event_durations = np.asarray(event_durations) + duration_argmin = np.argmin(event_durations, axis=0) + return df_features, event_durations, duration_argmin + + +def make_synthetic_competing_weibull( + n_events=3, + n_samples=3_000, + base_scale=1_000, + n_features=10, + features_rate=0.3, + degree_interaction=2, + independent_censoring=True, + features_censoring_rate=0.2, + return_uncensored_data=False, + return_X_y=True, + feature_rounding=2, + target_rounding=4, + shape_ranges=DEFAULT_SHAPE_RANGES, + scale_ranges=DEFAULT_SCALE_RANGES, + censoring_relative_scale=1.5, + random_state=0, + complex_features=False, +): + """ + Creating a synthetic dataset to make competing risks. + Depending of the choice of the user, the function may genere a simple dataset + (setting ``complex_features`` to False or either create complex features.) + + A fraction of the individuals are censored by sampling a censoring time + from a Weibull distribution with shape 1 and scale equal to the mean + duration of the target event times the ``censoring_relative_scale``. + + Setting ``censoring_relative_scale`` to 0 or None disables censoring. + Setting it to a small value (e.g. 0.5 instead of 1.5) will result in a + larger fraction of censored individuals. + """ + if complex_features: + X, event_durations, duration_argmin = make_complex_features_with_sparse_matrix( + n_events=n_events, + n_samples=n_samples, + base_scale=base_scale, + shape_ranges=shape_ranges, + scale_ranges=scale_ranges, + n_features=n_features, + features_rate=features_rate, + degree_interaction=degree_interaction, + random_state=random_state, + ) + else: + X, event_durations, duration_argmin = make_simple_features( + n_events=n_events, + n_samples=n_samples, + base_scale=base_scale, + shape_ranges=shape_ranges, + scale_ranges=scale_ranges, + random_state=random_state, + ) y = pd.DataFrame( dict( event=duration_argmin + 1, duration=event_durations[duration_argmin, np.arange(n_samples)], ) ) - y = _censor(y, censoring_relative_scale, random_state=random_state) + y_censored = _censor( + y, + censoring_relative_scale=censoring_relative_scale, + independent_censoring=independent_censoring, + X=X, + features_censoring_rate=features_censoring_rate, + random_state=random_state, + ) if feature_rounding is not None: X = X.round(feature_rounding) if target_rounding is not None: + y_censored["duration"] = y_censored["duration"].round(target_rounding) y = y.round(target_rounding) if return_X_y: - return X, y + if return_uncensored_data: + return X, y_censored, y + return X, y_censored frame = pd.concat([X, y], axis=1) - return Bunch(data=frame[X.columns], target=X[y.columns], frame=frame) + return Bunch(data=frame[X.columns], target=frame[y_censored.columns], frame=frame) diff --git a/hazardous/data/tests/test_competing_weibull.py b/hazardous/data/tests/test_competing_weibull.py index 594dbe3..eb2e638 100644 --- a/hazardous/data/tests/test_competing_weibull.py +++ b/hazardous/data/tests/test_competing_weibull.py @@ -1,6 +1,6 @@ import pytest -from sklearn.dummy import DummyClassifier, DummyRegressor -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score from hazardous.data import make_synthetic_competing_weibull @@ -52,38 +52,9 @@ def test_competing_weibull_no_censoring(seed): y["event"], cv=3, ).mean() - assert 0.4 > baseline_classification_acc > 0.3 # approximately balanced - assert event_classification_acc > 1.2 * baseline_classification_acc - assert event_classification_acc < 0.6 # still challenging. - - # Check that the features make it possible to predict the durations better - # than a marginal baseline, but that a significant amount of unpredictability - # remains. - median_duration = y["duration"].median() - duration_regression_relative_error = ( - -cross_val_score( - RandomForestRegressor(criterion="poisson", random_state=seed), - X, - y["duration"], - cv=3, - n_jobs=4, - scoring="neg_mean_absolute_error", - ).mean() - / median_duration - ) - baseline_duration_relative_error = ( - -cross_val_score( - DummyRegressor(strategy="mean"), - X, - y["duration"], - cv=3, - scoring="neg_mean_absolute_error", - ).mean() - / median_duration - ) - assert 0.9 < baseline_duration_relative_error < 1.1 # approximately balanced - assert duration_regression_relative_error < 0.98 * baseline_duration_relative_error - assert duration_regression_relative_error > 0.8 # still challenging. + assert 0.45 > baseline_classification_acc > 0.3 # approximately balanced + assert event_classification_acc > 1.1 * baseline_classification_acc + assert event_classification_acc < 0.63 # still challenging. @pytest.mark.parametrize("seed", range(3)) @@ -109,7 +80,7 @@ def test_competing_weibull_with_censoring(seed): # Censoring rate is lower for higher scale: high_scale_censoring_rate = (y_high_scale["event"] == 0).mean() low_scale_censoring_rate = (y_low_scale["event"] == 0).mean() - assert 0.35 < high_scale_censoring_rate < 0.5 + assert 0.3 < high_scale_censoring_rate < 0.5 assert 0.5 < low_scale_censoring_rate < 0.65 # Uncensored events and durations match: diff --git a/hazardous/tests/test_gradient_boosting_incidence.py b/hazardous/tests/test_gradient_boosting_incidence.py index 0c2fd73..4e14ab3 100644 --- a/hazardous/tests/test_gradient_boosting_incidence.py +++ b/hazardous/tests/test_gradient_boosting_incidence.py @@ -139,4 +139,4 @@ def test_gradient_boosting_incidence_parameter_tuning(seed): worst_ibs = -cv_results.iloc[0]["mean_test_score"] best_ibs = -cv_results.iloc[-1]["mean_test_score"] assert best_ibs == pytest.approx(-grid_search.best_score_) - assert worst_ibs > 1.4 * best_ibs + assert worst_ibs > 1.25 * best_ibs diff --git a/pyproject.toml b/pyproject.toml index 2c24fb7..862b146 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ doc = [ "matplotlib", "pillow", # to scrape images from the examples "numpydoc", + "seaborn", ] [project.urls]