BlueCast allows easy customization. Users can adjust the
configuration and just pass it to the BlueCast
class. Here is an example:
from bluecast.blueprints.cast import BlueCast
from bluecast.config.training_config import TrainingConfig, XgboostTuneParamsConfig
# Create a custom tuning config and adjust hyperparameter search space
xgboost_param_config = XgboostTuneParamsConfig()
xgboost_param_config.steps_max = 100
xgboost_param_config.max_leaves_max = 16
# Create a custom training config and adjust general training parameters
train_config = TrainingConfig()
train_config.hyperparameter_tuning_rounds = 10
train_config.autotune_model = False # we want to run just normal training, no hyperparameter tuning
# We could even just overwrite the final Xgboost params using the XgboostFinalParamConfig class
# Pass the custom configs to the BlueCast class
automl = BlueCast(
class_problem="binary",
conf_training=train_config,
conf_xgboost=xgboost_param_config,
)
automl.fit(df_train, target_col="target")
y_probs, y_classes = automl.predict(df_val)
The BlueCast
class also allows for custom preprocessing. This is done by
an abstract class that can be inherited and passed into the BlueCast
class.
BlueCast provides two entry points to inject custom preprocessing. The
attribute custom_preprocessor
is called right after the train_test_split.
The attribute custom_last_mile_computation
will be called before the model
training or prediction starts (when only numerical features are present anymore)
and allows users to execute last computations (i.e. sub sampling or final calculations).
from bluecast.blueprints.cast import BlueCast
from bluecast.preprocessing.custom import CustomPreprocessing
# Create a custom tuning config and adjust hyperparameter search space
xgboost_param_config = XgboostTuneParamsConfig()
xgboost_param_config.steps_max = 100
xgboost_param_config.max_leaves_max = 16
# Create a custom training config and adjust general training parameters
train_config = TrainingConfig()
train_config.hyperparameter_tuning_rounds = 10
train_config.autotune_model = False # we want to run just normal training, no hyperparameter tuning
# We could even just overwrite the final Xgboost params using the XgboostFinalParamConfig class
class MyCustomPreprocessing(CustomPreprocessing):
def __init__(self):
self.trained_patterns = {}
def fit_transform(
self, df: pd.DataFrame, target: pd.Series
) -> Tuple[pd.DataFrame, pd.Series]:
num_columns = df.drop(['Beta', 'Gamma', 'Delta', 'Alpha', 'EJ'], axis=1).columns
cat_df = df[['Beta', 'Gamma', 'Delta', 'Alpha', 'EJ']].copy()
zscores = Zscores()
zscores.fit_all(df, ['Beta', 'Gamma', 'Delta', 'Alpha', 'EJ'])
df = zscores.transform_all(df, ['Beta', 'Gamma', 'Delta', 'Alpha', 'EJ'])
self.trained_patterns["zscores"] = zscores
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
num_columns = df.drop(['Beta', 'Gamma', 'Delta', 'Alpha', 'EJ'], axis=1).columns
imp_mean.fit(df.loc[:, num_columns])
df = imp_mean.transform(df.loc[:, num_columns])
self.trained_patterns["imputation"] = imp_mean
df = pd.DataFrame(df, columns=num_columns).merge(cat_df, left_index=True, right_index=True, how="left")
df = df.drop(['Beta', 'Gamma', 'Delta', 'Alpha'], axis=1)
return df, target
def transform(
self,
df: pd.DataFrame,
target: Optional[pd.Series] = None,
predicton_mode: bool = False,
) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
num_columns = df.drop(['Beta', 'Gamma', 'Delta', 'Alpha', 'EJ'], axis=1).columns
cat_df = df[['Beta', 'Gamma', 'Delta', 'Alpha', 'EJ']].copy()
df = self.trained_patterns["zscores"].transform_all(df, ['Beta', 'Gamma', 'Delta', 'Alpha', 'EJ'])
imp_mean = self.trained_patterns["imputation"]
num_columns = df.drop(['Beta', 'Gamma', 'Delta', 'Alpha', 'EJ'], axis=1).columns
df.loc[:, num_columns] = df.loc[:, num_columns].replace([np.inf, -np.inf], np.nan)
df = imp_mean.transform(df.loc[:, num_columns])
df = pd.DataFrame(df, columns=num_columns).merge(cat_df, left_index=True, right_index=True, how="left")
df = df.drop(['Beta', 'Gamma', 'Delta', 'Alpha'], axis=1)
return df, target
# add custom last mile computation
class MyCustomLastMilePreprocessing(CustomPreprocessing):
def custom_function(self, df: pd.DataFrame) -> pd.DataFrame:
df = df / 2
df["custom_col"] = 5
return df
# Please note: The base class enforces that the fit_transform method is implemented
def fit_transform(
self, df: pd.DataFrame, target: pd.Series
) -> Tuple[pd.DataFrame, pd.Series]:
df = self.custom_function(df)
df = df.head(1000)
target = target.head(1000)
return df, target
# Please note: The base class enforces that the fit_transform method is implemented
def transform(
self,
df: pd.DataFrame,
target: Optional[pd.Series] = None,
predicton_mode: bool = False,
) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
df = self.custom_function(df)
if not predicton_mode and isinstance(target, pd.Series):
df = df.head(100)
target = target.head(100)
return df, targe
custom_last_mile_computation = MyCustomLastMilePreprocessing()
custom_preprocessor = MyCustomPreprocessing()
# Pass the custom configs to the BlueCast class
automl = BlueCast(
class_problem="binary",
conf_training=train_config,
conf_xgboost=xgboost_param_config,
custom_preprocessor=custom_preprocessor, # this takes place right after test_train_split
custom_last_mile_computation=custom_last_mile_computation, # last step before model training/prediction
)
automl.fit(df_train, target_col="target")
y_probs, y_classes = automl.predict(df_val)
BlueCast offers automated feature selection. On default the feature
selection is disabled, but BlueCast raises a warning to inform the
user about this option. The behaviour can be controlled via the
TrainingConfig
.
from bluecast.blueprints.cast import BlueCast
from bluecast.preprocessing.custom import CustomPreprocessing
from bluecast.config.training_config import TrainingConfig
# Create a custom training config and adjust general training parameters
train_config = TrainingConfig()
train_config.hyperparameter_tuning_rounds = 10
train_config.autotune_model = False # we want to run just normal training, no hyperparameter tuning
train_config.enable_feature_selection = True
# Pass the custom configs to the BlueCast class
automl = BlueCast(
class_problem="binary",
conf_training=train_config,
)
automl.fit(df_train, target_col="target")
y_probs, y_classes = automl.predict(df_val)
Also this step can be customized. The following example shows how to:
from bluecast.config.training_config import TrainingConfig
from bluecast.preprocessing.custom import CustomPreprocessing
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
from typing import Optional, Tuple
# Create a custom training config and adjust general training parameters
train_config = TrainingConfig()
train_config.enable_feature_selection = True
# add custom feature selection
class RFECVSelector(CustomPreprocessing):
def __init__(self, random_state: int = 0):
super().__init__()
self.selected_features = None
self.random_state = random_state
self.selection_strategy: RFECV = RFECV(
estimator=xgb.XGBClassifier(),
step=1,
cv=StratifiedKFold(5, random_state=random_state, shuffle=True),
min_features_to_select=1,
scoring=make_scorer(matthews_corrcoef),
n_jobs=2,
)
def fit_transform(self, df: pd.DataFrame, target: pd.Series) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
self.selection_strategy.fit(df, target)
self.selected_features = self.selection_strategy.support_
df = df.loc[:, self.selected_features]
return df, target
def transform(self,
df: pd.DataFrame,
target: Optional[pd.Series] = None,
predicton_mode: bool = False) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
df = df.loc[:, self.selected_features]
return df, target
custom_feature_selector = RFECVSelector()
# Create an instance of the BlueCast class with the custom model
bluecast = BlueCast(
class_problem="binary",
conf_feature_selection=custom_feat_sel,
conf_training=train_config,
custom_feature_selector=custom_feature_selector,
# Create some sample data for testing
x_train = pd.DataFrame(
{"feature1": [i for i in range(10)], "feature2": [i for i in range(10)]}
)
y_train = pd.Series([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
x_test = pd.DataFrame(
{"feature1": [i for i in range(10)], "feature2": [i for i in range(10)]}
x_train["target"] = y_trai
# Fit the BlueCast model using the custom model
bluecast.fit(x_train, "target"
# Predict on the test data using the custom model
predicted_probas, predicted_classes = bluecast.predict(x_test)
For some users it might just be convenient to use the BlueCast class to enjoy convenience features (details see below), but use a custom ML model. This is possible by passing a custom model to the BlueCast class. The needed properties are defined via the BaseClassMlModel class. Here is an example:
from bluecast.ml_modelling.base_classes import (
BaseClassMlModel,
PredictedClasses, # just for linting checks
PredictedProbas, # just for linting checks
)
class CustomModel(BaseClassMlModel):
def __init__(self):
self.model = None
def autotune(
self,
x_train: pd.DataFrame,
x_test: pd.DataFrame,
y_train: pd.Series,
y_test: pd.Series,
):
eval_dataset = Pool(x_test, y_test, cat_features=["DMAR", "LD_INDL", "RF_CESAR", "SEX"])
alpha = 0.05
quantile_levels = [alpha, 1 - alpha]
quantile_str = str(quantile_levels).replace('[','').replace(']','')
def objective(trial):
# this part is taken from: https://www.kaggle.com/code/syerramilli/catboost-multi-quantile-regression
param = {
'n_estimators': trial.suggest_int('n_estimators', 50, 2000, log=True),
"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 100),
"learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
'depth': trial.suggest_int('depth', 2, 10, log=True),
"l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 1e6, log=True),
'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0.1, 1),
'subsample': trial.suggest_float("subsample", 0.3, 1)
}
model = CatBoostRegressor(
loss_function=f'MultiQuantile:alpha={quantile_str}',
thread_count= 4,
cat_features=["DMAR", "LD_INDL", "RF_CESAR", "SEX"],
bootstrap_type = "Bernoulli",
sampling_frequency= 'PerTree',
**param
)
# train model
model.fit(x_train, y_train, verbose=0)
# get predictions
preds = model.predict(x_test)
# get perfomance metrics
MWIS, coverage = MWIS_metric.score(
y_test, preds[:, 0], preds[:, 1], alpha=0.1
)
if coverage < 0.9:
raise optuna.exceptions.TrialPruned()
return MWIS
sampler = optuna.samplers.TPESampler(
multivariate=True, seed=1000
)
study = optuna.create_study(
direction="minimize", sampler=sampler, study_name=f"catboost"
)
study.optimize(
objective,
n_trials=500,
timeout=60 * 60 * 2,
gc_after_trial=True,
show_progress_bar=True,
)
best_parameters = study.best_trial.params
self.model = CatBoostRegressor(
loss_function=f'MultiQuantile:alpha={quantile_str}',
thread_count= 4,
cat_features=["DMAR", "LD_INDL", "RF_CESAR", "SEX"],
bootstrap_type = "Bernoulli",
sampling_frequency= 'PerTree',
**best_parameters
).fit(
x_train,
y_train,
eval_set=eval_dataset,
use_best_model=True,
early_stopping_rounds=20,
plot=True,
)
def fit(
self,
x_train: pd.DataFrame,
x_test: pd.DataFrame,
y_train: pd.Series,
y_test: pd.Series,
) -> None:
self.autotune(x_train, x_test, y_train, y_test)
def predict(self, df: pd.DataFrame) -> np.ndarray:
# predict Catboost classifier
preds = self.model.predict(df)
return preds
train_config = TrainingConfig()
train_config.global_random_state = i
train_config.calculate_shap_values = False
train_config.train_size = 0.8
train_config.enable_feature_selection = False
train_config.cat_encoding_via_ml_algorithm = True # use catboosts own cat encoding
catboost_model = CustomModel()
automl = BlueCastRegression( # BlueCastCVRegression is not possible here, because the quantile regression predictions have an incompatible shape
class_problem="regression",
conf_training=train_config,
ml_model=catboost_model,
)
automl.fit(train.copy(), target_col=target) # fit_eval is not possible here, because the predictions have an incompatible shape
ml_models.append(automl)
Please note that custom ML models require user defined hyperparameter tuning. Pre-defined configurations are not available for custom models. Also note that the calculation of SHAP values only works with tree based models by default. For other model architectures disable SHAP values in the TrainingConfig via:
train_config.calculate_shap_values = False
Just instantiate a new instance of the TrainingConfig, update the param as above and pass the config as an argument to the BlueCast instance during instantiation. Feature importance can be added in the custom model definition.