diff --git a/advanced-python/10Basics.ipynb b/advanced-python/10Basics.ipynb index 87d8d778..93be8a95 100644 --- a/advanced-python/10Basics.ipynb +++ b/advanced-python/10Basics.ipynb @@ -481,7 +481,7 @@ }, "outputs": [], "source": [ - "{'a': 'b'}.get?" + "{'a': 'b'}.get?" ] }, { diff --git a/advanced-python/33ModelTuning.ipynb b/advanced-python/33ModelTuning.ipynb new file mode 100644 index 00000000..0ea4d465 --- /dev/null +++ b/advanced-python/33ModelTuning.ipynb @@ -0,0 +1,681 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model tuning setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#%store -r training_data\n", + "#%store -r training_columns\n", + "#%store -r bkg_df\n", + "#%store -r mc_df\n", + "#%store -r data_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#@title\n", + "#!pip install uproot\n", + "#!pip install sklearn\n", + "\n", + "import time\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import uproot\n", + "import xgboost as xgb\n", + "from matplotlib import pyplot as plt\n", + "from sklearn import metrics\n", + "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier\n", + "from sklearn.metrics import auc, roc_curve\n", + "from sklearn.model_selection import (GridSearchCV, KFold, cross_val_score,\n", + " cross_validate, train_test_split)\n", + "from xgboost.sklearn import XGBClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Time and processing check for the lesson\n", + "stt = time.time()\n", + "stc = time.process_time()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_mass(df, label=\"\", norm=True):\n", + " counts, bins, _ = plt.hist(df['Jpsi_M'], label=label, bins=100, range=[2.75, 3.5], histtype='step', density=norm)\n", + " # You can also use LaTeX in the axis label\n", + " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", + " plt.xlim(bins[0], bins[-1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_comparision(var, mc_df, bkg_df):\n", + " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", + " _, bins, _ = plt.hist(bkg_df[var], bins=bins, histtype='step', label='Background', density=1)\n", + " plt.xlabel(var)\n", + " plt.xlim(bins[0], bins[-1])\n", + " plt.legend(loc='best')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_roc(bdt, training_data, training_columns, label=None):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + " area = auc(fpr, tpr)\n", + "\n", + " plt.plot([0, 1], [0, 1], color='grey', linestyle='--')\n", + " if label:\n", + " plt.plot(fpr, tpr, label=f'{label} (area = {area:.2f})')\n", + " else:\n", + " plt.plot(fpr, tpr, label=f'ROC curve (area = {area:.2f})')\n", + " plt.xlim(0.0, 1.0)\n", + " plt.ylim(0.0, 1.0)\n", + " plt.xlabel('False Positive Rate')\n", + " plt.ylabel('True Positive Rate')\n", + " plt.legend(loc='lower right')\n", + " # We can make the plot look nicer by forcing the grid to be square\n", + " plt.gca().set_aspect('equal', adjustable='box')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_significance(bdt, training_data, training_columns, label):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + "\n", + " n_sig = 1200\n", + " n_bkg = 23000\n", + " S = n_sig*tpr + (n_sig*tpr==0)*1\n", + " B = n_bkg*fpr + (n_bkg*tpr==0)*1\n", + " metric = S/np.sqrt(S+B)\n", + "\n", + " plt.plot(thresholds, metric, label=label)\n", + " plt.xlabel('BDT cut value')\n", + " plt.ylabel('$\\\\frac{S}{\\\\sqrt{S+B}}$')\n", + " plt.xlim(0, 1.0)\n", + "\n", + " optimum = np.max(metric)\n", + " optimal_cut = thresholds[np.argmax(metric)]\n", + " print(label, \": S/sqrt(S+B) =\", optimum, \" at x =\", optimal_cut)\n", + " plt.axvline(x=optimal_cut, color='black', linewidth=1.0, linestyle='--')\n", + "\n", + " return optimal_cut" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly\n", + "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", + " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", + " )['DecayTree'].arrays(library='pd')#,entry_stop=max_entries)\n", + "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", + " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", + " )['DecayTree'].arrays(library='pd')#,entry_stop=max_entries)\n", + "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", + "\n", + "for df in [mc_df, data_df, bkg_df]:\n", + " df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", + " df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + " df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + "\n", + "bkg_df['catagory'] = 0 # Use 0 for background\n", + "mc_df['catagory'] = 1 # Use 1 for signal\n", + "training_data = pd.concat([bkg_df, mc_df], copy=True, ignore_index=True)\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n", + "\n", + "training_columns = [\n", + " 'Jpsi_PT',\n", + " 'mup_PT', 'mup_eta', 'mup_ProbNNmu', 'mup_IP',\n", + " 'mum_PT', 'mum_eta', 'mum_ProbNNmu', 'mum_IP',\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Previously we trained an XGBClassifier with the default settings, with learning rate = 0.3 and maximum iterations = 100. This cut off to the training process may be limiting the performance of our model. We can monitor the performance of our model as a function of training iteration and stop the training when the gradient approximates zero. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X1, y1 = training_data[training_columns], training_data['catagory']\n", + "X_train, X_test, y_train, y_test = train_test_split(X1, y1)\n", + "# default train_size = 0.25, this can be varied to suit your data\n", + "\n", + "LR = 0.3 # the coefficient of step size decay, eta, has alias 'learning_rate' with default 0.3\n", + "\n", + "stime = time.time()\n", + "bdt = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", + "bdt.fit(training_data[training_columns], training_data['catagory'])\n", + "print(\"XGBoost --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGB'] = bdt.predict_proba(df[training_columns])[:,1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cross-validation\n", + "\n", + "Splitting the data into randomised subsets for training allows you to monitor your model's performance on the fly using the statistically independant remainder of your sample - this is called cross-validation (CV). We can see below that at the 100th iteration the metrics still show a trend of improvement." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def training_monitor(alg):\n", + "\n", + " # A model trained with eval_set and eval_metric will return evals_result\n", + " results = alg.evals_result()\n", + " epochs = len(results['validation_0']['logloss'])\n", + " x_axis = range(0, epochs)\n", + "\n", + " # Plotting logLoss as a function of training iteration\n", + " fig, ax = plt.subplots()\n", + " ax.plot(x_axis, results['validation_0']['logloss'], label='Train') # for each eval_set\n", + " if results['validation_1']: ax.plot(x_axis, results['validation_1']['logloss'], label='Test')\n", + " ax.legend()\n", + " plt.ylabel('LogLoss')\n", + " plt.title('LogLoss')\n", + " plt.show()\n", + "\n", + " # Plotting classification error as a function of training iteration\n", + " fig, ax = plt.subplots()\n", + " ax.plot(x_axis, results['validation_0']['error'], label='Train') # for each eval_set\n", + " if results['validation_1']: ax.plot(x_axis, results['validation_1']['error'], label='Test')\n", + " ax.legend()\n", + " plt.ylabel('Error')\n", + " plt.title('Error')\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This involves training on less data but allows us to monitor progress to check if the model is becoming over-specific to our training sample. The minimisation of loss and classification error are common metrics for model assessment. As shown below, the cost to performance is negligible. If the test sample gradient were to invert this would be considered overtraining and is why monitoring performance without CV can be a time costly pitfall." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Defining a model with multi-threading set to maximum\n", + "bdt_cv = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", + "\n", + "# Model fitting with CV and printing out processing time\n", + "stime = time.time()\n", + "bdt_cv.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)\n", + "print(\"\\nXGBoost cross-validation --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Writing model predictions out for data\n", + "training_monitor(bdt_cv)\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBcv'] = bdt_cv.predict_proba(df[training_columns])[:,1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drawing plot of model respone for signal and background classes\n", + "plt.figure()\n", + "plot_comparision('XGB', mc_df, bkg_df)\n", + "plot_comparision('XGBcv', mc_df, bkg_df)\n", + "\n", + "# Drawing the signal efficiency vs background rejection curve (ROC)\n", + "plt.figure()\n", + "plot_roc(bdt, training_data, training_columns)\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "\n", + "# Drawing signal significance comparison as a function of minimum cut on model response\n", + "plt.figure()\n", + "bdt_cut = plot_significance(bdt, training_data, training_columns, \"bdt\")\n", + "bdt_cv_cut = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### $k$-folding & early stopping\n", + "\n", + "Performing CV on each of a number, k, of ways to split your data gives you k models to choose from. Some choose to average the performance across the models from each fold as any instability might imply the model will not be reliable. The results below seem stable; each fold provides a consistant performance across multiple metrics, so we'll just choose the best one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Defining the folds with a seed to test consistently\n", + "splits = 4 # to match 0.25 value of test_train_split default though this may not be optimal\n", + "kf = KFold(n_splits=splits, shuffle=True, random_state=123)\n", + "\n", + "# Printing processing time of the kfold cross-validation\n", + "stime = time.time()\n", + "for train, test in kf.split(X1):\n", + " X_train, X_test = X1.iloc[train], X1.iloc[test]\n", + " y_train, y_test = y1.iloc[train], y1.iloc[test]\n", + " bdt.fit(X_train,y_train)\n", + "print(\"\\nXGBoost k-folding --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Calculating scores of each fold using variety of CV-metrics\n", + "cv_acc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"accuracy\", n_jobs=-1)\n", + "cv_los = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"neg_log_loss\", n_jobs=-1)\n", + "cv_auc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"roc_auc\", n_jobs=-1)\n", + "\n", + "# Printing results and indicating best fold\n", + "print(\"accuracy: \",cv_acc, \" -> best fold =\", np.argmax(cv_acc) )\n", + "print(\"-logloss: \",cv_los, \" -> best fold =\", np.argmax(cv_los) )\n", + "print(\"roc_auc: \",cv_auc, \" -> best fold =\", np.argmax(cv_auc) )\n", + "bestfold = np.argmax(cv_acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Early stopping defines a maximum number of rounds the cross-validation metric (we'll use 'error'=1-accuracy) is allowed to not improve before training is terminated. As is standard, we will be reverting back to a 'previous best' model based on test sample score, this helps avoid overtraining. Early stopping prevents us training too many of extra models thus saving time. Set the limit too small though and your training might be cut off prematurely." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def modelfit(alg, metric, params, label, predictors, kfold, fbest, early_stop=10):\n", + "\n", + " # Loading data split inputs providing best fold result\n", + " for k, (train, test) in enumerate(kf.split(params)):\n", + " if (k==fbest):\n", + " X_train, X_test = params.iloc[train], params.iloc[test]\n", + " y_train, y_test = label.iloc[train], label.iloc[test]\n", + "\n", + " # Defining data in terms of training variables and class label\n", + " xgb_param = alg.get_xgb_params()\n", + " data = xgb.DMatrix(params, label=label, feature_names=predictors, nthread=-1)\n", + "\n", + " # Runs timed CV on our model using early stopping based on our metric\n", + " stime = time.time()\n", + " cvresult = xgb.cv(xgb_param,\n", + " data,\n", + " num_boost_round=alg.get_params()['n_estimators'],\n", + " #nfold=cv_folds, # to use in build folding\n", + " folds=kfold, # use -> ignores nfold\n", + " metrics=metric,\n", + " early_stopping_rounds=early_stop)\n", + " alg.set_params(n_estimators=cvresult.shape[0])\n", + " print(\"\\nXGBoost early-stop folding --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Fitting the algorithm on the data with CV evaluation early stopping\n", + " stime = time.time()\n", + " alg.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)],\n", + " verbose=False, early_stopping_rounds=early_stop)\n", + " training_monitor(alg)\n", + " print(\"XGBoost early-stop limit --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Predicting training set:\n", + " train_predictions = alg.predict(X_train)\n", + " test_predictions = alg.predict(X_test)\n", + "\n", + " # Printing model report:\n", + " print(\"\\nModel Report : best iteration \"+str(cvresult.shape[0]))\n", + " print(\"Train Accuracy : \"+str(metrics.accuracy_score(y_train, train_predictions)))\n", + " print(\"Test Accuracy : \"+str(metrics.accuracy_score(y_test, test_predictions)))\n", + " return cvresult.shape[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function incorporates the k-folding CV and early stopping, saving not only the optimal model but also the index of its training iteration. This means, in our subsequent steps, we can apply an upper limit on training for models based on the convergence of the default hyperparameters, saving us some time. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Defining model with high maximum estimators for use with early stopping\n", + "bdt_es = XGBClassifier(learning_rate = LR, n_estimators=1000,\n", + " # Default values of other hyperparamters\n", + " #max_depth=6, min_child_weight=1,\n", + " #gamma=0, subsample=0.8,\n", + " #colsample_bytree=0.8, scale_pos_weight=1,\n", + " #objective='binary:logistic', # default for binary classification\n", + " #objective='mutli:softprob', num_class=3, # for multiclassifiers\n", + " seed=123, n_threads=-1)\n", + "\n", + "# Timing the CV using early stopping\n", + "stime = time.time()\n", + "estimators = modelfit(bdt_es, \"error\", X1, y1, training_columns, kf, bestfold)\n", + "print(\"\\nmodelfit(bdt_es) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Saving model predictions\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBes'] = bdt_es.predict_proba(df[training_columns])[:,1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This provides us with an improved model as well as a benchmark to test against in both performance and training efficiency. When training using new combinations of hyperparameters, the maximum number of estimators from our model report will cut off any new models improving more slowly than our default, while, for more efficient models, the early stopping will kick in." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drawing plot to compare model response for signal and background classes\n", + "plt.figure()\n", + "plot_comparision('XGBcv', mc_df, bkg_df)\n", + "plot_comparision('XGBes', mc_df, bkg_df)\n", + "\n", + "# Drawing comaprison of the signal efficiency vs background rejection curve (ROC)\n", + "plt.figure()\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "plot_roc(bdt_es, training_data, training_columns)\n", + "\n", + "# Drawing signal significance comparison as a function of minimum cut on model response\n", + "plt.figure()\n", + "bdt_cut_cv = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")\n", + "bdt_cut_es = plot_significance(bdt_es, training_data, training_columns, \"bdt_es\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hyperameter optimisation\n", + "\n", + "Below we provide a \"grid\" of hyperparameters, defining the structure of the trees and constraints on the learning, but there are many more values to choose from and a larger parameter space to be explored. These optimsations are very problem specific and their impact will have to be weighed against the computing resources and timeframe you have at your disposal. For the sake of expedient demonstration we are comparing the default parameters to only one predetermined variation in 2 parameters. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define a function that performs a gridscan of HPs\n", + "\n", + "\n", + "def hpgridscan(alg, metric, params, label, kfold, fbest, early_stop=10):\n", + "\n", + " # Load data fold with best performance\n", + " for k, (train, test) in enumerate(kf.split(params)):\n", + " if (k==fbest):\n", + " X_train, X_test = params.iloc[train], params.iloc[test]\n", + " y_train, y_test = label.iloc[train], label.iloc[test]\n", + "\n", + " # Define a dictionary of numpy arrays for our HPs\n", + " params = {\n", + " 'max_depth':np.array([7]),\n", + " 'min_child_weight':np.array([3]),\n", + " #'max_depth':np.arange( 5, 9, 1 ),\n", + " #'min_child_weight':np.arange( 1, 5, 1 ),\n", + " ##'gamma':np.arange( 0.0, 1.0, 0.1 ),\n", + " ##'colsample_bytree':np.arange( 0.4, 1.0, 0.1 ),\n", + " ##'subsample':np.arange( 0.4, 1.0, 0.1 ),\n", + " ##'scale_pos_weight':np.arange( 0.4, 1.6, 0.1 )\n", + " }\n", + "\n", + " # Perform timed grid scan with established n_estimator cutoff and early stopping\n", + " stime = time.time()\n", + " gs = GridSearchCV(estimator=alg,\n", + " param_grid=params,\n", + " scoring=metric,\n", + " #iid=False,\n", + " cv=kf,\n", + " n_jobs=-1)\n", + " gs.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", + " eval_set=[(X_train, y_train), (X_test, y_test)],\n", + " verbose=False, early_stopping_rounds=early_stop)\n", + " print(\"XGBoost grid-scan --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Return suggested parameters, performance and best model\n", + " training_monitor(gs.best_estimator_)\n", + " print(\"Suggestion:\", gs.best_params_)\n", + " print(\"Accuracy:\" ,gs.best_score_)\n", + " return gs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Running with estimators maximum for shortened training\n", + "bdt_st = XGBClassifier( learning_rate = LR, n_estimators=estimators,\n", + " seed=123, n_threads=-1)\n", + "\n", + "# Running timed hyperparameter gridscan\n", + "stime = time.time()\n", + "gs = hpgridscan(bdt_st, \"accuracy\", X1, y1, kf, bestfold)\n", + "bdt_gs = gs.best_estimator_\n", + "print(\"\\nhpgridscan(bdt_st) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Get model predictions\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBgs'] = bdt_gs.predict_proba(df[training_columns])[:,1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Even this naive grid scan, using the same fold as before for fair comparison, can provide significant improvements as demonstrated above. These may be pushed further by including more hyperparameters for a trade off with processing time. However, even with parrallisation these tasks can take hours or longer and might only provide improvement of O(>1%)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## We could define a model using optimal hyperparameters from our grid scan\n", + "#bdt_opt = XGBClassifier( learning_rate = LR, n_estimators=1000,\n", + "# max_depth=gs.best_params_['max_depth'],\n", + "# min_child_weight=gs.best_params_['min_child_weight'],\n", + "# seed=123, n_threads=-1 )\n", + "\n", + "## Run with CV early stopping\n", + "#stime = time.time()\n", + "#estimators = modelfit(bdt_opt, 'error', X1, y1, training_columns, kf, bestfold)\n", + "#print(\"\\nmodelfit(bdt_opt) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "## Get model predictions\n", + "#for df in [mc_df, bkg_df, data_df, training_data]:\n", + "# df['XGBopt'] = bdt_opt.predict_proba(df[training_columns])[:,1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Comapring model response from the end of last session to the end of this one\n", + "plt.figure()\n", + "plot_comparision('XGB', mc_df, bkg_df)\n", + "plot_comparision('XGBgs', mc_df, bkg_df)\n", + "\n", + "# Comparing model performance for each level of tuning\n", + "plt.figure()\n", + "plot_roc(bdt, training_data, training_columns)\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "plot_roc(bdt_es, training_data, training_columns)\n", + "plot_roc(bdt_gs, training_data, training_columns)\n", + "#plot_roc(bdt_opt, training_data, training_columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Comparing the impact on projected performance at each stage of the tutorial\n", + "plt.figure()\n", + "bdt_cut = plot_significance(bdt, training_data, training_columns, \"bdt\")\n", + "bdt_cv_cut = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")\n", + "bdt_es_cut = plot_significance(bdt_es, training_data, training_columns, \"bdt_es\")\n", + "bdt_gs_cut = plot_significance(bdt_gs, training_data, training_columns, \"bdt_gs\")\n", + "#bdt_opt_cut = plot_significance(bdt_opt, training_data, training_columns, \"bdt_opt\")\n", + "\n", + "# Comparing best cuts impact on mass for original and tuned model\n", + "plt.figure()\n", + "data_bdt_cut = data_df.query('XGB > %f' %bdt_cut )\n", + "plot_mass(data_bdt_cut, label='XGB default', norm=True)\n", + "data_gs_cut = data_df.query('XGBgs > %f' %bdt_gs_cut )\n", + "plot_mass(data_gs_cut, label='XGB tuned', norm=True)\n", + "plt.legend(loc='best')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Comparing our data sample's mass plot having applied the cut optimised for $\\sigma=\\frac{S}{\\sqrt{S+B}}$ from each BDT output, we can see how the improved model reduces relative background. However, while we define our signal training sample from MC you'll remember we defined our background training sample from the data !(3.0 < JPsi_M < 3.2).\n", + "\n", + "We can see shoulders at the edges of the regions where we define our background training sample in our data's mass spectrum now. Our training and validation samples include a subset of our data sample so there's potential that our model is learning the difference between MC and data and exploiting that or demonstrating overtraining on the 'previously seen' data (remember we could see our train and test samples beginning to diverge in our validation metrics with more iterations).\n", + "\n", + "Below you can see replotting the normalised mass distribution from just the data not included in training demonstrates no significant improvement. This is not ideal and might be addressed by choosing the setup of our training more carefully. For example, we could train using background from same-sign muon MC across the full mass range (a common practice in LHC experiments) or, using other libraries such as UGBoost to introduce a punishment to the training for introducing a depedance of efficiency on mass." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sig_df = data_df.query('(3.0 < Jpsi_M < 3.2)')\n", + "sig_bdt_cut = sig_df.query('XGB > %f' %bdt_cut )\n", + "plot_mass(sig_bdt_cut, label='XGB default', norm=True)\n", + "sig_gs_cut = sig_df.query('XGBgs > %f' %bdt_gs_cut )\n", + "plot_mass(sig_gs_cut, label='XGB tuned', norm=True)\n", + "plt.legend(loc='best')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also choose a higher learning rate to perform course scans of your space and decrease it again to retrain your final model. If you can afford to, it might be best to include learning rate itself as a parameter in your grid. With some libraries you can specify your choice of kernel. Both these choices will impact your optimal maximum number of iterations, so setting it sufficiently high and using early stopping might be a good strategy.\n", + "\n", + "For less exhaustive and non-discritised methods try smart combinations of the following to perform adaptive scans or build your own:\n", + "* sklearn.model_selection.RandomizedSearchCV\n", + "* sklearn.model_selection.GridSearchCV\n", + "\n", + "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", + "* skopt.BayesSearchCV\n", + "* hyperopt.tpe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Full stats plots saved here: bit.ly/LHCb_XGB_Tuning\n", + "Run with full stats by removing entrystop at max_events in cell 8." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Final lesson time and processing time check\n", + "print(\"Notebook real time --- %s seconds ---\" % (time.time() - stt))\n", + "print(\"Notebook CPU time --- %s seconds ---\" % (time.process_time() - stc))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/advanced-python/README.md b/advanced-python/README.md index 986a9bdd..2dde2d2d 100644 --- a/advanced-python/README.md +++ b/advanced-python/README.md @@ -17,6 +17,7 @@ a knowledge base that one can always come back to lock up things. 30Classification.ipynb 31ClassificationExtension.ipynb 32BoostingToUniformity.ipynb + 33ModelTuning.ipynb 40Histograms.ipynb 45DemoReweighting.ipynb 50LikelihoodInference.ipynb