-
Notifications
You must be signed in to change notification settings - Fork 0
/
wrapper.xgb.cv.logistic.r
93 lines (77 loc) · 3.55 KB
/
wrapper.xgb.cv.logistic.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
library(caret)
library(ggplot2)
library(pdp)
library(xgboost)
library(boot)
library(yardstick)
source(paste0(ScriptDir,"xgb.cv.importance.plot.R"))
source(paste0(ScriptDir,"xgb.cv.partial.r"))
source(paste0(ScriptDir,"xgb.cv.fit.boxplot.r"))
source(paste0(ScriptDir,"xgb.cv.interaction.r"))
source(paste0(ScriptDir,"xgb.cv.makefolds.R"))
xgb.cv.logistic = function(Data,Predictors,Response,Objective = "binary:logistic",Metric = "logloss",path,Nfolds = 10,Nrounds = 10000,LearningRate=0.001
,Nthread = 2,MaxDepth=3,save = TRUE,Folds = NULL, Monotone = NULL,DoInteraction = TRUE)
{
CVtrain_x = as.matrix(Data[, colnames(Data) %in% Predictors])
CVtrain_y = Data[,colnames(Data) == Response]
if(is.null(Monotone)==TRUE)
Monotone = rep(0,times = ncol(CVtrain_x))
###Convert fold vector (if supplied) to list of obsrvations in each fold
###Assumes length of fold vector = nrow(Data)
K = Nfolds
FoldList = NULL
if(is.null(Folds)==FALSE)
{
K = min(Nfolds,length(unique(Folds)))
FoldList <- xgb.cv.makefolds(as.factor(Folds), K)
}
Nfolds = K
cv <- xgb.cv(tree_method = "exact",data = CVtrain_x, stratified = TRUE,label = CVtrain_y,nrounds = Nrounds, nthread = Nthread, nfold = Nfolds,folds = FoldList,monotone_constraints =Monotone,
max_depth = MaxDepth, eta = min(50,Nrounds), objective = Objective,metric = Metric,prediction = TRUE,print_every_n = 50,learning_rate = LearningRate,
save_models = TRUE,early_stopping_rounds = 50,callbacks = list(cb.cv.predict(save_models = TRUE)))
Nfolds = length(cv$models)
if(save==TRUE)
saveRDS(cv,paste0(path,"xgb.cv.logistic.rds"))
PredClass = ifelse(cv$pred >0.5,1,0)
###Test accuracy of predictions
Confusion = confusionMatrix(as.factor(PredClass),as.factor(CVtrain_y))
###Calculate ROC
Pred = cv$pred[order(CVtrain_y)]
Truth = CVtrain_y[order(CVtrain_y)]
ROC = roc_auc_vec(
estimate = Pred,
truth = as.factor(Truth),event_level="second")
###Print box plots of predicted probabilities against observed occurrences for each class
xgbm.cv.fit.boxplot.logistic(cv$pred,Data[, colnames(Data) == Response],ROC,path)
####Use custom function to generate predictor importance bar plots
Filename = paste0(path,"PredictorImportance.png")
Names = colnames(CVtrain_x)
Filename = paste0(path,"PredictorImportance.png")
Importance <- xgb.cv.importance.plot(cv, #ouput from xgb.cv. Be sure to use callback to save cv models
Nfolds, #number of fold models used in cross-validaton
Predictors= Names[Names%in% Predictors],#names of predictor variables
#this ensures names in right order
#for importance function
Filename)#location to print bar plot
####Use custom function to generate partial dependency plots
PartialDir = paste0(path,"PartialDependencePlots/")
dir.create(PartialDir,showWarnings = FALSE)
for(var in 1:length(Predictors))
xgbm.cv.partial(cv,Nfolds = Nfolds,na.omit(CVtrain_x),var,path = PartialDir,CVtrain_y=CVtrain_y,ResponseName = Response)
###Do interaction last as hstats changes model predictions somehow in partial plots
if(DoInteraction == TRUE)
Interaction = xgb.cv.interaction(cv,na.omit(CVtrain_x),Predictors,Nfolds)
OutList = list()
Key = "Model"
OutList[[Key]] = cv
Key = "ROC"
OutList[[Key]] = ROC
Key = "ConfusionMatrix"
OutList[[Key]] = Confusion
Key = "Predictor importance"
OutList[[Key]]= Importance
Key = "Interaction"
if(DoInteraction == TRUE)
OutList[[Key]] = Interaction
return(c(OutList))
}