-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathessemble
99 lines (76 loc) · 2.86 KB
/
essemble
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
library(tidyverse)
library(recipes)
setwd('C:/users/gusta/Dropbox/estatistica/heart')
heart <- read.csv('heart.csv', stringsAsFactors = FALSE)
str(heart)
dim(heart)
#names
nomes <- str_c("'", names(heart), "'", collapse = ',')
nomes <- c('age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target')
names(heart) <- nomes
heart <- heart %>%
mutate(
sex = factor(sex, labels = c('female','male')),
cp = factor(cp),
fbs = factor(fbs, labels = c('<120','>120')),
restecg = factor(restecg),
exang = factor(exang, labels = c('no','yes')),
ca = factor(ca),
thal = factor(thal),
target = factor(target, labels = c('no','yes'))
)
##using recipes
rec <- recipe(target ~ . , data = heart) %>%
step_BoxCox(all_numeric()) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
step_corr(all_numeric(), threshold = 0.9) %>%
step_dummy(all_nominal(), - all_outcomes()) %>%
step_nzv(all_predictors())
preparo <- prep(rec, heart)
heart_prep <- bake(preparo, heart)
#######################H2o #####################################
library(h2o)
h2o.init(nthreads = -1)
heart.h2o <- as.h2o(heart_prep, destination_frame = 'heart.h2o')
##split
split <- h2o.splitFrame(heart.h2o, ratios = 0.8)
heart.train <- split[[1]]
heart.test <- split[[2]]
##names
x <- setdiff(names(heart.h2o), 'target')
y <- 'target'
##Model glm wiht lasso
##To essemble
## fold_assignment = 'Modulo',
##keep_cross_validation_predictios = TRUE
h2o.glm <- h2o.glm(x = x, y = y, training_frame = heart.train, nfolds = 10,
seed = 123, lambda_search = TRUE, family = 'binomial',
fold_assignment = 'Modulo', keep_cross_validation_predictions = TRUE)
h2o.glm
h2o.performance(h2o.glm, newdata = heart.test)
h2o.varimp_plot(h2o.glm)
##Model Random forest
h2o.rf <- h2o.randomForest(x = x, y = y, training_frame = heart.train,
nfolds = 10, seed = 123,
max_depth = 10, min_rows = 1, mtries = 7,
ntrees = 100,
fold_assignment = 'Modulo', keep_cross_validation_predictions = TRUE)
h2o.performance(h2o.rf, newdata = heart.test)
h2o.varimp_plot(h2o.rf)
##tuned
g <- h2o.grid('randomForest',
hyper_params = list(
ntrees = c(50, 100, 120),
max_depth = c(10, 20, 40),
mtries = c(5,7,10),
min_rows = c(1, 2)),
x = x, y = y, training_frame = heart.train,
nfolds = 10
)
############### Stack Essemble ###############################################
models <- c(h2o.glm, h2o.rf)
m.stack <- h2o.stackedEnsemble(x = x, y = y, training_frame = heart.train,
base_models = models)
m.stack
h2o.performance(m.stack, newdata = heart.test)