-
Notifications
You must be signed in to change notification settings - Fork 0
/
prediction.Rmd
136 lines (102 loc) · 4.38 KB
/
prediction.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
---
title: "Prediction - Comparing Trees"
author: "Christina Huang"
output: html_document
---
```{r}
library (dplyr)
library (tidyr)
library (ggplot2)
#install.packages('caret')
#install.packages("RWeka")
#install.packages("C50")
DF1 <- data.frame (read.csv("drop-out.csv"))
install.packages('caret', dependencies = TRUE)
library(caret)
set.seed (1000000)
trainData <- createDataPartition (
y = DF1$student_id, ##the outcome data are needed
p = 0.75, #the percentage of data in the training set
list = FALSE
)
#generate a list of index numbers for the sample
training <- DF1 [trainData,]
testing <- DF1 [-trainData,]
summary (training)
summary (testing)
install.packages("GGally")
pdf(file = "scatterplot_matrix.pdf")
DF2 <- select (DF1, -complete, -international, -online)
DF2
pairs (DF2)
dev.off ()
library (rpart)
training2 <- training[,c(2:10)] #Remove the student_id variable that we do not want to use in the model
#caret does not summarize the metrics we want by default so we have to modify the output
MySummary <- function(data, lev = NULL, model = NULL){
df <- defaultSummary(data, lev, model)
tc <- twoClassSummary(data, lev, model)
pr <- prSummary(data, lev, model)
out <- c(df,tc,pr)
out}
#Define the control elements we would like to use
ctrl <- trainControl(method = "repeatedcv", #Tell caret to perform k-fold cross validation
repeats = 3, #Tell caret to repeat each fold three times
classProbs = TRUE, #Calculate class probabilities
summaryFunction = MySummary)
#Define the model
cartFit <- train(complete ~ ., #Define which variable to predict
data = training2, #Define the data set to train the model on
trControl = ctrl, #Tell caret the control elements
method = "rpart", #Define the model type
metric = "Accuracy", #Final model choice is made according to sensitivity
preProc = c("center", "scale")) #Center and scale the data to minimize the
#Check the results
cartFit
plot(cartFit)
testing2 <- testing [,c(2:10)] #Remove the student_id variable that we do not want to use in the model
#Generate prediction using previously trained model
cartClasses <- predict(cartFit, newdata = testing2)
#Generate model statistics
confusionMatrix(data = cartClasses, as.factor(testing2$complete))
library (party)
require (party)
#Define the control elements we would like to use
ctrl <- trainControl(method = "repeatedcv", #Tell caret to perform k-fold cross validation
repeats = 3, #Tell caret to repeat each fold three times
classProbs = TRUE, #Calculate class probabilities
summaryFunction = MySummary)
#Define the model
condFit <- train(complete ~ ., #Define which variable to predict
data = training2, #Define the data set to train the model on
trControl = ctrl, #Tell caret the control elements
method = "ctree", #Define the model type
metric = "ROC", #ROC calculation
preProc = c("center", "scale")) #Center and scale the data to minimize the
#Check the results
condFit
condFit$fitnalModel
plot(condFit)
testing3 <- testing[,c(2:10)] #Remove the student_id variable that we do not want to use in the model
#Generate prediction using previously trained model
condClasses <- predict(condFit, newdata = testing3)
#Generate model statistics
confusionMatrix(data = condClasses, as.factor(testing3$complete))
require (C50)
#Define the model
c50Fit <- train(complete ~ ., #Define which variable to predict
data = training2, #Define the data set to train the model on
trControl = ctrl, #Tell caret the control elements
method = "C5.0", #Define the model type
metric = "Accuracy", #Final model choice is made according to sensitivity
preProc = c("center", "scale")) #Center and scale the data to minimize the
#Check the results
c50Fit
plot (c50Fit)
#Generate prediction using previously trained model
c50Classes <- predict(c50Fit, newdata = testing3)
#Generate model statistics
confusionMatrix(data = c50Classes, as.factor(testing3$complete))
resamps <- resamples(list(cart = cartFit, condinf = condFit, cfiveo = c50Fit))
summary(resamps)
```