prediction.Rmd

---
title: "Prediction - Comparing Trees"
author: "Christina Huang"
output: html_document
---

```{r}

library (dplyr)
library (tidyr)
library (ggplot2)
#install.packages('caret')
#install.packages("RWeka")
#install.packages("C50")

DF1 <- data.frame (read.csv("drop-out.csv"))

install.packages('caret', dependencies = TRUE)

library(caret)

set.seed (1000000)

trainData <- createDataPartition (
  y = DF1$student_id, ##the outcome data are needed
  p = 0.75, #the percentage of data in the training set
  list = FALSE
)

#generate a list of index numbers for the sample
training <- DF1 [trainData,]
testing <- DF1 [-trainData,]

summary (training)
summary (testing)

install.packages("GGally")

pdf(file = "scatterplot_matrix.pdf")
DF2 <- select (DF1, -complete, -international, -online)
DF2
pairs (DF2)
dev.off ()

library (rpart)

training2 <- training[,c(2:10)] #Remove the student_id variable that we do not want to use in the model

#caret does not summarize the metrics we want by default so we have to modify the output
MySummary  <- function(data, lev = NULL, model = NULL){
  df <- defaultSummary(data, lev, model)
  tc <- twoClassSummary(data, lev, model)
  pr <- prSummary(data, lev, model)
  out <- c(df,tc,pr)
  out}

#Define the control elements we would like to use
ctrl <- trainControl(method = "repeatedcv", #Tell caret to perform k-fold cross validation
                repeats = 3, #Tell caret to repeat each fold three times
                classProbs = TRUE, #Calculate class probabilities
                summaryFunction = MySummary)

#Define the model
cartFit <- train(complete ~ ., #Define which variable to predict 
                data = training2, #Define the data set to train the model on
                trControl = ctrl, #Tell caret the control elements
                method = "rpart", #Define the model type
                metric = "Accuracy", #Final model choice is made according to sensitivity
                preProc = c("center", "scale")) #Center and scale the data to minimize the 

#Check the results
cartFit
plot(cartFit)

testing2 <- testing [,c(2:10)] #Remove the student_id variable that we do not want to use in the model

#Generate prediction using previously trained model
cartClasses <- predict(cartFit, newdata = testing2)

#Generate model statistics
confusionMatrix(data = cartClasses, as.factor(testing2$complete))

library (party)
require (party)

#Define the control elements we would like to use
ctrl <- trainControl(method = "repeatedcv", #Tell caret to perform k-fold cross validation
                repeats = 3, #Tell caret to repeat each fold three times
                classProbs = TRUE, #Calculate class probabilities
                summaryFunction = MySummary)

#Define the model
condFit <- train(complete ~ ., #Define which variable to predict 
                data = training2, #Define the data set to train the model on
                trControl = ctrl, #Tell caret the control elements
                method = "ctree", #Define the model type
                metric = "ROC", #ROC calculation
                preProc = c("center", "scale")) #Center and scale the data to minimize the 

#Check the results
condFit  
condFit$fitnalModel
plot(condFit)
  
  
testing3 <- testing[,c(2:10)] #Remove the student_id variable that we do not want to use in the model

#Generate prediction using previously trained model
condClasses <- predict(condFit, newdata = testing3)

#Generate model statistics
confusionMatrix(data = condClasses, as.factor(testing3$complete))

require (C50)

#Define the model
c50Fit <- train(complete ~ ., #Define which variable to predict 
                data = training2, #Define the data set to train the model on
                trControl = ctrl, #Tell caret the control elements
                method = "C5.0", #Define the model type
                metric = "Accuracy", #Final model choice is made according to sensitivity
                preProc = c("center", "scale")) #Center and scale the data to minimize the 

#Check the results
c50Fit  
plot (c50Fit)
  
#Generate prediction using previously trained model
c50Classes <- predict(c50Fit, newdata = testing3)

#Generate model statistics
confusionMatrix(data = c50Classes, as.factor(testing3$complete))

resamps <- resamples(list(cart = cartFit, condinf = condFit, cfiveo = c50Fit))
summary(resamps)
```