core-methods-in-edm · ericylc23 · Dec 17, 2019
diff --git a/Assignment7.Rmd b/Assignment7.Rmd
@@ -11,71 +11,106 @@ In the following assignment you will be looking at data from an one level of an
 
 #Upload data
 ```{r}
+library(ggplot2)
+library(dplyr)
+library(tidyr)
+library(rpart)
+library(party)
+D1 <- read.csv("online.data.csv")
 
 ```
 
 #Visualization 
 ```{r}
-#Start by creating histograms of the distributions for all variables (#HINT: look up "facet" in the ggplot documentation)
 
+#Start by creating histograms of the distributions for all variables (#HINT: look up "facet" in the ggplot documentation)
+D2 <- gather(D1, "variable", "value", 2:7)
+ggplot(D2, mapping = aes(x = id, y = value)) +
+  geom_histogram(stat="identity", binwidth = 50) +
+  facet_wrap(~variable, scales = "free_y")
 #Then visualize the relationships between variables
-
+pairs(D1)
 #Try to capture an intution about the data and the relationships
-
+library(corrplot)
+D3 <- D1 %>%
+      mutate(level.up = ifelse(level.up == "no",0,1))
+COR <- cor(D3)
+corrplot(COR, order="AOE", method="color", tl.pos="lt", type="upper")
 ```
 #Classification tree
 ```{r}
 #Create a classification tree that predicts whether a student "levels up" in the online course using three variables of your choice (As we did last time, set all controls to their minimums)
-
+c.tree1 <- rpart(as.factor(level.up) ~ post.test.score + messages + av.assignment.score, method = "class", data = D1)
 #Plot and generate a CP table for your tree 
-
+printcp(c.tree1)
+post(c.tree1, file = "tree1.ps")
 #Generate a probability value that represents the probability that a student levels up based your classification tree 
-
-D1$pred <- predict(rp, type = "prob")[,2]#Last class we used type = "class" which predicted the classification for us, this time we are using type = "prob" to see the probability that our classififcation is based on.
+D3$pred <- predict(c.tree1, type = "prob")[,2]
+#Last class we used type = "class" which predicted the classification for us, this time we are using type = "prob" to see the probability that our classififcation is based on.
 ```
 ## Part II
 #Now you can generate the ROC curve for your model. You will need to install the package ROCR to do this.
 ```{r}
 library(ROCR)
 
 #Plot the curve
-pred.detail <- prediction(D1$pred, D1$level.up) 
+pred.detail <- prediction(D3$pred, D3$level.up) 
 plot(performance(pred.detail, "tpr", "fpr"))
 abline(0, 1, lty = 2)
-
 #Calculate the Area Under the Curve
-unlist(slot(performance(Pred2,"auc"), "y.values"))#Unlist liberates the AUC value from the "performance" object created by ROCR
+#Unlist liberates the AUC value from the "performance" object created by ROCR
 
 #Now repeat this process, but using the variables you did not use for the previous model and compare the plots & results of your two models. Which one do you think was the better model? Why?
+c.tree2 <- rpart(as.factor(level.up) ~ forum.posts + id + pre.test.score, method = "class", data = D1)
+printcp(c.tree2)
+post(c.tree2, file = "tree2.ps", title = "CP Table2")
+D3$pred2 <- predict(c.tree2, type = "prob")[,2]
+pred.detail2 <- prediction(D3$pred2, D3$level.up) 
+plot(performance(pred.detail2, "tpr", "fpr"))
+abline(0, 1, lty = 2)
+unlist(slot(performance(pred.detail2, "auc"), "y.values"))
 ```
+##According to the accuracy, the first model has a higher accuracy level than the second model (the area under the curve equals to 1).
 ## Part III
 #Thresholds
 ```{r}
 #Look at the ROC plot for your first model. Based on this plot choose a probability threshold that balances capturing the most correct predictions against false positives. Then generate a new variable in your data set that classifies each student according to your chosen threshold.
 
-threshold.pred1 <- 
+cut <- 0.5
+D3$threshold.pred1 <- D3$pred
+D3$threshold.pred1[D3$pred < cut] <- 0
+D3$threshold.pred1[D3$pred >= cut] <- 1
 
 #Now generate three diagnostics:
 
-D1$accuracy.model1 <-
-
-D1$precision.model1 <- 
-
-D1$recall.model1 <- 
-
-#Finally, calculate Kappa for your model according to:
-
+D3$accuracy.model1 <- mean(ifelse(D3$level.up == D3$threshold.pred1, 1, 0))
+D3$accuracy.model1 <- as.integer(D3$accuracy.model1)
+accuracy1 <- sum(D3$accuracy.model1) / length(D3$accuracy.model1)
+D3$precision.model1 <- ifelse(D3$level.up == 1 & D3$threshold.pred1 == 1, 1, 0)
+precision1 <- sum(D3$precision.model1) / sum (D3$threshold.pred1)
+D3$recall.model1 <- ifelse(D3$level.up == 1 & D3$threshold.pred1 == 1, 1, 0)
+recall1 <- sum(D3$precision.model1) / sum(D3$level.up)
 #First generate the table of comparisons
-table1 <- table(D1$level.up, D1$threshold.pred1)
+table1 <- table(D3$level.up, D3$threshold.pred1)
 
 #Convert to matrix
 matrix1 <- as.matrix(table1)
-
+matrix1
 #Calculate kappa
 kappa(matrix1, exact = TRUE)/kappa(matrix1)
 
 #Now choose a different threshold value and repeat these diagnostics. What conclusions can you draw about your two thresholds?
-
+cut2 = 1
+D3$threshold.pred2 <- D3$pred
+D3$threshold.pred2[D3$pred < cut2] <- 0
+D3$threshold.pred2[D3$pred == cut2] <- 1
+D3$accuracy.model2 <- mean(ifelse(D3$level.up == D3$threshold.pred2, 1, 0))
+D3$accuracy.model2 <- as.integer(D3$accuracy.model2)
+accuracy2 <- sum(D3$accuracy.model2) / length(D3$accuracy.model2)
+D3$precision.model2 <- ifelse(D3$level.up == 1 & D3$threshold.pred2 == 1, 1, 0)
+precision2 <- sum(D3$precision.model2) / sum (D3$threshold.pred2)
+D3$recall.model2 <- ifelse(D3$level.up == 1 & D3$threshold.pred2 == 1, 1, 0)
+recall2 <- sum(D3$precision.model2) / sum(D3$level.up)
 ```
 
 ### To Submit Your Assignment