.Rhistory

boxplot(errors, main = "Boxplot of Boosted Tree Errors", ylab = "0 - 1 Loss")
errors
boxplot(errors, main = "Boxplot of Boosted Tree Errors", ylab = "0 - 1 Loss Accuracy")
#Import required libraries
library(tidyverse)
library(stopwords)
library(text2vec)
library(data.table)
library(magrittr)
library(glmnet)
library(xgboost)
library(randomForest)
#Read in data
reviews <- read_csv("AppReview.csv")
#Select only required columns and remove unneeded data, create an index variable and
#convert to a data table.
reviews_cleaned <- reviews %>% mutate(sentiment = round(reviewerRating)) %>%
select(reviewText, reviewerRating, sentiment)
reviews_cleaned$id <- 1:nrow(reviews_cleaned)
setDT(reviews_cleaned)
setkey(reviews_cleaned, id)
rm(reviews)
#Split into a training (tuning + validation) and test set
all_ids <- reviews_cleaned$id
train_ids <- sample(all_ids, nrow(reviews_cleaned)*0.8)
tune_ids <- sample(train_ids, length(train_ids)*0.8)
valid_ids <- setdiff(train_ids, tune_ids)
test_ids <- setdiff(all_ids, train_ids)
train <- reviews_cleaned[J(train_ids)]
tune <- reviews_cleaned[J(tune_ids)]
valid <- reviews_cleaned[J(valid_ids)]
test <- reviews_cleaned[J(test_ids)]
#Generate stopwords set
words_to_include <- c("no", "not", "cannot")
words_to_remove <- c("full", "review")
sw <- stopwords::stopwords("en")[!(stopwords::stopwords("en") %in% words_to_include)]
sw <- append(sw, words_to_remove)
# # --- Automatic Tuning ---
#
# #Combinations of hyperparameters
# n_comb <- 32
# vocab_term_maxs <- c(1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
#                     2500, 2500, 2500, 2500, 2500, 2500, 2500, 2500,
#                     5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000,
#                     10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000)
# tf_idfs <- c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
#              0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L)
# max_depths <- c(3L, 7L, 13L, 19L, 3L, 7L, 13L, 19L, 3L, 7L, 13L, 19L, 3L, 7L, 13L, 19L,
#                 3L, 7L, 13L, 19L, 3L, 7L, 13L, 19L, 3L, 7L, 13L, 19L, 3L, 7L, 13L, 19L)
# n_rounds <- c(1000, 400, 200, 150, 1000, 400, 200, 150,
#               1000, 400, 200, 150, 1000, 400, 200, 150,
#               1000, 400, 200, 150, 1000, 400, 200, 150,
#               1000, 400, 200, 150, 1000, 400, 200, 150)
# errors <- numeric(n_comb)
#
# #Find best combination
# for(i in 1:n_comb){
#
#   #Tuning iterator
#   it_tune <- itoken(tune$reviewText, preprocessor = tolower, tokenizer = word_tokenizer,
#                     ids = tune$id, progressbar = FALSE)
#
#   #Create vocab
#   vocab = create_vocabulary(it_tune, stopwords = sw, ngram = c(1L, 3L))
#   vocab = prune_vocabulary(vocab, term_count_min = 10,
#                            doc_proportion_max = 0.65, vocab_term_max = vocab_term_maxs[i])
#   vectorizer <- vocab_vectorizer(vocab)
#
#   #Create tuning dtm
#   dtm_tune  <- create_dtm(it_tune, vectorizer)
#   if(tf_idfs[i] == 1){
#     tfidf = TfIdf$new()
#     dtm_tune <- fit_transform(dtm_tune, tfidf)
#   }
#
#   #Train xgboost model
#   boost_model <- xgboost(data = dtm_tune, label = tune$sentiment, max_depth = max_depths[i],
#                          eta = 0.3, nthread = 8, nrounds = n_rounds[i], objective = "binary:hinge",
#                          print_every_n = 50)
#
#   #Build validation dtm
#   it_valid = itoken(valid$reviewText, preprocessor = tolower, tokenizer = word_tokenizer,
#                    ids = valid$id, progressbar = FALSE)
#   dtm_valid = create_dtm(it_valid, vectorizer)
#   if(tf_idfs[i] == 1){
#     dtm_valid <- transform(dtm_valid, tfidf)
#   }
#
#   #Make predictions and calculate error
#   preds <- predict(boost_model, dtm_valid, type = 'class')
#   errors[i] <- mean(preds != valid$sentiment)
# }
#
# # --- Manual Tuning ---
#
# #Manually tuning best combo from automatic tuning
#
# #Create vocab
# vocab = create_vocabulary(it_tune, stopwords = sw, ngram = c(1L, 3L))
# vocab = prune_vocabulary(vocab, term_count_min = 10,
#                          doc_proportion_max = 0.65, vocab_term_max = 10000)
# vectorizer <- vocab_vectorizer(vocab)
#
# #Create tuning dtm
# dtm_tune  <- create_dtm(it_tune, vectorizer)
#
# #Train xgboost model
# boost_model <- xgboost(data = dtm_tune, label = tune$sentiment, max_depth = 13,
#                        eta = 0.2, nthread = 8, nrounds = 300, objective = "binary:hinge")
#
# #Create valid DTM
# it_valid = itoken(valid$reviewText, preprocessor = tolower, tokenizer = word_tokenizer,
#                   ids = valid$id, progressbar = FALSE)
# dtm_valid = create_dtm(it_valid, vectorizer)
#
# #Make predictions and calculate error
# preds <- predict(boost_model, dtm_valid, type = 'class')
# mean(preds == valid$sentiment)
# --- Final Model ---
#Preprocessing and tokenization of whole training set
it_train <- itoken(train$reviewText, preprocessor = tolower, tokenizer = word_tokenizer,
ids = train$id, progressbar = FALSE)
#Vocab from entire training set
vocab = create_vocabulary(it_train, stopwords = sw, ngram = c(1L, 3L))
vocab = prune_vocabulary(vocab, term_count_min = 10,
doc_proportion_max = 0.65,
vocab_term_max = 10000)
#Vectorizer from entire training set
vectorizer <- vocab_vectorizer(vocab)
#Create training DTM
dtm_train  <- create_dtm(it_train, vectorizer)
#Train XGBoost model
boost_model <- xgboost(data = dtm_train, label = train$sentiment, max_depth = 13,
eta = 0.3, nthread = 8, nrounds = 300, objective = "binary:logistic")
#Create test DTM
it_test <- itoken(test$reviewText, preprocessor = tolower, tokenizer = word_tokenizer,
ids = train$id, progressbar = FALSE)
dtm_test <- create_dtm(it_test, vectorizer)
#Make predictions
preds_sent <- round(predict(boost_model, dtm_test))
#Calculate scores
mean(preds_sent == test$sentiment)
#Import required libraries
library(tidyverse)
library(stopwords)
library(text2vec)
library(data.table)
library(magrittr)
library(glmnet)
library(xgboost)
library(randomForest)
#Load libraries
library(MASS)
library(caret)
library(corrplot)
library(KernSmooth)
#Let's include a stratified holdout function for future use
stratified_holdout <- function(y, epsilon){
n <- length(y)
labels <- unique(y)
ind_train <- NULL
ind_test <- NULL
y <- sample(sample(sample(y))) #Resample y multiple times
for(label in labels){ #Go through each label
labels_i <- which(y == label)
n_i <- length(labels_i)
ind_train <- c(ind_train, sample(labels_i,
round((1 - epsilon)*n_i), replace = FALSE)) #Select (1 - epsilon)% for train
}
ind_test <- (1:n)[-ind_train] #Everything not in training set is in test set
return(list(ind_train, ind_test))
}
#Load the data
xy <- read.csv('bananaData.csv')
#Shape things nicely for data computation and scale data
n <- nrow(xy) #Number of examples
p <- ncol(xy) - 1 #Number of predictors
pos_resp <- p + 1 #Position of response
x <- scale(xy[,-pos_resp]) #Scaled predictor variables
xy[,pos_resp] <- as.factor(xy[,pos_resp]) #Turn response to factor
y <- xy[,pos_resp] #Response variable
#Explore the data
str(xy) #Data Types
head(xy) #Get an idea of the data
which(is.na(xy)) #Check for empties
plot(xy[,-pos_resp], col = as.integer(y) + 1L, pch = as.integer(y) + 1L, main = "Banana Dataset")
setwd("C:\Users\micha\Documents\edX Capstone\Part 2")
setwd("C:\\Users\\micha\\Documents\\edX Capstone\\Part 2")
#Load the data
xy <- read.csv('bananaData.csv')
#Shape things nicely for data computation and scale data
n <- nrow(xy) #Number of examples
p <- ncol(xy) - 1 #Number of predictors
pos_resp <- p + 1 #Position of response
x <- scale(xy[,-pos_resp]) #Scaled predictor variables
xy[,pos_resp] <- as.factor(xy[,pos_resp]) #Turn response to factor
y <- xy[,pos_resp] #Response variable
#Explore the data
str(xy) #Data Types
head(xy) #Get an idea of the data
which(is.na(xy)) #Check for empties
plot(xy[,-pos_resp], col = as.integer(y) + 1L, pch = as.integer(y) + 1L, main = "Banana Dataset")
#Check correlation between predictors
corrplot(cor(x))
#Plot the data with color representing class, y = 1 is in green, y = 0 is in red
plot(x, col = as.integer(y) + 1L, pch = as.integer(y) + 1L, main = "Banana Dataset")
#Cross - validation control
cv_control <- trainControl(method = "cv", number = 10)
#Optimal k in kNN
mod_knn <- train(x, y, method = "knn",
tuneGrid = expand.grid(k = 1:25),
trControl = cv_control) #kNN, CV for k = 1 ... 25
k_knn_opt <- mod_knn$bestTune$k
#Optimal c in linear SVM
mod_lsvm <- train(x, y, method = "svmLinear",
tuneGrid = expand.grid(C = 2^(-5:15)),
trControl = cv_control)
c_lsvm_opt <- mod_lsvm$bestTune$C
#Optimal C and sigma in RBF SVM
mod_rsvm <- train(x, y, method = "svmRadial",
tuneGrid = expand.grid(C = 2^(-5:15), sigma = 2^(-15:3)),
trControl = cv_control)
c_rsvm_opt <- mod_rsvm$bestTune$C
s_rsvm_opt <- mod_rsvm$bestTune$sigma
#Optimal depth in classification tree
mod_tree <- train(x, y, method = "rpart2",
tuneGrid = expand.grid(maxdepth = 1:12),
trControl = cv_control)
d_tree_opt <- mod_tree$bestTune$maxdepth
#Optimal mtry in RF (can only be 1 or 2 for us)
mod_rf <- train(x, y, method = "rf",
tuneGrid = expand.grid(mtry = 1:2),
trControl = cv_control)
mtry_rf_opt <- mod_rf$bestTune$mtry
#Optimal training rounds, depth, and learning rate in XGBoost
mod_xgb <- train(x, y, method = "xgbTree",
tuneGrid = expand.grid(nrounds = c(50, 100, 200, 400),
max_depth = 1:6,
eta = c(0.2, 0.3, 0.5, 0.8),
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1),
trControl = cv_control)
n_xgb_opt <- mod_xgb$bestTune$nrounds
d_xgb_opt <- mod_xgb$bestTune$max_depth
e_xgb_opt <- mod_xgb$bestTune$eta
R <- 25 #Number of runs
error_sh <- matrix(0, nrow = R, ncol = 10) #Matrix for storing errors
no_control <- trainControl(method = 'none') #No trainControl since hyperparameters are known
for(r in 1:R){
#Generate random stratified split
indices <- stratified_holdout(y, 1/3)
ind_train <- indices[[1]]
ind_test <- indices[[2]]
#Train models
mod_lda <- train(x[ind_train,], y[ind_train], method = "lda") #LDA
mod_qda <- train(x[ind_train,], y[ind_train], method = "qda") #QDA
mod_nb <- train(x[ind_train,], y[ind_train], method = "nb") #Naive Bayes
mod_knn <- train(x[ind_train,], y[ind_train], method = "knn", trControl = no_control,
tuneGrid = expand.grid(k = k_knn_opt)) #kNN
mod_lr <- train(x[ind_train,], y[ind_train], method = "glm", family = binomial(link = "logit")) #LR
mod_lsvm <- train(x[ind_train,], y[ind_train], method = "svmLinear", trControl = no_control,
tuneGrid = expand.grid(C = c_lsvm_opt)) #Linear SVM
mod_rsvm <- train(x[ind_train,], y[ind_train], method = "svmRadial", trControl = no_control,
tuneGrid = expand.grid(C = c_rsvm_opt, sigma = s_rsvm_opt)) #RBF SVM
mod_tree <- train(x[ind_train,], y[ind_train], method = "rpart2", trControl = no_control,
tuneGrid = expand.grid(maxdepth = d_tree_opt)) #Tree
mod_rf <- train(x[ind_train,], y[ind_train], method = "rf", trControl = no_control,
tuneGrid = expand.grid(mtry = mtry_rf_opt)) #RF
mod_xgb <- train(x[ind_train,], y[ind_train], method = "xgbTree", trControl = no_control,
tuneGrid = expand.grid(nrounds = n_xgb_opt,
max_depth = d_xgb_opt,
eta = e_xgb_opt,
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1)) #XGBoost
#Make predictions
y_hat_lda <- predict(mod_lda, x[ind_test,])
y_hat_qda <- predict(mod_qda, x[ind_test,])
y_hat_nb <- predict(mod_nb, x[ind_test,])
y_hat_knn <- predict(mod_knn, x[ind_test,])
y_hat_lr <- predict(mod_lr, x[ind_test,])
y_hat_lsvm <- predict(mod_lsvm, x[ind_test,])
y_hat_rsvm <- predict(mod_rsvm, x[ind_test,])
y_hat_tree <- predict(mod_tree, x[ind_test,])
y_hat_rf <- predict(mod_rf, x[ind_test,])
y_hat_xgb <- predict(mod_xgb, x[ind_test,])
#Store errors
error_sh[r, 1] <- mean(y_hat_lda != y[ind_test])
error_sh[r, 2] <- mean(y_hat_qda != y[ind_test])
error_sh[r, 3] <- mean(y_hat_nb != y[ind_test])
error_sh[r, 4] <- mean(y_hat_knn != y[ind_test])
error_sh[r, 5] <- mean(y_hat_lr != y[ind_test])
error_sh[r, 6] <- mean(y_hat_lsvm != y[ind_test])
error_sh[r, 7] <- mean(y_hat_rsvm != y[ind_test])
error_sh[r, 8] <- mean(y_hat_tree != y[ind_test])
error_sh[r, 9] <- mean(y_hat_rf != y[ind_test])
error_sh[r, 10] <- mean(y_hat_xgb != y[ind_test])
}
#Names of models in order
mod_names <- c("LDA", "QDA", "NB", "kNN", "LR", "SVM", "RBF SVM", "Tree", "RF", "XGB")
#Error means
err_means <- data.frame("Model" = mod_names, "Mean Error" = colMeans(error_sh))
err_means
#Generate a boxplot of the errors
boxplot(error_sh, main = "Boxplots of Performance on Banana Dataset by Various Models over 25 Runs",
names = mod_names,
ylab = "Avg Error")
#Import required libraries
library(tidyverse)
library(stopwords)
library(text2vec)
library(data.table)
library(magrittr)
library(glmnet)
library(xgboost)
library(randomForest)
#Import required libraries
library(tidyverse)
library(stopwords)
library(text2vec)
library(data.table)
library(magrittr)
library(glmnet)
library(xgboost)
library(randomForest)
#Read in data
reviews <- read_csv("AppReview.csv")
#Select only required columns and remove unneeded data, create an index variable and
#convert to a data table.
reviews_cleaned <- reviews %>% mutate(sentiment = round(reviewerRating)) %>%
select(reviewText, reviewerRating, sentiment)
reviews_cleaned$id <- 1:nrow(reviews_cleaned)
setDT(reviews_cleaned)
setkey(reviews_cleaned, id)
rm(reviews)
#Split into a training (tuning + validation) and test set
all_ids <- reviews_cleaned$id
train_ids <- sample(all_ids, nrow(reviews_cleaned)*0.8)
tune_ids <- sample(train_ids, length(train_ids)*0.8)
valid_ids <- setdiff(train_ids, tune_ids)
test_ids <- setdiff(all_ids, train_ids)
train <- reviews_cleaned[J(train_ids)]
tune <- reviews_cleaned[J(tune_ids)]
valid <- reviews_cleaned[J(valid_ids)]
test <- reviews_cleaned[J(test_ids)]
#Generate stopwords set
words_to_include <- c("no", "not", "cannot")
words_to_remove <- c("full", "review")
sw <- stopwords::stopwords("en")[!(stopwords::stopwords("en") %in% words_to_include)]
sw <- append(sw, words_to_remove)
# # --- Automatic Tuning ---
#
# #Combinations of hyperparameters
# n_comb <- 32
# vocab_term_maxs <- c(1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
#                     2500, 2500, 2500, 2500, 2500, 2500, 2500, 2500,
#                     5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000,
#                     10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000)
# tf_idfs <- c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
#              0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L)
# max_depths <- c(3L, 7L, 13L, 19L, 3L, 7L, 13L, 19L, 3L, 7L, 13L, 19L, 3L, 7L, 13L, 19L,
#                 3L, 7L, 13L, 19L, 3L, 7L, 13L, 19L, 3L, 7L, 13L, 19L, 3L, 7L, 13L, 19L)
# n_rounds <- c(1000, 400, 200, 150, 1000, 400, 200, 150,
#               1000, 400, 200, 150, 1000, 400, 200, 150,
#               1000, 400, 200, 150, 1000, 400, 200, 150,
#               1000, 400, 200, 150, 1000, 400, 200, 150)
# errors <- numeric(n_comb)
#
# #Find best combination
# for(i in 1:n_comb){
#
#   #Tuning iterator
#   it_tune <- itoken(tune$reviewText, preprocessor = tolower, tokenizer = word_tokenizer,
#                     ids = tune$id, progressbar = FALSE)
#
#   #Create vocab
#   vocab = create_vocabulary(it_tune, stopwords = sw, ngram = c(1L, 3L))
#   vocab = prune_vocabulary(vocab, term_count_min = 10,
#                            doc_proportion_max = 0.65, vocab_term_max = vocab_term_maxs[i])
#   vectorizer <- vocab_vectorizer(vocab)
#
#   #Create tuning dtm
#   dtm_tune  <- create_dtm(it_tune, vectorizer)
#   if(tf_idfs[i] == 1){
#     tfidf = TfIdf$new()
#     dtm_tune <- fit_transform(dtm_tune, tfidf)
#   }
#
#   #Train xgboost model
#   boost_model <- xgboost(data = dtm_tune, label = tune$sentiment, max_depth = max_depths[i],
#                          eta = 0.3, nthread = 8, nrounds = n_rounds[i], objective = "binary:hinge",
#                          print_every_n = 50)
#
#   #Build validation dtm
#   it_valid = itoken(valid$reviewText, preprocessor = tolower, tokenizer = word_tokenizer,
#                    ids = valid$id, progressbar = FALSE)
#   dtm_valid = create_dtm(it_valid, vectorizer)
#   if(tf_idfs[i] == 1){
#     dtm_valid <- transform(dtm_valid, tfidf)
#   }
#
#   #Make predictions and calculate error
#   preds <- predict(boost_model, dtm_valid, type = 'class')
#   errors[i] <- mean(preds != valid$sentiment)
# }
#
# # --- Manual Tuning ---
#
# #Manually tuning best combo from automatic tuning
#
# #Create vocab
# vocab = create_vocabulary(it_tune, stopwords = sw, ngram = c(1L, 3L))
# vocab = prune_vocabulary(vocab, term_count_min = 10,
#                          doc_proportion_max = 0.65, vocab_term_max = 10000)
# vectorizer <- vocab_vectorizer(vocab)
#
# #Create tuning dtm
# dtm_tune  <- create_dtm(it_tune, vectorizer)
#
# #Train xgboost model
# boost_model <- xgboost(data = dtm_tune, label = tune$sentiment, max_depth = 13,
#                        eta = 0.2, nthread = 8, nrounds = 300, objective = "binary:hinge")
#
# #Create valid DTM
# it_valid = itoken(valid$reviewText, preprocessor = tolower, tokenizer = word_tokenizer,
#                   ids = valid$id, progressbar = FALSE)
# dtm_valid = create_dtm(it_valid, vectorizer)
#
# #Make predictions and calculate error
# preds <- predict(boost_model, dtm_valid, type = 'class')
# mean(preds == valid$sentiment)
# --- Final Model ---
#Preprocessing and tokenization of whole training set
it_train <- itoken(train$reviewText, preprocessor = tolower, tokenizer = word_tokenizer,
ids = train$id, progressbar = FALSE)
#Vocab from entire training set
vocab = create_vocabulary(it_train, stopwords = sw, ngram = c(1L, 3L))
vocab = prune_vocabulary(vocab, term_count_min = 10,
doc_proportion_max = 0.65,
vocab_term_max = 10000)
#Vectorizer from entire training set
vectorizer <- vocab_vectorizer(vocab)
#Create training DTM
dtm_train  <- create_dtm(it_train, vectorizer)
#Train XGBoost model
boost_model <- xgboost(data = dtm_train, label = train$sentiment, max_depth = 13,
eta = 0.3, nthread = 8, nrounds = 300, objective = "binary:logistic")
#Create test DTM
it_test <- itoken(test$reviewText, preprocessor = tolower, tokenizer = word_tokenizer,
ids = train$id, progressbar = FALSE)
dtm_test <- create_dtm(it_test, vectorizer)
#Make predictions
preds_sent <- round(predict(boost_model, dtm_test))
#Calculate scores
mean(preds_sent == test$sentiment)
#--- Final Confirmation: Only Run for Generating a Boxplot of the Error Distribution ---
#Let's run stochastic holdout 25 times to see what the error distribution is like
#Let's include a stratified holdout function for future use
stratified_holdout <- function(y, epsilon){
n <- length(y)
labels <- unique(y)
ind_train <- NULL
ind_test <- NULL
y <- sample(sample(sample(y))) #Resample y multiple times
for(label in labels){ #Go through each label
labels_i <- which(y == label)
n_i <- length(labels_i)
ind_train <- c(ind_train, sample(labels_i,
round((1 - epsilon)*n_i), replace = FALSE)) #Select (1 - epsilon)% for train
}
ind_test <- (1:n)[-ind_train] #Everything not in training set is in test set
return(list(ind_train, ind_test))
}
n_run <- 25
errors <- numeric(n_run)
for(i in 1:n_run){
print(i)
#Split into training and test sets
ind <- stratified_holdout(reviews_cleaned$sentiment, 0.2)
ind_train <- ind[[1]]
ind_test <- ind[[2]]
train <- reviews_cleaned[ind_train]
test <- reviews_cleaned[ind_test]
#Preprocessing and tokenization of whole training set
it_train <- itoken(train$reviewText, preprocessor = tolower, tokenizer = word_tokenizer,
ids = train$id, progressbar = FALSE)
#Vocab from entire training set
vocab = create_vocabulary(it_train, stopwords = sw, ngram = c(1L, 3L))
vocab = prune_vocabulary(vocab, term_count_min = 10,
doc_proportion_max = 0.65,
vocab_term_max = 10000)
#Vectorizer from entire training set
vectorizer <- vocab_vectorizer(vocab)
#Create training DTM
dtm_train  <- create_dtm(it_train, vectorizer)
#Train XGBoost model
boost_model <- xgboost(data = dtm_train, label = train$sentiment, max_depth = 13,
eta = 0.2, nthread = 8, nrounds = 300, objective = "binary:logistic",
print_every_n = 100)
#Create test DTM
it_test <- itoken(test$reviewText, preprocessor = tolower, tokenizer = word_tokenizer,
ids = train$id, progressbar = FALSE)
dtm_test <- create_dtm(it_test, vectorizer)
#Make predictions
preds_xgb <- round(predict(boost_model, dtm_test))
#Calculate scores
errors[i] <- mean(preds_xgb != test$sentiment)
}
boxplot(errors, main = "Boxplot of Boosted Tree Errors", ylab = "0 - 1 Loss Accuracy")