-
Notifications
You must be signed in to change notification settings - Fork 0
/
machineLearning
113 lines (88 loc) · 2.76 KB
/
machineLearning
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#Install Packages
install.packages("caret")
library(caret)
#Load Data
#attach iris dataset to environment
data(iris)
#rename dataset
dataset<-iris
#Validate Dataset
#create list of 80% of rows in original dataset for training
validation_index<-createDataPartition(dataset$Species, p=0.80, list=FALSE)
#select 20% of data for validation
validation<-dataset[-validation_index,]
#use remaining 80% of data to train and test models
dataset<-dataset[validation_index,]
#Summarize Dataset
#dimension of dataset
dim(dataset)
#Types of Attributes
#list types of attributes
sapply(dataset, class)
#Review the Data
#reviews the first 5 rows of data
head(dataset)
#Levels of Class
#list levels for the class
levels(dataset$Species)
#Class Distribution
#summarize the class distribution
percentage<-prop.table(table(dataset$Species))*100
cbind(freq=table(dataset$Species), percentage=percentage)
#Statistical Summary
#summarize attribute distribution
summary(dataset)
#Visualize Dataset
#Unvariable Plots | plots of each individual variable
#split input and output
x<-dataset[,1:4]
y<-dataset[,5]
#boxplot for each attribute on one image
par(mfrow=c(1:4))
for(i in 1:4){
boxplot(x[,i], main=names(iris)[i])}
#barplot for class breakdown
plot(y)
#Multivariate Plots | interactions btwn variables
#scatterplot matrix
featurePlot(x=x, y=y, plot="ellipse")
#box and whisker plots for each attribute
featurePlot(x=x, y=y, plot="box")
#density plots for each attribute by class value
scales<-list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)
#EVALUATE | ALGORITHMS
#Test Harness | 10-fold crossvalidation to estimate accuracy
#run algorithms using 10-fold crossvalidation
control<-trainControl(method="cv", number=10)
metric="Accuracy"
#Build Models
#1.linear algorithms
set.seed(7)
fit.lda<-train(Species~., data=dataset, method="lda", metric=metric, trControl=control)
#2.nonlinear algorithms
#CART
set.seed(7)
fit.cart<-train(Species~., data=dataset, method="rpart", metric=metric, trControl=control)
#KNN
set.seed(7)
fit.knn<-train(Species~., data=dataset, method="knn", metric=metric, trControl=control)
#3.advanced algorithms
#SVM
set.seed(7)
fit.svm<-train(Species~., data=dataset, method="svmRadial", metric=metric, trControl=control)
#Random Forest
set.seed(7)
fit.rf<-train(Species~., data=dataset, method="rf", metric=metric, trControl=control)
#SELECT BEST MODEL
#summarize accuracy of models
results<-resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary(results)
#compare accuracy of models
dotplot(results)
#summarize best model
print(fit.lda)
#MAKE PREDICTIONS
#estimate skill of LDA on the validation dataset
predictions<-predict(fit.lda, validation)
confusionMatrix(predictions, validation$Species)