-
Notifications
You must be signed in to change notification settings - Fork 10
/
ml1.R
139 lines (111 loc) · 3.39 KB
/
ml1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
library(caret)
library(FNN)
library(fastNaiveBayes)
library(tidyverse)
library(doParallel)
library(foreach)
library(functional)
## reading data
dataurl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
wine <- read_csv(dataurl, col_names = F)
## Actual columns
good_cols <- c("class",
"alcohol",
'malic_acid',
'ash',
'alkalinity',
'magnesium',
'total_phenols',
'flavanoids',
'nonflavonoids_phenols',
'proanthocyanins',
'color_intensity',
'hue',
'dilution',
'proline'
)
## Column rename (data processing)
fix_cols <- function(df){
colnames(df) <- good_cols
df$class <- (df$class)
df
}
wine <- fix_cols(wine)
## Train test split
split <- function(df, p = 0.75, list = FALSE, ...) {
train_ind <- createDataPartition(df[[1]], p = p, list = list)
cat("creating training dataset...\n")
training <<- df[train_ind, ]
cat("completed training dataset, creating test set\n")
test <<- df[-train_ind, ]
cat("done")
}
split(wine)
## Making Model
train_knn <- function(k) {
FNN::knn(train = training[-1], test = test[-1], cl = as.factor(training$class), k)
}
## Making metrics
conmat <- function(predicted, expected){
cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted))
cm
}
f1_score <- function(predicted, expected, positive.class="1") {
cm = conmat(predicted, expected)
precision <- diag(cm) / colSums(cm)
recall <- diag(cm) / rowSums(cm)
f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall))
#Assuming that F1 is zero when it's not possible compute it
f1[is.na(f1)] <- 0
#Binary F1 or Multi-class macro-averaged F1
ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1))
}
accuracy <- function(predicted, expected){
cm <- conmat(predicted, expected)
sum(diag(cm)/length(test$class))
}
## Testing
3 %>% train_knn %>% conmat(test)
3 %>% train_knn %>% accuracy(test)
3 %>% train_knn %>% f1_score(test)
get_scores <- function(k){
predictions <- train_knn(k)
f1 <- f1_score(predictions,test)
acc <- accuracy(predictions,test)
scores <- c(accuracy = acc, f1 = f1)
scores
}
get_scores(3)
## Tuning K without a grid search
registerDoParallel(detectCores() -1)
# parallel KNN for k 1:33
# see how it is literally instantaneous
foreach(i = 1:33, .combine = "rbind", .multicombine = T) %dopar% get_scores(i) %>% as_tibble -> scores
scores$index <- 1:33
library(ggplot2)
library(reshape2)
ggplot(data = melt(scores, id.vars = "index", variable.name = "metric"), aes(index, value)) + geom_line(aes(color = metric))
# making a pipeline for knn predictions
readfile <- function(url) read_csv(url, col_names = F)
split2 <- function(df, p = 0.75, list = FALSE, ...) {
train_ind <- createDataPartition(df[[1]], p = p, list = list)
training <- df[train_ind, ]
test <- df[-train_ind, ]
list(training = training, test = test)
}
res <- split2(wine)
make_knn <- function(k) {
function(lst) {
training <- lst$training
test <- lst$test
train_knn(k)
}
}
knn11 <- make_knn(11)
knn_pipeline <- Compose(readfile, fix_cols, split2,knn11)
knn_pipeline(dataurl)
caret_knn <- function(k) {
knn3Train(train = training[-1], test = test[-1], cl = as.factor(training$class), k = k, prob = F)
}
caret_knn(11)
train_knn(11)