-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy path001_LinearRegression.R
61 lines (38 loc) · 1.13 KB
/
001_LinearRegression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
library(dplyr)
library(ggplot2)
library(caTools)
library(corrgram)
# 1. READ IN THE DATA
df <- read.csv('data/Fish.csv')
head(df)
# 2. DATA ANALYSIS
# check for missing values
any(is.na(df))
# Weight vs Height plot
ggplot(data=df, aes(x=Weight, y=Height)) +
geom_point(aes(color=Species, size=10, alpha=0.7))
# correlation check
corrgram(df, lower.panel=panel.shade, upper.panel=panel.cor)
# 3. TRAIN/TEST SPLIT
set.seed(42)
sampleSplit <- sample.split(Y=df$Weight, SplitRatio=0.7)
trainSet <- subset(x=df, sampleSplit==TRUE)
testSet <- subset(x=df, sampleSplit==FALSE)
# 4. TRAIN THE MODEL
model <- lm(formula=Weight ~ ., data=trainSet)
summary(model)
# visualize residuals
modelResiduals <- as.data.frame(residuals(model))
ggplot(modelResiduals, aes(residuals(model))) +
geom_histogram(fill='deepskyblue', color='black')
# 5. MAKE PREDICTIONS
preds <- predict(model, testSet)
# 6. EVALUATE PREDICTIONS
modelEval <- cbind(testSet$Weight, preds)
colnames(modelEval) <- c('Actual', 'Predicted')
modelEval <- as.data.frame(modelEval)
head(modelEval)
mse <- mean((modelEval$Actual - modelEval$Predicted)^2)
mse
rmse <- sqrt(mse)
rmse