-
Notifications
You must be signed in to change notification settings - Fork 0
/
Multiple linear regression: par(mfrow=c(2,2))
120 lines (93 loc) · 5.12 KB
/
Multiple linear regression: par(mfrow=c(2,2))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
library(MASS)
library(ISLR)
### Simple linear regression
names(Boston)
?Boston
plot(medv~lstat,Boston)
fit1=lm(medv~lstat,data=Boston)
fit1
summary(fit1)
abline(fit1,col="red")
names(fit1)
confint(fit1)
predict(fit1,data.frame(lstat=c(5,10,15)),interval="confidence") #Provide CIs
### Multiple linear regression
fit2=lm(medv~lstat+age,data=Boston) # R^2 = percentage of variance explained
# F-stat = f stat that will obtain if dropped out both of the predictors
summary(fit2)
fit3=lm(medv~.,Boston) # ~. = use all other variables
summary(fit3)
par(mfrow=c(2,2))
plot(fit3) # Non-linarity observed in R. vs. F. plot
fit4=update(fit3,~.-age-indus) # get rid of age and indus (less significant)
summary(fit4)
plot(fit4)
# Residuals vs. Fitted = fitted value vs. distance of each point to regression line.
# The plot is used to detect non-linearity, unequal error variances, and outliers.
# the residual = 0 line corresponds to the estimated regression line.
# Here are the characteristics of a well-behaved residual vs. fits plot and what they suggest
# about the appropriateness of the simple linear regression model:
# The residuals "bounce randomly" around the 0 line. This suggests that the assumption that the relationship is
# linear is reasonable.
# The residuals roughly form a "horizontal band" around the 0 line. This suggests that the variances of the error terms
# are equal.
# No one residual "stands out" from the basic random pattern of residuals. This suggests that there are no outliers.
# Source = https://onlinecourses.science.psu.edu/stat501/node/36
# Q(uantile) - Q(uantile) plot = graphical tool to help us assess if a set of data plausibly came
# from some theoretical distribution such as a Normal or exponential
# A Q-Q plot is a scatterplot created by plotting two sets of quantiles against one another.
# If both sets of quantiles came from the same distribution, we should see the points forming a line that’s roughly
# straight.
# i.e. if want to test if data is coming from a normal dist., compare quantiles from theorical normal dist.
# and compare them to the quantiles from your dist. to see if correspond.
#Q-Q plots take your sample data, sort it in ascending order, and then plot them versus quantiles calculated
# from a theoretical distribution. The number of quantiles is selected to match the size of your sample data.
# Source: http://data.library.virginia.edu/understanding-q-q-plots/
# Scale-Location (also called Spread-Location plot.)
# This plot shows if residuals are spread equally along the ranges of predictors.
# This is how you can check the assumption of equal variance (homoscedasticity).
# It’s good if you see a horizontal line with equally (randomly) spread points.
# Residuals vs. Levarage
# This plot helps us to find influential cases (i.e., subjects) if any.
# Not all outliers are influential in linear regression analysis (whatever outliers mean).
# We watch out for outlying values at the upper right corner or at the lower right corner.
# Those spots are the places where cases can be influential against a regression line.
# Look for cases outside of a dashed line, Cook’s distance.
# Source: http://data.library.virginia.edu/diagnostic-plots/
### Nonlinear terms and Interactions
fit5=lm(medv~lstat*age,Boston) # interaction between age and lstat
summary(fit5)
fit6=lm(medv~lstat +I(lstat^2),Boston); summary(fit6) # Add quadratic term
# I(x): Change the class of an object to indicate that it should be treated ‘as is’.
# used to inhibit the interpretation of operators such as "+", "-", "*" and "^" as formula operators,
# so they are used as arithmetical operators
attach(Boston) # Names variables in Boston are available in our data space
par(mfrow=c(1,1))
plot(medv~lstat)
points(lstat,fitted(fit6),col="red",pch=20) # add fitted points (from quadratic model 6)
fit7=lm(medv~poly(lstat,4)) # Easier way of fitting polynomials (here of degree four)
points(lstat,fitted(fit7),col="blue",pch=20) # Polynomial is a bit too wiggly (overfitting the data a little bit)
plot(1:20,1:20,pch=1:20,cex=2) # list of pch ; cex = 2 means enlarged characters by two
###Qualitative predictors
# carseats = another dataset
fix(Carseats) # fix = way of throwing up an editor in R
names(Carseats)
summary(Carseats)
fit1=lm(Sales~.+Income:Advertising+Age:Price,Carseats) #Add interactions between income and ad/ between age and price
summary(fit1) # Income/ad significant but not age/price
contrasts(Carseats$ShelveLoc) # shows how R will code qualitative variable ShelveLoc when put in linear model
# in this case, 3 level factor i.e put in two dummy variables.
###Writing R functions (to fit a regression model and make a plot)
regplot=function(x,y){ # regplot is function of x and y (arguments)
fit=lm(y~x) # fit linear model between x and y
plot(x,y) # plot xy
abline(fit,col="red") # add red regression line
}
attach(Carseats)
regplot(Price,Sales)
regplot=function(x,y,...){ # ... = unnamed arguments. You are allowed to add extra arguments.
fit=lm(y~x)
plot(x,y,...)
abline(fit,col="red")
}
regplot(Price,Sales,xlab="Price",ylab="Sales",col="blue",pch=20)