-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path.Rhistory
237 lines (237 loc) · 8.52 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
z = readLines('http://www.google.com/search?q=introduction+to+r')
z
ls()
pf = read.delim("pseudo_facebook.tsv")
load("~/Documents/Learning/Online/Udacity/Exploratory/Lecture4.RData")
?with
?corr.test
??corr.test
corr.test(pf$www_likes_received, pf$likes_received)
library(ggplot)
library("ggplot2")
corr.test(pf$www_likes_received, pf$likes_received)
corr(pf$www_likes_received, pf$likes_received)
cor.test(pf$www_likes_received, pf$likes_received)
install.package('alr3')
install.packages('alr3')
library(alr3)
data(Mitchell)
?Mitchell
save.image("~/Documents/Learning/Online/Udacity/Exploratory/Lecture4.1.RData")
clear()
install.packages(c("BH", "boot", "car", "cluster", "codetools", "dplyr", "KernSmooth", "labeling", "Lahman", "MASS", "mgcv", "nlme", "Rcpp"))
library(ggplot2)
data(diamonds)
summary(diamonds)
ggplot(data=diamonds, aes(x=x, y=price)) + geom_line()
ggplot(data=diamonds, aes(x=x, y=price))
ggplot(data=diamonds, aes(x=x, y=price)) + geom_point()
corr(diamonds$price, diamonds$x)
cor(diamonds$price, diamonds$x)
cor(diamonds$price, diamonds$y)
cor(diamonds$price, diamonds$z)
ggplot(data=diamonds, aes(x=depth, y=price)) + geom_point()
ggplot(data = diamonds, aes(x = depth, y = price)) +
geom_point(alpha=1/100) + scale_x_continously(breaks=2)
ggplot(data = diamonds, aes(x = depth, y = price)) +
geom_point(alpha=1/100) + scale_x_continuous(breaks=2)
help(scale_x_continuous)
ggplot(data = diamonds, aes(x = depth, y = price)) +
geom_point(alpha=1/100) + scale_x_continuous(breaks=2:10)
summary(diamonds$deptch)
summary(diamonds$detch)
summary(diamonds$depth)
ggplot(data = diamonds, aes(x = depth, y = price)) +
geom_point(alpha=1/100) + scale_x_continuous(breaks=2:80)
c(2,80,2)
2:80
2:80:2
ggplot(data = diamonds, aes(x = depth, y = price)) +
geom_point(alpha=1/100) + scale_x_continuous(breaks=2*2:10)
2*1:80
2*1:40
ggplot(data = diamonds, aes(x = depth, y = price)) +
geom_point(alpha=1/100) + scale_x_continuous(breaks=2*1:40)
cor(diamonds$deptch, diamonds$price)
cor(diamonds$depth, diamonds$price)
ggplot(data = diamonds, aes(x = carat, y = price)) +
geom_point(alpha=1/100) + scale_x_continuous(breaks=2*1:40)
ggplot(data = d[diamonds$price < quantile(diamonds$price, 0.99)], aes(x = carat, y = price)) +
geom_point(alpha=1/100) + scale_x_continuous(breaks=2*1:40)
d[diamonds$price < quantile(diamonds$price, 0.99)]
d = diamonds[diamonds$price < quantile(diamonds$price, 0.99)]
d = diamonds[price < quantile(diamonds$price, 0.99)]
d = diamonds[price < quantile(diamonds$price, 0.99), ]
d = diamonds[diamonds$price < quantile(diamonds$price, 0.99), ]
d
ggplot(data = d, aes(x = carat, y = price)) +
geom_point()
d = diamonds[diamonds$price < quantile(diamonds$price, 0.99), ]
diamonds$volume = diamonds$x * diamonds$y * diamonds$z
ggplot(data=dimonds, aes(x= volue, y = price)) + geom_point()
ggplot(data=diamonds, aes(x= volue, y = price)) + geom_point()
ggplot(data=diamonds, aes(x= volume, y = price)) + geom_point()
d = diamonds[diamonds$volume > 0, ]
d = d[d$volume < 800, ]
cor(d$volue, d$price)
cor(d$volume, d$price)
require(ggplot2)
ggplot(data=d, aes(x= volume, y = price)) + geom_point(alpha=1/100) + geom_smooth(method="lm", formula = y~x, size = 1)
summary(d)
ggplot(data=d, aes(x= volume, y = price)) + geom_point(alpha=1/100) + geom_smooth(method="lm", formula = y~x, size = 1)
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
data(diamonds)
head(diamonds)
diamondsByClarity = select(diamonds, mean_price, median_price, min_price, max_price,n)
group_by(diamonds, clarity)
suppressMessages(library(ggplot2))
library(ggplot2)
package
package.load(ggplot2)
install.packages("ggplot2")
install.packages("dplyr")
data(diamonds)
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
data(diamonds)
library(dplyr)
install.packages(dplyr)
install.packages("dplyr")
install.packages(dplyr)
library(dplyr)
library
library(ggplot2)
install.packages("ggplot2")
library(ggplot2)
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))
diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
p1 = ggplot(diamonds_mp_by_clarity, aes(x = clarity , y = mean_price)) + geom_bar()
p2 = ggplot(diamonds_mp_by_color, aes(x = color, y = mean_price)) + geom_bar()
install.packages("gridExtras")
install.packages("gridExtras")
install.packages("gridExtra")
library(gridExtra)
grid.arrange(p1, p2)
p1 = ggplot(diamonds_mp_by_clarity, aes(x = clarity , y = mean_price)) + geom_bar()
p1
head(diamonds_mp_by_clarity)
setwd("~/Documents/Learning/Online/Udacity/Nanodegree/Visualization")
data = read.csv("data/original/2008.csv")
data['Time'][1,10]
names(data)
data["DepTime"][1,10]
data["DepTime"]
data["DepTime"][1]
data["DepTime"][1,1]
data["DepTime"][1,2]
data["DepTime"][1,3]
data["DepTime"][2,1]
data["DepTime"][3,1]
data["DepTime"][4,1]
data["DepTime"][5,1]
three = 1829
four = 1892
three = 129
substr(three, 0, 1)
substr(four, 0, )
substr(four, 0, 2)
substr(four, 0, four-2)
substr(four, 0, nchar(four)-2)
substr(three, 0, nchar(three)-2)
hour = function(x) {}
hour = function(x) { return substr(x, 0, nchar(x) - 2)}
hour = function(x) { return(substr(x, 0, nchar(x) - 2))}
hour(123)
hour(1233)
data["Time"] = lapply(data["DepTime"], hour)
data.Time[1,1]
data["Time"][1,1]
data["Time"][2,1]
data["Time"][3,1]
data["Time"][4,1]
names(data)
mutate(data, DateTime = paste(Year, Month, DayofMonth, Time, sep="-")
)
library(dplyr)
mutate(data, DateTime = paste(Year, Month, DayofMonth, Time, sep="-"))
names(data)
data = mutate(data, DateTime = paste(Year, Month, DayofMonth, Time, sep="-"))
names(data)
grouped = group_by(data, DateTime)
summed = summarise(data, count=n(), DepDelay=mean(DepDelay))
names(grouped)
names(summed)
summed = summarise(data, count=n(), DepDelay=mean(DepDelay), DateTime=DateTime)
summed = summarise(data, count=n(), DepDelay=mean(DepDelay), DateTime=toString(DateTime))
names(summed)
summed
summed[1]
summed[2]
summed = summarise(data, count=n(), DepDelay=mean(DepDelay))
summed = summarise(grouped, count=n(), DepDelay=mean(DepDelay))
summed = summarise(grouped, count=n(), DepDelay=mean(DepDelay), DateTime=toString(DateTime))
summed = summarise(grouped, count=n(), DepDelay=mean(DepDelay))
summed = summarise(grouped, count=n(), DepDelay=mean(DepDelay), DateTime=toString(DateTime))
summed = summarise(grouped, count=n(), DepDelay=mean(DepDelay))
names(summed)
summed[1]
summed[2]
summary(summed)
summed["W"] = summed["count"] * summed["DepDelay"]
sum(summed["W"])/sum(summed["count"])
summed["W"][1]
summed["W"][3,1]
summed["W"][4,1]
summed["count"][4,1]
sum(summed["W"])
summed = summarise(grouped, count=n(), DepDelay=mean(DepDelay, na.rm=True))
summed = summarise(grouped, count=n(), DepDelay=mean(DepDelay, na.rm=TRUE))
summed["W"] = summed["count"] * summed["DepDelay"]
sum(summed["W"])
summed["W"]
sum(summed["count"])
summed["W"]
summed = summarise(grouped, count=n(na.rm=TRUE), DepDelay=mean(DepDelay, na.rm=TRUE))
summed = summarise(grouped, count=n(), DepDelay=mean(DepDelay, na.rm=TRUE))
class(summed)
class(summed["W"])
sum( is.na(summed["W"]))
sum( !is.na(summed["W"]))
is.na(summed["W"])
sum(is.na(summed$W))
sum(is.null(summed$W))
sum(is.nan(summed$W))
sum(is.na(summed$W))
summed[is.null(summed$W)]
names(summed)
summed[W]
summed["W"]
summed$W = summed$count * summed$DepDelay
sum(summed$W)
sum(summed$W, na.rm=TRUE)
sum(summed$W, na.rm=TRUE) / sum(summed$count, na.rm=TRUE)
write.csv("data/2008-DateTime.csv")
write.csv(summed, "data/2008-DateTime.csv")
summedByHour = summarise(group_by(data, Time), count=n(), DepDelay=mean(DepDelay, na.rm=TRUE))
library(ggplot2)
ggplot(summedByHour, aes(x=log(Count), y=DepDelay)) + geom_point()
ggplot(summedByHour, aes(x=log(count), y=DepDelay)) + geom_point()
ggplot(summedByHour, aes(x=log2(count), y=DepDelay)) + geom_point()
ggplot(summedByHour, aes(x=log10(count), y=DepDelay)) + geom_point()
ggplot(summedByHour, aes(x=count, y=DepDelay)) + geom_point()
ggplot(summedByHour, aes(x=Time, y=count, size=DepDelay)) + geom_point()
summary(summedByHour$Time)
summedByHour$Time
sum(is.null(summedByHour$Time))
sum(is.empty(summedByHour$Time))
sum(is.na(summedByHour$Time))
summedByHour = summedByHour[complete.cases(summedByHour$Time),]
summedByHour = summedByHour[summedByHour$Time != "",]
summedByHour
ggplot(summedByHour, aes(x=Time, y=count, size=DepDelay)) + geom_point()
ggplot(summedByHour, aes(x=as.numeric(Time), y=count, size=DepDelay)) + geom_point()
ggplot(summedByHour, aes(x=as.numeric(Time), y=log10(count), size=DepDelay)) + geom_point()
ggplot(summedByHour, aes(x=as.numeric(DepDelay), y=log10(count))) + geom_point()