-
Notifications
You must be signed in to change notification settings - Fork 1
/
Dplyr Tutorial.R
236 lines (168 loc) · 6.88 KB
/
Dplyr Tutorial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
library(gapminder)
library(dplyr)
library(magrittr)
library(ggplot2)
# Let's start with piping
#create a numeric vector to demonstrate with
x <- c(0.109, 0.359, 0.63, 0.996, 0.515, 0.142, 0.017, 0.829, 0.907)
#To apply multiple functions to a vector
#we can make intermediate objects
y<-log(x)
z<-diff(y)
a<-exp(z)
b<-round(a,1)
#Or we can nest the functions from inside out
round(exp(diff(log(x))), 1)
#This can be difficult to interpret for someone following your code.
#Nesting can also be hard to debug
#Piping is a more straightforward solution to this.
x %>% log() %>%
diff() %>%
exp() %>%
round(1)
# Although the %>% pipe is the standard, magrittr has many other types of pipes for other uses.
#For example, when I run
x %>% log() %>%
diff() %>%
exp() %>%
round(1)
#It prints out the results but does not assign the resulting values to anything
#The compound assignment opperator %<>% will reassign the new values to x
x %<>% log() %>%
diff() %>%
exp() %>%
round(1)
x
#There's also a T pipe for branching operations
#If I wnt to plot the matrix AND get the colsums, I can;t do this easily with just the regulr pipe
rnorm(200) %>%
matrix(ncol = 2) %>%
plot %>%
colSums
#It only plots the results but does not return the colsums
#Using the %T>% pipe lets us branch the operations
rnorm(200) %>%
matrix(ncol = 2) %T>%
plot %>%
colSums
#Exposition pipe operator
#The following code does not workbecause we are trying to call "Sepal.Width"
#which is inside the iris dataset, but we have not pulled it unto the pipe
iris %>%
subset(Sepal.Length > mean(Sepal.Length)) %>%
cor(Sepal.Length, Sepal.Width)
#If we want to refrence objects from inside the original object "iris"
#then we need to use the exposition pipe %$%
iris %>%
subset(Sepal.Length > mean(Sepal.Length)) %$%
cor(Sepal.Length, Sepal.Width)
#Moving forward, we will use only the main pipe operator %>%
#To look at the functionality of dplyr
#We will look at the following functions: select, filter, group_by, summarize, count, mutate
gapminder #subset gapminder to include only year,country, gdp
year_country_gdp <- select(gapminder,year,country,gdpPercap) #we can do this with the select function alone
year_country_gdp
year_country_gdp <- gapminder %>% select(year,country,gdpPercap) #or we can do this using piping
year_country_gdp
#it becomes more clear why piping is useful when I want to not only get the year,country
#and gdp, but also look at only european countries
year_country_gdp_euro <- gapminder %>%
filter(continent=="Europe") %>%
select(year,country,gdpPercap)
#Let's talk about group_by
str(gapminder)
#notice that it is a 'data.frame'
str(gapminder %>% group_by(continent))
#Notice that it now says 'grouped_df'
#a grouped_df is a list where each object in the list is a seperate dataframe which contains only rows
#which match the grouping variable
#Show fig. 1
#Now let's add in 'summarize'
#Notice that here, we are creating a new variable for the first time
gdp_bycontinents <- gapminder %>%
group_by(continent) %>%
summarize(mean_gdpPercap=mean(gdpPercap)) #mean_gdpPercap is the new variable to be stores in the gdp_bycontinents df
#Show Fig 2
#Let's try to group_by multiple variables
gdp_bycontinents_byyear <- gapminder %>%
group_by(continent,year) %>%
summarize(mean_gdpPercap=mean(gdpPercap))
#Think about fig 1/2 in this multi variable context
#Now, let's create multiple columns with various summary stats
gdp_pop_bycontinents_byyear <- gapminder %>%
group_by(continent,year) %>%
summarize(mean_gdpPercap=mean(gdpPercap),
sd_gdpPercap=sd(gdpPercap),
mean_pop=mean(pop),
sd_pop=sd(pop))
#Let's move on to 'count'
#Count is useful when we want to know how many observations in a df meet a specific condition
#Say from the gapminder dataset, I want to know how many observations there are
# for each continent in 2002
gapminder %>%
filter(year == 2002) %>%
count(continent, sort = TRUE)
#notice that the summary output is n here#
#we can call the 'n' column in the pipe where is it created
#we are going to calculate the std error for the life expectancy for each continent
gapminder %>%
group_by(continent) %>%
summarize(se_le = sd(lifeExp)/sqrt(n())) #notice that we name the new standard error variable here
#Let's show how we would chain together multiple summary stats for 'n'
gapminder %>%
group_by(continent) %>%
summarize(
mean_le = mean(lifeExp),
min_le = min(lifeExp),
max_le = max(lifeExp),
se_le = sd(lifeExp)/sqrt(n()))
#Let's move on to mutate
#we can create new variables prior to summarizing our information using 'mutate'
gdp_pop_bycontinents_byyear <- gapminder %>%
mutate(gdp_billion=gdpPercap*pop/10^9) %>% #create a new column here
group_by(continent,year) %>%
summarize(mean_gdpPercap=mean(gdpPercap), #now let's summarize from this column
sd_gdpPercap=sd(gdpPercap),
mean_pop=mean(pop),
sd_pop=sd(pop),
mean_gdp_billion=mean(gdp_billion),
sd_gdp_billion=sd(gdp_billion))
#link with ifelse statements
## keeping all data but "filtering" after a certain condition
# calculate GDP only for people with a life expectation above 25
gdp_pop_bycontinents_byyear_above25 <- gapminder %>%
mutate(gdp_billion = ifelse(lifeExp > 25, gdpPercap * pop / 10^9, NA)) %>%
group_by(continent, year) %>%
summarize(mean_gdpPercap = mean(gdpPercap),
sd_gdpPercap = sd(gdpPercap),
mean_pop = mean(pop),
sd_pop = sd(pop),
mean_gdp_billion = mean(gdp_billion),
sd_gdp_billion = sd(gdp_billion))
#We can also link dplyr to ggplot
#we could do it like this, with intermediate steps......
# Get the start letter of each country
starts.with <- substr(gapminder$country, start = 1, stop = 1)
# Filter countries that start with "A" or "Z"
az.countries <- gapminder[starts.with %in% c("A", "Z"), ]
# Make the plot
ggplot(data = az.countries, aes(x = year, y = lifeExp, color = continent)) +
geom_line() + facet_wrap( ~ country)
#But it is also possible to do it with dplyr like this
gapminder %>%
# Get the start letter of each country
mutate(startsWith = substr(country, start = 1, stop = 1)) %>%
# Filter countries that start with "A" or "Z"
filter(startsWith %in% c("A", "Z")) %>%
# Make the plot
ggplot(aes(x = year, y = lifeExp, color = continent)) +
geom_line() +
facet_wrap( ~ country)
#we can also remove one more step like this
gapminder %>%
# Filter countries that start with "A" or "Z"
filter(substr(country, start = 1, stop = 1) %in% c("A", "Z")) %>%
# Make the plot
ggplot(aes(x = year, y = lifeExp, color = continent)) +
geom_line() +
facet_wrap( ~ country)