-
Notifications
You must be signed in to change notification settings - Fork 0
/
summarising_viz_expends.R
518 lines (401 loc) · 18.9 KB
/
summarising_viz_expends.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
#######
### Summarising Visitation and Expenditures by MAR, country, and MPA
###
###
### Requires:
### Total visitors to the AOI in each country to be distributed out based on PUD and TUD
### (proportioned_viz_2015.csv from downscaling.r)
### Outputs from preparing_socmed.R
### Table of "short" MPA names
###
### Creates:
### aoi_viz_exp.shp
### country_summary.csv
### mar_summary.csv
### mpa_summary.csv
### plots of mpa estimates by country
library(sf)
library(tidyverse)
setwd("~/Documents/MAR/ModelRuns/baseline_20200715/")
# read in total visitation numbers by country (proportioned from overall country numbers into the AOI in
# `downscaling.R`)
# NOTE: decided 7/15/20 not to bother rerunning downscaling with the AOIv4. The country boundary changes are too minimal to have an effect I think
viz_2017 <- read_csv("~/Documents/MAR/Data/proportioned_viz_2017_AOIv3.csv")
# also, average expenditures (for now, rough numbers). Check out the google sheet 'expenditures' for more options
# Note that Honduras is from 2016, others are all 2017
# Average expenditures are averaged between day trippers and overnight guests
avg_exp <- tibble(Country = c("Mexico", "Honduras", "Guatemala", "Belize"),
AvgExp = c(130, 60, 81, 148),
AvgExpTrip = c(809, 572, 614, 854))
# Read in AOI and smud info
aoi_smud <- read_sf("aoi_smud.shp")
# Read in list of short MPA names
mpas <- read_csv("listofMPAs_v2.csv")
## First, let's summarise by country (since we're almost already there)
country_summaries <- viz_2017 %>%
left_join(avg_exp, by = "Country") %>%
mutate(Expenditures = vizinAOI*AvgExp) %>%
dplyr::select(Country, Year, vizinAOI, AvgExp, Expenditures)
country_summaries
#write_csv(country_summaries, "viz_and_expends/country_summaries.csv")
## And, for the entire AOI?
mar_summary <- country_summaries %>%
summarise(Visitors = sum(vizinAOI),
Expenditures = sum(Expenditures),
Year = mean(Year))
mar_summary
#write_csv(mar_summary, "viz_and_expends/mar_summary.csv")
############# And... how about by those MPAs? ###################
### As of 10/9/19, this should be much easier since my new AOI is intersected with the MPA boundaries
### See below for old code (used for August workshop summaries), that calculated the proportion
### of each hex that fell inside the MPA and then proportioned the viz accordingly.
### The new AOI is an improvement since it shouldn't include adjacent cities in MPAs
aoi_smud <- aoi_smud %>%
left_join(mpas %>% select(-Country), by = c("name_2" = "NAME"))
mpa_smud <- aoi_smud %>%
group_by(Name_short, Country = CNTRY_N) %>%
summarise(smud_prop = sum(smd_prp),
tud_prop = sum(tud_prp),
pud_prop = sum(pud_prp)) %>%
filter(!is.na(Name_short))
ggplot(mpa_smud) + geom_sf(aes(fill = smud_prop))
# great. So now... how to get from smud to number of visitors and expenditures?
## Need to find a SMUD to visitor ratio for each country
## Note: this is still necessary, since my proportions above represent the proportion of ALL tourism to the
## MAR region which is represented by that cell. But, I want to proportion out the COUNTRY-level tourism
## numbers
# So, summarise by country
countries_smud <- aoi_smud %>%
group_by(Country = CNTRY_N) %>%
summarise(smud_prop = sum(smd_prp),
tud_prop = sum(tud_prp),
pud_prop = sum(pud_prp)) %>%
filter(!is.na(Country))
countries_smud
# let's put it together to plot
countries_ud_emp <- countries_smud %>%
left_join(country_summaries, by = "Country") %>%
gather(key = "socmed", value = "prop_ud", c(smud_prop, tud_prop, pud_prop))
# and, calculate the vis to prop_ud ratio for each
countries_ratios <- countries_ud_emp %>%
st_set_geometry(NULL) %>%
mutate(ud_to_vis = vizinAOI/prop_ud) %>% #,
# socmed = str_extract(socmed, "(?<=_)[:alnum:]*$")) %>%
dplyr::select(Country, AvgExp, socmed, ud_to_vis)
countries_ratios
# Oof. Those are VERY different by country
##### creating a shapefile that includes estimated visitation per grid cell using those viz ratios ####
## First, just using smud_prop
vis_per_cell <- aoi_smud %>%
select(pid, smud_prop = smd_prp, Country = CNTRY_N, MPA = name_2, MPA_short = Name_short, geometry) %>%
left_join(countries_ratios %>% filter(socmed == "smud_prop"), by = "Country") %>%
mutate(est_vis = smud_prop*ud_to_vis,
est_exp = round(est_vis*AvgExp))
ggplot(vis_per_cell) + geom_sf(aes(fill = log1p(est_vis)))
## write it out
#write_sf(vis_per_cell, "aoi_viz_exp.shp")
## Let's also do this with tud and pud seperately to see if one of them better matches empirical numbers
vis_per_cell_sep <- aoi_smud %>%
select(pid, smud_prop = smd_prp, tud_prop = tud_prp, pud_prop = pud_prp, Country = CNTRY_N, MPA = name_2, MPA_short = Name_short, geometry) %>%
gather(key = "socmed", value = "prop", tud_prop, pud_prop, smud_prop) %>%
left_join(countries_ratios, by = c("Country", "socmed")) %>%
mutate(est_vis = prop*ud_to_vis,
est_exp = round(est_vis*AvgExp))
## so... let's calculate visitors and expenditures by mpa
mpa_summaries <- vis_per_cell_sep %>%
filter(!is.na(MPA), !is.na(Country)) %>%
group_by(Country, MPA, MPA_short, socmed) %>%
summarise(prop = sum(prop),
visitors = sum(est_vis),
expenditures = sum(est_exp))
mpa_summaries
# write it out
#write_csv(mpa_summaries, "viz_and_expends/mpa_summaries.csv")
# let's write out a cleaner version too, which only includes the smud_prop
mpa_summaries_clean <- mpa_summaries %>%
st_drop_geometry() %>%
filter(socmed == "smud_prop") %>%
select(-prop, -socmed) %>%
mutate(visitors = round(visitors))
mpa_summaries_clean
#write_csv(mpa_summaries_clean, "viz_and_expends/mpa_summaries_clean.csv")
##### Let's pull in whatever empirical numbers I have for MPA visitors and see how my estimates compare
MPA_emp_viz <- read_csv("~/Documents/MAR/Data/MPA_Emp_Visitation_10_09_19.csv")
# I have multiple estimates/years for Hol Chan and Cayos Cochinos and South Water Caye. Let's drop some
MPA_emp_viz_sub <- MPA_emp_viz %>%
filter(!(MPA == "Hol Chan" & Year == 2016),
!(MPA_short == "Cayos Cochinos" & Year == 2016),
!(MPA_short == "South Water Caye" & Visitation == 4600))
# bind on to my estimates to plot
mpa_comps <- mpa_summaries %>%
rename(est_visitors = visitors) %>%
left_join(MPA_emp_viz_sub)
ggplot(mpa_comps %>% filter(socmed == "smud_prop")) +
geom_label(aes(x = Visitation, y = est_visitors, label = MPA_short)) +
# geom_point(aes(x = Visitation, y = est_visitors), col = "blue") +
geom_abline(slope = 1) +
ylab("Estimated Annual Visitors (from social media)") +
xlab("Annual Visitors (from in-country sources)")# +
#facet_grid(rows = vars(socmed))
# write it out
#ggsave("figs/mpas_smud_v_emp_scatterplot.png", width = 8, height = 6, units = "in")
# slightly cleaner, with and without Hol Chan
ggplot(mpa_comps %>% filter(socmed == "smud_prop"))+#, MPA_short != "Hol Chan")) +
#geom_label(aes(x = Visitation, y = est_visitors, label = MPA_short)) +
geom_point(aes(y = Visitation, x = est_visitors, col = Country)) +
geom_abline(slope = 1) +
xlim(c(0, 40000)) +
xlab("Estimated Annual Visitors (from social media)") +
ylab("Annual Visitors (from in-country sources)") +
labs(title = "MPA Visitation - Survey-based vs. social media-based estimates") +
theme_classic()
# write it out
#ggsave("figs/mpas_smud_v_emp_scatterplot_2.png", width = 6, height = 4, units = "in")
ggplot(mpa_comps %>% filter(Country == "Belize")) +
geom_abline(slope = 1) +
geom_point(aes(x = Visitation, y = est_visitors))
## Hol Chan is underestimated by all three socmed options
## correlation I think isn't actually what I want... how about RMSE?
mpa_comps %>%
filter(!is.na(Visitation)) %>%
mutate(sq_error = (est_visitors - Visitation)^2) %>%
group_by(socmed) %>%
summarise(RMSE = mean(sq_error))
# great, smud is actually the lowest, and therefore the best
### Not sure what to do about Hol Chan being low. See notes in MAR notes and thoughts (pg 16)
## for a few things I investigated but rejected.
############ Plotting MPA Summaries #########
### Note that I checked on 6/21/21 that this creates the same results as reading in aoi_viz_exp.R directly and summarising by MPA
countrytp <- "Honduras"
ggplot(mpa_summaries %>% filter(Country == countrytp, socmed == "smud_prop")) +
geom_col(aes(x = reorder(MPA_short, visitors), y = visitors), fill = "darkred", width = .7) +
coord_flip() +
xlab(NULL) +
ylab("Annual Visitors") +
labs(title = paste0("Estimated 2017 Visitation by MPA - ", countrytp),
subtitle = paste("Updated", Sys.Date())) +
theme_bw()
#ggsave(paste0("figs/mpas_", countrytp, ".png"),
# width = 9, height = 9, units = "in", scale = .6)
# let's see if we can add expenditure information in text at the end of the bars
mpa_summaries %>% filter(Country == countrytp, socmed == "smud_prop") %>% arrange(desc(visitors))
## plot all MPAs together
ggplot(mpa_summaries %>% filter(socmed %in% c("smud_prop"))) +
geom_col(aes(x = reorder(MPA_short, visitors), y = visitors), fill = "darkred", width = .7) +
coord_flip() +
xlab(NULL) +
ylab("Annual Visitors") +
labs(title = paste0("Estimated 2017 Visitation by MPA")) +
theme_bw()
# write it out
#ggsave(paste0("figs/mpas_all.png"), width = 8, height = 8, units = "in")
## Let's do the all MPAs plot, but add in the empirical vis numbers
mpas_comps_tall <- mpa_comps %>%
filter(socmed == "smud_prop") %>%
gather(key = "source", value = "visitation", est_visitors, Visitation)
ggplot(mpas_comps_tall) +
geom_col(aes(x = reorder(MPA_short, prop), y = visitation, color = Country, alpha = source),
position = "dodge", width = .8) +
coord_flip() +
scale_alpha_discrete(breaks = c("est_visitors", "Visitation"),
labels = c("Model estimate", "Empirical Visitation")) +
labs(title = "Annual Visitation by MPA. Model estimates vs. Empirical viz numbers",
subtitle = paste("Updated", Sys.Date()))
#ggsave("figs/mpas_smud_v_empirical.png", height = 8, width = 8, unit = "in")
## plot all MPAs together, showing alternative social media combos
ggplot(mpa_summaries_tall %>% filter(socmed %in% c("smud2", "smud", "pud", "tud"),
Country == "Belize")) +
geom_col(aes(x = reorder(Name_short, visitors), y = visitors, fill = socmed), position = "dodge", width = .7) +
coord_flip() +
xlab(NULL) +
ylab("Annual Visitors") +
labs(title = paste0("Estimated 2017 Visitation by MPA")) +
theme_bw()
####################################################
######### Old MPA intersection, proportioning, and summary code ####
# pulling in gridded AOI & MPA shapefiles
aoi <- read_sf("~/Documents/MAR/ModelRuns/baseline_5k/T_aoi_v3_wgs_5k_pid.shp")
mpas <- read_sf("~/Documents/MAR/GIS/BordersandProtectedAreas/MPA/MPA_Network_WGS84_4326.shp")
# let's first try just a simple intersection and see what happens
st_crs(aoi)
st_crs(mpas)
grid_mpas <- st_intersection(aoi, mpas)
plot(grid_mpas, max.plot = 1)
# export
#write_sf(grid_mpas, "~/Downloads/test.shp")
# Let's calculate the proportionn of each grid which falls inside the MPA, then use this as a multplier times the
# avg_ann_ud for that grid cell to get number of posts from "within" the MPA
# what's the area of a single grid cell?
grid_area <- aoi %>%
mutate(area = unclass(st_area(aoi))) %>%
st_set_geometry(NULL)
# and, MPA grid areas? Plus calculate the prop_grid (proportion of the grid cell which is inside the MPA)
mpas_props <- grid_mpas %>%
mutate(area_mpa = unclass(st_area(grid_mpas))) %>%
left_join(grid_area, by = "pid") %>%
mutate(prop_grid = area_mpa/area)
## ok. So... pull in our social media data (by pid)
avg_ann_smud <- read_csv("~/Documents/MAR/ModelRuns/baseline_5k/avg_ann_smud_2005plus_19_101_SMUD.csv")
# and... let's bind it on to mpas_props
mpa_grid_sm <- mpas_props %>%
left_join(avg_ann_smud, by = "pid") %>%
mutate(smud_in_mpa = avg_ann_smud*prop_grid,
tud_in_mpa = avg_ann_tud*prop_grid,
pud_in_mpa = avg_ann_pud*prop_grid,
smud_prop_in_mpa = smud_prop*prop_grid)
mpa_grid_sm
# and, summarise by mpa
mpa_smud <- mpa_grid_sm %>%
group_by(Name_short, Country) %>%
summarise(smud_prop = sum(smud_in_mpa),
#smud_tot = sum(avg_ann_smud),
#pud_tot = sum(avg_ann_pud),
pud_prop = sum(pud_in_mpa),
#tud_tot = sum(avg_ann_tud),
tud_prop = sum(tud_in_mpa),
smud2_prop = sum(smud_prop_in_mpa)) %>%
ungroup()
# let's have a look and see if these seems reasonable
mpa_smud_tp <- st_set_geometry(mpa_smud, NULL)
ggplot(mpa_smud_tp)+
geom_col(aes(x = reorder(Name_short, smud_prop), y = smud_prop)) +
coord_flip()
# how about PUD on its own?
ggplot(mpa_smud_tp)+
geom_col(aes(x = reorder(Name_short, smud_prop), y = pud_prop)) +
coord_flip()
# and TUD?
ggplot(mpa_smud_tp)+
geom_col(aes(x = reorder(Name_short, smud_prop), y = tud_prop)) +
coord_flip()
## how about alternative smud made with proportions?
ggplot(mpa_smud_tp)+
geom_col(aes(x = reorder(Name_short, smud_prop), y = smud2_prop)) +
coord_flip()
# plotting together
mpa_smud_tall <- mpa_smud %>%
st_set_geometry(NULL) %>%
gather(key = "source", value = "UD", -Name_short, -Country)
ggplot(mpa_smud_tall %>% filter(source %in% c("tud_prop", "pud_prop"))) +
geom_col(aes(x = reorder(Name_short, UD), y = UD, fill = source)) +
coord_flip()
# ok, I need to know which country each pid falls into
countries_aoi <- read_sf("~/Documents/MAR/GIS/Downscaling/CountriesplusTAOI_v3.shp")
countries_aoi
c_aoi_valid <- lwgeom::st_make_valid(countries_aoi)
countries_pid_int <- st_intersection(aoi, c_aoi_valid)
# note that this splits some pids. Below I drop those that are split into two pieces within the same
# country, but don't worry for now about pids which straddle countries
countries_pid <- countries_pid_int %>%
st_set_geometry(NULL) %>%
dplyr::select(pid, country = CNTRY_NAME) %>%
distinct()
# combine with socmed and summarise by country
countries_ud <- avg_ann_smud %>%
left_join(countries_pid, by = "pid") %>%
group_by(country) %>%
summarise_at(vars(avg_ann_pud, avg_ann_tud, avg_ann_smud, smud_prop), sum) %>%
filter(!is.na(country))
vis_per_cell <- avg_ann_smud %>%
left_join(countries_pid, by = "pid") %>%
select(pid, smud2 = smud_prop, country) %>%
left_join(countries_ratios %>% filter(socmed == "smud2"), by = "country") %>%
mutate(est_vis = smud2*ud_to_vis,
est_exp = round(est_vis*AvgExp)) %>%
left_join(aoi, by = "pid")
for(source in c("pud", "tud", "smud", "smud2")){
plots <- ggplot(countries_ratios %>% filter(socmed == source)) +
geom_col(aes(x = Country, y = ud_to_vis)) +
labs(title = source)
print(plots)
}
plots
################ Old ####
# let's spread this data
countries_ratios %>% select(country, socmed, ud_to_vis) %>%
spread(key = "socmed", value = "ud_to_vis") %>%
rename(pud_to_vis = avg_ann_pud,
smud_to_vis = avg_ann_smud,
tud_to_vis = avg_ann_tud)
# let's pull out smud ratios alone
smud_ratios <- countries_ratios %>%
filter(socmed == "avg_ann_smud") %>%
rename(smud_to_vis = ud_to_vis) %>%
select(country, smud_to_vis)
##############
# Let's figure out how many SMUD I have across the entire MAR
#total_sm <- avg_ann_smud %>%
# summarise_at(vars(avg_ann_pud, avg_ann_tud, avg_ann_smud), sum)
# and... figure out the total visitor to avg_ann_smud ratio
#(viz_to_smud <- mar_summary$Visitors/total_sm$avg_ann_smud)
# ok. each avg_ann_smud is equivalent to ~25 visitors
#(viz_to_pud <- mar_summary$Visitors/total_sm$avg_ann_pud)
# and each avg_ann_pud is ~1000 visitors
mpa_summaries_tall <- mpa_smud_tall %>%
rename(socmed = source) %>%
mutate(socmed = str_extract(socmed, ".*(?=_)")) %>%
left_join(countries_ratios, by = c("Country" = "country", "socmed")) %>%
mutate(visitors = UD*ud_to_vis,
expenditures = visitors*AvgExp)
mpa_summaries_tall
#mpa_summaries #<-
mpa_smud %>%
left_join(avg_exp, by = "Country") %>%
left_join(smud_ratios, by = c("Country" = "country"))
mutate(visitors_smud = smud_prop*smud_to_vis,
expenditures_smud = visitors_smud*AvgExp,
visitors_pud = pud_prop*viz_to_pud)
mpa_summaries
## write it out
## NOTE: this doesn't work now since I haven't recoded mpa_summaries to wide format with geometries
#write_sf(mpa_summaries, "~/Documents/MAR/ModelRuns/baseline_5k/viz_and_expends/mpa_summaries.shp")
################ Old and other, supporting code #########
###############################
##### Let's check out whether proportions or total UD is better correlated with PUD pulled from the MPA shapefile ####
pud_avg_ann_shp <- read_csv("~/Documents/MAR/ModelRuns/baseline_mpas/pud/avg_ann_pud_pid.csv")
PUD_comps <- pud_avg_ann_shp %>%
rename(PUD_shape = avg_ann_ud) %>%
left_join(mpa_smud)
ggplot(PUD_comps, aes(x = PUD_shape)) +
#geom_point(aes(y = pud_tot)) +
geom_point(aes(y = pud_prop), col = "green") +
geom_abline(slope = 1)
# PUD_prop looks much better. Generally good, but there's one point that seems pretty off
PUD_comps %>% arrange(desc(pud_prop))
# Caye Caulker has more viz when calculated as a proportion of grid cells than when calculated using the MPA
# shapefile. I think this is becasue the Caye Caulker MPA doesn't actually include the popular Caye landmass
## Great. Proportions seems pretty good
##################################################################
################### Comparing viz numbers by country to PUD vs TUD and trying to figure out which one is better####
ggplot(countries_ud_emp) +
geom_point(aes(y = log1p(vizinAOI), x = log1p(avg_ann_ud))) +
geom_abline(slope = 1) +
facet_wrap(~socmed)
# hmm, they all have the appropriate relationship
# though the two middle countries swap for twitter...
countries_ud_emp %>% arrange(desc(vizinAOI))
# belize and honduras swap. twitter thinks honduras has more viz than belize, which isn't born out by the other data
# does a linear model help at all?
mod1 <- lm(log1p(country_summaries$vizinAOI) ~ log1p(avg_ann_pud) + log1p(avg_ann_tud), data = countries_ud)
summary(mod1)
plot(mod1)
mod2 <- lm(country_summaries$vizinAOI ~ -1 + avg_ann_pud + avg_ann_tud, data = countries_ud)
summary(mod2)
plot(mod2)
#####################################################
### Now let's make a plot - Using the mpa_summaries wide version. Now OLD
### plot of annual viz according to PUD proportions
ggplot(mpa_summaries) +
geom_col(aes(x = reorder(NAME, visitors_pud), y = visitors_pud)) +
coord_flip() +
xlab(NULL) +
labs(title = "Annual Visitors to MPAs (based on PUD Proportion)")
### plot of annual viz according to SMUD proportions
ggplot(mpa_summaries) +
geom_col(aes(x = reorder(NAME, visitors_smud), y = visitors_smud)) +
coord_flip() +
xlab(NULL) +
labs(title = "Annual Visitors to MPAs (based on TUD+PUD Proportion)")
## and, try to put them on a single plot
mpa_sum_tall <- mpa_summaries %>%
gather(key = "socmed", value = "visitors", c("visitors_smud", "visitors_pud"))