-
Notifications
You must be signed in to change notification settings - Fork 12
/
BEAData.R
457 lines (437 loc) · 21.5 KB
/
BEAData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
#' Get BEA state GDP data (including GVA, Employment Compensation, Tax, and GOS)
#' for a specified year.
#' @param dataname A text indicating what state data to get.
#' Can be GVA, employee compensation, taxes, and gross operating surplus.
#' @param year A numeric value specifying year of interest.
#' @return A data frame of BEA state data for the specified year.
getBEAStateGDPData <- function(dataname, year) {
# Create the placeholder file
StateGVAzip <- "inst/extdata/SAGDP.zip"
dir <- "inst/extdata/SAGDP"
# Download all BEA IO tables into the placeholder file
if (!file.exists(StateGVAzip)) {
utils::download.file("https://apps.bea.gov/regional/zip/SAGDP.zip",
StateGVAzip, mode = "wb")
# Get the name of all files in the zip archive
tmp <- unzip(StateGVAzip, list = TRUE)
fname <- tmp[tmp$Length > 0, ]$Name
# Unzip the file to the designated directory
unzip(StateGVAzip, files = fname, exdir = dir, overwrite = TRUE)
}
# Determine data filename
if (dataname == "GVA") {
FileName <- list.files(dir, pattern = "SAGDP2N__ALL_AREAS.*\\.csv")
} else if (dataname == "Tax") {
FileName <- list.files(dir, pattern = "SAGDP3N__ALL_AREAS.*\\.csv")
} else if (dataname == "Compensation") {
FileName <- list.files(dir, pattern = "SAGDP4N__ALL_AREAS.*\\.csv")
} else if (dataname == "GOS") {
FileName <- list.files(dir, pattern = "SAGDP7N__ALL_AREAS.*\\.csv")
}
FullFileName <- file.path(dir, FileName)
# Get date_accessed
date_accessed <- as.character(as.Date(file.mtime(StateGVAzip)))
# Find latest data year
file_split <- unlist(stringr::str_split(FileName, pattern = "_"))
endyear <- sub(".csv", "", file_split[length(file_split)])
# Load state data
if (year <= endyear) {
StateData <- utils::read.table(FullFileName, sep = ",",
header = TRUE, stringsAsFactors = FALSE,
check.names = FALSE, fill = TRUE)
# Get date_last_modified
date_last_modified <- stringr::str_match(StateData[grep("Last updated: ",
StateData$GeoFIPS),
"GeoFIPS"],
"Last updated: (.*?)--")[2]
StateData <- StateData[!is.na(StateData$LineCode), ]
# Assign NaN to (L) Less than $50,000.
StateData[StateData == "(L)"] <- NaN
# Assign NA to (D) Not shown to avoid disclosure of confidential information;
# estimates are included in higher-level totals.
StateData[StateData == "(D)"] <- NA
# Replace (NA) with NA
StateData[StateData == "(NA)"] <- NA
# Convert all the other values to numeric
year_col <- as.character(year)
StateData[, year_col] <- sapply(StateData[, year_col], as.numeric)
# Convert values to current US $
if (unique(StateData$Unit) == "Millions of current dollars") {
StateData[, year_col] <- StateData[, year_col]*1E6
} else if (unique(StateData$Unit) == "Thousands of dollars") {
StateData[, year_col] <- StateData[, year_col]*1E3
}
# Keep state-level data
geo_names <- c(state.name, "District of Columbia", "United States *")
# Save data
df <- StateData[StateData$GeoName %in% geo_names,
c("GeoName", "LineCode", "Description", year)]
# Write data to .rds
data_name <- paste("State", dataname, year,
utils::packageDescription("stateior", fields = "Version"),
sep = "_")
saveRDS(object = df,
file = paste0(file.path("data", data_name), ".rds"))
# Write metadata to JSON
useeior:::writeMetadatatoJSON(package = "stateior",
name = data_name,
year = year,
source = "US Bureau of Economic Analysis",
url = "https://apps.bea.gov/regional/zip/SAGDP.zip",
date_last_modified = date_last_modified,
date_accessed = date_accessed)
} else {
logging::logwarn(paste(year, "state", dataname, "data is not avaliable from BEA.",
"Nothing is returned."))
}
}
# Download, save and document BEA state economic data
for (dataname in c("GVA", "Tax", "Compensation", "GOS")) {
getBEAStateGDPData(dataname, year)
}
#' Get BEA state PCE (personal consumption expenditures) data for a specified year.
#' @param year A numeric value specifying year of interest.
#' @return A data frame of BEA state PCE data for the specified year.
#' available.
getBEAStatePCE <- function(year) {
# Create the placeholder file
StatePCEzip <- "inst/extdata/SAPCE.zip"
dir <- "inst/extdata/SAPCE"
# Download all BEA IO tables into the placeholder file
if (!file.exists(StatePCEzip)) {
utils::download.file("https://apps.bea.gov/regional/zip/SAPCE.zip",
StatePCEzip, mode = "wb")
# Get the name of all files in the zip archive
tmp <- unzip(StatePCEzip, list = TRUE)
fname <- tmp[tmp$Length > 0, ]$Name
# Unzip the file to the designated directory
unzip(StatePCEzip, files = fname, exdir = dir, overwrite = TRUE)
}
# Define FileName and FullFileName
FileName <- list.files(dir, pattern = "SAPCE1__ALL_AREAS")
FullFileName <- file.path(dir, FileName)
# Get date_accessed
date_accessed <- as.character(as.Date(file.mtime(StatePCEzip)))
# Find latest data year
file_split <- unlist(stringr::str_split(FileName, pattern = "_"))
endyear <- sub(".csv", "", file_split[length(file_split)])
# Load state PCE data
if (year <= endyear) {
StatePCE <- utils::read.table(FullFileName, sep = ",",
header = TRUE, stringsAsFactors = FALSE,
check.names = FALSE, fill = TRUE)
# Get date_last_modified
date_last_modified <- stringr::str_match(StatePCE[grep("Last updated: ",
StatePCE$GeoFIPS),
"GeoFIPS"],
"Last updated: (.*?)--")[2]
StatePCE <- StatePCE[!is.na(StatePCE$Line), ]
# Replace NA with zero
StatePCE[is.na(StatePCE)] <- 0
# Convert values to current US $
year_col <- as.character(year)
StatePCE[, year_col] <- StatePCE[, year_col]*1E6
# Keep state-level data
geo_names <- c(state.name, "District of Columbia", "United States")
# Save data
df <- StatePCE[StatePCE$GeoName %in% geo_names,
c("GeoName", "LineCode", "Description", year)]
# Write data to .rds
data_name <- paste("State_PCE", year,
utils::packageDescription("stateior", fields = "Version"),
sep = "_")
saveRDS(object = df,
file = paste0(file.path("data", data_name), ".rds"))
# Write metadata to JSON
useeior:::writeMetadatatoJSON(package = "stateior",
name = data_name,
year = year,
source = "US Bureau of Economic Analysis",
url = "https://apps.bea.gov/regional/zip/SAPCE.zip",
date_last_modified = date_last_modified,
date_accessed = date_accessed)
} else {
logging::logwarn(paste(year, "state PCE data is not avaliable from BEA.",
"Nothing is returned."))
}
}
# Download, save and document BEA state PCE from 2010 to the latest year available.
getBEAStatePCE(year)
#' Get BEA state foreign travel and expenditures by U.S. residents and
#' expenditures in the U.S. by nonresidents data from 2010 to the latest year
#' available.
#' @return A data frame of BEA state foreign travel and expenditures by U.S.
#' residents and expenditures in the U.S. by nonresidents data from 2010 to
#' the latest year available.
getBEAStateResidentAndNonresidentSpending <- function() {
# Create the placeholder file
StatePCEzip <- "inst/extdata/SAPCE.zip"
dir <- "inst/extdata/SAPCE"
# Download all BEA IO tables into the placeholder file
if (!file.exists(StatePCEzip)) {
utils::download.file("https://apps.bea.gov/regional/zip/SAPCE.zip",
StatePCEzip, mode = "wb")
# Get the name of all files in the zip archive
tmp <- unzip(StatePCEzip, list = TRUE)
fname <- tmp[tmp$Length > 0, ]$Name
# Unzip the file to the designated directory
unzip(StatePCEzip, files = fname, exdir = dir, overwrite = TRUE)
}
# Define FileName and FullFileName
FileName <- list.files(dir, pattern = "SAPCE4__ALL_AREAS")
FullFileName <- file.path(dir, FileName)
# Get date_accessed
date_accessed <- as.character(as.Date(file.mtime(StatePCEzip)))
# Find latest data year
file_split <- unlist(stringr::str_split(FileName, pattern = "_"))
end_year <- sub(".csv", "", file_split[length(file_split)])
# Create year_cols
year_cols <- as.character(2010:end_year)
# Load state PCE data
StatePCE <- utils::read.table(FullFileName, sep = ",",
header = TRUE, stringsAsFactors = FALSE,
check.names = FALSE, fill = TRUE)
# Get date_last_modified
date_last_modified <- stringr::str_match(StatePCE[grep("Last updated: ",
StatePCE$GeoFIPS),
"GeoFIPS"],
"Last updated: (.*?)--")[2]
# Get resident and nonresident spending
Spending <- StatePCE[StatePCE$LineCode %in% c(129, 130), ]
# Replace NA with zero
Spending[is.na(Spending)] <- 0
# Convert values to current US $
Spending[, year_cols] <- sapply(Spending[, year_cols], as.numeric)*1E6
# Keep state-level data
geo_names <- c(state.name, "District of Columbia", "United States")
# Save data
for (year in year_cols) {
for (linecode in c(129:130)) {
df <- Spending[Spending$GeoName %in% geo_names & Spending$LineCode == linecode,
c("GeoName", "LineCode", "Description", year)]
# Write data to .rds
data_name <- paste("State",
ifelse(linecode == 129,
"ForeignExpenditureByResident",
"DomesticExpenditureByNonresident"),
year,
utils::packageDescription("stateior", fields = "Version"),
sep = "_")
saveRDS(object = df,
file = paste0(file.path("data", data_name), ".rds"))
# Write metadata to JSON
useeior:::writeMetadatatoJSON(package = "stateior",
name = data_name,
year = year,
source = "US Bureau of Economic Analysis",
url = "https://apps.bea.gov/regional/zip/SAPCE.zip",
date_last_modified = date_last_modified,
date_accessed = date_accessed)
}
}
}
# Download, save and document BEA state foreign travel and expenditures by U.S.
# residents and expenditures in the U.S. by nonresidents data from 2010 to
# the latest year available.
getBEAStateResidentAndNonresidentSpending()
#' Download BEA US Gov Expenditure data (NIPA table).
#' @return A data frame of BEA US Gov Expenditure data (NIPA table).
downloadBEAGovExpenditure <- function() {
TableName <- "Section3All_xls.xlsx"
dir <- "inst/extdata/StateLocalGovFinances"
if (!dir.exists(dir)) {
dir.create(dir)
}
FullFileName <- file.path(dir, TableName)
url <- "https://apps.bea.gov/national/Release/XLS/Survey/Section3All_xls.xlsx"
# Download NIPA table file
if (!file.exists(FullFileName)) {
utils::download.file(url, FullFileName, mode = "wb")
}
# Create output
ls <- list("url" = url,
"date_accessed" = as.character(as.Date(file.mtime(FullFileName))))
return(ls)
}
#' Get BEA US Gov Investment data by state (Table 3.9.5 annual) for a specified
#' year.
#' @param year A numeric value specifying year of interest.
#' @return A data frame of BEA US Gov Investment data for the specified year.
getBEAGovInvestment <- function(year) {
# Download US Gov Expenditure (NIPA table) from BEA
url <- downloadBEAGovExpenditure()[["url"]]
date_accessed <- downloadBEAGovExpenditure()[["date_accessed"]]
# Load Gov Investment table
TableName <- "Section3All_xls.xlsx"
FullFileName <- file.path("inst/extdata/StateLocalGovFinances", TableName)
# Get date_last_modified
notes <- as.data.frame(readxl::read_excel(FullFileName,
sheet = "T30905-A",
range = "A1:A7"))
date_last_modified <- stringr::str_match(notes[grep("File created ",
notes[, 1]),
1],
"File created (.*?) ")[2]
# Load data
GovInvestment <- as.data.frame(readxl::read_excel(FullFileName,
sheet = "T30905-A",
col_names = TRUE,
skip = 7))
# Assign column name to the description column
colnames(GovInvestment)[2] <- "Description"
# Create year_col
year_col <- as.character(year)
# Save data
if (year %in% colnames(GovInvestment)) {
df <- GovInvestment[complete.cases(GovInvestment),
c("Line", "Description", year)]
# Convert values from million $ to $
df[, year_col] <- df[, year_col]*1E6
# Write data to .rds
data_name <- paste("GovInvestment", year,
utils::packageDescription("stateior", fields = "Version"),
sep = "_")
saveRDS(object = df,
file = paste0(file.path("data", data_name), ".rds"))
# Write metadata to JSON
useeior:::writeMetadatatoJSON(package = "stateior",
name = data_name,
year = year,
source = "US Bureau of Economic Analysis",
url = url,
date_last_modified = date_last_modified,
date_accessed = date_accessed)
} else {
logging::logwarn(paste(year, "state-level US government investment data is not",
"avaliable from BEA. Nothing is returned."))
}
}
# Download, save and document BEA US government investment data.
getBEAGovInvestment(year)
#' Get BEA US Gov Consumption data by state (Table 3.10.5 annual) for a specified
#' year.
#' @param year A numeric value specifying year of interest.
#' @return A data frame of BEA US Gov Consumption by state data for the specified
#' year.
getBEAGovConsumption <- function(year) {
# Download US Gov Expenditure (NIPA table) from BEA
url <- downloadBEAGovExpenditure()[["url"]]
date_accessed <- downloadBEAGovExpenditure()[["date_accessed"]]
# Load Gov Consumption table
TableName <- "Section3All_xls.xlsx"
FullFileName <- paste0("inst/extdata/StateLocalGovFinances/", TableName)
# Get date_last_modified
notes <- as.data.frame(readxl::read_excel(FullFileName,
sheet = "T31005-A",
range = "A1:A7"))
date_last_modified <- stringr::str_match(notes[grep("File created ",
notes[, 1]),
1],
"File created (.*?) ")[2]
# Load data
GovConsumption <- as.data.frame(readxl::read_excel(FullFileName,
sheet = "T31005-A",
col_names = TRUE,
skip = 7))
# Assign column name to the description column
colnames(GovConsumption)[2] <- "Description"
# Create year_col
year_col <- as.character(year)
# Save data
if (year %in% colnames(GovConsumption)) {
df <- GovConsumption[complete.cases(GovConsumption),
c("Line", "Description", year)]
# Convert values from million $ to $
GovConsumption[, year_col] <- GovConsumption[, year_col]*1E6
# Write data to .rds
data_name <- paste("GovConsumption", year,
utils::packageDescription("stateior", fields = "Version"),
sep = "_")
saveRDS(object = df,
file = paste0(file.path("data", data_name), ".rds"))
# Write metadata to JSON
useeior:::writeMetadatatoJSON(package = "stateior",
name = data_name,
year = year,
source = "US Bureau of Economic Analysis",
url = url,
date_last_modified = date_last_modified,
date_accessed = date_accessed)
} else {
logging::logwarn(paste(year, "state-level US government consumption data is not",
"avaliable from BEA. Nothing is returned."))
}
}
# Download, save and document BEA US government consumption by state data
getBEAGovConsumption(year)
#####################################################################################
################## BELOW ARE COUNTY MODEL DATA PROCESSING FUNCTIONS #################
#####################################################################################
#' getBEACountySectorGDP
#'
#' It returns the original county GDP of one specified state at BEA-sector(Line Code)
#' level with NAs.
#'
#' @param year A numeric value between 2001 and 2018 specifying the year of interest. If 0 ,return a dataframe with data from all years available
#' @param state A string character specifying the state of interest, 'GA'
#' @param axis A numeric value, 0,1. if 0, each county will be a col, if 1, row, default 0. This parameter only works when you specify one year
#' @return A data frame contains selected county GDP by BEA sector industries at a specific year.
getBEACountySectorGDP = function(year, state, axis = 0) {
# Create the placeholder file
CountyGDPzip = "inst/extdata/CAGDP2.zip"
# Download all BEA IO tables into the placeholder file
if(!file.exists(CountyGDPzip)) {
utils::download.file("https://apps.bea.gov/regional/zip/CAGDP2.zip", CountyGDPzip, mode = "wb")
# Get the name of all files in the zip archive
fname = unzip(CountyGDPzip, list = TRUE)[unzip(CountyGDPzip, list = TRUE)$Length > 0, ]$Name
# Unzip the file to the designated directory
unzip(CountyGDPzip, files = fname, exdir = "inst/extdata/CAGDP2", overwrite = TRUE)
}
# filter for specified state
fileName = paste0('inst/extdata/CAGDP2/CAGDP2_', paste0(getStateAbbreviation(state),'_2001_2018.csv'))
# read data
countyData = utils::read.table(fileName, sep = ",", header = TRUE,
stringsAsFactors = FALSE, check.names = FALSE,
fill = TRUE)
# drop last four rows (notes in original file)
countyData = countyData[!is.na(countyData$Region), ]
# select only BEA-sector-level rows
sectorLevelLineCode = c(1,3,6,10,11,12,34,35,36,45,50,59,68,75,82,83) #######TODO: have a crosswalk file in extdata and read from it, instead of hardcoding?
countyData = countyData[countyData$LineCode %in% sectorLevelLineCode, ]
# convert data type (string to numeric) Note: NAs introduced by coercion, which is ok
year_range = seq(2001,2018,1)
countyData[, as.character(year_range)] = sapply(countyData[, as.character(year_range)],
as.numeric)
# Decision1: return all-year table or one-year table
if (year == 0) {
# retain all year columns
countyDataAllYear = countyData[, c('GeoFIPS', 'GeoName', 'LineCode', as.character(year_range))]
# convert unit
for (yr in as.character(year_range)) {
countyDataAllYear[[yr]] = 1000 * countyDataAllYear[[yr]]
}
return(countyDataAllYear)
} else {
# retain specified year only
countyDataOneYear = countyData[, c('GeoName', 'LineCode', year)]
countyDataOneYear[[as.character(year)]] = 1000 * countyDataOneYear[[as.character(year)]]
}
# Decision2: transpose the table? (county as column or as row)
if (axis == 0) {
countyDataOneYearColumn = countyDataOneYear
colnames(countyDataOneYearColumn)[ncol(countyDataOneYearColumn)] = 'GDP'
# drop State Total Rows (you can still find it in row table)
dropStateFilter = sapply(attributes(countyDataOneYearColumn)$row.names, as.numeric) >= 36 # because state total always takes the first 35 rows(indices) of each table
countyDataOneYearColumn = countyDataOneYearColumn[dropStateFilter, ]
# WHY ARRANGE? It is because there is a discrepancy between rank by FIPS (default) and Name.
# We decide to stick with character string ranking (For instance: in Georgia, we arrange 'Macon' before 'McIntosh' while Macon in fact has a larger fips than McIntosh)
countyDataOneYearColumn = countyDataOneYearColumn %>%
dplyr::arrange(GeoName) %>%
dplyr::spread(GeoName, GDP)
return(countyDataOneYearColumn)
} else {
return(countyDataOneYear)
}
}
# CountyGA_BEASectorGDP_2001_2018 = getBEACountySectorGDP(year = 0, state = 'Georgia', axis = 1)
# usethis::use_data(CountyGA_BEASectorGDP_2001_2018, overwrite = TRUE)