-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #43 from dss-hmi/dev-kyle
Merge to update greeter scripts
- Loading branch information
Showing
3 changed files
with
324 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
# Lines before the first chunk are invisible to Rmd/Rnw callers | ||
# Run to stitch a tech report of this script (used only in RStudio) | ||
# knitr::stitch_rmd(script = "./manipulation/0-greeter.R", output = "./stitched-output/manipulation/0-greeter.md") | ||
# knitr::stitch_rmd(script = "./manipulation/0-greeter.R", output = "./manipulation/stitched-output/0-greeter.md", ) | ||
# this command is typically executed by the ./manipulation/governor.R | ||
|
||
rm(list=ls(all=TRUE)) #Clear the memory of variables from previous run. | ||
# This is not called by knitr, because it's above the first chunk. | ||
cat("\f") # clear console when working in RStudio | ||
|
||
# ---- load-sources ------------------------------------------------------------ | ||
# Call `base::source()` on any repo file that defines functions needed below. | ||
base::source("./scripts/common-functions.R") | ||
# ---- load-packages ----------------------------------------------------------- | ||
# Attach these packages so their functions don't need to be qualified | ||
# see http://r-pkgs.had.co.nz/namespace.html#search-path | ||
library(magrittr) #Pipes | ||
library(dplyr) # disable when temp lines are removed | ||
library(ggplot2) | ||
library(ggpubr) | ||
library(readxl) | ||
# ---- declare-globals --------------------------------------------------------- | ||
path_input <- "./data-unshared/raw/cause113-county/" | ||
|
||
# ---- load-data --------------------------------------------------------------- | ||
# | ||
input_files <- list.files(path_input,pattern = ".xlsx$", full.names = T) | ||
# when downloading the tables the interface couldn't handle all years at once | ||
|
||
ls_input <- list() | ||
|
||
for(i in seq_along(input_files)){ | ||
element_name <- gsub(pattern = ".xlsx", replacement = "",input_files[i]) | ||
year_i <- stringr::str_sub(basename(element_name),-4) | ||
ls_input[[year_i]] <- readxl::read_excel(input_files[i], col_names = TRUE, skip = 4) | ||
} | ||
|
||
|
||
# ---- tweak-data ---- | ||
|
||
for(i in names(ls_input)){ | ||
names_all <- names(ls_input[[i]]) | ||
names_part2 <- names_all[8:length(names_all)] | ||
custom_names_part1 <- c("county" | ||
,"locus1" | ||
,"locus2" | ||
,"cause" | ||
,"gender" | ||
,"race" | ||
,"ethnicity" | ||
) | ||
names(ls_input[[i]]) <- c(custom_names_part1,names_part2) | ||
|
||
d <- ls_input[[i]] %>% tidyr::fill(custom_names_part1) %>% | ||
filter_at(vars(custom_names_part1),all_vars(!stringr::str_detect(.,"Total"))) %>% | ||
select(-locus1, -locus2, -starts_with("Total"), -starts_with("...")) | ||
|
||
|
||
names_stem <- setdiff(custom_names_part1,c("locus1","locus2")) | ||
names_body <- setdiff(names(d), names_stem) | ||
|
||
d_out <- d %>% | ||
tidyr::pivot_longer(names_body | ||
,names_to = "age" | ||
,values_to = "n_suicides" | ||
) %>% | ||
mutate_at("age", as.integer) | ||
|
||
ls_input[[i]] <- d_out | ||
|
||
} | ||
|
||
|
||
|
||
# ---- combine-data ---- | ||
|
||
ds0 <- ls_input %>% bind_rows(.id = "year") %>% | ||
mutate_at("year", as.integer) | ||
|
||
|
||
# ---- tweek-data-1 ---- | ||
# tweek data to allow factors to match other data sets | ||
|
||
ds1 <- ds0 %>% | ||
mutate( | ||
race_f = forcats::fct_recode(race, | ||
"Black & Other" = "Black" | ||
,"Black & Other" = "Other") | ||
,ethnicity_f = forcats::fct_recode(ethnicity, | ||
"Hispanic" = "Hispanic" | ||
,"Non-Hispanic" = "Non-Hispanic") | ||
) | ||
|
||
|
||
|
||
|
||
# ---- save-to-disk ---------------------------- | ||
|
||
ds1 %>% readr::write_rds("./data-unshared/derived/2-greeted-suicide-2.rds" | ||
,compress = 'gz') | ||
# for read-only inspection | ||
ds1 %>% readr::write_csv("./data-unshared/derived/2-greeted-suicide-2.csv") | ||
|
||
|
||
# ---- publish --------------------------------- | ||
rmarkdown::render( | ||
input = "./analysis/2-greeter/2-greeter-suicide-2.Rmd" | ||
,output_format = c( | ||
"html_document" | ||
# ,"pdf_document" | ||
# ,"md_document" | ||
# "word_document" | ||
) | ||
,clean=TRUE | ||
) | ||
|
||
|
||
|
||
|
||
# ---- single-file-testing ---- | ||
|
||
# used for testing script, unneeded for final production script | ||
|
||
# input_file <- "./data-unshared/raw/cause113-county/cause113-county-2018.xlsx" | ||
# | ||
# ds0 <- readxl::read_excel(input_file, col_names = TRUE, skip = 4) | ||
# | ||
# names_all <- names(ds0) | ||
# names_part2 <- names_all[8:length(names_all)] | ||
# custom_names_part1 <- c("county" | ||
# ,"locus1" | ||
# ,"locus2" | ||
# ,"cause" | ||
# ,"sex" | ||
# ,"race" | ||
# ,"ethnicity" | ||
# ) | ||
# names(ds0) <- c(custom_names_part1,names_part2) | ||
# | ||
# #GOT IT!!!! | ||
# | ||
# ds1 <- ds0 %>% tidyr::fill(custom_names_part1) %>% | ||
# filter_at(vars(custom_names_part1),all_vars(!stringr::str_detect(.,"Total"))) %>% | ||
# # filter(locus1 != "Total") %>% | ||
# # filter(locus2 != "Total") %>% | ||
# # filter(cause != "Total") %>% | ||
# # filter(sex != "Total") %>% | ||
# # filter(race != "Total") %>% | ||
# # filter(ethnicity != "Total") %>% | ||
# select(-locus1, -locus2) %>% | ||
# select(-starts_with("Total"), -starts_with("...")) | ||
# | ||
# names_stem <- setdiff(custom_names_part1,c("locus1", "locus2")) | ||
# names_body <- setdiff(names(ds1), names_stem) | ||
# | ||
# | ||
# ds2 <- ds1 %>% tidyr::pivot_longer(names_body | ||
# ,names_to = "age" | ||
# ,values_to = "n_suicides") %>% | ||
# mutate( | ||
# year = "2018" | ||
# ) | ||
# | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
rm(list=ls(all=TRUE)) #Clear the memory of variables from previous run. | ||
cat("\f") # clear console when working in RStudio | ||
|
||
# ---- load-sources ------------------------------------------------------------ | ||
|
||
# ---- load-packages ----------------------------------------------------------- | ||
# Attach these packages so their functions don't need to be qualified | ||
# see http://r-pkgs.had.co.nz/namespace.html#search-path | ||
library(magrittr) #Pipes | ||
library(dplyr) # disable when temp lines are removed | ||
library(ggplot2) | ||
|
||
# ---- declare-globals --------------------------------------------------------- | ||
path_file_input_0 <- "./data-unshared/derived/1-greeted-population-3-cdc.rds" | ||
path_file_input_1 <- "./data-unshared/derived/2-greeted-suicide-2.rds" | ||
|
||
|
||
# ---- load-data --------------------------------------------------------------- | ||
|
||
ds_population <- readr::read_rds(path_file_input_0) | ||
ds_suicide <- readr::read_rds(path_file_input_1) | ||
|
||
|
||
# ---- tweak-data ------------------------------ | ||
|
||
ds_population <- ds_population %>% | ||
select(-race,-ethnicity) %>% | ||
mutate_at( | ||
"county", ~stringr::str_replace_all(. | ||
,c( | ||
"St. Johns" = "Saint Johns" | ||
,"St. Lucie" = "Saint Lucie" | ||
,"DeSoto" = "Desoto" | ||
)) | ||
) | ||
|
||
|
||
ds_suicide <- ds_suicide %>% | ||
select(-race,-ethnicity) %>% | ||
mutate( | ||
age = if_else(age < 85,age,as.integer(86)) | ||
,cause = if_else(stringr::str_detect(cause,"Firearms"),"Firearms","Other") | ||
) | ||
|
||
|
||
# --- join-data | ||
|
||
ds0 <- ds_population %>% | ||
left_join(ds_suicide, by = c("year" | ||
,"county" | ||
,"gender" | ||
,"age" | ||
,"race_f" | ||
,"ethnicity_f")) %>% | ||
mutate_at("cause", ~stringr::str_replace_na(.,"None")) %>% | ||
mutate_at("n_suicides", ~stringr::str_replace_na(.,0)) %>% | ||
mutate_at("n_suicides", as.integer) %>% | ||
mutate( | ||
race_ethnicity = paste0(race_f, " + ", ethnicity_f) | ||
,race_ethnicity = forcats::as_factor(race_ethnicity) | ||
,cause = forcats::as_factor(cause) | ||
,gender = forcats::as_factor(gender) | ||
) | ||
|
||
# ---- compute-rate-function ---- | ||
|
||
compute_rate <- function( | ||
d | ||
,grouping_frame | ||
,wide = FALSE | ||
){ | ||
#test variables | ||
# d <- ds0 | ||
# grouping_frame = c("county", "year") | ||
#end test | ||
|
||
d_wide <- d %>% | ||
group_by_at(c(grouping_frame, "cause")) %>% | ||
summarise( | ||
n_population = sum(population, na.rm = TRUE) | ||
,n_suicides = sum(n_suicides, na.rm = TRUE) | ||
) %>% | ||
ungroup() %>% | ||
mutate( | ||
rate_suicide = ((n_suicides/n_population)*100000) | ||
) | ||
|
||
d_long <- d_wide %>% | ||
tidyr::pivot_longer(c(n_suicides,rate_suicide) | ||
,names_to = "metric" | ||
,values_to = "value" | ||
) | ||
|
||
if(wide){ | ||
return(d_wide) | ||
} | ||
|
||
return(d_long) | ||
|
||
} | ||
|
||
#how to use | ||
ds_example <- ds0 %>% compute_rate(grouping_frame = c("county", "year")) | ||
ds_example_w <- ds0 %>% compute_rate(grouping_frame = c("county", "year"), wide = TRUE) | ||
|
||
|
||
# ---- store-data ---- | ||
|
||
ls_grouping_frame <- list( | ||
c("county","year" ) | ||
,c("county","year","gender" ) | ||
,c("county","year" ,"race_ethnicity" ) | ||
,c("county","year" ,"age") | ||
,c("county","year","gender","race_ethnicity" ) | ||
,c("county","year" ,"race_ethnicity","age") | ||
,c("county","year","gender" ,"age") | ||
,c("county","year","gender","race_ethnicity","age") | ||
,c( "year" ) | ||
,c( "year","gender" ) | ||
,c( "year" ,"race_ethnicity" ) | ||
,c( "year" ,"age") | ||
,c( "year","gender","race_ethnicity" ) | ||
,c( "year" ,"race_ethnicity","age") | ||
,c( "year","gender" ,"age") | ||
,c( "year","gender","race_ethnicity","age") | ||
) | ||
|
||
|
||
|
||
#loop through all combos of grouping frame to store data | ||
|
||
for(i in seq_along(ls_grouping_frame)){ | ||
path_to_folder <- "./data-unshared/derived/rate/" | ||
frame_i <- ls_grouping_frame[[i]] | ||
file_name <- paste0(path_to_folder, paste0(frame_i, collapse = "-"),".rds") | ||
d_computed <- ds0 %>% compute_rate(grouping_frame = frame_i) | ||
|
||
d_computed %>% readr::write_rds(file_name, compress = "gz") | ||
} | ||
|
||
|
||
# store youth data | ||
|
||
|
||
ds_youth <- ds0 %>% filter(age %in% 10:24) | ||
|
||
|
||
for(i in seq_along(ls_grouping_frame)){ | ||
path_to_folder <- "./data-unshared/derived/rate/youth/" | ||
frame_i <- ls_grouping_frame[[i]] | ||
file_name <- paste0(path_to_folder | ||
,paste0(frame_i, collapse = "-") | ||
,"-10-24" | ||
,".rds") | ||
d_computed <- ds_youth %>% compute_rate(grouping_frame = frame_i) | ||
|
||
d_computed %>%readr::write_rds(file_name, compress = "gz") | ||
} |