diff --git a/manipulation/1-greeter-population-3-cdc-bridged.R b/manipulation/1-greeter-population-3-cdc-bridged.R index 78cb259..22fe5ea 100644 --- a/manipulation/1-greeter-population-3-cdc-bridged.R +++ b/manipulation/1-greeter-population-3-cdc-bridged.R @@ -71,7 +71,8 @@ for(i in names(ls_input)){ "age", ~stringr::str_replace_all(., c("< 1" = "0" ,"85\\+" = "86"))) %>% mutate_at( - "age", as.integer) + "age", as.integer) %>% + tidyr::drop_na() } ds0 <- ls_input %>% bind_rows(.id = "year") %>% diff --git a/manipulation/2-greeter-suicide-2.R b/manipulation/2-greeter-suicide-2.R new file mode 100644 index 0000000..09a833b --- /dev/null +++ b/manipulation/2-greeter-suicide-2.R @@ -0,0 +1,164 @@ +# Lines before the first chunk are invisible to Rmd/Rnw callers +# Run to stitch a tech report of this script (used only in RStudio) +# knitr::stitch_rmd(script = "./manipulation/0-greeter.R", output = "./stitched-output/manipulation/0-greeter.md") +# knitr::stitch_rmd(script = "./manipulation/0-greeter.R", output = "./manipulation/stitched-output/0-greeter.md", ) +# this command is typically executed by the ./manipulation/governor.R + +rm(list=ls(all=TRUE)) #Clear the memory of variables from previous run. +# This is not called by knitr, because it's above the first chunk. +cat("\f") # clear console when working in RStudio + +# ---- load-sources ------------------------------------------------------------ +# Call `base::source()` on any repo file that defines functions needed below. +base::source("./scripts/common-functions.R") +# ---- load-packages ----------------------------------------------------------- +# Attach these packages so their functions don't need to be qualified +# see http://r-pkgs.had.co.nz/namespace.html#search-path +library(magrittr) #Pipes +library(dplyr) # disable when temp lines are removed +library(ggplot2) +library(ggpubr) +library(readxl) +# ---- declare-globals --------------------------------------------------------- +path_input <- "./data-unshared/raw/cause113-county/" + +# ---- load-data --------------------------------------------------------------- +# +input_files <- list.files(path_input,pattern = ".xlsx$", full.names = T) +# when downloading the tables the interface couldn't handle all years at once + +ls_input <- list() + +for(i in seq_along(input_files)){ + element_name <- gsub(pattern = ".xlsx", replacement = "",input_files[i]) + year_i <- stringr::str_sub(basename(element_name),-4) + ls_input[[year_i]] <- readxl::read_excel(input_files[i], col_names = TRUE, skip = 4) +} + + +# ---- tweak-data ---- + +for(i in names(ls_input)){ + names_all <- names(ls_input[[i]]) + names_part2 <- names_all[8:length(names_all)] + custom_names_part1 <- c("county" + ,"locus1" + ,"locus2" + ,"cause" + ,"gender" + ,"race" + ,"ethnicity" + ) + names(ls_input[[i]]) <- c(custom_names_part1,names_part2) + + d <- ls_input[[i]] %>% tidyr::fill(custom_names_part1) %>% + filter_at(vars(custom_names_part1),all_vars(!stringr::str_detect(.,"Total"))) %>% + select(-locus1, -locus2, -starts_with("Total"), -starts_with("...")) + + + names_stem <- setdiff(custom_names_part1,c("locus1","locus2")) + names_body <- setdiff(names(d), names_stem) + + d_out <- d %>% + tidyr::pivot_longer(names_body + ,names_to = "age" + ,values_to = "n_suicides" + ) %>% + mutate_at("age", as.integer) + + ls_input[[i]] <- d_out + +} + + + +# ---- combine-data ---- + +ds0 <- ls_input %>% bind_rows(.id = "year") %>% + mutate_at("year", as.integer) + + +# ---- tweek-data-1 ---- +# tweek data to allow factors to match other data sets + +ds1 <- ds0 %>% + mutate( + race_f = forcats::fct_recode(race, + "Black & Other" = "Black" + ,"Black & Other" = "Other") + ,ethnicity_f = forcats::fct_recode(ethnicity, + "Hispanic" = "Hispanic" + ,"Non-Hispanic" = "Non-Hispanic") + ) + + + + +# ---- save-to-disk ---------------------------- + +ds1 %>% readr::write_rds("./data-unshared/derived/2-greeted-suicide-2.rds" + ,compress = 'gz') +# for read-only inspection +ds1 %>% readr::write_csv("./data-unshared/derived/2-greeted-suicide-2.csv") + + +# ---- publish --------------------------------- +rmarkdown::render( + input = "./analysis/2-greeter/2-greeter-suicide-2.Rmd" + ,output_format = c( + "html_document" + # ,"pdf_document" + # ,"md_document" + # "word_document" + ) + ,clean=TRUE +) + + + + +# ---- single-file-testing ---- + +# used for testing script, unneeded for final production script + +# input_file <- "./data-unshared/raw/cause113-county/cause113-county-2018.xlsx" +# +# ds0 <- readxl::read_excel(input_file, col_names = TRUE, skip = 4) +# +# names_all <- names(ds0) +# names_part2 <- names_all[8:length(names_all)] +# custom_names_part1 <- c("county" +# ,"locus1" +# ,"locus2" +# ,"cause" +# ,"sex" +# ,"race" +# ,"ethnicity" +# ) +# names(ds0) <- c(custom_names_part1,names_part2) +# +# #GOT IT!!!! +# +# ds1 <- ds0 %>% tidyr::fill(custom_names_part1) %>% +# filter_at(vars(custom_names_part1),all_vars(!stringr::str_detect(.,"Total"))) %>% +# # filter(locus1 != "Total") %>% +# # filter(locus2 != "Total") %>% +# # filter(cause != "Total") %>% +# # filter(sex != "Total") %>% +# # filter(race != "Total") %>% +# # filter(ethnicity != "Total") %>% +# select(-locus1, -locus2) %>% +# select(-starts_with("Total"), -starts_with("...")) +# +# names_stem <- setdiff(custom_names_part1,c("locus1", "locus2")) +# names_body <- setdiff(names(ds1), names_stem) +# +# +# ds2 <- ds1 %>% tidyr::pivot_longer(names_body +# ,names_to = "age" +# ,values_to = "n_suicides") %>% +# mutate( +# year = "2018" +# ) +# + diff --git a/manipulation/9-aggregator-2.R b/manipulation/9-aggregator-2.R new file mode 100644 index 0000000..0a145a5 --- /dev/null +++ b/manipulation/9-aggregator-2.R @@ -0,0 +1,158 @@ +rm(list=ls(all=TRUE)) #Clear the memory of variables from previous run. +cat("\f") # clear console when working in RStudio + +# ---- load-sources ------------------------------------------------------------ + +# ---- load-packages ----------------------------------------------------------- +# Attach these packages so their functions don't need to be qualified +# see http://r-pkgs.had.co.nz/namespace.html#search-path +library(magrittr) #Pipes +library(dplyr) # disable when temp lines are removed +library(ggplot2) + +# ---- declare-globals --------------------------------------------------------- +path_file_input_0 <- "./data-unshared/derived/1-greeted-population-3-cdc.rds" +path_file_input_1 <- "./data-unshared/derived/2-greeted-suicide-2.rds" + + +# ---- load-data --------------------------------------------------------------- + +ds_population <- readr::read_rds(path_file_input_0) +ds_suicide <- readr::read_rds(path_file_input_1) + + +# ---- tweak-data ------------------------------ + +ds_population <- ds_population %>% + select(-race,-ethnicity) %>% + mutate_at( + "county", ~stringr::str_replace_all(. + ,c( + "St. Johns" = "Saint Johns" + ,"St. Lucie" = "Saint Lucie" + ,"DeSoto" = "Desoto" + )) + ) + + +ds_suicide <- ds_suicide %>% + select(-race,-ethnicity) %>% + mutate( + age = if_else(age < 85,age,as.integer(86)) + ,cause = if_else(stringr::str_detect(cause,"Firearms"),"Firearms","Other") + ) + + +# --- join-data + +ds0 <- ds_population %>% + left_join(ds_suicide, by = c("year" + ,"county" + ,"gender" + ,"age" + ,"race_f" + ,"ethnicity_f")) %>% + mutate_at("cause", ~stringr::str_replace_na(.,"None")) %>% + mutate_at("n_suicides", ~stringr::str_replace_na(.,0)) %>% + mutate_at("n_suicides", as.integer) %>% + mutate( + race_ethnicity = paste0(race_f, " + ", ethnicity_f) + ,race_ethnicity = forcats::as_factor(race_ethnicity) + ,cause = forcats::as_factor(cause) + ,gender = forcats::as_factor(gender) + ) + +# ---- compute-rate-function ---- + +compute_rate <- function( + d + ,grouping_frame + ,wide = FALSE +){ + #test variables + # d <- ds0 + # grouping_frame = c("county", "year") + #end test + + d_wide <- d %>% + group_by_at(c(grouping_frame, "cause")) %>% + summarise( + n_population = sum(population, na.rm = TRUE) + ,n_suicides = sum(n_suicides, na.rm = TRUE) + ) %>% + ungroup() %>% + mutate( + rate_suicide = ((n_suicides/n_population)*100000) + ) + + d_long <- d_wide %>% + tidyr::pivot_longer(c(n_suicides,rate_suicide) + ,names_to = "metric" + ,values_to = "value" + ) + + if(wide){ + return(d_wide) + } + + return(d_long) + +} + +#how to use +ds_example <- ds0 %>% compute_rate(grouping_frame = c("county", "year")) +ds_example_w <- ds0 %>% compute_rate(grouping_frame = c("county", "year"), wide = TRUE) + + +# ---- store-data ---- + +ls_grouping_frame <- list( + c("county","year" ) + ,c("county","year","gender" ) + ,c("county","year" ,"race_ethnicity" ) + ,c("county","year" ,"age") + ,c("county","year","gender","race_ethnicity" ) + ,c("county","year" ,"race_ethnicity","age") + ,c("county","year","gender" ,"age") + ,c("county","year","gender","race_ethnicity","age") + ,c( "year" ) + ,c( "year","gender" ) + ,c( "year" ,"race_ethnicity" ) + ,c( "year" ,"age") + ,c( "year","gender","race_ethnicity" ) + ,c( "year" ,"race_ethnicity","age") + ,c( "year","gender" ,"age") + ,c( "year","gender","race_ethnicity","age") +) + + + +#loop through all combos of grouping frame to store data + +for(i in seq_along(ls_grouping_frame)){ + path_to_folder <- "./data-unshared/derived/rate/" + frame_i <- ls_grouping_frame[[i]] + file_name <- paste0(path_to_folder, paste0(frame_i, collapse = "-"),".rds") + d_computed <- ds0 %>% compute_rate(grouping_frame = frame_i) + + d_computed %>% readr::write_rds(file_name, compress = "gz") +} + + +# store youth data + + +ds_youth <- ds0 %>% filter(age %in% 10:24) + + +for(i in seq_along(ls_grouping_frame)){ + path_to_folder <- "./data-unshared/derived/rate/youth/" + frame_i <- ls_grouping_frame[[i]] + file_name <- paste0(path_to_folder + ,paste0(frame_i, collapse = "-") + ,"-10-24" + ,".rds") + d_computed <- ds_youth %>% compute_rate(grouping_frame = frame_i) + + d_computed %>%readr::write_rds(file_name, compress = "gz") +}