Skip to content

Commit

Permalink
Merge pull request #43 from dss-hmi/dev-kyle
Browse files Browse the repository at this point in the history
Merge to update greeter scripts
  • Loading branch information
andkov authored May 28, 2020
2 parents 065612f + 332c970 commit 3e6ecd6
Show file tree
Hide file tree
Showing 3 changed files with 324 additions and 1 deletion.
3 changes: 2 additions & 1 deletion manipulation/1-greeter-population-3-cdc-bridged.R
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ for(i in names(ls_input)){
"age", ~stringr::str_replace_all(., c("< 1" = "0"
,"85\\+" = "86"))) %>%
mutate_at(
"age", as.integer)
"age", as.integer) %>%
tidyr::drop_na()
}

ds0 <- ls_input %>% bind_rows(.id = "year") %>%
Expand Down
164 changes: 164 additions & 0 deletions manipulation/2-greeter-suicide-2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# Lines before the first chunk are invisible to Rmd/Rnw callers
# Run to stitch a tech report of this script (used only in RStudio)
# knitr::stitch_rmd(script = "./manipulation/0-greeter.R", output = "./stitched-output/manipulation/0-greeter.md")
# knitr::stitch_rmd(script = "./manipulation/0-greeter.R", output = "./manipulation/stitched-output/0-greeter.md", )
# this command is typically executed by the ./manipulation/governor.R

rm(list=ls(all=TRUE)) #Clear the memory of variables from previous run.
# This is not called by knitr, because it's above the first chunk.
cat("\f") # clear console when working in RStudio

# ---- load-sources ------------------------------------------------------------
# Call `base::source()` on any repo file that defines functions needed below.
base::source("./scripts/common-functions.R")
# ---- load-packages -----------------------------------------------------------
# Attach these packages so their functions don't need to be qualified
# see http://r-pkgs.had.co.nz/namespace.html#search-path
library(magrittr) #Pipes
library(dplyr) # disable when temp lines are removed
library(ggplot2)
library(ggpubr)
library(readxl)
# ---- declare-globals ---------------------------------------------------------
path_input <- "./data-unshared/raw/cause113-county/"

# ---- load-data ---------------------------------------------------------------
#
input_files <- list.files(path_input,pattern = ".xlsx$", full.names = T)
# when downloading the tables the interface couldn't handle all years at once

ls_input <- list()

for(i in seq_along(input_files)){
element_name <- gsub(pattern = ".xlsx", replacement = "",input_files[i])
year_i <- stringr::str_sub(basename(element_name),-4)
ls_input[[year_i]] <- readxl::read_excel(input_files[i], col_names = TRUE, skip = 4)
}


# ---- tweak-data ----

for(i in names(ls_input)){
names_all <- names(ls_input[[i]])
names_part2 <- names_all[8:length(names_all)]
custom_names_part1 <- c("county"
,"locus1"
,"locus2"
,"cause"
,"gender"
,"race"
,"ethnicity"
)
names(ls_input[[i]]) <- c(custom_names_part1,names_part2)

d <- ls_input[[i]] %>% tidyr::fill(custom_names_part1) %>%
filter_at(vars(custom_names_part1),all_vars(!stringr::str_detect(.,"Total"))) %>%
select(-locus1, -locus2, -starts_with("Total"), -starts_with("..."))


names_stem <- setdiff(custom_names_part1,c("locus1","locus2"))
names_body <- setdiff(names(d), names_stem)

d_out <- d %>%
tidyr::pivot_longer(names_body
,names_to = "age"
,values_to = "n_suicides"
) %>%
mutate_at("age", as.integer)

ls_input[[i]] <- d_out

}



# ---- combine-data ----

ds0 <- ls_input %>% bind_rows(.id = "year") %>%
mutate_at("year", as.integer)


# ---- tweek-data-1 ----
# tweek data to allow factors to match other data sets

ds1 <- ds0 %>%
mutate(
race_f = forcats::fct_recode(race,
"Black & Other" = "Black"
,"Black & Other" = "Other")
,ethnicity_f = forcats::fct_recode(ethnicity,
"Hispanic" = "Hispanic"
,"Non-Hispanic" = "Non-Hispanic")
)




# ---- save-to-disk ----------------------------

ds1 %>% readr::write_rds("./data-unshared/derived/2-greeted-suicide-2.rds"
,compress = 'gz')
# for read-only inspection
ds1 %>% readr::write_csv("./data-unshared/derived/2-greeted-suicide-2.csv")


# ---- publish ---------------------------------
rmarkdown::render(
input = "./analysis/2-greeter/2-greeter-suicide-2.Rmd"
,output_format = c(
"html_document"
# ,"pdf_document"
# ,"md_document"
# "word_document"
)
,clean=TRUE
)




# ---- single-file-testing ----

# used for testing script, unneeded for final production script

# input_file <- "./data-unshared/raw/cause113-county/cause113-county-2018.xlsx"
#
# ds0 <- readxl::read_excel(input_file, col_names = TRUE, skip = 4)
#
# names_all <- names(ds0)
# names_part2 <- names_all[8:length(names_all)]
# custom_names_part1 <- c("county"
# ,"locus1"
# ,"locus2"
# ,"cause"
# ,"sex"
# ,"race"
# ,"ethnicity"
# )
# names(ds0) <- c(custom_names_part1,names_part2)
#
# #GOT IT!!!!
#
# ds1 <- ds0 %>% tidyr::fill(custom_names_part1) %>%
# filter_at(vars(custom_names_part1),all_vars(!stringr::str_detect(.,"Total"))) %>%
# # filter(locus1 != "Total") %>%
# # filter(locus2 != "Total") %>%
# # filter(cause != "Total") %>%
# # filter(sex != "Total") %>%
# # filter(race != "Total") %>%
# # filter(ethnicity != "Total") %>%
# select(-locus1, -locus2) %>%
# select(-starts_with("Total"), -starts_with("..."))
#
# names_stem <- setdiff(custom_names_part1,c("locus1", "locus2"))
# names_body <- setdiff(names(ds1), names_stem)
#
#
# ds2 <- ds1 %>% tidyr::pivot_longer(names_body
# ,names_to = "age"
# ,values_to = "n_suicides") %>%
# mutate(
# year = "2018"
# )
#

158 changes: 158 additions & 0 deletions manipulation/9-aggregator-2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
rm(list=ls(all=TRUE)) #Clear the memory of variables from previous run.
cat("\f") # clear console when working in RStudio

# ---- load-sources ------------------------------------------------------------

# ---- load-packages -----------------------------------------------------------
# Attach these packages so their functions don't need to be qualified
# see http://r-pkgs.had.co.nz/namespace.html#search-path
library(magrittr) #Pipes
library(dplyr) # disable when temp lines are removed
library(ggplot2)

# ---- declare-globals ---------------------------------------------------------
path_file_input_0 <- "./data-unshared/derived/1-greeted-population-3-cdc.rds"
path_file_input_1 <- "./data-unshared/derived/2-greeted-suicide-2.rds"


# ---- load-data ---------------------------------------------------------------

ds_population <- readr::read_rds(path_file_input_0)
ds_suicide <- readr::read_rds(path_file_input_1)


# ---- tweak-data ------------------------------

ds_population <- ds_population %>%
select(-race,-ethnicity) %>%
mutate_at(
"county", ~stringr::str_replace_all(.
,c(
"St. Johns" = "Saint Johns"
,"St. Lucie" = "Saint Lucie"
,"DeSoto" = "Desoto"
))
)


ds_suicide <- ds_suicide %>%
select(-race,-ethnicity) %>%
mutate(
age = if_else(age < 85,age,as.integer(86))
,cause = if_else(stringr::str_detect(cause,"Firearms"),"Firearms","Other")
)


# --- join-data

ds0 <- ds_population %>%
left_join(ds_suicide, by = c("year"
,"county"
,"gender"
,"age"
,"race_f"
,"ethnicity_f")) %>%
mutate_at("cause", ~stringr::str_replace_na(.,"None")) %>%
mutate_at("n_suicides", ~stringr::str_replace_na(.,0)) %>%
mutate_at("n_suicides", as.integer) %>%
mutate(
race_ethnicity = paste0(race_f, " + ", ethnicity_f)
,race_ethnicity = forcats::as_factor(race_ethnicity)
,cause = forcats::as_factor(cause)
,gender = forcats::as_factor(gender)
)

# ---- compute-rate-function ----

compute_rate <- function(
d
,grouping_frame
,wide = FALSE
){
#test variables
# d <- ds0
# grouping_frame = c("county", "year")
#end test

d_wide <- d %>%
group_by_at(c(grouping_frame, "cause")) %>%
summarise(
n_population = sum(population, na.rm = TRUE)
,n_suicides = sum(n_suicides, na.rm = TRUE)
) %>%
ungroup() %>%
mutate(
rate_suicide = ((n_suicides/n_population)*100000)
)

d_long <- d_wide %>%
tidyr::pivot_longer(c(n_suicides,rate_suicide)
,names_to = "metric"
,values_to = "value"
)

if(wide){
return(d_wide)
}

return(d_long)

}

#how to use
ds_example <- ds0 %>% compute_rate(grouping_frame = c("county", "year"))
ds_example_w <- ds0 %>% compute_rate(grouping_frame = c("county", "year"), wide = TRUE)


# ---- store-data ----

ls_grouping_frame <- list(
c("county","year" )
,c("county","year","gender" )
,c("county","year" ,"race_ethnicity" )
,c("county","year" ,"age")
,c("county","year","gender","race_ethnicity" )
,c("county","year" ,"race_ethnicity","age")
,c("county","year","gender" ,"age")
,c("county","year","gender","race_ethnicity","age")
,c( "year" )
,c( "year","gender" )
,c( "year" ,"race_ethnicity" )
,c( "year" ,"age")
,c( "year","gender","race_ethnicity" )
,c( "year" ,"race_ethnicity","age")
,c( "year","gender" ,"age")
,c( "year","gender","race_ethnicity","age")
)



#loop through all combos of grouping frame to store data

for(i in seq_along(ls_grouping_frame)){
path_to_folder <- "./data-unshared/derived/rate/"
frame_i <- ls_grouping_frame[[i]]
file_name <- paste0(path_to_folder, paste0(frame_i, collapse = "-"),".rds")
d_computed <- ds0 %>% compute_rate(grouping_frame = frame_i)

d_computed %>% readr::write_rds(file_name, compress = "gz")
}


# store youth data


ds_youth <- ds0 %>% filter(age %in% 10:24)


for(i in seq_along(ls_grouping_frame)){
path_to_folder <- "./data-unshared/derived/rate/youth/"
frame_i <- ls_grouping_frame[[i]]
file_name <- paste0(path_to_folder
,paste0(frame_i, collapse = "-")
,"-10-24"
,".rds")
d_computed <- ds_youth %>% compute_rate(grouping_frame = frame_i)

d_computed %>%readr::write_rds(file_name, compress = "gz")
}

0 comments on commit 3e6ecd6

Please sign in to comment.