Merge pull request #43 from dss-hmi/dev-kyle

Merge to update greeter scripts
dss-hmi · May 28, 2020 · 3e6ecd6 · 3e6ecd6
2 parents 065612f + 332c970
commit 3e6ecd6
Show file tree

Hide file tree

Showing 3 changed files with 324 additions and 1 deletion.
diff --git a/manipulation/1-greeter-population-3-cdc-bridged.R b/manipulation/1-greeter-population-3-cdc-bridged.R
@@ -71,7 +71,8 @@ for(i in names(ls_input)){
       "age", ~stringr::str_replace_all(., c("< 1"  = "0"
                                             ,"85\\+" = "86"))) %>%
     mutate_at(
-      "age", as.integer)
+      "age", as.integer) %>% 
+    tidyr::drop_na()
 }
 
 ds0 <- ls_input %>% bind_rows(.id = "year") %>% 

diff --git a/manipulation/2-greeter-suicide-2.R b/manipulation/2-greeter-suicide-2.R
@@ -0,0 +1,164 @@
+# Lines before the first chunk are invisible to Rmd/Rnw callers
+# Run to stitch a tech report of this script (used only in RStudio)
+# knitr::stitch_rmd(script = "./manipulation/0-greeter.R", output = "./stitched-output/manipulation/0-greeter.md")
+# knitr::stitch_rmd(script = "./manipulation/0-greeter.R", output = "./manipulation/stitched-output/0-greeter.md", )
+# this command is typically executed by the ./manipulation/governor.R
+
+rm(list=ls(all=TRUE)) #Clear the memory of variables from previous run. 
+# This is not called by knitr, because it's above the first chunk.
+cat("\f") # clear console when working in RStudio
+
+# ---- load-sources ------------------------------------------------------------
+# Call `base::source()` on any repo file that defines functions needed below.  
+base::source("./scripts/common-functions.R")
+# ---- load-packages -----------------------------------------------------------
+# Attach these packages so their functions don't need to be qualified
+# see http://r-pkgs.had.co.nz/namespace.html#search-path
+library(magrittr) #Pipes
+library(dplyr) # disable when temp lines are removed
+library(ggplot2)
+library(ggpubr)
+library(readxl)
+# ---- declare-globals ---------------------------------------------------------
+path_input       <- "./data-unshared/raw/cause113-county/"
+
+# ---- load-data ---------------------------------------------------------------
+#
+input_files <- list.files(path_input,pattern = ".xlsx$", full.names = T)
+# when downloading the tables the interface couldn't handle all years at once
+
+ls_input <- list()
+
+for(i in seq_along(input_files)){
+  element_name <- gsub(pattern = ".xlsx", replacement = "",input_files[i])
+  year_i <- stringr::str_sub(basename(element_name),-4)
+  ls_input[[year_i]] <- readxl::read_excel(input_files[i], col_names = TRUE, skip = 4)
+}
+
+
+# ---- tweak-data ----
+
+for(i in names(ls_input)){
+  names_all           <- names(ls_input[[i]])
+  names_part2         <- names_all[8:length(names_all)]
+  custom_names_part1  <- c("county"
+                           ,"locus1"
+                           ,"locus2"
+                           ,"cause"
+                           ,"gender"
+                           ,"race"
+                           ,"ethnicity" 
+                          )
+  names(ls_input[[i]]) <- c(custom_names_part1,names_part2)
+
+  d <- ls_input[[i]] %>% tidyr::fill(custom_names_part1) %>% 
+    filter_at(vars(custom_names_part1),all_vars(!stringr::str_detect(.,"Total"))) %>% 
+    select(-locus1, -locus2, -starts_with("Total"), -starts_with("..."))
+
+
+  names_stem <- setdiff(custom_names_part1,c("locus1","locus2"))
+  names_body <- setdiff(names(d), names_stem)
+
+  d_out <- d %>% 
+    tidyr::pivot_longer(names_body
+                        ,names_to  = "age"
+                        ,values_to = "n_suicides"
+                        ) %>% 
+    mutate_at("age", as.integer)
+
+  ls_input[[i]] <- d_out
+
+}
+
+
+
+# ---- combine-data ----
+
+ds0 <- ls_input %>% bind_rows(.id = "year") %>% 
+  mutate_at("year", as.integer)
+
+
+# ---- tweek-data-1 ----
+# tweek data to allow factors to match other data sets
+
+ds1 <- ds0 %>% 
+  mutate(
+    race_f = forcats::fct_recode(race,
+                                 "Black & Other"  =  "Black" 
+                                 ,"Black & Other" =  "Other")
+    ,ethnicity_f = forcats::fct_recode(ethnicity,
+                                        "Hispanic"      = "Hispanic"
+                                        ,"Non-Hispanic" = "Non-Hispanic")
+  )
+
+
+
+
+# ---- save-to-disk ----------------------------
+
+ds1 %>% readr::write_rds("./data-unshared/derived/2-greeted-suicide-2.rds"
+                         ,compress = 'gz')
+# for read-only inspection
+ds1 %>% readr::write_csv("./data-unshared/derived/2-greeted-suicide-2.csv") 
+
+
+# ---- publish ---------------------------------
+rmarkdown::render(
+  input = "./analysis/2-greeter/2-greeter-suicide-2.Rmd"
+  ,output_format = c(
+    "html_document" 
+    # ,"pdf_document"
+    # ,"md_document"
+    # "word_document" 
+  )
+  ,clean=TRUE
+)
+
+
+
+
+# ---- single-file-testing ----
+
+# used for testing script, unneeded for final production script
+
+# input_file <- "./data-unshared/raw/cause113-county/cause113-county-2018.xlsx"
+# 
+# ds0 <- readxl::read_excel(input_file, col_names = TRUE, skip = 4)
+# 
+# names_all <- names(ds0)
+# names_part2 <- names_all[8:length(names_all)]
+# custom_names_part1 <- c("county"
+#                         ,"locus1"
+#                         ,"locus2"
+#                         ,"cause"
+#                         ,"sex"
+#                         ,"race"
+#                         ,"ethnicity"
+# )
+# names(ds0) <- c(custom_names_part1,names_part2)
+# 
+# #GOT IT!!!!
+# 
+# ds1 <-  ds0 %>% tidyr::fill(custom_names_part1) %>%
+#   filter_at(vars(custom_names_part1),all_vars(!stringr::str_detect(.,"Total"))) %>% 
+# # filter(locus1 != "Total") %>%
+# # filter(locus2 != "Total") %>%
+# # filter(cause != "Total") %>%
+# # filter(sex != "Total") %>%
+# # filter(race != "Total") %>%
+# # filter(ethnicity != "Total") %>%
+#  select(-locus1, -locus2) %>%
+#   select(-starts_with("Total"), -starts_with("..."))
+# 
+# names_stem <- setdiff(custom_names_part1,c("locus1", "locus2"))
+# names_body <- setdiff(names(ds1), names_stem)
+# 
+# 
+# ds2 <- ds1 %>% tidyr::pivot_longer(names_body
+#                                    ,names_to  = "age"
+#                                    ,values_to = "n_suicides") %>%
+#   mutate(
+#     year = "2018"
+#   )
+# 
+
diff --git a/manipulation/9-aggregator-2.R b/manipulation/9-aggregator-2.R
@@ -0,0 +1,158 @@
+rm(list=ls(all=TRUE)) #Clear the memory of variables from previous run. 
+cat("\f") # clear console when working in RStudio
+
+# ---- load-sources ------------------------------------------------------------
+
+# ---- load-packages -----------------------------------------------------------
+# Attach these packages so their functions don't need to be qualified
+# see http://r-pkgs.had.co.nz/namespace.html#search-path
+library(magrittr) #Pipes
+library(dplyr) # disable when temp lines are removed
+library(ggplot2)
+
+# ---- declare-globals ---------------------------------------------------------
+path_file_input_0 <- "./data-unshared/derived/1-greeted-population-3-cdc.rds"
+path_file_input_1 <- "./data-unshared/derived/2-greeted-suicide-2.rds"
+
+
+# ---- load-data ---------------------------------------------------------------
+
+ds_population <- readr::read_rds(path_file_input_0)
+ds_suicide    <- readr::read_rds(path_file_input_1)
+
+
+# ---- tweak-data ------------------------------
+
+ds_population <- ds_population %>% 
+  select(-race,-ethnicity) %>% 
+  mutate_at(
+    "county", ~stringr::str_replace_all(.
+                                        ,c(
+                                          "St. Johns"  = "Saint Johns"
+                                          ,"St. Lucie" = "Saint Lucie"
+                                          ,"DeSoto"    = "Desoto"
+                                        ))
+  )
+
+
+ds_suicide <- ds_suicide %>%
+  select(-race,-ethnicity) %>% 
+  mutate(
+    age    = if_else(age < 85,age,as.integer(86))
+    ,cause = if_else(stringr::str_detect(cause,"Firearms"),"Firearms","Other")
+  ) 
+
+
+# --- join-data
+
+ds0 <- ds_population %>% 
+  left_join(ds_suicide, by = c("year"
+                               ,"county"
+                               ,"gender"
+                               ,"age"
+                               ,"race_f"
+                               ,"ethnicity_f")) %>% 
+  mutate_at("cause", ~stringr::str_replace_na(.,"None")) %>% 
+  mutate_at("n_suicides", ~stringr::str_replace_na(.,0)) %>% 
+  mutate_at("n_suicides", as.integer) %>% 
+  mutate(
+    race_ethnicity  = paste0(race_f, " + ", ethnicity_f)
+    ,race_ethnicity = forcats::as_factor(race_ethnicity) 
+    ,cause          = forcats::as_factor(cause)
+    ,gender         = forcats::as_factor(gender)
+  )
+
+# ---- compute-rate-function ----
+
+compute_rate <- function(
+  d
+  ,grouping_frame
+  ,wide = FALSE
+){
+  #test variables
+  # d <- ds0
+  # grouping_frame = c("county", "year")
+  #end test
+
+  d_wide <- d %>% 
+    group_by_at(c(grouping_frame, "cause")) %>% 
+    summarise(
+      n_population = sum(population, na.rm = TRUE)
+      ,n_suicides  = sum(n_suicides, na.rm = TRUE)
+    ) %>% 
+    ungroup() %>% 
+    mutate(
+      rate_suicide = ((n_suicides/n_population)*100000)
+    )
+
+  d_long <- d_wide %>% 
+    tidyr::pivot_longer(c(n_suicides,rate_suicide)
+                        ,names_to = "metric" 
+                        ,values_to = "value" 
+                        )
+
+  if(wide){
+    return(d_wide)
+  }
+
+  return(d_long)
+
+}
+
+#how to use
+ds_example   <- ds0 %>% compute_rate(grouping_frame = c("county", "year"))
+ds_example_w <- ds0 %>% compute_rate(grouping_frame = c("county", "year"), wide = TRUE)
+
+
+# ---- store-data ----
+
+ls_grouping_frame <- list(
+   c("county","year"                                )
+  ,c("county","year","gender"                       )
+  ,c("county","year"         ,"race_ethnicity"      )
+  ,c("county","year"                          ,"age")
+  ,c("county","year","gender","race_ethnicity"      )
+  ,c("county","year"         ,"race_ethnicity","age")
+  ,c("county","year","gender"                 ,"age")
+  ,c("county","year","gender","race_ethnicity","age")
+  ,c(         "year"                                )
+  ,c(         "year","gender"                       )
+  ,c(         "year"         ,"race_ethnicity"      )
+  ,c(         "year"                          ,"age")
+  ,c(         "year","gender","race_ethnicity"      )
+  ,c(         "year"         ,"race_ethnicity","age")
+  ,c(         "year","gender"                 ,"age")
+  ,c(         "year","gender","race_ethnicity","age")
+)
+
+
+
+#loop through all combos of grouping frame to store data
+
+for(i in seq_along(ls_grouping_frame)){
+  path_to_folder <- "./data-unshared/derived/rate/"
+  frame_i        <- ls_grouping_frame[[i]]
+  file_name      <- paste0(path_to_folder, paste0(frame_i, collapse = "-"),".rds")
+  d_computed     <- ds0 %>% compute_rate(grouping_frame = frame_i)
+
+  d_computed %>% readr::write_rds(file_name, compress = "gz")
+}
+
+
+# store youth data 
+
+
+ds_youth <- ds0 %>% filter(age %in% 10:24)
+
+
+for(i in seq_along(ls_grouping_frame)){
+  path_to_folder <- "./data-unshared/derived/rate/youth/"
+  frame_i        <- ls_grouping_frame[[i]]
+  file_name      <- paste0(path_to_folder
+                           ,paste0(frame_i, collapse = "-")
+                           ,"-10-24"
+                           ,".rds")
+  d_computed     <- ds_youth %>% compute_rate(grouping_frame = frame_i)
+
+  d_computed %>%readr::write_rds(file_name, compress = "gz")
+}