forked from UW-GAC/primed_data_models
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sheets_to_tsv_gsr.R
56 lines (47 loc) · 1.68 KB
/
sheets_to_tsv_gsr.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
library(googlesheets4)
library(dplyr)
library(tidyr)
library(stringr)
url <- "https://docs.google.com/spreadsheets/d/1xfSQqRQIq6pGkJ5jzzv2QhetmX5boaEZoNECpDwXe5I"
# table metadata
meta_tsv <- tibble(
entity="meta",
required="TRUE",
table=c("analysis", "gsr_file")
)
table_names <- meta_tsv$table
tables <- lapply(table_names, function(x) read_sheet(url, sheet=x, skip=1, col_types="c"))
names(tables) <- table_names
tsv_format <- function(t) {
tables[[t]] %>%
filter(!is.na(`Data type`)) %>%
mutate(entity="Table",
table=t,
pk=ifelse(paste0(t, "_id") == Column, TRUE, NA),
type=ifelse(`Data type` == "enumeration", Column, `Data type`)) %>%
select(entity, table,
column=Column, type, required=Required,
pk, ref=References,
note=Description) %>%
mutate(note=gsub('"', "'", note), # replace double with single quote
note=gsub('\n', ' ', note), # replace newline with space
ref=ifelse(grepl("omop_concept", ref), NA, ref)) # remove external table reference
}
out <- lapply(table_names, tsv_format) %>%
bind_rows()
# enumerated values
enum_format <- function(t) {
tables[[t]] %>%
filter(!is.na(Enumerations)) %>%
mutate(entity="enum") %>%
select(entity,
table=Column,
column=Enumerations) %>%
separate_rows(column, sep="\n") %>%
mutate(column=str_trim(column))
}
enum_tsv <- lapply(table_names, enum_format) %>%
bind_rows %>%
distinct()
out <- bind_rows(out, enum_tsv, meta_tsv)
readr::write_tsv(out, file="PRIMED_GSR_data_model.tsv", na="", escape="none")