forked from UW-GAC/primed_data_models
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sheets_to_JSON_gsr_dd.R
127 lines (80 loc) · 3.28 KB
/
sheets_to_JSON_gsr_dd.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
library(googlesheets4)
library(dplyr)
library(tidyr)
library(stringr)
# link to the data
url <- "https://docs.google.com/spreadsheets/d/1xfSQqRQIq6pGkJ5jzzv2QhetmX5boaEZoNECpDwXe5I"
# table metadata
tables <- list(read_sheet(url, sheet="GSR_files_DD", skip=1, col_types="c"))
meta_tsv <- tibble(
entity="meta",
required="TRUE",
table=c("gsr_files_dd")
)
table_names <- meta_tsv$table
names(tables) <- table_names
rm(list = c("table_names", "url"))
# keep only non-empty rows
for (i in 1:length(tables)) {
tables[[i]] <- tables[[i]][apply(tables[[i]], 1, function(x){sum(!is.na(x))}) != 0, ]
}
# assign new names to the columns in the Google Sheets file
"categories" -> "Categories"
"required" -> "Required"
"column" -> "Column"
"description" -> "Description"
"references" -> "References"
"data_type" -> "Data type"
"enumerations" -> "Enumerations"
"multi_value_delimeter" -> "Multi-value delimeter"
"examples" -> "Examples"
"notes_comments" -> "Notes/comments"
# manually check that names are in the correct order
expected_names <- c("Categories", "Column", "Description", "Data type", "Required", "References", "Enumerations", "Multi-value delimeter", "Examples", "Notes/comments", "CC", "CAPE", "CARDINAL", "D-PRISM", "EPIC-PRS", "FFAIRR-PRS", "PREVENT", "PRIMED-Cancer", "...8")
for (i in 1:length(tables)) {
# check if all table names are in the expected list
if (all(names(tables[[i]]) %in% expected_names)) {
# if so, re-order as according to the preferred ordering
tables[[i]] <- tables[[i]][, intersect(names(tables[[i]]), expected_names)]
# re-label the columns to the preferred notation
colnames(tables[[i]]) <- sapply(intersect(names(tables[[i]]), expected_names),
function(x){ifelse(exists(x), get(x), x)})
} else {
# otherwise, identify which columns were unexpected
unexpected_names <- names(tables[[i]])[(!(names(tables[[i]]) %in% expected_names))]
stop(paste0("the following columns in the Google Sheets file were not expected:\n ",
paste0(1:length(unexpected_names), ". ", unexpected_names, collapse = "\n ")))
}
}
rm(list = c("i", "expected_names", expected_names[sapply(expected_names, exists)]))
# call in the sheets_to_list function that accepts two arguments:
# 1) the list describing which tables are in the Google Sheets file
# 2) the list of data tables corresponding to the first argument
source("sheets_to_list.R")
tab_list <- sheets_to_list(apply(meta_tsv[, -1], 1, as.list), tables)
rm(list = c("meta_tsv", "tables", "sheets_to_list"))
# initialize leading text
master <- list(
list(
# Overall File Details
name = "PRIMED GSR Data Dictionary",
description = "Insert description here...",
version = "1.3",
# Data Table Details
tables = tab_list
)
)
rm(list = c("tab_list"))
# compile master file in JSON format
out <- toJSON(x = master,
pretty = TRUE,
auto_unbox = TRUE,
unbox = TRUE)
rm(list = c("master"))
# unquote the logical parameters TRUE and FALSE
out <- gsub(pattern = ': \"TRUE\"', replacement = ': true', x = out)
out <- gsub(pattern = ': \"FALSE\"', replacement = ': false', x = out)
# view the final version
out
# save the final version
write(out, "PRIMED_GSR_data_dictionary.json")