-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathexpand_model_out_grid.R
474 lines (443 loc) · 15.5 KB
/
expand_model_out_grid.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
#' Create expanded grid of valid task ID and output type value combinations
#'
#' @param config_tasks a list version of the content's of a hub's `tasks.json`
#' config file, accessed through the `"config_tasks"` attribute of a `<hub_connection>`
#' object or function [hubUtils::read_config()].
#' @param round_id Character string. Round identifier. If the round is set to
#' `round_id_from_variable: true`, IDs are values of the task ID defined in the round's
#' `round_id` property of `config_tasks`.
#' Otherwise should match round's `round_id` value in config. Ignored if hub
#' contains only a single round.
#' @param required_vals_only Logical. Whether to return only combinations of
#' Task ID and related output type ID required values.
#' @param all_character Logical. Whether to return all character column.
#' @param bind_model_tasks Logical. Whether to bind expanded grids of
#' values from multiple modeling tasks into a single tibble/arrow table or
#' return a list.
#' @param include_sample_ids Logical. Whether to include sample identifiers in
#' the `output_type_id` column.
#' @param compound_taskid_set List of character vectors, one for each modeling task
#' in the round. Can be used to override the compound task ID set defined in the
#' config. If `NULL` is provided for a given modeling task, a compound task ID set of
#' all task IDs is used.
#'
#' @return If `bind_model_tasks = TRUE` (default) a tibble or arrow table
#' containing all possible task ID and related output type ID
#' value combinations. If `bind_model_tasks = FALSE`, a list containing a
#' tibble or arrow table for each round modeling task.
#'
#' Columns are coerced to data types according to the hub schema,
#' unless `all_character = TRUE`. If `all_character = TRUE`, all columns are returned as
#' character which can be faster when large expanded grids are expected.
#' If `required_vals_only = TRUE`, values are limited to the combinations of required
#' values only.
#' @inheritParams hubData::coerce_to_hub_schema
#' @details
#' When a round is set to `round_id_from_variable: true`,
#' the value of the task ID from which round IDs are derived (i.e. the task ID
#' specified in `round_id` property of `config_tasks`) is set to the value of the
#' `round_id` argument in the returned output.
#'
#' When sample output types are included in the output and `include_sample_ids = TRUE`,
#' the `output_type_id` column contains example sample indexes which are useful
#' for identifying the compound task ID structure of multivariate sampling
#' distributions in particular, i.e. which combinations of task ID values
#' represent individual samples.
#' @export
#'
#' @examples
#' hub_con <- hubData::connect_hub(
#' system.file("testhubs/flusight", package = "hubUtils")
#' )
#' config_tasks <- attr(hub_con, "config_tasks")
#' expand_model_out_grid(config_tasks, round_id = "2023-01-02")
#' expand_model_out_grid(
#' config_tasks,
#' round_id = "2023-01-02",
#' required_vals_only = TRUE
#' )
#' # Specifying a round in a hub with multiple round configurations.
#' hub_con <- hubData::connect_hub(
#' system.file("testhubs/simple", package = "hubUtils")
#' )
#' config_tasks <- attr(hub_con, "config_tasks")
#' expand_model_out_grid(config_tasks, round_id = "2022-10-01")
#' # Later round_id maps to round config that includes additional task ID 'age_group'.
#' expand_model_out_grid(config_tasks, round_id = "2022-10-29")
#' # Coerce all columns to character
#' expand_model_out_grid(config_tasks,
#' round_id = "2022-10-29",
#' all_character = TRUE
#' )
#' # Return arrow table
#' expand_model_out_grid(config_tasks,
#' round_id = "2022-10-29",
#' all_character = TRUE,
#' as_arrow_table = TRUE
#' )
#' # Hub with sample output type
#' config_tasks <- hubUtils::read_config_file(system.file("config", "tasks.json",
#' package = "hubValidations"
#' ))
#' expand_model_out_grid(config_tasks,
#' round_id = "2022-12-26"
#' )
#' # Include sample IDS
#' expand_model_out_grid(config_tasks,
#' round_id = "2022-12-26",
#' include_sample_ids = TRUE
#' )
#' # Hub with sample output type and compound task ID structure
#' config_tasks <- hubUtils::read_config_file(system.file("config", "tasks-comp-tid.json",
#' package = "hubValidations"
#' ))
#' expand_model_out_grid(config_tasks,
#' round_id = "2022-12-26",
#' include_sample_ids = TRUE
#' )
#' # Override config compound task ID set
#' # Create coarser compound task ID set for the first modeling task which contains
#' # samples
#' expand_model_out_grid(config_tasks,
#' round_id = "2022-12-26",
#' include_sample_ids = TRUE,
#' compound_taskid_set = list(
#' c("forecast_date", "target"),
#' NULL
#' )
#' )
#' expand_model_out_grid(config_tasks,
#' round_id = "2022-12-26",
#' include_sample_ids = TRUE,
#' compound_taskid_set = list(
#' NULL,
#' NULL
#' )
#' )
expand_model_out_grid <- function(config_tasks,
round_id,
required_vals_only = FALSE,
all_character = FALSE,
output_type_id_datatype = c(
"from_config", "auto", "character",
"double", "integer",
"logical", "Date"
),
as_arrow_table = FALSE,
bind_model_tasks = TRUE,
include_sample_ids = FALSE,
compound_taskid_set = NULL) {
round_idx <- hubUtils::get_round_idx(config_tasks, round_id)
checkmate::assert_list(compound_taskid_set, null.ok = TRUE)
output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
round_config <- purrr::pluck(
config_tasks,
"rounds",
round_idx
)
task_id_l <- purrr::map(
round_config[["model_tasks"]],
~ .x[["task_ids"]] %>%
null_taskids_to_na()
) %>%
# Fix round_id value to current round_id in round_id variable column
fix_round_id(
round_id = round_id,
round_config = round_config,
round_ids = hubUtils::get_round_ids(config_tasks)
) %>%
process_grid_inputs(required_vals_only = required_vals_only)
# Get output type id property according to config schema version
# TODO: remove back-compatibility with schema versions < v2.0.0 when support
# retired
config_tid <- hubUtils::get_config_tid(config_tasks = config_tasks)
output_type_l <- purrr::map(
round_config[["model_tasks"]],
function(.x) {
.x[["output_type"]]
}
) %>%
purrr::map(
~ extract_mt_output_type_ids(.x, config_tid)
) %>%
process_grid_inputs(required_vals_only = required_vals_only) %>%
purrr::map(function(.x) {
purrr::compact(.x)
})
# Expand output grid individually for each modeling task and output type.
grid <- purrr::map2(
task_id_l, output_type_l,
~ expand_output_type_grid(
task_id_values = .x,
output_type_values = .y
)
)
if (include_sample_ids) {
if (is.null(compound_taskid_set)) {
compound_taskid_set <- get_round_compound_task_ids(config_tasks, round_id)
}
grid <- add_sample_idx(grid, round_config, config_tid, compound_taskid_set)
}
process_mt_grid_outputs(
grid,
config_tasks,
all_character = all_character,
as_arrow_table = as_arrow_table,
bind_model_tasks = bind_model_tasks,
output_type_id_datatype = output_type_id_datatype
)
}
# Extracts/collapses individual task ID values depending on whether all or just required
# values are needed.
process_grid_inputs <- function(x, required_vals_only = FALSE) {
if (required_vals_only) {
purrr::map(x, ~ .x %>% purrr::map(~ .x[["required"]]))
} else {
purrr::modify_depth(x, .depth = 2, ~ unlist(.x, use.names = FALSE))
}
}
# Function that expands modeling task level lists of task IDs and output type
# values into a grid and combines them into a single tibble.
expand_output_type_grid <- function(task_id_values,
output_type_values) {
purrr::imap(
output_type_values,
~ c(task_id_values, list(
output_type = .y,
output_type_id = .x
)) %>%
purrr::compact() %>%
expand.grid(stringsAsFactors = FALSE)
) %>%
purrr::list_rbind()
}
# Given expanded grids are constructed for specific rounds, this functions fixes
# the round_id in the any round_id variable column (if round_id_from_variable = TRUE)
fix_round_id <- function(x, round_id, round_config, round_ids) {
if (round_config[["round_id_from_variable"]] && !is.null(round_id)) {
round_id <- rlang::arg_match(round_id,
values = round_ids
)
round_id_var <- round_config[["round_id"]]
purrr::map(
x,
function(.x) {
purrr::imap(
.x,
function(.x, .y) {
if (.y == round_id_var) {
list(required = round_id, optional = NULL)
} else {
.x
}
}
)
}
)
} else {
x
}
}
# Function that processes lists of modeling tasks grids of output type values
# and task IDs by (depending on settings):
# - padding with NA columns.
# - applying the required schema and converting to arrow tables.
# - binding multiple modeling task grids together.
process_mt_grid_outputs <- function(x, config_tasks, all_character,
as_arrow_table = TRUE,
bind_model_tasks = TRUE,
output_type_id_datatype = output_type_id_datatype) {
if (bind_model_tasks) {
# To bind multiple modeling task grids together, we need to ensure they contain
# the same columns. Any missing columns are padded with NAs.
all_cols <- purrr::map(x, ~ names(.x)) %>%
unlist() %>%
unique()
schema_cols <- names(
hubData::create_hub_schema(
config_tasks,
partitions = NULL,
output_type_id_datatype = output_type_id_datatype
)
)
all_cols <- schema_cols[schema_cols %in% all_cols]
x <- purrr::map(x, ~ pad_missing_cols(.x, all_cols))
}
if (all_character) {
x <- purrr::map(
x, ~ hubData::coerce_to_character(
.x,
as_arrow_table = as_arrow_table
)
)
} else {
x <- purrr::map(
x,
~ hubData::coerce_to_hub_schema(
.x,
config_tasks,
as_arrow_table = as_arrow_table,
output_type_id_datatype = output_type_id_datatype
)
)
}
if (bind_model_tasks) {
return(do.call(rbind, x))
} else {
return(x)
}
}
# Pad any columns in all_cols missing in x of with new NA columns
pad_missing_cols <- function(x, all_cols) {
if (inherits(x, "data.frame")) {
x[, all_cols[!all_cols %in% names(x)]] <- NA
return(x[, all_cols])
}
if (inherits(x, "ArrowTabular")) {
missing_colnames <- setdiff(all_cols, names(x))
if (length(missing_colnames) == 0L) {
return(x)
}
missing_cols <- as.list(rep(NA, length(missing_colnames))) %>%
stats::setNames(missing_colnames) %>%
as.data.frame() %>%
arrow::arrow_table()
return(cbind(x, missing_cols)[, all_cols])
}
x
}
# Convert required value to NA in task IDs where both required and optional
# are as NA.
null_taskids_to_na <- function(model_task) {
to_na <- purrr::map_lgl(
model_task, ~ all(purrr::map_lgl(.x, is.null))
)
purrr::modify_if(model_task,
.p = to_na,
~ list(
required = NA,
optional = NULL
)
)
}
# Adds example sample ids to the output type id column which are unique
# across multiple modeling task groups. Only apply to v3 and above sample output
# type configurations.
add_sample_idx <- function(x, round_config, config_tid, compound_taskid_set = NULL) {
if (!is.null(compound_taskid_set) && length(compound_taskid_set) != length(x)) {
cli::cli_abort(
c("x" = "The length of {.var compound_taskid_set}
({.val {length(compound_taskid_set)}})
must match the number of modeling tasks ({.val {length(x)}})
in the round."),
call = rlang::caller_call()
)
}
spl_idx_0 <- 0L
for (i in seq_along(x)) {
# Check that the modeling task config has a v3 sample configuration
config_has_v3_spl <- purrr::pluck(
round_config[["model_tasks"]][[i]],
"output_type", "sample", "output_type_id_params"
) %>%
is.null() %>%
isFALSE()
# Check that x (the output df) has a sample output type (e.g. samples could be
# missing where only required values are requested but samples are optional)
x_has_spl <- "sample" %in% x[[i]][["output_type"]]
if (all(config_has_v3_spl, x_has_spl)) {
x[[i]] <- add_mt_sample_idx(
x = x[[i]],
config = round_config[["model_tasks"]][[i]],
start_idx = spl_idx_0,
config_tid,
comp_tids = compound_taskid_set[[i]]
)
spl_idx_0 <- spl_idx_0 + get_sample_n(x[[i]], config_tid)
}
}
x
}
# Add sample index to output type data frame of a single modeling task group
# according the the compound task ID set.
add_mt_sample_idx <- function(x, config, start_idx = 0L, config_tid, comp_tids = NULL,
call = rlang::caller_call(2)) {
x_names <- names(x)
task_ids <- setdiff(names(x), hubUtils::std_colnames)
spl <- x[
x[["output_type"]] == "sample",
task_ids
]
if (is.null(comp_tids)) {
# If the comp_tids are still NULL, then we assume that all compound task IDs
# are being set as compound task ids.
comp_tids <- task_ids
} else {
if (isFALSE(all(comp_tids %in% names(config$task_ids)))) {
cli::cli_abort(
c(
"x" = "{.val {setdiff(comp_tids, names(config$task_ids))}} {?is/are} not valid task ID{?s}.",
"i" = "The {.var compound_taskid_set} must be a subset of {.val {names(config$task_ids)}}."
),
call = call
)
}
}
type <- purrr::pluck(
config,
"output_type",
"sample",
"output_type_id_params",
"type"
)
if (is.null(comp_tids)) {
comp_tids <- names(spl)
} else {
# Check whether some compound task IDs have only optional values
# (i.e. the columns are missing in spl) and warn.
# Only do so though if a specific compound task ID set is provided in the config.
opt_comp_tids <- setdiff(comp_tids, names(spl))
if (length(opt_comp_tids) > 0) {
cli::cli_warn(
"The compound task ID{?s} {.field {opt_comp_tids}} ha{?s/ve} all optional values.
Representation of compound sample modeling tasks is not fully specified."
)
}
# subset to compound task IDs that are present in spl
comp_tids <- intersect(comp_tids, names(spl))
}
spl <- unique(spl[, comp_tids, drop = FALSE]) %>%
dplyr::mutate(
output_type = "sample",
output_type_id = seq_len(nrow(.)) + start_idx
) %>%
dplyr::left_join(spl, by = comp_tids)
if (!is.null(type) && type == "character") {
spl[[config_tid]] <- sprintf("s%s", spl[[config_tid]])
}
x[x[["output_type"]] != "sample", ] %>%
rbind(spl[, x_names, drop = FALSE])
}
get_sample_n <- function(x, config_tid) {
x[x[["output_type"]] == "sample", config_tid, drop = TRUE] %>%
unique() %>%
length()
}
extract_mt_output_type_ids <- function(x, config_tid) {
purrr::map(
x,
function(.x) {
if (config_tid %in% names(.x)) {
.x[[config_tid]]
} else if ("output_type_id_params" %in% names(.x)) {
if (.x[["output_type_id_params"]][["is_required"]]) {
list(required = NA, optional = NULL)
} else {
list(required = NULL, optional = NA)
}
} else {
NULL
}
}
)
}