hubverse-org · zkamvar · Jan 8, 2025 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # hubValidations (development version)
 
+* `check_tbl_value_col_ascending()` will now use the order of the
+  `output_type_id` values as defined in the schema. This ensures that the
+  `output_type_id`s for `cdf` output types are always sorted in the correct
+  order (#78).
+
 # hubValidations 0.10.0
 
 * Added `check_tbl_derived_task_id_vals()` check to `validate_model_data()`

diff --git a/R/check_tbl_value_col_ascending.R b/R/check_tbl_value_col_ascending.R
@@ -11,8 +11,11 @@
 #' @inherit check_tbl_colnames params
 #' @inherit check_tbl_col_types return
 #' @export
-check_tbl_value_col_ascending <- function(tbl, file_path) {
-  if (all(!c("cdf", "quantile") %in% tbl[["output_type"]])) {
+check_tbl_value_col_ascending <- function(tbl, file_path, hub_path, round_id) {
+
+  # Exit early if there are no values to check
+  no_values_to_check <- all(!c("cdf", "quantile") %in% tbl[["output_type"]])
+  if (no_values_to_check) {
     return(
       capture_check_info(
         file_path,
@@ -22,8 +25,26 @@ check_tbl_value_col_ascending <- function(tbl, file_path) {
     )
   }
 
-  output_type_tbl <- split(tbl, tbl[["output_type"]])[c("cdf", "quantile")] %>%
-    purrr::compact()
+  # create a model output table subset to only the CDF and or quantile values
+  # regardless of whether they are optional or required
+  config_tasks <- hubUtils::read_config(hub_path, "tasks")
+  round_output_types <- get_round_output_type_names(config_tasks, round_id)
+  only_cdf_or_quantile <- intersect(c("cdf", "quantile"), round_output_types)
+  reference_tbl <- expand_model_out_grid(
+    config_tasks = config_tasks,
+    round_id = round_id,
+    all_character = FALSE,
+    force_output_types = TRUE,
+    output_types = only_cdf_or_quantile
+  )
+
+  # FIX for <https://github.com/hubverse-org/hubValidations/issues/78>
+  # sort the table by config by merging from config ----------------
+  tbl_sorted <- order_output_type_ids(tbl, reference_tbl)
+  # TODO: return an informative error or message if the table has no rows
+  # If this is the case, this likely means that there are invalid combinations
+  # of values.
+  output_type_tbl <- split_cdf_quantile(tbl_sorted)
 
   error_tbl <- purrr::map(
     output_type_tbl,
@@ -57,8 +78,8 @@ check_values_ascending <- function(tbl) {
   group_cols <- names(tbl)[!names(tbl) %in% hubUtils::std_colnames]
   tbl[["value"]] <- as.numeric(tbl[["value"]])
 
+  # group by all of the target columns
   check_tbl <- dplyr::group_by(tbl, dplyr::across(dplyr::all_of(group_cols))) %>%
-    dplyr::arrange(.data$output_type_id, .by_group = TRUE) %>%
     dplyr::summarise(non_asc = any(diff(.data[["value"]]) < 0))
 
   if (!any(check_tbl$non_asc)) {
@@ -72,3 +93,45 @@ check_values_ascending <- function(tbl) {
     dplyr::ungroup() %>%
     dplyr::mutate(.env$output_type)
 }
+
+split_cdf_quantile <- function(tbl) {
+  split(tbl, tbl[["output_type"]])[c("cdf", "quantile")] %>%
+    purrr::compact()
+}
+
+#' Order the output type ids in the order of the config
+#'
+#' This function uses the output from [expand_model_out_grid()] to create
+#' a lookup table that contains the correct ordering for all of the output type
+#' IDs. Performing an inner join with this lookup table as the reference will
+#' auto sort the model output by the output type ID.
+#'
+#' @param tbl a model output table
+#' @param reference_tbl output from [expand_model_out_grid()]
+#'
+#' @note
+#' 1. this assumes that the output_type_id values in the `tbl` are complete,
+#'    which is explicitly checked by the [check_tbl_values_required()]
+#' 2. this assumes that both `tbl` and `reference_tbl` have the same column
+#'    types
+#' @noRd
+#' @examples
+#' reference_tbl <- data.frame(
+#'   target = c(rep("a", 3), rep("b", 5)),
+#'   output_type = rep("quantile", 8),
+#'   output_type_id = c("0", "0.5", "1", "0", "0.25", "0.5", "0.75", "1")
+#' )
+#' tbl <- reference_tbl
+#' tbl$value <- c(
+#'   seq(from = 0, to = 1, length.out = 3),
+#'   seq(from = 0, to = 1, length.out = 5)
+#' )
+#' order_output_type_ids(tbl[sample(nrow(tbl)), ] reference_tbl)
+order_output_type_ids <- function(tbl, reference_tbl) {
+  group_cols <- names(tbl)[!names(tbl) %in% hubUtils::std_colnames]
+  join_by <- c(group_cols, "output_type", "output_type_id")
+  lookup <- unique(reference_tbl[join_by])
+  tbl$output_type_id <- as.character(tbl$output_type_id)
+  lookup$output_type_id <- as.character(lookup$output_type_id)
+  dplyr::inner_join(lookup, tbl, by = join_by)
+}
diff --git a/R/validate_model_data.R b/R/validate_model_data.R
@@ -212,7 +212,9 @@ validate_model_data <- function(hub_path, file_path, round_id_col = NULL,
   checks$value_col_non_desc <- try_check(
     check_tbl_value_col_ascending(
       tbl,
-      file_path = file_path
+      file_path = file_path,
+      hub_path = hub_path,
+      round_id = round_id
     ), file_path
   )
 

diff --git a/man/check_tbl_value_col_ascending.Rd b/man/check_tbl_value_col_ascending.Rd
diff --git a/tests/testthat/_snaps/check_tbl_value_col_ascending.md b/tests/testthat/_snaps/check_tbl_value_col_ascending.md
@@ -1,7 +1,7 @@
 # check_tbl_value_col_ascending works
 
     Code
-      check_tbl_value_col_ascending(tbl, file_path)
+      check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
     Output
       <message/check_success>
       Message:
@@ -10,7 +10,7 @@
 ---
 
     Code
-      check_tbl_value_col_ascending(tbl, file_path)
+      check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
     Output
       <message/check_success>
       Message:
@@ -19,7 +19,7 @@
 # check_tbl_value_col_ascending works when output type IDs not ordered
 
     Code
-      check_tbl_value_col_ascending(tbl, file_path)
+      check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
     Output
       <message/check_success>
       Message:
@@ -28,7 +28,7 @@
 # check_tbl_value_col_ascending errors correctly
 
     Code
-      str(check_tbl_value_col_ascending(tbl, file_path))
+      str(check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id))
     Output
       List of 7
        $ message       : chr "Values in `value` column are not non-decreasing as output_type_ids increase for all unique task ID\n    value/o"| __truncated__
@@ -48,7 +48,8 @@
 ---
 
     Code
-      str(check_tbl_value_col_ascending(tbl_error, file_path))
+      str(check_tbl_value_col_ascending(tbl_error, file_path, hub_path, file_meta$
+        round_id))
     Output
       List of 7
        $ message       : chr "Values in `value` column are not non-decreasing as output_type_ids increase for all unique task ID\n    value/o"| __truncated__
@@ -58,7 +59,7 @@
        $ error_tbl     : tibble [1 x 5] (S3: tbl_df/tbl/data.frame)
         ..$ forecast_date: Date[1:1], format: "2023-05-08"
         ..$ horizon      : int 1
-        ..$ target       : chr "wk ahead inc covid hosp"
+        ..$ target       : chr "wk ahead inc flu hosp"
         ..$ location     : chr "US"
         ..$ output_type  : chr "quantile"
        $ call          : chr "check_tbl_value_col_ascending"
@@ -68,7 +69,8 @@
 ---
 
     Code
-      str(check_tbl_value_col_ascending(rbind(tbl, tbl_error), file_path))
+      str(check_tbl_value_col_ascending(rbind(tbl, tbl_error), file_path, hub_path,
+      file_meta$round_id))
     Output
       List of 7
        $ message       : chr "Values in `value` column are not non-decreasing as output_type_ids increase for all unique task ID\n    value/o"| __truncated__
@@ -78,7 +80,7 @@
        $ error_tbl     : tibble [1 x 5] (S3: tbl_df/tbl/data.frame)
         ..$ forecast_date: Date[1:1], format: "2023-05-08"
         ..$ horizon      : int 1
-        ..$ target       : chr "wk ahead inc covid hosp"
+        ..$ target       : chr "wk ahead inc flu hosp"
         ..$ location     : chr "US"
         ..$ output_type  : chr "quantile"
        $ call          : chr "check_tbl_value_col_ascending"
@@ -88,7 +90,7 @@
 # check_tbl_value_col_ascending skips correctly
 
     Code
-      check_tbl_value_col_ascending(tbl, file_path)
+      check_tbl_value_col_ascending(tbl, file_path, hub_path, file_meta$round_id)
     Output
       <message/check_info>
       Message: