atorus-research · nicholas-masel · Sep 5, 2024 · Sep 5, 2024 · Sep 5, 2024 · Sep 5, 2024
diff --git a/R/column_metadata.R b/R/column_metadata.R
@@ -0,0 +1,102 @@
+#' Verify that the item metadata supplied is the appropriate format
+#'
+#' This function does the following checks and consolidates to a single error message:
+#'   - Columns missing that must be present
+#'   - Columns present that are not permissible
+#'   - Columns with NAs that must be fully populated
+#'   - Columns columns that should be character or integer but aren't
+#'   - Within the dataType column, if the values are within the permissible list per
+#'     the schema
+#'   - Within the targetDataType column, if the values are within the permissible list per
+#'     the schema
+#' @param items
+#'
+#' @return Error Check
+#' @noRd
+validate_dataset_columns <- function(items) {
+  required_cols <- c("itemOID", "name", "label", "dataType")
+  all_cols <- c("itemOID", "name", "label", "dataType", "targetDataType", "length", "displayFormat", "keySequence")
+
+  # Check for missing or extraneous columns
+  missing_cols <- setdiff(required_cols, names(items))
+  err_missing_cols <- sprintf("Column `%s` is missing and must be present", missing_cols)
+  additional_cols <- setdiff(names(items), all_cols)
+  err_additional_cols <- sprintf("Column `%s` is not a permissible column", additional_cols)
+
+  # Check for for NAs in required columns
+  any_nas <- vapply(items[intersect(required_cols, names(items))], function(X) any(is.na(X)), FUN.VALUE = TRUE)
+  has_nas <- names(any_nas)[any_nas]
+  err_nas <- sprintf("Column `%s` must not have NA values", has_nas)
+
+  # Check columns that should be character
+  char_cols <- intersect(c("itemOID", "name", "label", "dataType", "targetDataType", "displayFormat"), names(items))
+  are_char_cols <- vapply(items[char_cols], is.character, FUN.VALUE=TRUE)
+  not_char_cols <- names(are_char_cols)[!are_char_cols]
+  err_char_cols <- sprintf("Column `%s` must be of type character", not_char_cols)
+
+  # Check columns that should be integers
+  int_cols <- intersect(c("length", "keySequence"), names(items))
+  are_int_cols <- vapply(items[int_cols], is.integer, FUN.VALUE=TRUE)
+  not_int_cols <- names(are_int_cols)[!are_int_cols]
+  err_int_cols <- sprintf("Column `%s` must be of type integer", not_int_cols)
+
+  # Check that dataType values are within the permissible list
+  err_dataType_vars <- character()
+  if ('dataType' %in% names(items)) {
+    bad_dataType <- !(items$type %in% c("string", "integer", "float", "double", "decimal", "boolean",
+                                     "datetime", "date", "time", "URI"))
+    bad_dataType_vars <- items$name[bad_dataType]
+    bad_dataType_vals <- items$type[bad_dataType]
+    err_dataType_vars <- sprintf(
+      paste("Variable %s has an invalid dataType value of %s.",
+            "Must be one of string, integer, float, double, decimal, boolean, datetime, date, time, URI"),
+      bad_dataType_vars, bad_dataType_vals
+    )
+  }
+
+  # Check that targetDataType values are within the permissible list
+  err_targetDataType_vars <- character()
+  if ('targetDataType' %in% names(items)) {
+    bad_targetDataType <- !(items$type %in% c("integer", "decimal"))
+    bad_targetDataType_vars <- items$name[bad_targetDataType]
+    bad_targetDataType_vals <- items$type[bad_targetDataType]
+    err_targetDataType_vars <- sprintf(
+      paste("Variable %s has an invalid targetDataType value of %s.",
+            "Must be integer or decimal"),
+      bad_targetDataType_vars, bad_targetDataType_vals
+    )
+  }
+
+  all_errs <- c(err_missing_cols, err_additional_cols, err_nas, err_char_cols,
+                err_int_cols, err_dataType_vars, err_targetDataType_vars)
+
+  if (length(all_errs) > 0) {
+    msg_prep <- paste0("\n\t", all_errs)
+    err_msg <- paste0(c("Error: Issues found in items data:", msg_prep))
+    stop(err_msg, call.=FALSE)
+  }
+}
+
+
+set_column_metadata <- function(columns) {
+  # Check items before moving any further
+  validate_dataset_columns(columns)
+
+  # Attach in the variable metadata
+  if (!("ITEMGROUPDATASEQ" %in% columns$itemOID)) {
+    igds_row <- data.frame(
+      itemOID = "ITEMGROUPDATASEQ",
+      name = "ITEMGROUPDATASEQ",
+      label = "Record Identifier",
+      dataType = "integer"
+    )
+
+    # Match up columns and fill
+    igds_row[setdiff(names(columns), names(igds_row))] <- NA
+    columns[setdiff(names(igds_row), names(columns))] <- NA
+
+    columns <- rbind(igds_row, columns)
+  }
+
+  columns_converted <- df_to_list_rows(columns)
+}
diff --git a/R/datasetjson.R b/R/datasetjson.R
@@ -3,11 +3,11 @@
 #' Create the base object used to write a Dataset JSON file.
 #'
 #' @details
-#' 
-#' Note that DatasetJSON is on version 1.1.0. Based off findings from the pilot, 
-#' version 1.1.0 reflects feedback from the user community. Support for 1.0.0 has 
+#'
+#' Note that DatasetJSON is on version 1.1.0. Based off findings from the pilot,
+#' version 1.1.0 reflects feedback from the user community. Support for 1.0.0 has
 #' been deprecated.
-#' 
+#'
 #' @param .data Input data to contain within the Dataset JSON file. Written to
 #'   the itemData parameter.
 #' @param sys sourceSystem parameter, defined as "The computer system or
@@ -25,9 +25,10 @@
 #' @param item_oid ID used to label dataset with the itemGroupData parameter.
 #'   Defined as "Object of Datasets. Key value is a unique identifier for
 #'   Dataset, corresponding to ItemGroupDef/@OID in Define-XML."
-#' @param ref_data Boolean value that is set to "true" when the dataset contains 
-#'   reference data (not subject data). The default value is "false". 
+#' @param ref_data Boolean value that is set to "true" when the dataset contains
+#'   reference data (not subject data). The default value is "false".
 #' @param version The DatasetJSON version to use. Currently only 1.1.0 is supported.
+#' @param columns Variable level metadata for the Dataset JSON object
 #'
 #' @return dataset_json object pertaining to the specific Dataset JSON version
 #'   specific
@@ -36,18 +37,19 @@
 #' @examples
 #' # Create a basic object
 #' ds_json <- dataset_json(
-#'   iris, 
+#'   iris,
 #'   file_oid = "/some/path",
 #'   last_modified = "2023-02-15T10:23:15",
-#'   originator = "Some Org", 
-#'   sys = "source system", 
-#'   sys_version = "1.0", 
+#'   originator = "Some Org",
+#'   sys = "source system",
+#'   sys_version = "1.0",
 #'   study = "SOMESTUDY",
 #'   metadata_version = "MDV.MSGv2.0.SDTMIG.3.3.SDTM.1.7",
 #'   metadata_ref = "some/define.xml",
 #'   item_oid = "IG.IRIS",
 #'   name = "IRIS",
-#'   dataset_label = "Iris"
+#'   dataset_label = "Iris",
+#'   columns = iris_items
 #' )
 #'
 #' # Attach attributes directly
@@ -62,14 +64,15 @@
 #' ds_json_updated <- set_item_oid(ds_json_updated, "IG.IRIS")
 #' ds_json_updated <- set_dataset_name(ds_json_updated, "IRIS")
 #' ds_json_updated <- set_dataset_label(ds_json_updated, "Iris")
-dataset_json <- function(.data, file_oid = NULL, last_modified=NULL, 
-                          originator=NULL, sys=NULL, sys_version = NULL, 
-                          study=NULL, metadata_version=NULL,metadata_ref=NULL, 
-                          item_oid=NULL, name=NULL, dataset_label=NULL, ref_data=FALSE, 
-                          version="1.1.0") {
-  new_dataset_json(.data, file_oid, last_modified, originator, sys, sys_version, study, 
-                   metadata_version, metadata_ref, item_oid, name, dataset_label, ref_data, 
-                   version)
+#' ds_json_updates <- set_columns(ds_json_updated, iris_items)
+dataset_json <- function(.data, file_oid=NULL, last_modified=NULL,
+                          originator=NULL, sys=NULL, sys_version = NULL,
+                          study=NULL, metadata_version=NULL,metadata_ref=NULL,
+                          item_oid=NULL, name=NULL, dataset_label=NULL, ref_data=FALSE,
+                          columns=NULL, version="1.1.0") {
+  new_dataset_json(.data, file_oid, last_modified, originator, sys, sys_version, study,
+                   metadata_version, metadata_ref, item_oid, name, dataset_label, ref_data,
+                   columns, version)
 }
 
 #' Create a base Dataset JSON Container
@@ -81,9 +84,9 @@ dataset_json <- function(.data, file_oid = NULL, last_modified=NULL,
 #' @return datasetjson object
 #'
 #' @noRd
-new_dataset_json <- function(.data, file_oid, last_modified, originator, sys, sys_version, study, 
+new_dataset_json <- function(.data, file_oid, last_modified, originator, sys, sys_version, study,
                              metadata_version, metadata_ref, item_oid, name, dataset_label,
-                             ref_data, version) {
+                             ref_data, columns, version) {
 
   if (!(version %in% c("1.1.0"))) {
     stop("Unsupported version specified - currently only version 1.1.0 is supported", call.=FALSE)
@@ -95,18 +98,18 @@ new_dataset_json <- function(.data, file_oid, last_modified, originator, sys, sy
   )
 
   # Extract the function and call it to return the base structure
-  funcs[[version]](.data, file_oid, last_modified, originator, sys, sys_version, study, 
+  funcs[[version]](.data, file_oid, last_modified, originator, sys, sys_version, study,
                    metadata_version, metadata_ref, item_oid, name, dataset_label,
-                   ref_data)
+                   ref_data, columns)
 }
 
 #' Dataset JSON v1.1.0 Generator
 #'
 #' @return datasetjson_v1_1_0 object
 #' @noRd
-new_dataset_json_v1_1_0 <- function(.data, file_oid, last_modified, originator, sys, sys_version, 
-                                    study, metadata_version, metadata_ref, item_oid, name, 
-                                    dataset_label, ref_data) {
+new_dataset_json_v1_1_0 <- function(.data, file_oid, last_modified, originator, sys, sys_version,
+                                    study, metadata_version, metadata_ref, item_oid, name,
+                                    dataset_label, ref_data, columns) {
 
   if (!inherits(.data, 'data.frame')) {
     stop("datasetjson objects must inherit from a data.frame", call.=FALSE)
@@ -130,7 +133,11 @@ new_dataset_json_v1_1_0 <- function(.data, file_oid, last_modified, originator,
   attr(.data, 'name') <- name
   attr(.data, 'label') <- dataset_label
   attr(.data, 'isReferenceData') <- ref_data
-
+  if (!is.null(columns)) {
+    validate_dataset_columns(columns)
+  }
+  attr(.data, 'columns') <- set_column_metadata(columns)
+
   structure(
     .data,
     class = c("datasetjson_v1_1_0", "datasetjson", "data.frame")

diff --git a/R/read_dataset_json.R b/R/read_dataset_json.R
@@ -48,59 +48,57 @@ read_dataset_json <- function(file) {
     )
   }
 
-
-
-  # Pull the object out with a lot of assumptions because the format has already
-  # been validated
-  dtype <- ifelse("clinicalData" %in% names(ds_json), "clinicalData", "referenceData")
-  d <- as.data.frame(ds_json[[dtype]]$itemGroupData[[1]]$itemData)
-  items <- ds_json[[dtype]]$itemGroupData[[1]]$items
+  # Pull the data and items
+  d <- as.data.frame(ds_json$rows)
+  items <- ds_json$columns
 
   # Start setting attributes
   colnames(d) <- items$name
 
   # Process type conversions
-  tt <- items$type
+  tt <- items$dataType
+  tdt <- items$targetDataType
   int_cols <- tt == "integer"
   dbl_cols <- tt %in% c("float", "double", "decimal")
   bool_cols <- tt == "boolean"
   d[int_cols] <- lapply(d[int_cols], as.integer)
   d[dbl_cols] <- lapply(d[dbl_cols], as.double)
   d[bool_cols] <- lapply(d[bool_cols], as.logical)
 
-  # Grab date and datetime column info
-  fmts <- items$displayFormat
-  date_cols <- fmts %in% sas_date_formats
-  datetime_cols <- fmts %in% sas_datetime_formats
-  d[date_cols] <- lapply(d[date_cols], as.Date, origin="1960-01-01")
-  d[datetime_cols] <- lapply(d[datetime_cols], as.POSIXct, origin="1960-01-01")
+  date_cols <- tt %in% c("date") & tdt %in% "integer"
+  datetime_cols <- tt %in% c("datetime", "time") & tdt %in% "integer"
+  d[date_cols] <- lapply(d[date_cols], as.Date)
+  d[datetime_cols] <- lapply(d[datetime_cols], as.POSIXct)
 
   # Apply variable labels
   d[names(d)] <- lapply(items$name, set_col_attr, d, 'label', items)
-  d[names(d)] <- lapply(items$name, set_col_attr, d, 'OID', items)
-  d[names(d)] <- lapply(items$name, set_col_attr, d, 'length', items)
-  d[names(d)] <- lapply(items$name, set_col_attr, d, 'type', items)
-  d[names(d)] <- lapply(items$name, set_col_attr, d, 'keySequence', items)
-  d[names(d)] <- lapply(items$name, set_col_attr, d, 'displayFormat', items)
 
   d <- d[,-1] # get rid of ITEMGROUPDATASEQ column
 
-  # Apply file and data level attributes
-  attr(d, 'creationDateTime') <- ds_json$creationDateTime
-  attr(d, 'datasetJSONVersion') <- ds_json$datasetJSONVersion
-  attr(d, 'fileOID') <- ds_json$fileOID
-  attr(d, 'asOfDateTime') <- ds_json$asOfDateTime
-  attr(d, 'originator') <- ds_json$originator
-  attr(d, 'sourceSystem') <- ds_json$sourceSystem
-  attr(d, 'sourceSystemVersion') <- ds_json$sourceSystemVersion
-  attr(d, 'name') <-ds_json[[dtype]]$itemGroupData[[1]]$name
-  attr(d, 'records') <-ds_json[[dtype]]$itemGroupData[[1]]$records
-  attr(d, 'label') <-ds_json[[dtype]]$itemGroupData[[1]]$label
+  ds_attr <- dataset_json(
+    d,
+    file_oid = ds_json$fileOID,
+    originator = ds_json$originator,
+    sys = ds_json$sourceSystem$name,
+    sys_version = ds_json$sourceSystem$version,
+    study = ds_json$studyOID,
+    metadata_version = ds_json$metaDataVersionOID,
+    metadata_ref = ds_json$metaDataRef,
+    item_oid = ds_json$itemGroupOID,
+    name = ds_json$name,
+    dataset_label = ds_json$label,
+    ref_data = ds_json$isReferenceData,
+    last_modified = ds_json$dbLastModifiedDateTime,
+    version = ds_json$datasetJSONVersion,
+    columns = ds_json$columns
+  )
+
+  # Apply records and column attribute
+  if(ds_json$records != nrow(d)) {
+    warning("The number of rows in the data does not match the number of records recorded in the metadata.")
+  }
 
-  # Still save the name of the element storing the dataset metadata
-  ds_json[[dtype]]$itemGroupData <- names(ds_json[[dtype]]$itemGroupData)
+  attr(ds_attr, 'records') <- ds_json$records
 
-  # Store the data metadata still within it's own list
-  attr(d, dtype) <- ds_json[[dtype]]
-  d
+  ds_attr
 }