Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

read and write updates #46

Open
wants to merge 18 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions R/column_metadata.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#' Verify that the item metadata supplied is the appropriate format
#'
#' This function does the following checks and consolidates to a single error message:
#' - Columns missing that must be present
#' - Columns present that are not permissible
#' - Columns with NAs that must be fully populated
#' - Columns columns that should be character or integer but aren't
#' - Within the dataType column, if the values are within the permissible list per
#' the schema
#' - Within the targetDataType column, if the values are within the permissible list per
#' the schema
#' @param items
#'
#' @return Error Check
#' @noRd
validate_dataset_columns <- function(items) {
required_cols <- c("itemOID", "name", "label", "dataType")
all_cols <- c("itemOID", "name", "label", "dataType", "targetDataType", "length", "displayFormat", "keySequence")

# Check for missing or extraneous columns
missing_cols <- setdiff(required_cols, names(items))
err_missing_cols <- sprintf("Column `%s` is missing and must be present", missing_cols)
additional_cols <- setdiff(names(items), all_cols)
err_additional_cols <- sprintf("Column `%s` is not a permissible column", additional_cols)

# Check for for NAs in required columns
any_nas <- vapply(items[intersect(required_cols, names(items))], function(X) any(is.na(X)), FUN.VALUE = TRUE)
has_nas <- names(any_nas)[any_nas]
err_nas <- sprintf("Column `%s` must not have NA values", has_nas)

# Check columns that should be character
char_cols <- intersect(c("itemOID", "name", "label", "dataType", "targetDataType", "displayFormat"), names(items))
are_char_cols <- vapply(items[char_cols], is.character, FUN.VALUE=TRUE)
not_char_cols <- names(are_char_cols)[!are_char_cols]
err_char_cols <- sprintf("Column `%s` must be of type character", not_char_cols)

# Check columns that should be integers
int_cols <- intersect(c("length", "keySequence"), names(items))
are_int_cols <- vapply(items[int_cols], is.integer, FUN.VALUE=TRUE)
not_int_cols <- names(are_int_cols)[!are_int_cols]
err_int_cols <- sprintf("Column `%s` must be of type integer", not_int_cols)

# Check that dataType values are within the permissible list
err_dataType_vars <- character()
if ('dataType' %in% names(items)) {
bad_dataType <- !(items$type %in% c("string", "integer", "float", "double", "decimal", "boolean",
"datetime", "date", "time", "URI"))
bad_dataType_vars <- items$name[bad_dataType]
bad_dataType_vals <- items$type[bad_dataType]
err_dataType_vars <- sprintf(
paste("Variable %s has an invalid dataType value of %s.",
"Must be one of string, integer, float, double, decimal, boolean, datetime, date, time, URI"),
bad_dataType_vars, bad_dataType_vals
)
}

# Check that targetDataType values are within the permissible list
err_targetDataType_vars <- character()
if ('targetDataType' %in% names(items)) {
bad_targetDataType <- !(items$type %in% c("integer", "decimal"))
bad_targetDataType_vars <- items$name[bad_targetDataType]
bad_targetDataType_vals <- items$type[bad_targetDataType]
err_targetDataType_vars <- sprintf(
paste("Variable %s has an invalid targetDataType value of %s.",
"Must be integer or decimal"),
bad_targetDataType_vars, bad_targetDataType_vals
)
}

all_errs <- c(err_missing_cols, err_additional_cols, err_nas, err_char_cols,
err_int_cols, err_dataType_vars, err_targetDataType_vars)

if (length(all_errs) > 0) {
msg_prep <- paste0("\n\t", all_errs)
err_msg <- paste0(c("Error: Issues found in items data:", msg_prep))
stop(err_msg, call.=FALSE)
}
}


set_column_metadata <- function(columns) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nicholas-masel no issue with this as is, but do we this to be a) exported, and if so, b) would we then want the first parameter back to x to be pipe-compatible.

# Check items before moving any further
validate_dataset_columns(columns)

# Attach in the variable metadata
if (!("ITEMGROUPDATASEQ" %in% columns$itemOID)) {
igds_row <- data.frame(
itemOID = "ITEMGROUPDATASEQ",
name = "ITEMGROUPDATASEQ",
label = "Record Identifier",
dataType = "integer"
)

# Match up columns and fill
igds_row[setdiff(names(columns), names(igds_row))] <- NA
columns[setdiff(names(igds_row), names(columns))] <- NA

columns <- rbind(igds_row, columns)
}

columns_converted <- df_to_list_rows(columns)
}
61 changes: 34 additions & 27 deletions R/datasetjson.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
#' Create the base object used to write a Dataset JSON file.
#'
#' @details
#'
#' Note that DatasetJSON is on version 1.1.0. Based off findings from the pilot,
#' version 1.1.0 reflects feedback from the user community. Support for 1.0.0 has
#'
#' Note that DatasetJSON is on version 1.1.0. Based off findings from the pilot,
#' version 1.1.0 reflects feedback from the user community. Support for 1.0.0 has
#' been deprecated.
#'
#'
#' @param .data Input data to contain within the Dataset JSON file. Written to
#' the itemData parameter.
#' @param sys sourceSystem parameter, defined as "The computer system or
Expand All @@ -25,9 +25,10 @@
#' @param item_oid ID used to label dataset with the itemGroupData parameter.
#' Defined as "Object of Datasets. Key value is a unique identifier for
#' Dataset, corresponding to ItemGroupDef/@OID in Define-XML."
#' @param ref_data Boolean value that is set to "true" when the dataset contains
#' reference data (not subject data). The default value is "false".
#' @param ref_data Boolean value that is set to "true" when the dataset contains
#' reference data (not subject data). The default value is "false".
#' @param version The DatasetJSON version to use. Currently only 1.1.0 is supported.
#' @param columns Variable level metadata for the Dataset JSON object
#'
#' @return dataset_json object pertaining to the specific Dataset JSON version
#' specific
Expand All @@ -36,18 +37,19 @@
#' @examples
#' # Create a basic object
#' ds_json <- dataset_json(
#' iris,
#' iris,
#' file_oid = "/some/path",
#' last_modified = "2023-02-15T10:23:15",
#' originator = "Some Org",
#' sys = "source system",
#' sys_version = "1.0",
#' originator = "Some Org",
#' sys = "source system",
#' sys_version = "1.0",
#' study = "SOMESTUDY",
#' metadata_version = "MDV.MSGv2.0.SDTMIG.3.3.SDTM.1.7",
#' metadata_ref = "some/define.xml",
#' item_oid = "IG.IRIS",
#' name = "IRIS",
#' dataset_label = "Iris"
#' dataset_label = "Iris",
#' columns = iris_items
#' )
#'
#' # Attach attributes directly
Expand All @@ -62,14 +64,15 @@
#' ds_json_updated <- set_item_oid(ds_json_updated, "IG.IRIS")
#' ds_json_updated <- set_dataset_name(ds_json_updated, "IRIS")
#' ds_json_updated <- set_dataset_label(ds_json_updated, "Iris")
dataset_json <- function(.data, file_oid = NULL, last_modified=NULL,
originator=NULL, sys=NULL, sys_version = NULL,
study=NULL, metadata_version=NULL,metadata_ref=NULL,
item_oid=NULL, name=NULL, dataset_label=NULL, ref_data=FALSE,
version="1.1.0") {
new_dataset_json(.data, file_oid, last_modified, originator, sys, sys_version, study,
metadata_version, metadata_ref, item_oid, name, dataset_label, ref_data,
version)
#' ds_json_updates <- set_columns(ds_json_updated, iris_items)
dataset_json <- function(.data, file_oid=NULL, last_modified=NULL,
originator=NULL, sys=NULL, sys_version = NULL,
study=NULL, metadata_version=NULL,metadata_ref=NULL,
item_oid=NULL, name=NULL, dataset_label=NULL, ref_data=FALSE,
columns=NULL, version="1.1.0") {
new_dataset_json(.data, file_oid, last_modified, originator, sys, sys_version, study,
metadata_version, metadata_ref, item_oid, name, dataset_label, ref_data,
columns, version)
}

#' Create a base Dataset JSON Container
Expand All @@ -81,9 +84,9 @@ dataset_json <- function(.data, file_oid = NULL, last_modified=NULL,
#' @return datasetjson object
#'
#' @noRd
new_dataset_json <- function(.data, file_oid, last_modified, originator, sys, sys_version, study,
new_dataset_json <- function(.data, file_oid, last_modified, originator, sys, sys_version, study,
metadata_version, metadata_ref, item_oid, name, dataset_label,
ref_data, version) {
ref_data, columns, version) {

if (!(version %in% c("1.1.0"))) {
stop("Unsupported version specified - currently only version 1.1.0 is supported", call.=FALSE)
Expand All @@ -95,18 +98,18 @@ new_dataset_json <- function(.data, file_oid, last_modified, originator, sys, sy
)

# Extract the function and call it to return the base structure
funcs[[version]](.data, file_oid, last_modified, originator, sys, sys_version, study,
funcs[[version]](.data, file_oid, last_modified, originator, sys, sys_version, study,
metadata_version, metadata_ref, item_oid, name, dataset_label,
ref_data)
ref_data, columns)
}

#' Dataset JSON v1.1.0 Generator
#'
#' @return datasetjson_v1_1_0 object
#' @noRd
new_dataset_json_v1_1_0 <- function(.data, file_oid, last_modified, originator, sys, sys_version,
study, metadata_version, metadata_ref, item_oid, name,
dataset_label, ref_data) {
new_dataset_json_v1_1_0 <- function(.data, file_oid, last_modified, originator, sys, sys_version,
study, metadata_version, metadata_ref, item_oid, name,
dataset_label, ref_data, columns) {

if (!inherits(.data, 'data.frame')) {
stop("datasetjson objects must inherit from a data.frame", call.=FALSE)
Expand All @@ -130,7 +133,11 @@ new_dataset_json_v1_1_0 <- function(.data, file_oid, last_modified, originator,
attr(.data, 'name') <- name
attr(.data, 'label') <- dataset_label
attr(.data, 'isReferenceData') <- ref_data

if (!is.null(columns)) {
validate_dataset_columns(columns)
}
attr(.data, 'columns') <- set_column_metadata(columns)

structure(
.data,
class = c("datasetjson_v1_1_0", "datasetjson", "data.frame")
Expand Down
68 changes: 33 additions & 35 deletions R/read_dataset_json.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,59 +48,57 @@ read_dataset_json <- function(file) {
)
}



# Pull the object out with a lot of assumptions because the format has already
# been validated
dtype <- ifelse("clinicalData" %in% names(ds_json), "clinicalData", "referenceData")
d <- as.data.frame(ds_json[[dtype]]$itemGroupData[[1]]$itemData)
items <- ds_json[[dtype]]$itemGroupData[[1]]$items
# Pull the data and items
d <- as.data.frame(ds_json$rows)
items <- ds_json$columns

# Start setting attributes
colnames(d) <- items$name

# Process type conversions
tt <- items$type
tt <- items$dataType
tdt <- items$targetDataType
int_cols <- tt == "integer"
dbl_cols <- tt %in% c("float", "double", "decimal")
bool_cols <- tt == "boolean"
d[int_cols] <- lapply(d[int_cols], as.integer)
d[dbl_cols] <- lapply(d[dbl_cols], as.double)
d[bool_cols] <- lapply(d[bool_cols], as.logical)

# Grab date and datetime column info
fmts <- items$displayFormat
date_cols <- fmts %in% sas_date_formats
datetime_cols <- fmts %in% sas_datetime_formats
d[date_cols] <- lapply(d[date_cols], as.Date, origin="1960-01-01")
d[datetime_cols] <- lapply(d[datetime_cols], as.POSIXct, origin="1960-01-01")
date_cols <- tt %in% c("date") & tdt %in% "integer"
datetime_cols <- tt %in% c("datetime", "time") & tdt %in% "integer"
d[date_cols] <- lapply(d[date_cols], as.Date)
d[datetime_cols] <- lapply(d[datetime_cols], as.POSIXct)

# Apply variable labels
d[names(d)] <- lapply(items$name, set_col_attr, d, 'label', items)
d[names(d)] <- lapply(items$name, set_col_attr, d, 'OID', items)
d[names(d)] <- lapply(items$name, set_col_attr, d, 'length', items)
d[names(d)] <- lapply(items$name, set_col_attr, d, 'type', items)
d[names(d)] <- lapply(items$name, set_col_attr, d, 'keySequence', items)
d[names(d)] <- lapply(items$name, set_col_attr, d, 'displayFormat', items)

d <- d[,-1] # get rid of ITEMGROUPDATASEQ column

# Apply file and data level attributes
attr(d, 'creationDateTime') <- ds_json$creationDateTime
attr(d, 'datasetJSONVersion') <- ds_json$datasetJSONVersion
attr(d, 'fileOID') <- ds_json$fileOID
attr(d, 'asOfDateTime') <- ds_json$asOfDateTime
attr(d, 'originator') <- ds_json$originator
attr(d, 'sourceSystem') <- ds_json$sourceSystem
attr(d, 'sourceSystemVersion') <- ds_json$sourceSystemVersion
attr(d, 'name') <-ds_json[[dtype]]$itemGroupData[[1]]$name
attr(d, 'records') <-ds_json[[dtype]]$itemGroupData[[1]]$records
attr(d, 'label') <-ds_json[[dtype]]$itemGroupData[[1]]$label
ds_attr <- dataset_json(
d,
file_oid = ds_json$fileOID,
originator = ds_json$originator,
sys = ds_json$sourceSystem$name,
sys_version = ds_json$sourceSystem$version,
study = ds_json$studyOID,
metadata_version = ds_json$metaDataVersionOID,
metadata_ref = ds_json$metaDataRef,
item_oid = ds_json$itemGroupOID,
name = ds_json$name,
dataset_label = ds_json$label,
ref_data = ds_json$isReferenceData,
last_modified = ds_json$dbLastModifiedDateTime,
version = ds_json$datasetJSONVersion,
columns = ds_json$columns
)

# Apply records and column attribute
if(ds_json$records != nrow(d)) {
warning("The number of rows in the data does not match the number of records recorded in the metadata.")
}

# Still save the name of the element storing the dataset metadata
ds_json[[dtype]]$itemGroupData <- names(ds_json[[dtype]]$itemGroupData)
attr(ds_attr, 'records') <- ds_json$records

# Store the data metadata still within it's own list
attr(d, dtype) <- ds_json[[dtype]]
d
ds_attr
}
Loading
Loading