From 16c1eec8bcf49f061fd8f3c39536be6b5c631b94 Mon Sep 17 00:00:00 2001 From: Unknown Date: Tue, 25 Oct 2022 09:06:23 -0400 Subject: [PATCH] Fixes #881 selection applied at SimulationSet level Deprecate evalDataFilter in favor of functions relying on dplyr::filter --- DESCRIPTION | 3 +- NAMESPACE | 3 + R/utilities-observed-data.R | 131 ++++++++++++++++++++++++---- man/getSelectedData.Rd | 45 ++++++++++ man/getSelectedRows.Rd | 45 ++++++++++ tests/testthat/test-observed-data.R | 80 +++++++++-------- tests/testthat/test-output.R | 20 ++--- 7 files changed, 265 insertions(+), 62 deletions(-) create mode 100644 man/getSelectedData.Rd create mode 100644 man/getSelectedRows.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 9254a7ff..bdf22ab8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,7 +36,8 @@ Imports: R6, ggplot2, jsonlite, - ospsuite.utils (>= 1.3.0) + ospsuite.utils (>= 1.3.0), + dplyr Suggests: testthat (>= 2.1.0), knitr, diff --git a/NAMESPACE b/NAMESPACE index 8a956aae..83171012 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -75,6 +75,8 @@ export(getPkAnalysisResultsFileNames) export(getPopulationSensitivityAnalysisResultsFileNames) export(getReportingEngineInfo) export(getResiduals) +export(getSelectedData) +export(getSelectedRows) export(getSimulationDescriptor) export(getSimulationParameterDisplayPaths) export(getSimulationResultFileNames) @@ -160,6 +162,7 @@ export(startQualificationRunner) export(trimFileName) export(updateSimulationIndividualParameters) export(vpcParameterPlot) +import(dplyr) import(ggplot2) import(jsonlite) import(ospsuite) diff --git a/R/utilities-observed-data.R b/R/utilities-observed-data.R index 6ccdb4e6..0d4ba803 100644 --- a/R/utilities-observed-data.R +++ b/R/utilities-observed-data.R @@ -85,6 +85,106 @@ readObservedDataFile <- function(fileName, return(observedData) } +#' @title getSelectedData +#' @description +#' Get selected data +#' The function leverage `dplyr::filter` to select the data +#' @param data A data.frame +#' @param dataSelection Character string or expression evaluated to select data +#' The enum helper `DataSelectionKeys` provides keys for selected all or none of the data +#' @return A data.frame of selected data +#' @export +#' @import dplyr +#' @seealso DataSelectionKeys +#' @examples +#' data <- data.frame( +#' x = seq(0,9), +#' y = seq(10,19), +#' mdv = c(1,1, rep(0, 8)), +#' groups = rep(c("A", "B"), 5) +#' ) +#' +#' # Select all the data +#' getSelectedData(data, DataSelectionKeys$ALL) +#' +#' # Select no data +#' getSelectedData(data, DataSelectionKeys$NONE) +#' +#' # Select data from group A +#' getSelectedData(data, "groups %in% 'A'") +#' +#' # Remove missing dependent variable (mdv) +#' getSelectedData(data, "mdv == 0") +#' +getSelectedData <- function(data, dataSelection) { + if(isEmpty(dataSelection)){ + return(data[FALSE,]) + } + if(isIncluded(dataSelection, DataSelectionKeys$ALL)){ + return(data) + } + if(isIncluded(dataSelection, DataSelectionKeys$NONE)){ + return(data[FALSE,]) + } + if(isOfType(dataSelection, "expression")){ + return(data %>% dplyr::filter(eval(dataSelection))) + } + return(data %>% dplyr::filter(eval(parse(text = dataSelection)))) +} + +#' @title getSelectedRows +#' @description +#' Get selected rows from data and its selection +#' The function leverage `dplyr::filter` to select the rows +#' @param data A data.frame +#' @param dataSelection Character string or expression evaluated to select data +#' The enum helper `DataSelectionKeys` provides keys for selected all or none of the data +#' @return A data.frame of selected data +#' @export +#' @import dplyr +#' @seealso DataSelectionKeys +#' @examples +#' data <- data.frame( +#' x = seq(0,9), +#' y = seq(10,19), +#' mdv = c(1,1, rep(0, 8)), +#' groups = rep(c("A", "B"), 5) +#' ) +#' +#' # Select all the rows +#' getSelectedRows(data, DataSelectionKeys$ALL) +#' +#' # Select no row +#' getSelectedRows(data, DataSelectionKeys$NONE) +#' +#' # Select rows from group A +#' getSelectedData(data, "groups %in% 'A'") +#' +#' # Get rows of missing dependent variable (mdv) +#' getSelectedRows(data, "mdv == 0") +#' +getSelectedRows <- function(data, dataSelection) { + if(isEmpty(dataSelection)){ + return(FALSE) + } + if(isIncluded(dataSelection, DataSelectionKeys$ALL)){ + return(TRUE) + } + if(isIncluded(dataSelection, DataSelectionKeys$NONE)){ + return(FALSE) + } + if(isOfType(dataSelection, "expression")){ + selectedData <- data %>% + dplyr::mutate(rows = 1:n()) %>% + dplyr::filter(eval(dataSelection)) + return(selectedData$rows) + } + selectedData <- data %>% + dplyr::mutate(rows = 1:n()) %>% + dplyr::filter(eval(parse(text = dataSelection))) + return(selectedData$rows) +} + #' @title evalDataFilter #' @description #' Evaluate a data filter by converting the variable names of the data.frame @@ -95,6 +195,7 @@ readObservedDataFile <- function(fileName, #' @return vector of logicals corresponding to the evaluation of the filter #' @export evalDataFilter <- function(data, filterExpression) { + .Deprecated("getSelectedRows") variableNames <- names(data) expressionList <- lapply( variableNames, @@ -146,6 +247,7 @@ loadObservedDataFromSimulationSet <- function(simulationSet) { re.tStoreFileMetadata(access = "read", filePath = simulationSet$observedDataFile) observedDataset <- readObservedDataFile(simulationSet$observedDataFile) + observedDataset <- getSelectedData(observedDataset, simulationSet$dataSelection) re.tStoreFileMetadata(access = "read", filePath = simulationSet$observedMetaDataFile) dictionary <- readObservedDataFile(simulationSet$observedMetaDataFile) @@ -182,10 +284,10 @@ loadObservedDataFromSimulationSet <- function(simulationSet) { # If unit was actually defined using output objects, overwrite current dvUnit for (output in simulationSet$outputs) { - if (isOfLength(output$dataUnit, 0)) { + if (isEmpty(output$dataUnit)) { next } - selectedRows <- evalDataFilter(observedDataset, output$dataSelection) + selectedRows <- getSelectedRows(observedDataset, output$dataSelection) observedDataset[selectedRows, dvUnitColumn] <- output$dataUnit } @@ -260,26 +362,21 @@ getObservedDataFromOutput <- function(output, data, dataMapping, molWeight, stru if (isEmpty(data)) { return() } - if (isEmpty(output$dataSelection)) { - return() - } - - selectedRows <- evalDataFilter(data, output$dataSelection) - logDebug(messages$selectedObservedDataForPath(output$path, sum(selectedRows))) - # If filter did not select any data, return empty dataset - if (sum(selectedRows) == 0) { + selectedData <- getSelectedData(data, output$dataSelection) + logDebug(messages$selectedObservedDataForPath(output$path, nrow(selectedData))) + if (isEmpty(selectedData)) { return() } # Get dimensions of observed data - dvDimensions <- unique(as.character(data[selectedRows, dataMapping$dimension])) - outputConcentration <- data[selectedRows, dataMapping$dv] + dvDimensions <- unique(as.character(selectedData[, dataMapping$dimension])) + outputConcentration <- selectedData[, dataMapping$dv] if (!isEmpty(output$displayUnit)) { for (dvDimension in dvDimensions) { if (is.na(dvDimension)) { next } - dvSelectedRows <- data[selectedRows, dataMapping$dimension] %in% dvDimension + dvSelectedRows <- selectedData[, dataMapping$dimension] %in% dvDimension outputConcentration[dvSelectedRows] <- ospsuite::toUnit( dvDimension, outputConcentration[dvSelectedRows], @@ -291,7 +388,7 @@ getObservedDataFromOutput <- function(output, data, dataMapping, molWeight, stru outputData <- data.frame( "Time" = ospsuite::toUnit( "Time", - data[selectedRows, dataMapping$time], + selectedData[, dataMapping$time], structureSet$simulationSet$timeUnit ), "Concentration" = outputConcentration, @@ -306,13 +403,13 @@ getObservedDataFromOutput <- function(output, data, dataMapping, molWeight, stru return(list(data = outputData, lloq = NULL)) } - lloqConcentration <- data[selectedRows, dataMapping$lloq] + lloqConcentration <- selectedData[, dataMapping$lloq] if (!isEmpty(output$displayUnit)) { for (dvDimension in dvDimensions) { if (is.na(dvDimension)) { next } - dvSelectedRows <- data[selectedRows, dataMapping$dimension] %in% dvDimension + dvSelectedRows <- selectedData[, dataMapping$dimension] %in% dvDimension lloqConcentration[dvSelectedRows] <- ospsuite::toUnit( dvDimension, lloqConcentration[dvSelectedRows], @@ -324,7 +421,7 @@ getObservedDataFromOutput <- function(output, data, dataMapping, molWeight, stru lloqOutput <- data.frame( "Time" = ospsuite::toUnit( "Time", - data[selectedRows, dataMapping$time], + selectedData[, dataMapping$time], structureSet$simulationSet$timeUnit ), "Concentration" = lloqConcentration, diff --git a/man/getSelectedData.Rd b/man/getSelectedData.Rd new file mode 100644 index 00000000..ac1c68d4 --- /dev/null +++ b/man/getSelectedData.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-observed-data.R +\name{getSelectedData} +\alias{getSelectedData} +\title{getSelectedData} +\usage{ +getSelectedData(data, dataSelection) +} +\arguments{ +\item{data}{A data.frame} + +\item{dataSelection}{Character string or expression evaluated to select data +The enum helper `DataSelectionKeys` provides keys for selected all or none of the data} +} +\value{ +A data.frame of selected data +} +\description{ +Get selected data +The function leverage `dplyr::filter` to select the data +} +\examples{ +data <- data.frame( +x = seq(0,9), +y = seq(10,19), +mdv = c(1,1, rep(0, 8)), +groups = rep(c("A", "B"), 5) +) + +# Select all the data +getSelectedData(data, DataSelectionKeys$ALL) + +# Select no data +getSelectedData(data, DataSelectionKeys$NONE) + +# Select data from group A +getSelectedData(data, "groups \%in\% 'A'") + +# Remove missing dependent variable (mdv) +getSelectedData(data, "mdv == 0") + +} +\seealso{ +DataSelectionKeys +} diff --git a/man/getSelectedRows.Rd b/man/getSelectedRows.Rd new file mode 100644 index 00000000..ef83e636 --- /dev/null +++ b/man/getSelectedRows.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-observed-data.R +\name{getSelectedRows} +\alias{getSelectedRows} +\title{getSelectedRows} +\usage{ +getSelectedRows(data, dataSelection) +} +\arguments{ +\item{data}{A data.frame} + +\item{dataSelection}{Character string or expression evaluated to select data +The enum helper `DataSelectionKeys` provides keys for selected all or none of the data} +} +\value{ +A data.frame of selected data +} +\description{ +Get selected rows from data and its selection +The function leverage `dplyr::filter` to select the rows +} +\examples{ +data <- data.frame( +x = seq(0,9), +y = seq(10,19), +mdv = c(1,1, rep(0, 8)), +groups = rep(c("A", "B"), 5) +) + +# Select all the rows +getSelectedRows(data, DataSelectionKeys$ALL) + +# Select no row +getSelectedRows(data, DataSelectionKeys$NONE) + +# Select rows from group A +getSelectedData(data, "groups \%in\% 'A'") + +# Get rows of missing dependent variable (mdv) +getSelectedRows(data, "mdv == 0") + +} +\seealso{ +DataSelectionKeys +} diff --git a/tests/testthat/test-observed-data.R b/tests/testthat/test-observed-data.R index 7e193751..96e55aa5 100644 --- a/tests/testthat/test-observed-data.R +++ b/tests/testthat/test-observed-data.R @@ -1,10 +1,10 @@ library(ospsuite.reportingengine) # Test data frame used as reference testDataFrame <- data.frame( - "ID" = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3), - "Time" = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4), + "ID" = rep(seq(1,3), each = 4), + "Time" = rep(seq(1,4), 3), "DV" = c(1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 4, 4), - "Group" = c("A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B"), + "Group" = rep(c("A", "B"), each = 6), stringsAsFactors = FALSE ) @@ -29,6 +29,15 @@ write.table(testDataFrame, row.names = FALSE ) +# Needs to update expect_equal due to mismatch in attribute "row.names" +# due to new dplyr package method +expect_dataframe <- function(x, y){ + row.names(x) <- 1:nrow(x) + row.names(y) <- 1:nrow(y) + expect_equal(x, y) +} + + context("Reading of Observed Data") test_that("readObservedDataFile can correctly guess separator and read csv and txt format for observed data", { @@ -53,42 +62,45 @@ test_that("readObservedDataFile throw an error if columns are inconsistent", { context("Data selection process") -test_that("'evalDataFilter' gets data.frame variable as 'data' and is consequently independent of input data.frame", { - testDataFrameA <- testDataFrame - testDataFrameB <- testDataFrame - filterExpression <- parse(text = "data") - expect_equal(testDataFrame, evalDataFilter(testDataFrameA, filterExpression)) - expect_equal(testDataFrame, evalDataFilter(testDataFrameB, filterExpression)) -}) - -test_that("'filterExpression' uses data.frame variable names as actual variable", { - for (variableName in names(testDataFrame)) { - filterExpression <- parse(text = variableName) - filterVariable <- evalDataFilter(testDataFrame, filterExpression) - expect_equal(testDataFrame[, variableName], filterVariable) - } +test_that("Selection Keys are well understood", { + expect_equal(testDataFrame, getSelectedData(testDataFrame, DataSelectionKeys$ALL)) + expect_true(ospsuite.utils::isEmpty(getSelectedData(testDataFrame, DataSelectionKeys$NONE))) + + expect_true(getSelectedRows(testDataFrame, DataSelectionKeys$ALL)) + expect_false(getSelectedRows(testDataFrame, DataSelectionKeys$NONE)) }) -test_that("'evalDataFilter' throw an error if variable name does not exist in data.frame", { - filterExpression <- parse(text = "wrongName") - expect_error(evalDataFilter(testDataFrame, filterExpression)) +test_that("'getSelectedData' and 'getSelectedRows' throw an error if variable name does not exist in data.frame", { + expect_error(getSelectedData(testDataFrame, "wrongName")) + expect_error(getSelectedData(testDataFrame, "wrongName")) }) -test_that("Correct expressions work the way they should", { - filterExpression <- parse(text = "ID == 1") - expect_equal( - c(TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), - evalDataFilter(testDataFrame, filterExpression) - ) - filterExpression <- parse(text = "Time %in% 1") - expect_equal( - c(TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE), - evalDataFilter(testDataFrame, filterExpression) +test_that("Correct expressions work as expected and both methods can be used to select data", { + testSelection <- "ID == 1" + selectedRows <- which(testDataFrame$ID == 1) + expect_equal(selectedRows, getSelectedRows(testDataFrame, testSelection)) + expect_dataframe(testDataFrame[selectedRows,], getSelectedData(testDataFrame, testSelection)) + expect_dataframe( + testDataFrame[getSelectedRows(testDataFrame, testSelection),], + getSelectedData(testDataFrame, testSelection) + ) + + testSelection <- "Time %in% 1" + selectedRows <- which(testDataFrame$Time %in% 1) + expect_equal(selectedRows, getSelectedRows(testDataFrame, testSelection)) + expect_dataframe(testDataFrame[selectedRows,], getSelectedData(testDataFrame, testSelection)) + expect_dataframe( + testDataFrame[getSelectedRows(testDataFrame, testSelection),], + getSelectedData(testDataFrame, testSelection) ) - filterExpression <- parse(text = '!DV %in% 1 & Group %in% "A"') - expect_equal( - c(FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE), - evalDataFilter(testDataFrame, filterExpression) + + testSelection <- '!DV %in% 1 & Group %in% "A"' + selectedRows <- which(!(testDataFrame$DV %in% 1) & testDataFrame$Group %in% "A") + expect_equal(selectedRows, getSelectedRows(testDataFrame, testSelection)) + expect_dataframe(testDataFrame[selectedRows,], getSelectedData(testDataFrame, testSelection)) + expect_dataframe( + testDataFrame[getSelectedRows(testDataFrame, testSelection),], + getSelectedData(testDataFrame, testSelection) ) }) diff --git a/tests/testthat/test-output.R b/tests/testthat/test-output.R index 7f10d12b..83d3aac5 100644 --- a/tests/testthat/test-output.R +++ b/tests/testthat/test-output.R @@ -59,18 +59,18 @@ test_that("Output 'dataSelection' is checked and set properly", { expect_silent(Output$new(path = testPath, dataSelection = testDataSelection)) outputPath <- Output$new(path = testPath) - expect_null(outputPath$dataSelection) + expect_s3_class(outputPath, "Output") + # By default, no selection is used + expect_false(outputPath$dataSelection) outputFilter <- Output$new(path = testPath, dataSelection = testDataSelection) - expect_is(outputFilter$dataSelection, "expression") - expect_equivalent(deparse(outputFilter$dataSelection), deparse(parse(text = testDataSelection))) - - # Expressions can be input - outputFilter <- Output$new(path = testPath, dataSelection = parse(text = testDataSelection)) - expect_is(outputFilter$dataSelection, "expression") - expect_equivalent(deparse(outputFilter$dataSelection), deparse(parse(text = testDataSelection))) - - expect_error(Output$new(path = testPath, dataSelection = c(testDataSelection, testDataSelection))) + expect_s3_class(outputFilter, "Output") + + # Vectors are concatenated using "&" + outputVector <- Output$new(path = testPath, dataSelection = c(testDataSelection, testDataSelection)) + expect_length(outputVector$dataSelection, 1) + expect_true(grepl(pattern = "\\&", outputVector$dataSelection)) + expect_true(grepl(pattern = testDataSelection, outputVector$dataSelection)) }) myTestAUCName <- "AUC_inf"