diff --git a/DESCRIPTION b/DESCRIPTION index a45cfa47..0f9c01cc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,6 +50,7 @@ Suggests: BigKnn (>= 1.0.0), devtools, Eunomia, + glmnet, IterativeHardThresholding, knitr, markdown, diff --git a/NAMESPACE b/NAMESPACE index ad98ff62..173f8d29 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,14 +25,18 @@ export(createExecuteSettings) export(createExistingSplitSettings) export(createFeatureEngineeringSettings) export(createGlmModel) +export(createIterativeImputer) export(createLearningCurve) export(createLogSettings) export(createModelDesign) +export(createNormalization) export(createPlpResultTables) export(createPreprocessSettings) export(createRandomForestFeatureSelection) +export(createRareFeatureRemover) export(createRestrictPlpDataSettings) export(createSampleSettings) +export(createSimpleImputer) export(createSplineSettings) export(createStratifiedImputationSettings) export(createStudyPopulation) diff --git a/R/CyclopsModels.R b/R/CyclopsModels.R index 760f151d..2bb144fa 100644 --- a/R/CyclopsModels.R +++ b/R/CyclopsModels.R @@ -101,9 +101,9 @@ fitCyclopsModel <- function( noiseLevel = "silent", threads = settings$threads, maxIterations = settings$maxIterations, - seed = settings$seed + seed = settings$seed, + useKKTSwindle = FALSE ) - fit <- tryCatch({ ParallelLogger::logInfo('Running Cyclops') Cyclops::fitCyclopsModel( diff --git a/R/Imputation.R b/R/Imputation.R new file mode 100644 index 00000000..43c58da4 --- /dev/null +++ b/R/Imputation.R @@ -0,0 +1,774 @@ +# @file Imputation.R +# Copyright 2025 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' @title Create Iterative Imputer settings +#' @description This function creates the settings for an iterative imputer +#' which first removes features with more than `missingThreshold` missing values +#' and then imputes the missing values iteratively using chained equations +#' @param missingThreshold The threshold for missing values to remove a feature +#' @param method The method to use for imputation, currently only "pmm" is supported +#' @return The settings for the single imputer of class `featureEngineeringSettings` +#' @export +createIterativeImputer <- function(missingThreshold = 0.3, + method = "pmm") { + featureEngineeringSettings <- list( + missingThreshold = missingThreshold, + method = method + ) + if (method == "pmm") { + # at the moment this requires glmnet + rlang::check_installed("glmnet") + } + attr(featureEngineeringSettings, "fun") <- "iterativeImpute" + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' @title Create Simple Imputer settings +#' @description This function creates the settings for a simple imputer +#' which imputes missing values with the mean or median +#' @param method The method to use for imputation, either "mean" or "median" +#' @return The settings for the single imputer of class `featureEngineeringSettings` +#' @export +createSimpleImputer <- function(method = "mean", + missingThreshold = 0.3) { + checkIsClass(method, "character") + checkInStringVector(method, c("mean", "median")) + featureEngineeringSettings <- list( + method = method, + missingThreshold = missingThreshold + ) + attr(featureEngineeringSettings, "fun") <- "simpleImpute" + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' @title Simple Imputation +#' @description This function does single imputation with the mean or median +#' @param trainData The data to be imputed +#' @param featureEngineeringSettings The settings for the imputation +#' @param done Whether the imputation has already been done (bool) +#' @return The imputed data +simpleImpute <- function(trainData, featureEngineeringSettings, done = FALSE) { + if (!done) { + missingInfo <- extractMissingInfo(trainData) + trainData$covariateData$missingInfo <- missingInfo$missingInfo + continuousFeatures <- missingInfo$continuousFeatures + on.exit(trainData$covariateData$missingInfo <- NULL, add = TRUE) + + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::filter(is.na(.data$missing) || + .data$missing <= featureEngineeringSettings$missingThreshold) %>% + dplyr::select(-"missing") + + # separate the continuous and binary features + featureData <- separateFeatures(trainData, continuousFeatures) + numericData <- featureData[[1]] + on.exit(numericData <- NULL, add = TRUE) + + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + + if (featureEngineeringSettings$method == "mean") { + numericData$imputedValues <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise(imputedValues = mean(.data$covariateValue, na.rm = TRUE)) + } else if (featureEngineeringSettings$method == "median") { + numericData$imputedValues <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::collect() %>% # median not possible in sql + dplyr::summarise(imputedValues = median(.data$covariateValue, na.rm = TRUE)) + } + + + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::left_join(numericData$imputedValues, by = "covariateId") %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = ifelse(is.na(.data$covariateValue), + .data$imputedValues, + .data$covariateValue + )) %>% + dplyr::select(-c("imputedValues")) + Andromeda::appendToTable( + trainData$covariateData$covariates, + numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + ) + attr(featureEngineeringSettings, "missingInfo") <- + trainData$covariateData$missingInfo %>% + dplyr::collect() + attr(featureEngineeringSettings, "imputer") <- + numericData$imputedValues %>% dplyr::collect() + done <- TRUE + } else { + trainData$covariateData$missingInfo <- attr( + featureEngineeringSettings, + "missingInfo" + ) + on.exit(trainData$covariateData$missingInfo <- NULL, add = TRUE) + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::filter(is.na(.data$missing) || + .data$missing <= featureEngineeringSettings$missingThreshold) %>% + dplyr::select(-"missing") + + continuousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + featureData <- separateFeatures(trainData, continuousFeatures) + numericData <- featureData[[1]] + on.exit(numericData <- NULL, add = TRUE) + # impute missing values + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + numericData$imputedValues <- attr(featureEngineeringSettings, "imputer") + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::left_join(numericData$imputedValues, by = "covariateId") %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = ifelse(is.na(.data$covariateValue), + .data$imputedValues, + .data$covariateValue + )) %>% + dplyr::select(-c("imputedValues")) + Andromeda::appendToTable( + trainData$covariateData$covariates, + numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + ) + + } + featureEngineering <- list( + funct = "simpleImpute", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + done = done + ) + ) + attr(trainData, "metaData")$featureEngineering[["simpleImputer"]] <- + featureEngineering + return(trainData) +} + + +#' @title Imputation +#' @description This function does single imputation with predictive mean matchin +#' @param trainData The data to be imputed +#' @param featureEngineeringSettings The settings for the imputation +#' @param done Whether the imputation has already been done (bool) +#' @return The imputed data +iterativeImpute <- function(trainData, featureEngineeringSettings, done = FALSE) { + if (!done) { + missingInfo <- extractMissingInfo(trainData) + trainData$covariateData$missingInfo <- missingInfo$missingInfo + continuousFeatures <- missingInfo$continuousFeatures + on.exit(trainData$covariateData$missingInfo <- NULL, add = TRUE) + + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::filter(is.na(.data$missing) || + .data$missing <= featureEngineeringSettings$missingThreshold) %>% + dplyr::select(-"missing") + + # separate the continuous and binary features + featureData <- separateFeatures(trainData, continuousFeatures) + numericData <- featureData[[1]] + binary <- featureData[[2]] + on.exit(numericData <- NULL, add = TRUE) + on.exit(binary <- NULL, add = TRUE) + + numericData <- initializeImputation(numericData, "mean") + # add imputed values in data + iterativeImputeResults <- iterativeChainedImpute(numericData, + binary, + trainData, + featureEngineeringSettings, + direction = "ascending", + iterations = 5 + ) + + Andromeda::appendToTable( + trainData$covariateData$covariates, + iterativeImputeResults$numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + ) + + + attr(featureEngineeringSettings, "missingInfo") <- + trainData$covariateData$missingInfo %>% + dplyr::collect() + attr(featureEngineeringSettings, "imputer") <- iterativeImputeResults$modelInfo + attr(featureEngineeringSettings, "kdeEstimates") <- iterativeImputeResults$kdeEstimates + done <- TRUE + } else { + # remove data with more than missingThreshold + trainData$covariateData$missingInfo <- attr( + featureEngineeringSettings, + "missingInfo" + ) + on.exit(trainData$covariateData$missingInfo <- NULL, add = TRUE) + trainData$covariateData$covariateIsBinary <- trainData$covariateData$covariateRef %>% + dplyr::select("covariateId", "analysisId") %>% + dplyr::inner_join( + trainData$covariateData$analysisRef %>% + dplyr::select("analysisId", "isBinary"), + by = "analysisId" + ) %>% + dplyr::mutate(isBinary = .data$isBinary == "Y") %>% + dplyr::select(.data$covariateId, .data$isBinary) %>% + dplyr::compute() + on.exit(trainData$covariateData$covariateIsBinary <- NULL, add = TRUE) + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::left_join(trainData$covariateData$covariateIsBinary, by = "covariateId") %>% + dplyr::filter( + (!is.na(.data$missing) && .data$missing <= featureEngineeringSettings$missingThreshold) || + (is.na(.data$missing) && .data$isBinary) + ) %>% + dplyr::select(-"missing", -"isBinary") + + continuousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + featureData <- separateFeatures(trainData, continuousFeatures) + numericData <- featureData[[1]] + binary <- featureData[[2]] + on.exit(numericData <- NULL, add = TRUE) + on.exit(binary <- NULL, add = TRUE) + # impute missing values + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + # now we have NAs for missing combinations + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + + # get index of NAs for every feature to be imputed + numericData$missingIndex <- numericData$covariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::select(-c("covariateValue")) + on.exit(numericData$missingIndex <- NULL, add = TRUE) + + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = .data$covariateValue) + on.exit(numericData$imputedCovariates <- NULL, add = TRUE) + + + varsToImpute <- numericData$missingIndex %>% + dplyr::pull(.data$covariateId) %>% + unique() + for (varId in varsToImpute) { + varName <- trainData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$covariateName) + ParallelLogger::logInfo("Imputing variable: ", varName) + numericData$y <- numericData$covariates %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::mutate(y = .data$covariateValue) %>% + dplyr::select("y", "rowId") + on.exit(numericData$y <- NULL, add = TRUE) + missIdx <- which(is.na(numericData$y %>% dplyr::pull(.data$y))) + numericData$X <- numericData$covariates %>% + dplyr::filter(.data$covariateId != varId) + on.exit(numericData$X <- NULL, add = TRUE) + Andromeda::appendToTable(numericData$X, binary$covariates) + numericData$xMiss <- numericData$X %>% + dplyr::filter(.data$rowId %in% !!allRowIds[missIdx]) + on.exit(numericData$xMiss <- NULL, add = TRUE) + + imputer <- attr(featureEngineeringSettings, "imputer")[[as.character(varId)]] + pmmResults <- pmmPredict(numericData, k = 5, imputer) + + # update imputations in data + numericData$imputedValues <- pmmResults$imputedValues + on.exit(numericData$imputedValues <- NULL, add = TRUE) + numericData$imputedCovariates <- numericData$imputedCovariates %>% + dplyr::left_join(numericData$imputedValues, + by = "rowId", + suffix = c("", ".new") + ) %>% + dplyr::mutate( + imputedValue = + dplyr::if_else(.data$covariateId == varId && + !is.na(.data$imputedValue.new), + .data$imputedValue.new, + .data$imputedValue + ) + ) %>% + dplyr::select(-"imputedValue.new") + } + # add imputed values in data + Andromeda::appendToTable( + trainData$covariateData$covariates, + numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + ) + } + featureEngineering <- list( + funct = "iterativeImpute", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + done = done + ) + ) + attr(trainData, "metaData")$featureEngineering[["iterativeImputer"]] <- + featureEngineering + return(trainData) +} + +#' @title Predictive mean matching using lasso +#' @param numericData An andromeda object with the following fields: +#' xObs: covariates table for observed data +#' xMiss: covariates table for missing data +#' yObs: outcome variable that we want to impute +#' @param k The number of donors to use for matching (default 5) +#' @keywords internal +pmmFit <- function(data, k = 5) { + rlang::check_installed("glmnet") + data$rowMap <- data$xObs %>% + dplyr::group_by(.data$rowId) %>% + dplyr::summarise() %>% + dplyr::mutate( + oldRowId = .data$rowId, + newRowId = dplyr::row_number() + ) %>% + dplyr::select(c("newRowId", "oldRowId")) %>% + dplyr::compute() + on.exit(data$rowMap <- NULL, add = TRUE) + data$colMap <- data$xObs %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise() %>% + dplyr::mutate( + oldCovariateId = .data$covariateId, + newCovariateId = dplyr::row_number() + ) %>% + dplyr::select(c("newCovariateId", "oldCovariateId")) + on.exit(data$colMap <- NULL, add = TRUE) + + data$xObs <- data$xObs %>% + dplyr::left_join(data$rowMap, by = c("rowId" = "oldRowId")) %>% + dplyr::left_join(data$colMap, by = c("covariateId" = "oldCovariateId")) %>% + dplyr::select( + rowId = "newRowId", + covariateId = "newCovariateId", + covariateValue = "covariateValue" + ) + + xObs <- Matrix::sparseMatrix( + i = data$xObs %>% dplyr::pull(.data$rowId), + j = data$xObs %>% dplyr::pull(.data$covariateId), + x = data$xObs %>% dplyr::pull(.data$covariateValue), + dims = c( + data$rowMap %>% dplyr::pull(.data$newRowId) %>% max(), + data$colMap %>% dplyr::pull(.data$newCovariateId) %>% max() + ) + ) + + fit <- glmnet::cv.glmnet(xObs, data$yObs %>% + dplyr::pull(.data$y), alpha = 1, nfolds = 3) + + # predict on both XObs and XMiss + predsObs <- predict(fit, xObs, fit$lambda.min) + data$xMiss <- data$xMiss %>% + dplyr::left_join( + data$xMiss %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise( + n_unique = dplyr::n_distinct(.data$covariateValue), + max = max(.data$covariateValue, na.rm = TRUE), + min = min(.data$covariateValue, na.rm = TRUE), + ), + by = "covariateId" + ) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate( + covariateValue = ifelse(.data$n_unique > 2 & (.data$max - .data$max) > 0, + (.data$covariateValue - .data$min) / (.data$max - .data$min), + .data$covariateValue + ) + ) %>% + dplyr::ungroup() %>% + dplyr::select(-c("n_unique", "min", "max")) + data$rowMapMiss <- data$xMiss %>% + dplyr::group_by(.data$rowId) %>% + dplyr::summarise() %>% + dplyr::mutate( + oldRowId = .data$rowId, + newRowId = dplyr::row_number() + ) %>% + dplyr::select(c("newRowId", "oldRowId")) %>% + dplyr::compute() + on.exit(data$rowMapMiss <- NULL, add = TRUE) + data$xMiss <- data$xMiss %>% + dplyr::left_join(data$rowMapMiss, by = c("rowId" = "oldRowId")) %>% + dplyr::left_join(data$colMap, by = c("covariateId" = "oldCovariateId")) %>% + dplyr::select( + rowId = "newRowId", + covariateId = "newCovariateId", + covariateValue = "covariateValue" + ) + + xMiss <- Matrix::sparseMatrix( + i = data$xMiss %>% dplyr::pull(.data$rowId), + j = data$xMiss %>% dplyr::pull(.data$covariateId), + x = data$xMiss %>% dplyr::pull(.data$covariateValue), + dims = c( + data$xMiss %>% dplyr::pull(.data$rowId) %>% max(), + data$xMiss %>% dplyr::pull(.data$covariateId) %>% max() + ) + ) + + predsMiss <- predict(fit, xMiss, fit$lambda.min) + + # precompute mapping to use - straight from xId (row index) to + # covariateValue of donor + donorMapping <- data$rowMap %>% + dplyr::inner_join(data$yObs, by = c("oldRowId" = "rowId"), copy = TRUE) %>% + dplyr::pull(.data$y) + # for each missing value, find the k closest observed values + imputedValues <- numeric(nrow(xMiss)) + for (j in 1:nrow(xMiss)) { + distances <- abs(predsObs - predsMiss[j]) + donorIndices <- order(distances)[1:k] + donorValues <- donorMapping[donorIndices] + imputedValues[j] <- sample(donorValues, 1) + } + + results <- list() + results$imputedValues <- data.frame( + rowId = data$rowMapMiss %>% + dplyr::pull(.data$oldRowId), + imputedValue = imputedValues + ) + bestIndex <- which(fit$lambda == fit$lambda.min) + nonZero <- which(fit$glmnet.fit$beta[, bestIndex] != 0) + nonZeroCovariateIds <- data$colMap %>% + dplyr::filter(.data$newCovariateId %in% nonZero) %>% + dplyr::pull(.data$oldCovariateId) + results$model <- list( + intercept = as.numeric(fit$glmnet.fit$a0[bestIndex]), + coefficients = data.frame( + covariateId = nonZeroCovariateIds, + values = as.numeric(fit$glmnet.fit$beta[nonZero, bestIndex]) + ), + predictions = data.frame( + rowId = data$rowMap %>% + dplyr::pull(.data$oldRowId), + prediction = as.numeric(predsObs) + ) + ) + return(results) +} + +pmmPredict <- function(data, k = 5, imputer) { + data$coefficients <- imputer$coefficients + predictionMissing <- data$xMiss %>% + dplyr::inner_join(data$coefficients, by = "covariateId") %>% + dplyr::mutate(values = .data$covariateValue * .data$values) %>% + dplyr::group_by(.data$rowId) %>% + dplyr::summarise(value = sum(.data$values, na.rm = TRUE)) %>% + dplyr::select("rowId", "value") + predictionMissing <- as.data.frame(predictionMissing) + predictionMissing$value <- predictionMissing$value + imputer$intercept + + + # precompute mapping to use - straight from xId (row index) to + # covariateValue of donor + donorMapping <- imputer$predictions %>% dplyr::pull(.data$prediction) + + # for each missing value, find the k closest observed values + nRows <- data$xMiss %>% + dplyr::pull(.data$rowId) %>% + dplyr::n_distinct() + imputedValues <- numeric(nRows) + predsObs <- imputer$predictions$prediction + for (j in 1:nRows) { + distances <- abs(predsObs - predictionMissing$value[j]) + donorIndices <- order(distances)[1:k] + donorValues <- donorMapping[donorIndices] + imputedValues[j] <- sample(donorValues, 1) + } + + results <- list() + results$imputedValues <- data.frame( + rowId = predictionMissing %>% + dplyr::pull(.data$rowId), + imputedValue = imputedValues + ) + return(results) +} + +extractMissingInfo <- function(trainData) { + total <- trainData$covariateData$covariates %>% + dplyr::summarise(total = dplyr::n_distinct(.data$rowId)) %>% + dplyr::pull() + continuousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + + missingInfo <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise(counts = dplyr::n()) %>% + dplyr::collect() %>% # necessary because of integer division in sqlite + dplyr::mutate(missing = 1 - .data$counts / total) %>% + dplyr::select(c("covariateId", "missing")) + results <- list( + "missingInfo" = missingInfo, + "continuousFeatures" = continuousFeatures + ) + return(results) +} + +separateFeatures <- function(trainData, continuousFeatures) { + numericData <- Andromeda::andromeda() + numericData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) + numericData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) + + binaryData <- Andromeda::andromeda() + binaryData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(!.data$covariateId %in% !!continuousFeatures) + binaryData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(!.data$covariateId %in% !!continuousFeatures) + return(list(numericData, binaryData)) +} + +initializeImputation <- function(numericData, method = "mean") { + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + + # get index of NAs for every feature to be imputed + numericData$missingIndex <- numericData$covariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::select(-c("covariateValue")) + + if (method == "mean") { + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = ifelse(is.na(.data$covariateValue), + mean(.data$covariateValue, na.rm = TRUE), + .data$covariateValue + )) + } else { + stop("Unknown initialization method: ", method) + } + return(numericData) +} + +# Main (M)ICE algorithm - iterative imputation with chained equations +iterativeChainedImpute <- function(numericData, + binaryData, + originalData, + featureEngineeringSettings, + direction = "ascending", + iterations = 5) { + prevImputations <- list() + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + maxIter <- iterations# TODO check + varsToImpute <- numericData$missingIndex %>% + dplyr::pull(.data$covariateId) %>% + unique() + convergenceParameters <- list() + modelInfo <- list() + + for (iter in 1:maxIter) { + ParallelLogger::logInfo("Imputation iteration: ", iter) + currentImputations <- list() + + # TODO do in order from least missing to most missing + for (varId in varsToImpute) { + varName <- originalData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$covariateName) + ParallelLogger::logInfo("Imputing variable: ", varName) + numericData$y <- numericData$covariates %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::mutate(y = .data$covariateValue) %>% + dplyr::select("y", "rowId") + on.exit(numericData$y <- NULL, add = TRUE) + obsIdx <- which(!is.na(numericData$y %>% dplyr::pull(.data$y))) + missIdx <- which(is.na(numericData$y %>% dplyr::pull(.data$y))) + numericData$yObs <- numericData$y %>% + dplyr::filter(.data$rowId %in% !!allRowIds[obsIdx]) + on.exit(numericData$yObs <- NULL, add = TRUE) + + numericData$X <- numericData$imputedCovariates %>% + dplyr::filter(.data$covariateId != varId) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + on.exit(numericData$X <- NULL, add = TRUE) + Andromeda::appendToTable(numericData$X, binaryData$covariates) + numericData$xObs <- numericData$X %>% dplyr::filter(.data$rowId %in% !!allRowIds[obsIdx]) + on.exit(numericData$xObs <- NULL, add = TRUE) + numericData$xMiss <- numericData$X %>% dplyr::filter(.data$rowId %in% !!allRowIds[missIdx]) + on.exit(numericData$xMiss <- NULL, add = TRUE) + + pmmResults <- pmmFit(numericData, k = 5) + + # update imputations in data + numericData$imputedValues <- pmmResults$imputedValues + on.exit(numericData$imputedValues <- NULL, add = TRUE) + numericData$imputedCovariates <- numericData$imputedCovariates %>% + dplyr::left_join(numericData$imputedValues, + by = "rowId", + suffix = c("", ".new") + ) %>% + dplyr::mutate( + imputedValue = + dplyr::if_else(.data$covariateId == varId && + !is.na(.data$imputedValue.new), + .data$imputedValue.new, + .data$imputedValue + ) + ) %>% + dplyr::select(-"imputedValue.new") + + # store current imputations for convergence check + currentImputations[[as.character(varId)]] <- pmmResults$imputedValues$imputedValue + + # store pmm info for each variable + modelInfo[[as.character(varId)]] <- pmmResults$model + } + + # save values for convergence checking afterwards + # store mean and variance of imputed values for each variable + # as well as average change from previous iteration + meanVector <- numeric(length(varsToImpute)) + varVector <- numeric(length(varsToImpute)) + idx <- 1 + for (varId in varsToImpute) { + currentImputation <- currentImputations[[as.character(varId)]] + meanVector[idx] <- mean(currentImputation) + varVector[idx] <- var(currentImputation) + idx <- idx + 1 + } + convergenceInfo <- list( + meanVector = meanVector, + varVector = varVector + ) + if (iter > 1) { + meanVarChange <- numeric(length(varsToImpute)) + for (varId in varsToImpute) { + prevImputation <- prevImputations[[as.character(varId)]] + currentImputation <- currentImputations[[as.character(varId)]] + meanVarChange <- c( + meanVarChange, + mean(abs(currentImputation - prevImputation)) + ) + } + convergenceInfo$meanVarChange <- meanVarChange + } + convergenceParameters[[iter]] <- convergenceInfo + + prevImputations <- currentImputations + } + + # calculate kde estimates of imputed and observed distributions per imputed variable + # and store in featureEngineeringSettings + kdeEstimates <- list() + for (varId in varsToImpute) { + varName <- originalData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$covariateName) + rows <- numericData$missingIndex %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$rowId) + imputedValues <- numericData$imputedCovariates %>% + dplyr::filter( + .data$covariateId == varId, + .data$rowId %in% rows + ) %>% + dplyr::pull(.data$imputedValue) + observedValues <- numericData$covariates %>% + dplyr::filter( + .data$covariateId == varId, + !is.na(.data$covariateValue) + ) %>% + dplyr::pull(.data$covariateValue) + kdeEstimates[[as.character(varId)]] <- list( + imputed = density(imputedValues), + observed = density(observedValues) + ) + } + results <- list( + "numericData" = numericData, + "convergenceParameters" = convergenceParameters, + "modelInfo" = modelInfo, + "kdeEstimates" = kdeEstimates + ) + return(results) +} diff --git a/R/PreprocessingData.R b/R/PreprocessingData.R index 834d27a8..f7e53576 100644 --- a/R/PreprocessingData.R +++ b/R/PreprocessingData.R @@ -2,13 +2,13 @@ # Copyright 2021 Observational Health Data Sciences and Informatics # # This file is part of PatientLevelPrediction -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,34 +20,31 @@ #' @details #' Returns an object of class \code{preprocessingSettings} that specifies how to preprocess the training data #' -#' @param minFraction The minimum fraction of target population who must have a covariate for it to be included in the model training +#' @param minFraction The minimum fraction of target population who must have a covariate for it to be included in the model training #' @param normalize Whether to normalise the covariates before training (Default: TRUE) #' @param removeRedundancy Whether to remove redundant features (Default: TRUE) #' @return #' An object of class \code{preprocessingSettings} #' @export createPreprocessSettings <- function( - minFraction = 0.001, - normalize = TRUE, - removeRedundancy = TRUE - ){ - - checkIsClass(minFraction, c('numeric','integer')) - checkHigherEqual(minFraction,0) - - checkIsClass(normalize, c("logical")) + minFraction = 0.001, + normalize = TRUE, + removeRedundancy = TRUE) { + checkIsClass(minFraction, c("numeric", "integer")) + checkHigherEqual(minFraction, 0) + + checkIsClass(normalize, c("logical")) + + checkIsClass(removeRedundancy, c("logical")) + + preprocessingSettings <- list( + minFraction = minFraction, + normalize = normalize, + removeRedundancy = removeRedundancy + ) - checkIsClass(removeRedundancy, c("logical")) - - preprocessingSettings <- list( - minFraction = minFraction, - normalize = normalize, - removeRedundancy = removeRedundancy - ) - class(preprocessingSettings) <- "preprocessSettings" return(preprocessingSettings) - } @@ -57,39 +54,277 @@ createPreprocessSettings <- function( #' @details #' Returns an object of class \code{covariateData} that has been processed #' -#' @param covariateData The covariate part of the training data created by \code{splitData} after being sampled and having -#' any required feature engineering -#' @param preprocessSettings The settings for the preprocessing created by \code{createPreprocessSettings} +#' @param covariateData The covariate part of the training data created by \code{splitData} after being sampled and having +#' any required feature engineering +#' @param preprocessSettings The settings for the preprocessing created by \code{createPreprocessSettings} #' @return -#' The data processed -preprocessData <- function (covariateData, - preprocessSettings){ - +#' The data processed +preprocessData <- function(covariateData, + preprocessSettings) { metaData <- attr(covariateData, "metaData") preprocessSettingsInput <- preprocessSettings # saving this before adding covariateData - + checkIsClass(covariateData, c("CovariateData")) checkIsClass(preprocessSettings, c("preprocessSettings")) - - ParallelLogger::logDebug(paste0('minFraction: ', preprocessSettings$minFraction)) - ParallelLogger::logDebug(paste0('normalize: ', preprocessSettings$normalize)) - ParallelLogger::logDebug(paste0('removeRedundancy: ', preprocessSettings$removeRedundancy)) - + + ParallelLogger::logDebug(paste0("minFraction: ", preprocessSettings$minFraction)) + ParallelLogger::logDebug(paste0("normalize: ", preprocessSettings$normalize)) + ParallelLogger::logDebug(paste0("removeRedundancy: ", preprocessSettings$removeRedundancy)) + preprocessSettings$covariateData <- covariateData covariateData <- do.call(FeatureExtraction::tidyCovariateData, preprocessSettings) - - #update covariateRef + + # update covariateRef removed <- unique(c( attr(covariateData, "metaData")$deletedInfrequentCovariateIds, attr(covariateData, "metaData")$deletedRedundantCovariateIds - ) - ) - covariateData$covariateRef <- covariateData$covariateRef %>% - dplyr::filter(!.data$covariateId %in% removed) - + )) + covariateData$covariateRef <- covariateData$covariateRef %>% + dplyr::filter(!.data$covariateId %in% removed) + metaData$tidyCovariateDataSettings <- attr(covariateData, "metaData") metaData$preprocessSettings <- preprocessSettingsInput attr(covariateData, "metaData") <- metaData - + return(covariateData) } + +#' A function that normalizes continous features to have values between 0 and 1 +#' @details uses value - min / (max - min) to normalize the data +#' @param trainData The training data to be normalized +#' @param featureEngineeringSettings The settings for the normalization +#' @param normalized Whether the data has already been normalized (bool) +#' @return The normalized data +minMaxNormalize <- function(trainData, featureEngineeringSettings, normalized = FALSE) { + if (!normalized) { + # fit the normalization + # find continuous features from trainData$covariateData$analysisRef + continousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + + # get max of each feature + trainData$covariateData$minMaxs <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continousFeatures) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise( + max = max(.data$covariateValue, na.rm = TRUE), + min = min(.data$covariateValue, na.rm = TRUE) + ) %>% + dplyr::collect() + on.exit(trainData$covariateData$minMaxs <- NULL, add = TRUE) + + # save the normalization + attr(featureEngineeringSettings, "minMaxs") <- + trainData$covariateData$minMaxs %>% dplyr::collect() + + # apply the normalization to trainData + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$minMaxs, by = "covariateId") %>% + # use ifelse to only normalize if min and max are not NA as is the case + # for continous features, else return original value + dplyr::mutate(covariateValue = ifelse(!is.na(min) & !is.na(max), + (.data$covariateValue - min) / (max - min), + .data$covariateValue + )) %>% + dplyr::select(-c("max", "min")) + trainData$covariateData$minMaxs <- NULL + normalized <- TRUE + } else { + # apply the normalization to test data by using saved normalization values + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(attr(featureEngineeringSettings, "minMaxs"), + by = "covariateId", copy = TRUE + ) %>% + dplyr::mutate(covariateValue = ifelse(!is.na(min) & !is.na(max), + (.data$covariateValue - min) / (max - min), + .data$covariateValue + )) %>% + dplyr::select(-c("max", "min")) + } + featureEngineering <- list( + funct = "minMaxNormalize", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + normalized = normalized + ) + ) + + attr(trainData, "metaData")$featureEngineering[["minMaxNormalize"]] -> + featureEngineering + + return(trainData) +} + +#' A function that normalizes continous by the interquartile range and forces +#' the resulting values to be between -3 and 3 with f(x) = x / sqrt(1 + (x/3)^2) +#' @details uses (value - median) / iqr to normalize the data and then +#' applies the function f(x) = x / sqrt(1 + (x/3)^2) to the normalized values. +#' This forces the values to be between -3 and 3 while preserving the relative +#' ordering of the values.' +#' based on https://arxiv.org/abs/2407.04491 for more details +#' @param trainData The training data to be normalized +#' @param featureEngineeringSettings The settings for the normalization +#' @param normalized Whether the data has already been normalized (bool) +robustNormalize <- function(trainData, featureEngineeringSettings, normalized = FALSE) { + if (!normalized) { + # find continuous features from trainData$covariateData$analysisRef + continousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + + # get (25, 75)% quantiles of each feature + # sqlite (used by Andromeda) doesn't have quantile function, so we need to load the extension + # to get upper_quartile and lower_quartile_functions + RSQLite::initExtension(trainData$covariateData, "math") + + trainData$covariateData$quantiles <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continousFeatures) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise( + q25 = dplyr::sql("lower_quartile(covariateValue)"), + q75 = dplyr::sql("upper_quartile(covariateValue)"), + median = median(.data$covariateValue, na.rm = TRUE) + ) %>% + dplyr::mutate(iqr = .data$q75 - .data$q25) %>% + dplyr::select(-c("q75", "q25")) %>% + dplyr::collect() + on.exit(trainData$covariateData$quantiles <- NULL, add = TRUE) + + # save the normalization + attr(featureEngineeringSettings, "quantiles") <- + trainData$covariateData$quantiles %>% dplyr::collect() + + # apply the normalization to trainData + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$quantiles, by = "covariateId") %>% + # use ifelse to only normalize continous features + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + (.data$covariateValue - .data$median) / .data$iqr, + .data$covariateValue + )) %>% + # smoothly clip the range to [-3, 3] with x / sqrt(1 + (x/3)^2) + # ref: https://arxiv.org/abs/2407.04491 + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + .data$covariateValue / sqrt(1 + (.data$covariateValue / 3)^2), + .data$covariateValue + )) %>% + dplyr::select(-c("median", "iqr")) + normalized <- TRUE + } else { + # apply the normalization to test data by using saved normalization values + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(attr(featureEngineeringSettings, "quantiles"), + by = "covariateId", copy = TRUE + ) %>% + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + (.data$covariateValue - .data$median) / .data$iqr, + .data$covariateValue + )) %>% + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + .data$covariateValue / sqrt(1 + (.data$covariateValue / 3)^2), + .data$covariateValue + )) %>% + dplyr::select(-c("median", "iqr")) + } + featureEngineering <- list( + funct = "robustNormalize", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + normalized = normalized + ) + ) + + attr(trainData, "metaData")$featureEngineering[["robustNormalize"]] <- + featureEngineering + return(trainData) +} + +#' Create the settings for normalizing the data @param type The type of normalization to use, either "minmax" or "robust" +#' @return An object of class \code{featureEngineeringSettings} +#' @export +createNormalization <- function(type = "minmax") { + featureEngineeringSettings <- list( + type = type + ) + if (type == "minmax") { + attr(featureEngineeringSettings, "fun") <- "minMaxNormalize" + } else if (type == "robust") { + attr(featureEngineeringSettings, "fun") <- "robustNormalize" + } + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' A function that removes rare features from the data +#' @details removes features that are present in less than a certain fraction of the population +#' @param trainData The data to be normalized +#' @param featureEngineeringSettings The settings for the normalization +#' @param findRare Whether to find and remove rare features or remove them only (bool) +removeRareFeatures <- function(trainData, featureEngineeringSettings, findRare = FALSE) { + if (!findRare) { + rareFeatures <- trainData$covariateData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise(count = dplyr::n()) %>% + dplyr::collect() + rareFeatures <- rareFeatures %>% + dplyr::mutate(ratio = .data$count / ( + trainData$covariateData$covariates %>% + dplyr::summarise(popSize = dplyr::n_distinct(.data$rowId)) %>% + dplyr::pull() + )) %>% + dplyr::filter(.data$ratio <= featureEngineeringSettings$ratio) %>% + dplyr::pull(c("covariateId")) + + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(!.data$covariateId %in% rareFeatures) + trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(!.data$covariateId %in% rareFeatures) + + attr(featureEngineeringSettings, "rareFeatures") <- rareFeatures + + findRare <- TRUE + } else { + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter( + !.data$covariateId %in% !!attr(featureEngineeringSettings, "rareFeatures") + ) + trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter( + !.data$covariateId %in% !!attr(featureEngineeringSettings, "rareFeatures") + ) + } + featureEngineering <- list( + funct = "removeRareFeatures", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + findRare = findRare + ) + ) + attr(trainData, "metaData")$featureEngineering[["removeRare"]] <- + featureEngineering + return(trainData) +} + +#' Create the settings for removing rare features +#' @param ratio The minimum fraction of the training data that must have a +#' feature for it to be included +#' @return An object of class \code{featureEngineeringSettings} +#' @export +createRareFeatureRemover <- function(ratio = 0.001) { + checkIsClass(ratio, c("numeric")) + checkHigherEqual(ratio, 0) + checkLower(ratio, 1) + featureEngineeringSettings <- list( + ratio = ratio + ) + attr(featureEngineeringSettings, "fun") <- "removeRareFeatures" + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} diff --git a/R/RunPlp.R b/R/RunPlp.R index 44b32b98..58842898 100644 --- a/R/RunPlp.R +++ b/R/RunPlp.R @@ -213,6 +213,7 @@ runPlp <- function( executeSettings = createDefaultExecuteSettings(), saveDirectory = getwd() ){ + start <- Sys.time() # start log analysisPath <- file.path(saveDirectory, analysisId) @@ -509,7 +510,8 @@ runPlp <- function( class(results) <- c('runPlp') ParallelLogger::logInfo("Run finished successfully.") - + end <- Sys.time() + ParallelLogger::logInfo(paste0('Total time taken: ', end - start)) # save the results ParallelLogger::logInfo(paste0('Saving PlpResult')) tryCatch(savePlpResult(results, file.path(analysisPath,'plpResult')), diff --git a/man/createDefaultSplitSetting.Rd b/man/createDefaultSplitSetting.Rd index e63e7437..20e679bc 100644 --- a/man/createDefaultSplitSetting.Rd +++ b/man/createDefaultSplitSetting.Rd @@ -9,7 +9,7 @@ test/validation/train sets using default splitting functions createDefaultSplitSetting( testFraction = 0.25, trainFraction = 0.75, - splitSeed = sample(1e+05, 1), + splitSeed = sample(100000, 1), nfold = 3, type = "stratified" ) diff --git a/man/createIterativeImputer.Rd b/man/createIterativeImputer.Rd new file mode 100644 index 00000000..f979bfa2 --- /dev/null +++ b/man/createIterativeImputer.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{createIterativeImputer} +\alias{createIterativeImputer} +\title{Create Iterative Imputer settings} +\usage{ +createIterativeImputer(missingThreshold = 0.3, method = "pmm") +} +\arguments{ +\item{missingThreshold}{The threshold for missing values to remove a feature} + +\item{method}{The method to use for imputation, currently only "pmm" is supported} +} +\value{ +The settings for the single imputer of class `featureEngineeringSettings` +} +\description{ +This function creates the settings for an iterative imputer +which first removes features with more than `missingThreshold` missing values +and then imputes the missing values iteratively using chained equations +} diff --git a/man/createNormalization.Rd b/man/createNormalization.Rd new file mode 100644 index 00000000..6a73cfb3 --- /dev/null +++ b/man/createNormalization.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PreprocessingData.R +\name{createNormalization} +\alias{createNormalization} +\title{Create the settings for normalizing the data @param type The type of normalization to use, either "minmax" or "robust"} +\usage{ +createNormalization(type = "minmax") +} +\value{ +An object of class \code{featureEngineeringSettings} +} +\description{ +Create the settings for normalizing the data @param type The type of normalization to use, either "minmax" or "robust" +} diff --git a/man/createRareFeatureRemover.Rd b/man/createRareFeatureRemover.Rd new file mode 100644 index 00000000..c1e70b87 --- /dev/null +++ b/man/createRareFeatureRemover.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PreprocessingData.R +\name{createRareFeatureRemover} +\alias{createRareFeatureRemover} +\title{Create the settings for removing rare features} +\usage{ +createRareFeatureRemover(ratio = 0.001) +} +\arguments{ +\item{ratio}{The minimum fraction of the training data that must have a +feature for it to be included} +} +\value{ +An object of class \code{featureEngineeringSettings} +} +\description{ +Create the settings for removing rare features +} diff --git a/man/createSimpleImputer.Rd b/man/createSimpleImputer.Rd new file mode 100644 index 00000000..46b2ef5c --- /dev/null +++ b/man/createSimpleImputer.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{createSimpleImputer} +\alias{createSimpleImputer} +\title{Create Simple Imputer settings} +\usage{ +createSimpleImputer(method = "mean", missingThreshold = 0.3) +} +\arguments{ +\item{method}{The method to use for imputation, either "mean" or "median"} +} +\value{ +The settings for the single imputer of class `featureEngineeringSettings` +} +\description{ +This function creates the settings for a simple imputer +which imputes missing values with the mean or median +} diff --git a/man/deDuplicateCovariateData.Rd b/man/deDuplicateCovariateData.Rd new file mode 100644 index 00000000..e1f94213 --- /dev/null +++ b/man/deDuplicateCovariateData.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExternalValidatePlp.R +\name{deDuplicateCovariateData} +\alias{deDuplicateCovariateData} +\title{deplucateCovariateData - Remove duplicate covariate data +when downloading data with multiple different covariateSettings sometimes +there will be duplicated analysisIds which need to be removed} +\usage{ +deDuplicateCovariateData(covariateData) +} +\arguments{ +\item{covariateData}{The covariate data Andromeda object} +} +\value{ +The deduplicated covariate data +} +\description{ +deplucateCovariateData - Remove duplicate covariate data +when downloading data with multiple different covariateSettings sometimes +there will be duplicated analysisIds which need to be removed +} +\keyword{internal} diff --git a/man/iterativeImpute.Rd b/man/iterativeImpute.Rd new file mode 100644 index 00000000..f2f9d375 --- /dev/null +++ b/man/iterativeImpute.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{iterativeImpute} +\alias{iterativeImpute} +\title{Imputation} +\usage{ +iterativeImpute(trainData, featureEngineeringSettings, done = FALSE) +} +\arguments{ +\item{trainData}{The data to be imputed} + +\item{featureEngineeringSettings}{The settings for the imputation} + +\item{done}{Whether the imputation has already been done (bool)} +} +\value{ +The imputed data +} +\description{ +This function does single imputation with predictive mean matchin +} diff --git a/man/minMaxNormalize.Rd b/man/minMaxNormalize.Rd new file mode 100644 index 00000000..c4624293 --- /dev/null +++ b/man/minMaxNormalize.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PreprocessingData.R +\name{minMaxNormalize} +\alias{minMaxNormalize} +\title{A function that normalizes continous features to have values between 0 and 1} +\usage{ +minMaxNormalize(trainData, featureEngineeringSettings, normalized = FALSE) +} +\arguments{ +\item{trainData}{The training data to be normalized} + +\item{featureEngineeringSettings}{The settings for the normalization} + +\item{normalized}{Whether the data has already been normalized (bool)} +} +\value{ +The normalized data +} +\description{ +A function that normalizes continous features to have values between 0 and 1 +} +\details{ +uses value - min / (max - min) to normalize the data +} diff --git a/man/pmmFit.Rd b/man/pmmFit.Rd new file mode 100644 index 00000000..3cadc1df --- /dev/null +++ b/man/pmmFit.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{pmmFit} +\alias{pmmFit} +\title{Predictive mean matching using lasso} +\usage{ +pmmFit(data, k = 5) +} +\arguments{ +\item{k}{The number of donors to use for matching (default 5)} + +\item{numericData}{An andromeda object with the following fields: +xObs: covariates table for observed data +xMiss: covariates table for missing data +yObs: outcome variable that we want to impute} +} +\description{ +Predictive mean matching using lasso +} +\keyword{internal} diff --git a/man/preprocessData.Rd b/man/preprocessData.Rd index 2ace5220..4f9b18eb 100644 --- a/man/preprocessData.Rd +++ b/man/preprocessData.Rd @@ -8,7 +8,7 @@ and remove rare or redundant features} preprocessData(covariateData, preprocessSettings) } \arguments{ -\item{covariateData}{The covariate part of the training data created by \code{splitData} after being sampled and having +\item{covariateData}{The covariate part of the training data created by \code{splitData} after being sampled and having any required feature engineering} \item{preprocessSettings}{The settings for the preprocessing created by \code{createPreprocessSettings}} diff --git a/man/removeRareFeatures.Rd b/man/removeRareFeatures.Rd new file mode 100644 index 00000000..0a62ea54 --- /dev/null +++ b/man/removeRareFeatures.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PreprocessingData.R +\name{removeRareFeatures} +\alias{removeRareFeatures} +\title{A function that removes rare features from the data} +\usage{ +removeRareFeatures(trainData, featureEngineeringSettings, findRare = FALSE) +} +\arguments{ +\item{trainData}{The data to be normalized} + +\item{featureEngineeringSettings}{The settings for the normalization} + +\item{findRare}{Whether to find and remove rare features or remove them only (bool)} +} +\description{ +A function that removes rare features from the data +} +\details{ +removes features that are present in less than a certain fraction of the population +} diff --git a/man/robustNormalize.Rd b/man/robustNormalize.Rd new file mode 100644 index 00000000..cc3d558e --- /dev/null +++ b/man/robustNormalize.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PreprocessingData.R +\name{robustNormalize} +\alias{robustNormalize} +\title{A function that normalizes continous by the interquartile range and forces +the resulting values to be between -3 and 3 with f(x) = x / sqrt(1 + (x/3)^2)} +\usage{ +robustNormalize(trainData, featureEngineeringSettings, normalized = FALSE) +} +\arguments{ +\item{trainData}{The training data to be normalized} + +\item{featureEngineeringSettings}{The settings for the normalization} + +\item{normalized}{Whether the data has already been normalized (bool)} +} +\description{ +A function that normalizes continous by the interquartile range and forces +the resulting values to be between -3 and 3 with f(x) = x / sqrt(1 + (x/3)^2) +} +\details{ +uses (value - median) / iqr to normalize the data and then +applies the function f(x) = x / sqrt(1 + (x/3)^2) to the normalized values. +This forces the values to be between -3 and 3 while preserving the relative +ordering of the values.' +based on https://arxiv.org/abs/2407.04491 for more details +} diff --git a/man/setAdaBoost.Rd b/man/setAdaBoost.Rd index e4af2568..a13762b2 100644 --- a/man/setAdaBoost.Rd +++ b/man/setAdaBoost.Rd @@ -8,7 +8,7 @@ setAdaBoost( nEstimators = list(10, 50, 200), learningRate = list(1, 0.5, 0.1), algorithm = list("SAMME"), - seed = sample(1e+06, 1) + seed = sample(1000000, 1) ) } \arguments{ diff --git a/man/setCoxModel.Rd b/man/setCoxModel.Rd index 2d1a6cc7..a0cf90de 100644 --- a/man/setCoxModel.Rd +++ b/man/setCoxModel.Rd @@ -12,7 +12,7 @@ setCoxModel( threads = -1, upperLimit = 20, lowerLimit = 0.01, - tolerance = 2e-07, + tolerance = 0.0000002, maxIterations = 3000 ) } diff --git a/man/setDecisionTree.Rd b/man/setDecisionTree.Rd index 1e6b994c..b5633772 100644 --- a/man/setDecisionTree.Rd +++ b/man/setDecisionTree.Rd @@ -15,7 +15,7 @@ setDecisionTree( maxLeafNodes = list(NULL), minImpurityDecrease = list(10^-7), classWeight = list(NULL), - seed = sample(1e+06, 1) + seed = sample(1000000, 1) ) } \arguments{ diff --git a/man/setGradientBoostingMachine.Rd b/man/setGradientBoostingMachine.Rd index a42e2b71..5f201ea2 100644 --- a/man/setGradientBoostingMachine.Rd +++ b/man/setGradientBoostingMachine.Rd @@ -14,7 +14,7 @@ setGradientBoostingMachine( scalePosWeight = 1, lambda = 1, alpha = 0, - seed = sample(1e+07, 1) + seed = sample(10000000, 1) ) } \arguments{ diff --git a/man/setIterativeHardThresholding.Rd b/man/setIterativeHardThresholding.Rd index 8cac5a62..c8eba9d5 100644 --- a/man/setIterativeHardThresholding.Rd +++ b/man/setIterativeHardThresholding.Rd @@ -7,14 +7,14 @@ setIterativeHardThresholding( K = 10, penalty = "bic", - seed = sample(1e+05, 1), + seed = sample(100000, 1), exclude = c(), forceIntercept = FALSE, fitBestSubset = FALSE, initialRidgeVariance = 0.1, - tolerance = 1e-08, + tolerance = 0.00000001, maxIterations = 10000, - threshold = 1e-06, + threshold = 0.000001, delta = 0 ) } diff --git a/man/setLassoLogisticRegression.Rd b/man/setLassoLogisticRegression.Rd index 533aa53b..f8988101 100644 --- a/man/setLassoLogisticRegression.Rd +++ b/man/setLassoLogisticRegression.Rd @@ -13,7 +13,7 @@ setLassoLogisticRegression( forceIntercept = F, upperLimit = 20, lowerLimit = 0.01, - tolerance = 2e-06, + tolerance = 0.000002, maxIterations = 3000, priorCoefs = NULL ) diff --git a/man/setLightGBM.Rd b/man/setLightGBM.Rd index 6380df30..41225ce0 100644 --- a/man/setLightGBM.Rd +++ b/man/setLightGBM.Rd @@ -16,7 +16,7 @@ setLightGBM( lambdaL2 = c(0), scalePosWeight = 1, isUnbalance = FALSE, - seed = sample(1e+07, 1) + seed = sample(10000000, 1) ) } \arguments{ diff --git a/man/setMLP.Rd b/man/setMLP.Rd index d6dea625..7c72e5d7 100644 --- a/man/setMLP.Rd +++ b/man/setMLP.Rd @@ -8,14 +8,14 @@ setMLP( hiddenLayerSizes = list(c(100), c(20)), activation = list("relu"), solver = list("adam"), - alpha = list(0.3, 0.01, 1e-04, 1e-06), + alpha = list(0.3, 0.01, 0.0001, 0.000001), batchSize = list("auto"), learningRate = list("constant"), learningRateInit = list(0.001), powerT = list(0.5), maxIter = list(200, 100), shuffle = list(TRUE), - tol = list(1e-04), + tol = list(0.0001), warmStart = list(TRUE), momentum = list(0.9), nesterovsMomentum = list(TRUE), @@ -23,9 +23,9 @@ setMLP( validationFraction = list(0.1), beta1 = list(0.9), beta2 = list(0.999), - epsilon = list(1e-08), + epsilon = list(0.00000001), nIterNoChange = list(10), - seed = sample(1e+05, 1) + seed = sample(100000, 1) ) } \arguments{ diff --git a/man/setRandomForest.Rd b/man/setRandomForest.Rd index 15ee62bb..7812e522 100644 --- a/man/setRandomForest.Rd +++ b/man/setRandomForest.Rd @@ -19,7 +19,7 @@ setRandomForest( oobScore = list(FALSE), nJobs = list(NULL), classWeight = list(NULL), - seed = sample(1e+05, 1) + seed = sample(100000, 1) ) } \arguments{ diff --git a/man/setSVM.Rd b/man/setSVM.Rd index f2d273f2..0b7af491 100644 --- a/man/setSVM.Rd +++ b/man/setSVM.Rd @@ -8,13 +8,13 @@ setSVM( C = list(1, 0.9, 2, 0.1), kernel = list("rbf"), degree = list(1, 3, 5), - gamma = list("scale", 1e-04, 3e-05, 0.001, 0.01, 0.25), + gamma = list("scale", 0.0001, 0.00003, 0.001, 0.01, 0.25), coef0 = list(0), shrinking = list(TRUE), tol = list(0.001), classWeight = list(NULL), cacheSize = 500, - seed = sample(1e+05, 1) + seed = sample(100000, 1) ) } \arguments{ diff --git a/man/simpleImpute.Rd b/man/simpleImpute.Rd new file mode 100644 index 00000000..619cb6bf --- /dev/null +++ b/man/simpleImpute.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{simpleImpute} +\alias{simpleImpute} +\title{Simple Imputation} +\usage{ +simpleImpute(trainData, featureEngineeringSettings, done = FALSE) +} +\arguments{ +\item{trainData}{The data to be imputed} + +\item{featureEngineeringSettings}{The settings for the imputation} + +\item{done}{Whether the imputation has already been done (bool)} +} +\value{ +The imputed data +} +\description{ +This function does single imputation with the mean or median +}