From 4e834fcbdcd3ad6e917b11b9b5e512372a7b6560 Mon Sep 17 00:00:00 2001 From: egillax Date: Wed, 4 Dec 2024 10:12:50 +0100 Subject: [PATCH 1/8] more work on imputations --- NAMESPACE | 4 + R/CyclopsModels.R | 4 +- R/PreprocessingData.R | 320 ++++++++++++++++++++++++++++++++++++------ R/RunPlp.R | 4 +- man/preprocessData.Rd | 2 +- 5 files changed, 286 insertions(+), 48 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index ad98ff62d..b80da52d4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,11 +28,15 @@ export(createGlmModel) export(createLearningCurve) export(createLogSettings) export(createModelDesign) +export(createNormalization) export(createPlpResultTables) export(createPreprocessSettings) export(createRandomForestFeatureSelection) +export(createRareFeatureRemover) export(createRestrictPlpDataSettings) export(createSampleSettings) +export(createSingleImputer) +export(createSklearnImputer) export(createSplineSettings) export(createStratifiedImputationSettings) export(createStudyPopulation) diff --git a/R/CyclopsModels.R b/R/CyclopsModels.R index 16f035164..e97cf0f74 100644 --- a/R/CyclopsModels.R +++ b/R/CyclopsModels.R @@ -101,9 +101,9 @@ fitCyclopsModel <- function( noiseLevel = "silent", threads = settings$threads, maxIterations = settings$maxIterations, - seed = settings$seed + seed = settings$seed, + useKKTSwindle = FALSE ) - fit <- tryCatch({ ParallelLogger::logInfo('Running Cyclops') Cyclops::fitCyclopsModel( diff --git a/R/PreprocessingData.R b/R/PreprocessingData.R index 834d27a85..bd065ae45 100644 --- a/R/PreprocessingData.R +++ b/R/PreprocessingData.R @@ -2,13 +2,13 @@ # Copyright 2021 Observational Health Data Sciences and Informatics # # This file is part of PatientLevelPrediction -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,34 +20,31 @@ #' @details #' Returns an object of class \code{preprocessingSettings} that specifies how to preprocess the training data #' -#' @param minFraction The minimum fraction of target population who must have a covariate for it to be included in the model training +#' @param minFraction The minimum fraction of target population who must have a covariate for it to be included in the model training #' @param normalize Whether to normalise the covariates before training (Default: TRUE) #' @param removeRedundancy Whether to remove redundant features (Default: TRUE) #' @return #' An object of class \code{preprocessingSettings} #' @export createPreprocessSettings <- function( - minFraction = 0.001, - normalize = TRUE, - removeRedundancy = TRUE - ){ - - checkIsClass(minFraction, c('numeric','integer')) - checkHigherEqual(minFraction,0) - - checkIsClass(normalize, c("logical")) - - checkIsClass(removeRedundancy, c("logical")) - - preprocessingSettings <- list( - minFraction = minFraction, - normalize = normalize, - removeRedundancy = removeRedundancy - ) - + minFraction = 0.001, + normalize = TRUE, + removeRedundancy = TRUE) { + checkIsClass(minFraction, c("numeric", "integer")) + checkHigherEqual(minFraction, 0) + + checkIsClass(normalize, c("logical")) + + checkIsClass(removeRedundancy, c("logical")) + + preprocessingSettings <- list( + minFraction = minFraction, + normalize = normalize, + removeRedundancy = removeRedundancy + ) + class(preprocessingSettings) <- "preprocessSettings" return(preprocessingSettings) - } @@ -57,39 +54,274 @@ createPreprocessSettings <- function( #' @details #' Returns an object of class \code{covariateData} that has been processed #' -#' @param covariateData The covariate part of the training data created by \code{splitData} after being sampled and having -#' any required feature engineering -#' @param preprocessSettings The settings for the preprocessing created by \code{createPreprocessSettings} +#' @param covariateData The covariate part of the training data created by \code{splitData} after being sampled and having +#' any required feature engineering +#' @param preprocessSettings The settings for the preprocessing created by \code{createPreprocessSettings} #' @return -#' The data processed -preprocessData <- function (covariateData, - preprocessSettings){ - +#' The data processed +preprocessData <- function(covariateData, + preprocessSettings) { metaData <- attr(covariateData, "metaData") preprocessSettingsInput <- preprocessSettings # saving this before adding covariateData - + checkIsClass(covariateData, c("CovariateData")) checkIsClass(preprocessSettings, c("preprocessSettings")) - - ParallelLogger::logDebug(paste0('minFraction: ', preprocessSettings$minFraction)) - ParallelLogger::logDebug(paste0('normalize: ', preprocessSettings$normalize)) - ParallelLogger::logDebug(paste0('removeRedundancy: ', preprocessSettings$removeRedundancy)) - + + ParallelLogger::logDebug(paste0("minFraction: ", preprocessSettings$minFraction)) + ParallelLogger::logDebug(paste0("normalize: ", preprocessSettings$normalize)) + ParallelLogger::logDebug(paste0("removeRedundancy: ", preprocessSettings$removeRedundancy)) + preprocessSettings$covariateData <- covariateData covariateData <- do.call(FeatureExtraction::tidyCovariateData, preprocessSettings) - - #update covariateRef + + # update covariateRef removed <- unique(c( attr(covariateData, "metaData")$deletedInfrequentCovariateIds, attr(covariateData, "metaData")$deletedRedundantCovariateIds - ) - ) - covariateData$covariateRef <- covariateData$covariateRef %>% - dplyr::filter(!.data$covariateId %in% removed) - + )) + covariateData$covariateRef <- covariateData$covariateRef %>% + dplyr::filter(!.data$covariateId %in% removed) + metaData$tidyCovariateDataSettings <- attr(covariateData, "metaData") metaData$preprocessSettings <- preprocessSettingsInput attr(covariateData, "metaData") <- metaData - + return(covariateData) } + +#' A function that normalizes continous features to have values between 0 and 1 +#' @details uses value - min / (max - min) to normalize the data +#' @param trainData The training data to be normalized +#' @param featureEngineeringSettings The settings for the normalization +#' @param normalized Whether the data has already been normalized (bool) +#' @return The normalized data +minMaxNormalize <- function(trainData, featureEngineeringSettings, normalized = FALSE) { + if (!normalized) { + # fit the normalization + # find continuous features from trainData$covariateData$analysisRef + continousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + + # get max of each feature + trainData$covariateData$minMaxs <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continousFeatures) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise( + max = max(.data$covariateValue, na.rm = TRUE), + min = min(.data$covariateValue, na.rm = TRUE) + ) %>% + dplyr::collect() + + # save the normalization + attr(featureEngineeringSettings, "minMaxs") <- + trainData$covariateData$minMaxs %>% dplyr::collect() + + # apply the normalization to trainData + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$minMaxs, by = "covariateId") %>% + # use ifelse to only normalize if min and max are not NA as is the case + # for continous features, else return original value + dplyr::mutate(covariateValue = ifelse(!is.na(min) & !is.na(max), + (.data$covariateValue - min) / (max - min), + .data$covariateValue + )) %>% + dplyr::select(-c("max", "min")) + trainData$covariateData$minMaxs <- NULL + normalized <- TRUE + } else { + # apply the normalization to test data by using saved normalization values + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(attr(featureEngineeringSettings, "minMaxs"), + by = "covariateId", copy = TRUE + ) %>% + dplyr::mutate(covariateValue = ifelse(!is.na(min) & !is.na(max), + (.data$covariateValue - min) / (max - min), + .data$covariateValue + )) %>% + dplyr::select(-c("max", "min")) + } + featureEngineering <- list( + funct = "minMaxNormalize", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + normalized = normalized + ) + ) + + attr(trainData, "metaData")$featureEngineering <- listAppend( + attr(trainData, "metaData")$featureEngineering, + featureEngineering + ) + return(trainData) +} + +#' A function that normalizes continous by the interquartile range and forces +#' the resulting values to be between -3 and 3 with f(x) = x / sqrt(1 + (x/3)^2) +#' @details uses (value - median) / iqr to normalize the data and then +#' applies the function f(x) = x / sqrt(1 + (x/3)^2) to the normalized values. +#' This forces the values to be between -3 and 3 while preserving the relative +#' ordering of the values.' +#' based on https://arxiv.org/abs/2407.04491 for more details +#' @param trainData The training data to be normalized +#' @param featureEngineeringSettings The settings for the normalization +#' @param normalized Whether the data has already been normalized (bool) +robustNormalize <- function(trainData, featureEngineeringSettings, normalized = FALSE) { + if (!normalized) { + # find continuous features from trainData$covariateData$analysisRef + continousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + + # get (25, 75)% quantiles of each feature + # sqlite (used by Andromeda) doesn't have quantile function, so we need to load the extension + # to get upper_quartile and lower_quartile_functions + con <- trainData$covariateData$covariates %>% dbplyr::remote_con() + RSQLite::initExtension(con) + + trainData$covariateData$quantiles <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continousFeatures) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise( + q25 = dplyr::sql("lower_quartile(covariateValue)"), + q75 = dplyr::sql("upper_quartile(covariateValue)"), + median = median(.data$covariateValue, na.rm = TRUE) + ) %>% + dplyr::mutate(iqr = .data$q75 - .data$q25) %>% + dplyr::select(-c("q75", "q25")) %>% + dplyr::collect() + + # save the normalization + attr(featureEngineeringSettings, "quantiles") <- + trainData$covariateData$quantiles %>% dplyr::collect() + + # apply the normalization to trainData + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$quantiles, by = "covariateId") %>% + # use ifelse to only normalize continous features + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + (.data$covariateValue - .data$median) / .data$iqr, + .data$covariateValue + )) %>% + # smoothly clip the range to [-3, 3] with x / sqrt(1 + (x/3)^2) + # ref: https://arxiv.org/abs/2407.04491 + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + .data$covariateValue / sqrt(1 + (.data$covariateValue / 3)^2), + .data$covariateValue + )) %>% + dplyr::select(-c("median", "iqr")) + trainData$covariateData$quantiles <- NULL + normalized <- TRUE + } else { + # apply the normalization to test data by using saved normalization values + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(attr(featureEngineeringSettings, "quantiles"), + by = "covariateId", copy = TRUE + ) %>% + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + (.data$covariateValue - .data$median) / .data$iqr, + .data$covariateValue + )) %>% + dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median), + .data$covariateValue / sqrt(1 + (.data$covariateValue / 3)^2), + .data$covariateValue + )) %>% + dplyr::select(-c("median", "iqr")) + } + featureEngineering <- list( + funct = "robustNormalize", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + normalized = normalized + ) + ) + + attr(trainData, "metaData")$featureEngineering <- listAppend( + attr(trainData, "metaData")$featureEngineering, + featureEngineering + ) + return(trainData) +} + +#' Create the settings for normalizing the data +#' @param type The type of normalization to use, either "minmax" or "robust" +#' @return An object of class \code{featureEngineeringSettings} +#' @export +createNormalization <- function(type = "minmax") { + featureEngineeringSettings <- list( + type = type + ) + if (type == "minmax") { + attr(featureEngineeringSettings, "fun") <- "minMaxNormalize" + } else if (type == "robust") { + attr(featureEngineeringSettings, "fun") <- "robustNormalize" + } + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' A function that removes rare features from the data +#' @details removes features that are present in less than a certain fraction of the population +#' @param trainData The data to be normalized +#' @param featureEngineeringSettings The settings for the normalization +#' @param findRare Whether to find and remove rare features or remove them only (bool) +removeRareFeatures <- function(trainData, featureEngineeringSettings, findRare = FALSE) { + if (!findRare) { + rareFeatures <- trainData$covariateData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise(count = dplyr::n()) %>% + dplyr::collect() + rareFeatures <- rareFeatures %>% + dplyr::mutate(ratio = .data$count / ( + trainData$covariateData$covariates %>% + dplyr::summarise(popSize = dplyr::n_distinct(.data$rowId)) %>% + dplyr::pull() + )) %>% + dplyr::filter(.data$ratio <= featureEngineeringSettings$ratio) %>% + dplyr::pull(c("covariateId")) + + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(!.data$covariateId %in% rareFeatures) + + attr(featureEngineeringSettings, "rareFeatures") <- rareFeatures + + findRare <- TRUE + } else { + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter( + !.data$covariateId %in% !!attr(featureEngineeringSettings, "rareFeatures") + ) + } + featureEngineering <- list( + funct = "removeRareFeatures", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + findRare = findRare + ) + ) + attr(trainData, "metaData")$featureEngineering <- listAppend( + attr(trainData, "metaData")$featureEngineering, + featureEngineering + ) + return(trainData) +} + +#' Create the settings for removing rare features +#' @param ratio The minimum fraction of the training data that must have a +#' feature for it to be included +#' @return An object of class \code{featureEngineeringSettings} +#' @export +createRareFeatureRemover <- function(ratio = 0.001) { + featureEngineeringSettings <- list( + ratio = ratio + ) + attr(featureEngineeringSettings, "fun") <- "removeRareFeatures" + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} diff --git a/R/RunPlp.R b/R/RunPlp.R index 2d7119388..7bae85020 100644 --- a/R/RunPlp.R +++ b/R/RunPlp.R @@ -213,6 +213,7 @@ runPlp <- function( executeSettings = createDefaultExecuteSettings(), saveDirectory = getwd() ){ + start <- Sys.time() # start log analysisPath <- file.path(saveDirectory, analysisId) @@ -509,7 +510,8 @@ runPlp <- function( class(results) <- c('runPlp') ParallelLogger::logInfo("Run finished successfully.") - + end <- Sys.time() + ParallelLogger::logInfo(paste0('Total time taken: ', end - start)) # save the results ParallelLogger::logInfo(paste0('Saving PlpResult')) tryCatch(savePlpResult(results, file.path(analysisPath,'plpResult')), diff --git a/man/preprocessData.Rd b/man/preprocessData.Rd index 2ace52208..4f9b18eb2 100644 --- a/man/preprocessData.Rd +++ b/man/preprocessData.Rd @@ -8,7 +8,7 @@ and remove rare or redundant features} preprocessData(covariateData, preprocessSettings) } \arguments{ -\item{covariateData}{The covariate part of the training data created by \code{splitData} after being sampled and having +\item{covariateData}{The covariate part of the training data created by \code{splitData} after being sampled and having any required feature engineering} \item{preprocessSettings}{The settings for the preprocessing created by \code{createPreprocessSettings}} From d17223c5b27ed16c1c863db80feaea8dcc2823b3 Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 5 Dec 2024 17:50:47 +0100 Subject: [PATCH 2/8] more imputation work --- NAMESPACE | 2 +- R/PreprocessingData.R | 13 ++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index b80da52d4..3bbf6aaae 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,6 +25,7 @@ export(createExecuteSettings) export(createExistingSplitSettings) export(createFeatureEngineeringSettings) export(createGlmModel) +export(createIterativeImputer) export(createLearningCurve) export(createLogSettings) export(createModelDesign) @@ -35,7 +36,6 @@ export(createRandomForestFeatureSelection) export(createRareFeatureRemover) export(createRestrictPlpDataSettings) export(createSampleSettings) -export(createSingleImputer) export(createSklearnImputer) export(createSplineSettings) export(createStratifiedImputationSettings) diff --git a/R/PreprocessingData.R b/R/PreprocessingData.R index bd065ae45..fba55fc35 100644 --- a/R/PreprocessingData.R +++ b/R/PreprocessingData.R @@ -151,10 +151,9 @@ minMaxNormalize <- function(trainData, featureEngineeringSettings, normalized = ) ) - attr(trainData, "metaData")$featureEngineering <- listAppend( - attr(trainData, "metaData")$featureEngineering, + attr(trainData, "metaData")$featureEngineering[["minMaxNormalize"]] -> featureEngineering - ) + return(trainData) } @@ -240,10 +239,8 @@ robustNormalize <- function(trainData, featureEngineeringSettings, normalized = ) ) - attr(trainData, "metaData")$featureEngineering <- listAppend( - attr(trainData, "metaData")$featureEngineering, + attr(trainData, "metaData")$featureEngineering[['robustNormalize']] <- featureEngineering - ) return(trainData) } @@ -304,10 +301,8 @@ removeRareFeatures <- function(trainData, featureEngineeringSettings, findRare = findRare = findRare ) ) - attr(trainData, "metaData")$featureEngineering <- listAppend( - attr(trainData, "metaData")$featureEngineering, + attr(trainData, "metaData")$featureEngineering[['removeRare']] <- featureEngineering - ) return(trainData) } From 098a44aac06aa26c59209b9aa1e0b8da1db87292 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 9 Dec 2024 10:19:30 +0100 Subject: [PATCH 3/8] more imputation work --- NAMESPACE | 1 - R/PreprocessingData.R | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 3bbf6aaae..a3a8f7e7a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,7 +36,6 @@ export(createRandomForestFeatureSelection) export(createRareFeatureRemover) export(createRestrictPlpDataSettings) export(createSampleSettings) -export(createSklearnImputer) export(createSplineSettings) export(createStratifiedImputationSettings) export(createStudyPopulation) diff --git a/R/PreprocessingData.R b/R/PreprocessingData.R index fba55fc35..e5ff95468 100644 --- a/R/PreprocessingData.R +++ b/R/PreprocessingData.R @@ -114,6 +114,7 @@ minMaxNormalize <- function(trainData, featureEngineeringSettings, normalized = min = min(.data$covariateValue, na.rm = TRUE) ) %>% dplyr::collect() + on.exit(trainData$covariateData$minMaxs <- NULL, add = TRUE) # save the normalization attr(featureEngineeringSettings, "minMaxs") <- @@ -193,6 +194,7 @@ robustNormalize <- function(trainData, featureEngineeringSettings, normalized = dplyr::mutate(iqr = .data$q75 - .data$q25) %>% dplyr::select(-c("q75", "q25")) %>% dplyr::collect() + on.exit(trainData$covariateData$quantiles <- NULL, add = TRUE) # save the normalization attr(featureEngineeringSettings, "quantiles") <- @@ -213,7 +215,6 @@ robustNormalize <- function(trainData, featureEngineeringSettings, normalized = .data$covariateValue )) %>% dplyr::select(-c("median", "iqr")) - trainData$covariateData$quantiles <- NULL normalized <- TRUE } else { # apply the normalization to test data by using saved normalization values From 6b27f8e733a0a27e790bb02efb07d382d4b0dbc7 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 9 Dec 2024 10:38:24 +0100 Subject: [PATCH 4/8] add files --- R/Imputation.R | 551 ++++++++++++++++++++++++++++++++++ man/createIterativeImputer.Rd | 21 ++ man/iterativeImpute.Rd | 21 ++ 3 files changed, 593 insertions(+) create mode 100644 R/Imputation.R create mode 100644 man/createIterativeImputer.Rd create mode 100644 man/iterativeImpute.Rd diff --git a/R/Imputation.R b/R/Imputation.R new file mode 100644 index 000000000..5bfa23380 --- /dev/null +++ b/R/Imputation.R @@ -0,0 +1,551 @@ +# @file Imputation.R +# Copyright 2025 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' @title Imputation +#' @description This function does single imputation with predictive mean matchin +#' @param trainData The data to be imputed +#' @param featureEngineeringSettings The settings for the imputation +#' @param done Whether the imputation has already been done (bool) +#' @return The imputed data +iterativeImpute <- function(trainData, featureEngineeringSettings, done = FALSE) { + if (!done) { + total <- trainData$covariateData$covariates %>% + dplyr::summarise(total = dplyr::n_distinct(.data$rowId)) %>% + dplyr::pull() + continuousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + + trainData$covariateData$missingInfo <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise(counts = dplyr::n()) %>% + dplyr::collect() %>% # necessary because of integer division in sqlite + dplyr::mutate(missing = 1 - .data$counts / total) %>% + dplyr::select(c("covariateId", "missing")) + on.exit(trainData$covariateData$missingInfo <- NULL, add = TRUE) + + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::filter(is.na(.data$missing) || + .data$missing <= featureEngineeringSettings$missingThreshold) %>% + dplyr::select(-"missing") + + # now I want to do imputation using predictive mean matching and chained equations + # separate the continuous and binary features + numericData <- Andromeda::andromeda() + numericData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) + on.exit(numericData$covariates <- NULL, add = TRUE) + numericData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) + on.exit(numericData$covariateRef <- NULL, add = TRUE) + + binary <- Andromeda::andromeda() + binary$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(!.data$covariateId %in% !!continuousFeatures) + on.exit(binary$covariates <- NULL, add = TRUE) + binary$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(!.data$covariateId %in% !!continuousFeatures) + on.exit(binary$covariateRef <- NULL, add = TRUE) + + # initialize imputed values + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + # now we have NAs for missing combinations + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE) + + # get index of NAs for every feature to be imputed + numericData$missingIndex <- numericData$covariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::select(-c("covariateValue")) + on.exit(numericData$missingIndex <- NULL, add = TRUE) + + # TODO choose initialization method + # let's impute by the mean initially + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = ifelse(is.na(.data$covariateValue), + mean(.data$covariateValue, na.rm = TRUE), + .data$covariateValue + )) + on.exit(numericData$imputedCovariates <- NULL, add = TRUE) + + # Main MICE algorithm + prevImputations <- list() + maxIter <- 5 # TODO check + varsToImpute <- numericData$missingIndex %>% + dplyr::pull(.data$covariateId) %>% + unique() + convergenceParameters <- list() + modelInfo <- list() + + for (iter in 1:maxIter) { + ParallelLogger::logInfo("Imputation iteration: ", iter) + currentImputations <- list() + + # TODO do in order from least missing to most missing + for (varId in varsToImpute) { + varName <- trainData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$covariateName) + ParallelLogger::logInfo("Imputing variable: ", varName) + numericData$y <- numericData$covariates %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::mutate(y = .data$covariateValue) %>% + dplyr::select("y", "rowId") + on.exit(numericData$y <- NULL, add = TRUE) + obsIdx <- which(!is.na(numericData$y %>% dplyr::pull(.data$y))) + missIdx <- which(is.na(numericData$y %>% dplyr::pull(.data$y))) + numericData$yObs <- numericData$y %>% + dplyr::filter(.data$rowId %in% !!allRowIds[obsIdx]) + on.exit(numericData$yObs <- NULL, add = TRUE) + + numericData$X <- numericData$imputedCovariates %>% + dplyr::filter(.data$covariateId != varId) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + on.exit(numericData$X <- NULL, add = TRUE) + Andromeda::appendToTable(numericData$X, binary$covariates) + numericData$xObs <- numericData$X %>% dplyr::filter(.data$rowId %in% !!allRowIds[obsIdx]) + on.exit(numericData$xObs <- NULL, add = TRUE) + numericData$xMiss <- numericData$X %>% dplyr::filter(.data$rowId %in% !!allRowIds[missIdx]) + on.exit(numericData$xMiss <- NULL, add = TRUE) + + pmmResults <- pmmFit(numericData, k = 5) + + # update imputations in data + numericData$imputedValues <- pmmResults$imputedValues + on.exit(numericData$imputedValues <- NULL, add = TRUE) + numericData$imputedCovariates <- numericData$imputedCovariates %>% + dplyr::left_join(numericData$imputedValues, + by = "rowId", + suffix = c("", ".new") + ) %>% + dplyr::mutate( + imputedValue = + dplyr::if_else(.data$covariateId == varId && + !is.na(.data$imputedValue.new), + .data$imputedValue.new, + .data$imputedValue + ) + ) %>% + dplyr::select(-"imputedValue.new") + + # store current imputations for convergence check + currentImputations[[as.character(varId)]] <- pmmResults$imputedValues$imputedValue + + # store pmm info for each variable + modelInfo[[as.character(varId)]] <- pmmResults$model + } + + # save values for convergence checking afterwards + + # store mean and variance of imputed values for each variable + # as well as average change from previous iteration + meanVector <- numeric(length(varsToImpute)) + varVector <- numeric(length(varsToImpute)) + idx <- 1 + for (varId in varsToImpute) { + currentImputation <- currentImputations[[as.character(varId)]] + meanVector[idx] <- mean(currentImputation) + varVector[idx] <- var(currentImputation) + idx <- idx + 1 + } + convergenceInfo <- list( + meanVector = meanVector, + varVector = varVector + ) + if (iter > 1) { + meanVarChange <- numeric(length(varsToImpute)) + for (varId in varsToImpute) { + prevImputation <- prevImputations[[as.character(varId)]] + currentImputation <- currentImputations[[as.character(varId)]] + meanVarChange <- c( + meanVarChange, + mean(abs(currentImputation - prevImputation)) + ) + } + convergenceInfo$meanVarChange <- meanVarChange + } + convergenceParameters[[iter]] <- convergenceInfo + + prevImputations <- currentImputations + } + + # calculate kde estimates of imputed and observed distributions per imputed variable + # and store in featureEngineeringSettings + kdeEstimates <- list() + for (varId in varsToImpute) { + varName <- trainData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$covariateName) + rows <- numericData$missingIndex %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$rowId) + imputedValues <- numericData$imputedCovariates %>% + dplyr::filter(.data$covariateId == varId, + .data$rowId %in% rows) %>% + dplyr::pull(.data$imputedValue) + observedValues <- numericData$covariates %>% + dplyr::filter(.data$covariateId == varId, + !is.na(.data$covariateValue)) %>% + dplyr::pull(.data$covariateValue) + kdeEstimates[[as.character(varId)]] <- list( + imputed = density(imputedValues), + observed = density(observedValues) + ) + } + + # add imputed values in data + Andromeda::appendToTable( + trainData$covariateData$covariates, + numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue"))) + + + attr(featureEngineeringSettings, "missingInfo") <- + trainData$covariateData$missingInfo %>% + dplyr::collect() + attr(featureEngineeringSettings, "imputer") <- modelInfo + attr(featureEngineeringSettings, "kdeEstimates") <- kdeEstimates + done <- TRUE + } else { + # remove data with more than missingThreshold + trainData$covariateData$missingInfo <- attr(featureEngineeringSettings, + "missingInfo") + on.exit(trainData$covariateData$missingInfo <- NULL, add = TRUE) + trainData$covariateData$covariateIsBinary <- trainData$covariateData$covariateRef %>% + dplyr::select(.data$covariateId, analysisId) %>% + dplyr::inner_join( + trainData$covariateData$analysisRef %>% + dplyr::select(analysisId, .data$isBinary), + by = "analysisId" + ) %>% + dplyr::mutate(isBinary = .data$isBinary == 'Y') %>% + dplyr::select(.data$covariateId, .data$isBinary) %>% + dplyr::compute() + on.exit(trainData$covariateData$covariateIsBinary <- NULL, add = TRUE) + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::left_join(trainData$covariateData$covariateIsBinary, by = "covariateId") %>% + dplyr::filter( + (!is.na(.data$missing) && .data$missing <= featureEngineeringSettings$missingThreshold) || + (is.na(.data$missing) && .data$isBinary)) %>% + dplyr::select(-"missing", -"isBinary") + + continuousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + numericData <- Andromeda::andromeda() + numericData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) + on.exit(numericData$covariates <- NULL, add = TRUE) + numericData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) + on.exit(numericData$covariateRef <- NULL, add = TRUE) + + binary <- Andromeda::andromeda() + binary$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(!.data$covariateId %in% !!continuousFeatures) + on.exit(binary$covariates <- NULL, add = TRUE) + binary$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(!.data$covariateId %in% !!continuousFeatures) + on.exit(binary$covariateRef <- NULL, add = TRUE) + # impute missing values + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + # now we have NAs for missing combinations + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE) + + # get index of NAs for every feature to be imputed + numericData$missingIndex <- numericData$covariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::select(-c("covariateValue")) + on.exit(numericData$missingIndex <- NULL, add = TRUE) + + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = .data$covariateValue) + on.exit(numericData$imputedCovariates <- NULL, add = TRUE) + + + varsToImpute <- numericData$missingIndex %>% + dplyr::pull(.data$covariateId) %>% + unique() + for (varId in varsToImpute) { + varName <- trainData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$covariateName) + ParallelLogger::logInfo("Imputing variable: ", varName) + numericData$y <- numericData$covariates %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::mutate(y = .data$covariateValue) %>% + dplyr::select("y", "rowId") + on.exit(numericData$y <- NULL, add = TRUE) + missIdx <- which(is.na(numericData$y %>% dplyr::pull(.data$y))) + numericData$X <- numericData$covariates %>% + dplyr::filter(.data$covariateId != varId) + on.exit(numericData$X <- NULL, add = TRUE) + Andromeda::appendToTable(numericData$X, binary$covariates) + numericData$xMiss <- numericData$X %>% dplyr::filter(.data$rowId %in% !!allRowIds[missIdx]) + on.exit(numericData$xMiss <- NULL, add = TRUE) + + imputer <- attr(featureEngineeringSettings, "imputer")[[as.character(varId)]] + pmmResults <- pmmPredict(numericData, k = 5, imputer) + + # update imputations in data + numericData$imputedValues <- pmmResults$imputedValues + on.exit(numericData$imputedValues <- NULL, add = TRUE) + numericData$imputedCovariates <- numericData$imputedCovariates %>% + dplyr::left_join(numericData$imputedValues, + by = "rowId", + suffix = c("", ".new") + ) %>% + dplyr::mutate( + imputedValue = + dplyr::if_else(.data$covariateId == varId && + !is.na(.data$imputedValue.new), + .data$imputedValue.new, + .data$imputedValue + ) + ) %>% + dplyr::select(-"imputedValue.new") + } + # add imputed values in data + Andromeda::appendToTable( + trainData$covariateData$covariates, + numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue"))) + } + featureEngineering <- list( + funct = "iterativeImpute", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + done = done + ) + ) + attr(trainData, "metaData")$featureEngineering[["iterativeImputer"]] <- + featureEngineering + return(trainData) +} + +#' @title Create Iterative Imputer settings +#' @description This function creates the settings for an iterative imputer +#' which first removes features with more than `missingThreshold` missing values +#' and then imputes the missing values iteratively using chained equations +#' @param missingThreshold The threshold for missing values to remove a feature +#' @param method The method to use for imputation, currently only "pmm" is supported +#' @return The settings for the single imputer of class `featureEngineeringSettings` +#' @export +createIterativeImputer <- function(missingThreshold = 0.3, + method = "pmm") { + featureEngineeringSettings <- list( + missingThreshold = missingThreshold, + method = method + ) + attr(featureEngineeringSettings, "fun") <- "iterativeImpute" + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' @title Predictive mean matching using lasso +#' @param numericData An andromeda object with the following fields: +#' xObs: covariates table for observed data +#' xMiss: covariates table for missing data +#' yObs: outcome variable that we want to impute +#' @param k The number of donors to use for matching (default 5) +#' @keywords internal +pmmFit <- function(data, k = 5) { + data$rowMap <- data$xObs %>% + dplyr::group_by(.data$rowId) %>% + dplyr::summarise() %>% + dplyr::mutate(oldRowId = .data$rowId, + newRowId = dplyr::row_number()) %>% + dplyr::select(c("newRowId", "oldRowId")) %>% + dplyr::compute() + on.exit(data$rowMap <- NULL, add = TRUE) + data$colMap <- data$xObs %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise() %>% + dplyr::mutate(oldCovariateId = .data$covariateId, + newCovariateId = dplyr::row_number()) %>% + dplyr::select(c("newCovariateId", "oldCovariateId")) + on.exit(data$colMap <- NULL, add = TRUE) + + data$xObs <- data$xObs %>% + dplyr::left_join(data$rowMap, by = c("rowId" = "oldRowId")) %>% + dplyr::left_join(data$colMap, by = c("covariateId" = "oldCovariateId")) %>% + dplyr::select(rowId = "newRowId", + covariateId = "newCovariateId", + covariateValue = "covariateValue") + + xObs <- Matrix::sparseMatrix( + i = data$xObs %>% dplyr::pull(.data$rowId), + j = data$xObs %>% dplyr::pull(.data$covariateId), + x = data$xObs %>% dplyr::pull(.data$covariateValue), + dims = c(data$rowMap %>% dplyr::pull(.data$newRowId) %>% max(), + data$colMap %>% dplyr::pull(.data$newCovariateId) %>% max()) + ) + + fit <- glmnet::cv.glmnet(xObs, data$yObs %>% + dplyr::pull(.data$y), alpha = 1, nfolds = 3) + + # predict on both XObs and XMiss + predsObs <- predict(fit, xObs, fit$lambda.min) + data$xMiss <- data$xMiss %>% + dplyr::left_join( + data$xMiss %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise(n_unique = dplyr::n_distinct(.data$covariateValue), + max = max(.data$covariateValue, na.rm = TRUE), + min = min(.data$covariateValue, na.rm = TRUE), + ), + by = "covariateId" + ) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate( + covariateValue = ifelse(.data$n_unique > 2 & (.data$max - .data$max) > 0, + (.data$covariateValue - .data$min) / (.data$max - .data$min), + .data$covariateValue)) %>% + dplyr::ungroup() %>% + dplyr::select(-c("n_unique", "min", "max")) + data$rowMapMiss <- data$xMiss %>% + dplyr::group_by(.data$rowId) %>% + dplyr::summarise() %>% + dplyr::mutate(oldRowId = .data$rowId, + newRowId = dplyr::row_number()) %>% + dplyr::select(c("newRowId", "oldRowId")) %>% + dplyr::compute() + on.exit(data$rowMapMiss <- NULL, add = TRUE) + data$xMiss <- data$xMiss %>% + dplyr::left_join(data$rowMapMiss, by = c("rowId" = "oldRowId")) %>% + dplyr::left_join(data$colMap, by = c("covariateId" = "oldCovariateId")) %>% + dplyr::select(rowId = "newRowId", + covariateId = "newCovariateId", + covariateValue = "covariateValue") + + xMiss <- Matrix::sparseMatrix( + i = data$xMiss %>% dplyr::pull(.data$rowId), + j = data$xMiss %>% dplyr::pull(.data$covariateId), + x = data$xMiss %>% dplyr::pull(.data$covariateValue), + dims = c( + data$xMiss %>% dplyr::pull(.data$rowId) %>% max(), + data$xMiss %>% dplyr::pull(.data$covariateId) %>% max()) + ) + + predsMiss <- predict(fit, xMiss, fit$lambda.min) + + # precompute mapping to use - straight from xId (row index) to + # covariateValue of donor + donorMapping <- data$rowMap %>% + dplyr::inner_join(data$yObs, by = c("oldRowId" = "rowId"), copy = TRUE) %>% + dplyr::pull(.data$y) + # for each missing value, find the k closest observed values + imputedValues <- numeric(nrow(xMiss)) + for (j in 1:nrow(xMiss)) { + distances <- abs(predsObs - predsMiss[j]) + donorIndices <- order(distances)[1:k] + donorValues <- donorMapping[donorIndices] + imputedValues[j] <- sample(donorValues, 1) + } + + results <- list() + results$imputedValues <- data.frame( + rowId = data$rowMapMiss %>% + dplyr::pull(.data$oldRowId), + imputedValue = imputedValues + ) + bestIndex <- which(fit$lambda == fit$lambda.min) + nonZero <- which(fit$glmnet.fit$beta[, bestIndex] != 0) + nonZeroCovariateIds <- data$colMap %>% + dplyr::filter(.data$newCovariateId %in% nonZero) %>% + dplyr::pull(.data$oldCovariateId) + results$model <- list( + intercept = as.numeric(fit$glmnet.fit$a0[bestIndex]), + coefficients = data.frame( + covariateId = nonZeroCovariateIds, + values = as.numeric(fit$glmnet.fit$beta[nonZero, bestIndex])), + predictions = data.frame( + rowId = data$rowMap %>% + dplyr::pull(.data$oldRowId), + prediction = as.numeric(predsObs) + ) + ) + return(results) +} + +pmmPredict <- function(data, k = 5, imputer) { + data$coefficients <- imputer$coefficients + predictionMissing <- data$xMiss %>% + dplyr::inner_join(data$coefficients, by = "covariateId") %>% + dplyr::mutate(values = .data$covariateValue * .data$values) %>% + dplyr::group_by(.data$rowId) %>% + dplyr::summarise(value = sum(.data$values, na.rm = TRUE)) %>% + dplyr::select("rowId", "value") + predictionMissing <- as.data.frame(predictionMissing) + predictionMissing$value <- predictionMissing$value + imputer$intercept + + + # precompute mapping to use - straight from xId (row index) to + # covariateValue of donor + donorMapping <- imputer$predictions %>% dplyr::pull(.data$prediction) + + # for each missing value, find the k closest observed values + nRows <- data$xMiss %>% dplyr::pull(.data$rowId) %>% dplyr::n_distinct() + imputedValues <- numeric(nRows) + predsObs <- imputer$predictions$prediction + for (j in 1:nRows) { + distances <- abs(predsObs - predictionMissing$value[j]) + donorIndices <- order(distances)[1:k] + donorValues <- donorMapping[donorIndices] + imputedValues[j] <- sample(donorValues, 1) + } + + results <- list() + results$imputedValues <- data.frame( + rowId = predictionMissing %>% + dplyr::pull(.data$rowId), + imputedValue = imputedValues + ) + return(results) +} diff --git a/man/createIterativeImputer.Rd b/man/createIterativeImputer.Rd new file mode 100644 index 000000000..f979bfa22 --- /dev/null +++ b/man/createIterativeImputer.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{createIterativeImputer} +\alias{createIterativeImputer} +\title{Create Iterative Imputer settings} +\usage{ +createIterativeImputer(missingThreshold = 0.3, method = "pmm") +} +\arguments{ +\item{missingThreshold}{The threshold for missing values to remove a feature} + +\item{method}{The method to use for imputation, currently only "pmm" is supported} +} +\value{ +The settings for the single imputer of class `featureEngineeringSettings` +} +\description{ +This function creates the settings for an iterative imputer +which first removes features with more than `missingThreshold` missing values +and then imputes the missing values iteratively using chained equations +} diff --git a/man/iterativeImpute.Rd b/man/iterativeImpute.Rd new file mode 100644 index 000000000..f2f9d3750 --- /dev/null +++ b/man/iterativeImpute.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{iterativeImpute} +\alias{iterativeImpute} +\title{Imputation} +\usage{ +iterativeImpute(trainData, featureEngineeringSettings, done = FALSE) +} +\arguments{ +\item{trainData}{The data to be imputed} + +\item{featureEngineeringSettings}{The settings for the imputation} + +\item{done}{Whether the imputation has already been done (bool)} +} +\value{ +The imputed data +} +\description{ +This function does single imputation with predictive mean matchin +} From 05065378fa4d9f83ae97e13e2cfa1398712faf70 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 9 Dec 2024 10:39:35 +0100 Subject: [PATCH 5/8] docs --- man/createDefaultSplitSetting.Rd | 2 +- man/setAdaBoost.Rd | 2 +- man/setCoxModel.Rd | 2 +- man/setDecisionTree.Rd | 2 +- man/setGradientBoostingMachine.Rd | 2 +- man/setIterativeHardThresholding.Rd | 6 +++--- man/setLassoLogisticRegression.Rd | 2 +- man/setLightGBM.Rd | 2 +- man/setMLP.Rd | 8 ++++---- man/setRandomForest.Rd | 2 +- man/setSVM.Rd | 4 ++-- 11 files changed, 17 insertions(+), 17 deletions(-) diff --git a/man/createDefaultSplitSetting.Rd b/man/createDefaultSplitSetting.Rd index e63e74375..20e679bc6 100644 --- a/man/createDefaultSplitSetting.Rd +++ b/man/createDefaultSplitSetting.Rd @@ -9,7 +9,7 @@ test/validation/train sets using default splitting functions createDefaultSplitSetting( testFraction = 0.25, trainFraction = 0.75, - splitSeed = sample(1e+05, 1), + splitSeed = sample(100000, 1), nfold = 3, type = "stratified" ) diff --git a/man/setAdaBoost.Rd b/man/setAdaBoost.Rd index 971948d00..adcc4f6f9 100644 --- a/man/setAdaBoost.Rd +++ b/man/setAdaBoost.Rd @@ -8,7 +8,7 @@ setAdaBoost( nEstimators = list(10, 50, 200), learningRate = list(1, 0.5, 0.1), algorithm = list("SAMME.R"), - seed = sample(1e+06, 1) + seed = sample(1000000, 1) ) } \arguments{ diff --git a/man/setCoxModel.Rd b/man/setCoxModel.Rd index 2d1a6cc77..a0cf90ded 100644 --- a/man/setCoxModel.Rd +++ b/man/setCoxModel.Rd @@ -12,7 +12,7 @@ setCoxModel( threads = -1, upperLimit = 20, lowerLimit = 0.01, - tolerance = 2e-07, + tolerance = 0.0000002, maxIterations = 3000 ) } diff --git a/man/setDecisionTree.Rd b/man/setDecisionTree.Rd index d977d0ee9..96f56e749 100644 --- a/man/setDecisionTree.Rd +++ b/man/setDecisionTree.Rd @@ -15,7 +15,7 @@ setDecisionTree( maxLeafNodes = list(NULL), minImpurityDecrease = list(10^-7), classWeight = list(NULL), - seed = sample(1e+06, 1) + seed = sample(1000000, 1) ) } \arguments{ diff --git a/man/setGradientBoostingMachine.Rd b/man/setGradientBoostingMachine.Rd index a42e2b71c..5f201ea24 100644 --- a/man/setGradientBoostingMachine.Rd +++ b/man/setGradientBoostingMachine.Rd @@ -14,7 +14,7 @@ setGradientBoostingMachine( scalePosWeight = 1, lambda = 1, alpha = 0, - seed = sample(1e+07, 1) + seed = sample(10000000, 1) ) } \arguments{ diff --git a/man/setIterativeHardThresholding.Rd b/man/setIterativeHardThresholding.Rd index 63d7fd257..f9d782081 100644 --- a/man/setIterativeHardThresholding.Rd +++ b/man/setIterativeHardThresholding.Rd @@ -7,14 +7,14 @@ setIterativeHardThresholding( K = 10, penalty = "bic", - seed = sample(1e+05, 1), + seed = sample(100000, 1), exclude = c(), forceIntercept = F, fitBestSubset = FALSE, initialRidgeVariance = 10000, - tolerance = 1e-08, + tolerance = 0.00000001, maxIterations = 10000, - threshold = 1e-06, + threshold = 0.000001, delta = 0 ) } diff --git a/man/setLassoLogisticRegression.Rd b/man/setLassoLogisticRegression.Rd index 533aa53b3..f89881019 100644 --- a/man/setLassoLogisticRegression.Rd +++ b/man/setLassoLogisticRegression.Rd @@ -13,7 +13,7 @@ setLassoLogisticRegression( forceIntercept = F, upperLimit = 20, lowerLimit = 0.01, - tolerance = 2e-06, + tolerance = 0.000002, maxIterations = 3000, priorCoefs = NULL ) diff --git a/man/setLightGBM.Rd b/man/setLightGBM.Rd index 6380df304..41225ce01 100644 --- a/man/setLightGBM.Rd +++ b/man/setLightGBM.Rd @@ -16,7 +16,7 @@ setLightGBM( lambdaL2 = c(0), scalePosWeight = 1, isUnbalance = FALSE, - seed = sample(1e+07, 1) + seed = sample(10000000, 1) ) } \arguments{ diff --git a/man/setMLP.Rd b/man/setMLP.Rd index d6dea6254..7c72e5d73 100644 --- a/man/setMLP.Rd +++ b/man/setMLP.Rd @@ -8,14 +8,14 @@ setMLP( hiddenLayerSizes = list(c(100), c(20)), activation = list("relu"), solver = list("adam"), - alpha = list(0.3, 0.01, 1e-04, 1e-06), + alpha = list(0.3, 0.01, 0.0001, 0.000001), batchSize = list("auto"), learningRate = list("constant"), learningRateInit = list(0.001), powerT = list(0.5), maxIter = list(200, 100), shuffle = list(TRUE), - tol = list(1e-04), + tol = list(0.0001), warmStart = list(TRUE), momentum = list(0.9), nesterovsMomentum = list(TRUE), @@ -23,9 +23,9 @@ setMLP( validationFraction = list(0.1), beta1 = list(0.9), beta2 = list(0.999), - epsilon = list(1e-08), + epsilon = list(0.00000001), nIterNoChange = list(10), - seed = sample(1e+05, 1) + seed = sample(100000, 1) ) } \arguments{ diff --git a/man/setRandomForest.Rd b/man/setRandomForest.Rd index 360b532ae..dbe7ad0f5 100644 --- a/man/setRandomForest.Rd +++ b/man/setRandomForest.Rd @@ -19,7 +19,7 @@ setRandomForest( oobScore = list(FALSE), nJobs = list(NULL), classWeight = list(NULL), - seed = sample(1e+05, 1) + seed = sample(100000, 1) ) } \arguments{ diff --git a/man/setSVM.Rd b/man/setSVM.Rd index 2def0720b..f21a202e4 100644 --- a/man/setSVM.Rd +++ b/man/setSVM.Rd @@ -8,13 +8,13 @@ setSVM( C = list(1, 0.9, 2, 0.1), kernel = list("rbf"), degree = list(1, 3, 5), - gamma = list("scale", 1e-04, 3e-05, 0.001, 0.01, 0.25), + gamma = list("scale", 0.0001, 0.00003, 0.001, 0.01, 0.25), coef0 = list(0), shrinking = list(TRUE), tol = list(0.001), classWeight = list(NULL), cacheSize = 500, - seed = sample(1e+05, 1) + seed = sample(100000, 1) ) } \arguments{ From c17ae5eb4f10cca2be48b868ed0c31b785b42c91 Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 12 Dec 2024 11:59:03 +0100 Subject: [PATCH 6/8] new docs --- man/createNormalization.Rd | 17 +++++++++++++++++ man/createRareFeatureRemover.Rd | 18 ++++++++++++++++++ man/deDuplicateCovariateData.Rd | 22 ++++++++++++++++++++++ man/minMaxNormalize.Rd | 24 ++++++++++++++++++++++++ man/pmmFit.Rd | 20 ++++++++++++++++++++ man/removeRareFeatures.Rd | 21 +++++++++++++++++++++ man/robustNormalize.Rd | 27 +++++++++++++++++++++++++++ 7 files changed, 149 insertions(+) create mode 100644 man/createNormalization.Rd create mode 100644 man/createRareFeatureRemover.Rd create mode 100644 man/deDuplicateCovariateData.Rd create mode 100644 man/minMaxNormalize.Rd create mode 100644 man/pmmFit.Rd create mode 100644 man/removeRareFeatures.Rd create mode 100644 man/robustNormalize.Rd diff --git a/man/createNormalization.Rd b/man/createNormalization.Rd new file mode 100644 index 000000000..fa20c6df0 --- /dev/null +++ b/man/createNormalization.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PreprocessingData.R +\name{createNormalization} +\alias{createNormalization} +\title{Create the settings for normalizing the data} +\usage{ +createNormalization(type = "minmax") +} +\arguments{ +\item{type}{The type of normalization to use, either "minmax" or "robust"} +} +\value{ +An object of class \code{featureEngineeringSettings} +} +\description{ +Create the settings for normalizing the data +} diff --git a/man/createRareFeatureRemover.Rd b/man/createRareFeatureRemover.Rd new file mode 100644 index 000000000..c1e70b87d --- /dev/null +++ b/man/createRareFeatureRemover.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PreprocessingData.R +\name{createRareFeatureRemover} +\alias{createRareFeatureRemover} +\title{Create the settings for removing rare features} +\usage{ +createRareFeatureRemover(ratio = 0.001) +} +\arguments{ +\item{ratio}{The minimum fraction of the training data that must have a +feature for it to be included} +} +\value{ +An object of class \code{featureEngineeringSettings} +} +\description{ +Create the settings for removing rare features +} diff --git a/man/deDuplicateCovariateData.Rd b/man/deDuplicateCovariateData.Rd new file mode 100644 index 000000000..e1f94213d --- /dev/null +++ b/man/deDuplicateCovariateData.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExternalValidatePlp.R +\name{deDuplicateCovariateData} +\alias{deDuplicateCovariateData} +\title{deplucateCovariateData - Remove duplicate covariate data +when downloading data with multiple different covariateSettings sometimes +there will be duplicated analysisIds which need to be removed} +\usage{ +deDuplicateCovariateData(covariateData) +} +\arguments{ +\item{covariateData}{The covariate data Andromeda object} +} +\value{ +The deduplicated covariate data +} +\description{ +deplucateCovariateData - Remove duplicate covariate data +when downloading data with multiple different covariateSettings sometimes +there will be duplicated analysisIds which need to be removed +} +\keyword{internal} diff --git a/man/minMaxNormalize.Rd b/man/minMaxNormalize.Rd new file mode 100644 index 000000000..c46242937 --- /dev/null +++ b/man/minMaxNormalize.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PreprocessingData.R +\name{minMaxNormalize} +\alias{minMaxNormalize} +\title{A function that normalizes continous features to have values between 0 and 1} +\usage{ +minMaxNormalize(trainData, featureEngineeringSettings, normalized = FALSE) +} +\arguments{ +\item{trainData}{The training data to be normalized} + +\item{featureEngineeringSettings}{The settings for the normalization} + +\item{normalized}{Whether the data has already been normalized (bool)} +} +\value{ +The normalized data +} +\description{ +A function that normalizes continous features to have values between 0 and 1 +} +\details{ +uses value - min / (max - min) to normalize the data +} diff --git a/man/pmmFit.Rd b/man/pmmFit.Rd new file mode 100644 index 000000000..3cadc1df1 --- /dev/null +++ b/man/pmmFit.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{pmmFit} +\alias{pmmFit} +\title{Predictive mean matching using lasso} +\usage{ +pmmFit(data, k = 5) +} +\arguments{ +\item{k}{The number of donors to use for matching (default 5)} + +\item{numericData}{An andromeda object with the following fields: +xObs: covariates table for observed data +xMiss: covariates table for missing data +yObs: outcome variable that we want to impute} +} +\description{ +Predictive mean matching using lasso +} +\keyword{internal} diff --git a/man/removeRareFeatures.Rd b/man/removeRareFeatures.Rd new file mode 100644 index 000000000..0a62ea546 --- /dev/null +++ b/man/removeRareFeatures.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PreprocessingData.R +\name{removeRareFeatures} +\alias{removeRareFeatures} +\title{A function that removes rare features from the data} +\usage{ +removeRareFeatures(trainData, featureEngineeringSettings, findRare = FALSE) +} +\arguments{ +\item{trainData}{The data to be normalized} + +\item{featureEngineeringSettings}{The settings for the normalization} + +\item{findRare}{Whether to find and remove rare features or remove them only (bool)} +} +\description{ +A function that removes rare features from the data +} +\details{ +removes features that are present in less than a certain fraction of the population +} diff --git a/man/robustNormalize.Rd b/man/robustNormalize.Rd new file mode 100644 index 000000000..cc3d558e8 --- /dev/null +++ b/man/robustNormalize.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PreprocessingData.R +\name{robustNormalize} +\alias{robustNormalize} +\title{A function that normalizes continous by the interquartile range and forces +the resulting values to be between -3 and 3 with f(x) = x / sqrt(1 + (x/3)^2)} +\usage{ +robustNormalize(trainData, featureEngineeringSettings, normalized = FALSE) +} +\arguments{ +\item{trainData}{The training data to be normalized} + +\item{featureEngineeringSettings}{The settings for the normalization} + +\item{normalized}{Whether the data has already been normalized (bool)} +} +\description{ +A function that normalizes continous by the interquartile range and forces +the resulting values to be between -3 and 3 with f(x) = x / sqrt(1 + (x/3)^2) +} +\details{ +uses (value - median) / iqr to normalize the data and then +applies the function f(x) = x / sqrt(1 + (x/3)^2) to the normalized values. +This forces the values to be between -3 and 3 while preserving the relative +ordering of the values.' +based on https://arxiv.org/abs/2407.04491 for more details +} From 362898c9d40349e87d1bbd3c078ecc8c950e9e2c Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 12 Dec 2024 14:07:28 +0100 Subject: [PATCH 7/8] refactoring and fix dependancies --- DESCRIPTION | 1 + R/Imputation.R | 605 +++++++++++++++++++++++------------------- R/PreprocessingData.R | 14 +- 3 files changed, 347 insertions(+), 273 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a45cfa471..0f9c01cc7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,6 +50,7 @@ Suggests: BigKnn (>= 1.0.0), devtools, Eunomia, + glmnet, IterativeHardThresholding, knitr, markdown, diff --git a/R/Imputation.R b/R/Imputation.R index 5bfa23380..227526c1b 100644 --- a/R/Imputation.R +++ b/R/Imputation.R @@ -23,263 +23,86 @@ #' @return The imputed data iterativeImpute <- function(trainData, featureEngineeringSettings, done = FALSE) { if (!done) { - total <- trainData$covariateData$covariates %>% - dplyr::summarise(total = dplyr::n_distinct(.data$rowId)) %>% - dplyr::pull() - continuousFeatures <- trainData$covariateData$analysisRef %>% - dplyr::filter(.data$isBinary == "N") %>% - dplyr::select("analysisId") %>% - dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% - dplyr::pull(.data$covariateId) - - trainData$covariateData$missingInfo <- trainData$covariateData$covariates %>% - dplyr::filter(.data$covariateId %in% continuousFeatures) %>% - dplyr::group_by(.data$covariateId) %>% - dplyr::summarise(counts = dplyr::n()) %>% - dplyr::collect() %>% # necessary because of integer division in sqlite - dplyr::mutate(missing = 1 - .data$counts / total) %>% - dplyr::select(c("covariateId", "missing")) + missingInfo <- extractMissingInfo(trainData) + trainData$covariateData$missingInfo <- missingInfo$missingInfo + continuousFeatures <- missingInfo$continuousFeatures on.exit(trainData$covariateData$missingInfo <- NULL, add = TRUE) trainData$covariateData$covariates <- trainData$covariateData$covariates %>% dplyr::left_join(trainData$covariateData$missingInfo, by = "covariateId") %>% - dplyr::filter(is.na(.data$missing) || + dplyr::filter(is.na(.data$missing) || .data$missing <= featureEngineeringSettings$missingThreshold) %>% dplyr::select(-"missing") - # now I want to do imputation using predictive mean matching and chained equations # separate the continuous and binary features - numericData <- Andromeda::andromeda() - numericData$covariates <- trainData$covariateData$covariates %>% - dplyr::filter(.data$covariateId %in% continuousFeatures) - on.exit(numericData$covariates <- NULL, add = TRUE) - numericData$covariateRef <- trainData$covariateData$covariateRef %>% - dplyr::filter(.data$covariateId %in% continuousFeatures) - on.exit(numericData$covariateRef <- NULL, add = TRUE) - - binary <- Andromeda::andromeda() - binary$covariates <- trainData$covariateData$covariates %>% - dplyr::filter(!.data$covariateId %in% !!continuousFeatures) - on.exit(binary$covariates <- NULL, add = TRUE) - binary$covariateRef <- trainData$covariateData$covariateRef %>% - dplyr::filter(!.data$covariateId %in% !!continuousFeatures) - on.exit(binary$covariateRef <- NULL, add = TRUE) - - # initialize imputed values - allRowIds <- numericData$covariates %>% - dplyr::pull(.data$rowId) %>% - unique() %>% - sort() - allColumnIds <- numericData$covariates %>% - dplyr::pull(.data$covariateId) %>% - unique() %>% - sort() - completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) - # now we have NAs for missing combinations - numericData$covariates <- merge(completeIds, numericData$covariates, - all.x = TRUE) - - # get index of NAs for every feature to be imputed - numericData$missingIndex <- numericData$covariates %>% - dplyr::filter(is.na(.data$covariateValue)) %>% - dplyr::select(-c("covariateValue")) - on.exit(numericData$missingIndex <- NULL, add = TRUE) - - # TODO choose initialization method - # let's impute by the mean initially - numericData$imputedCovariates <- numericData$covariates %>% - dplyr::group_by(.data$covariateId) %>% - dplyr::mutate(imputedValue = ifelse(is.na(.data$covariateValue), - mean(.data$covariateValue, na.rm = TRUE), - .data$covariateValue - )) - on.exit(numericData$imputedCovariates <- NULL, add = TRUE) - - # Main MICE algorithm - prevImputations <- list() - maxIter <- 5 # TODO check - varsToImpute <- numericData$missingIndex %>% - dplyr::pull(.data$covariateId) %>% - unique() - convergenceParameters <- list() - modelInfo <- list() - - for (iter in 1:maxIter) { - ParallelLogger::logInfo("Imputation iteration: ", iter) - currentImputations <- list() - - # TODO do in order from least missing to most missing - for (varId in varsToImpute) { - varName <- trainData$covariateData$covariateRef %>% - dplyr::filter(.data$covariateId == varId) %>% - dplyr::pull(.data$covariateName) - ParallelLogger::logInfo("Imputing variable: ", varName) - numericData$y <- numericData$covariates %>% - dplyr::filter(.data$covariateId == varId) %>% - dplyr::mutate(y = .data$covariateValue) %>% - dplyr::select("y", "rowId") - on.exit(numericData$y <- NULL, add = TRUE) - obsIdx <- which(!is.na(numericData$y %>% dplyr::pull(.data$y))) - missIdx <- which(is.na(numericData$y %>% dplyr::pull(.data$y))) - numericData$yObs <- numericData$y %>% - dplyr::filter(.data$rowId %in% !!allRowIds[obsIdx]) - on.exit(numericData$yObs <- NULL, add = TRUE) - - numericData$X <- numericData$imputedCovariates %>% - dplyr::filter(.data$covariateId != varId) %>% - dplyr::mutate(covariateValue = .data$imputedValue) %>% - dplyr::select(-c("imputedValue")) - on.exit(numericData$X <- NULL, add = TRUE) - Andromeda::appendToTable(numericData$X, binary$covariates) - numericData$xObs <- numericData$X %>% dplyr::filter(.data$rowId %in% !!allRowIds[obsIdx]) - on.exit(numericData$xObs <- NULL, add = TRUE) - numericData$xMiss <- numericData$X %>% dplyr::filter(.data$rowId %in% !!allRowIds[missIdx]) - on.exit(numericData$xMiss <- NULL, add = TRUE) - - pmmResults <- pmmFit(numericData, k = 5) - - # update imputations in data - numericData$imputedValues <- pmmResults$imputedValues - on.exit(numericData$imputedValues <- NULL, add = TRUE) - numericData$imputedCovariates <- numericData$imputedCovariates %>% - dplyr::left_join(numericData$imputedValues, - by = "rowId", - suffix = c("", ".new") - ) %>% - dplyr::mutate( - imputedValue = - dplyr::if_else(.data$covariateId == varId && - !is.na(.data$imputedValue.new), - .data$imputedValue.new, - .data$imputedValue - ) - ) %>% - dplyr::select(-"imputedValue.new") - - # store current imputations for convergence check - currentImputations[[as.character(varId)]] <- pmmResults$imputedValues$imputedValue - - # store pmm info for each variable - modelInfo[[as.character(varId)]] <- pmmResults$model - } - - # save values for convergence checking afterwards - - # store mean and variance of imputed values for each variable - # as well as average change from previous iteration - meanVector <- numeric(length(varsToImpute)) - varVector <- numeric(length(varsToImpute)) - idx <- 1 - for (varId in varsToImpute) { - currentImputation <- currentImputations[[as.character(varId)]] - meanVector[idx] <- mean(currentImputation) - varVector[idx] <- var(currentImputation) - idx <- idx + 1 - } - convergenceInfo <- list( - meanVector = meanVector, - varVector = varVector - ) - if (iter > 1) { - meanVarChange <- numeric(length(varsToImpute)) - for (varId in varsToImpute) { - prevImputation <- prevImputations[[as.character(varId)]] - currentImputation <- currentImputations[[as.character(varId)]] - meanVarChange <- c( - meanVarChange, - mean(abs(currentImputation - prevImputation)) - ) - } - convergenceInfo$meanVarChange <- meanVarChange - } - convergenceParameters[[iter]] <- convergenceInfo - - prevImputations <- currentImputations - } - - # calculate kde estimates of imputed and observed distributions per imputed variable - # and store in featureEngineeringSettings - kdeEstimates <- list() - for (varId in varsToImpute) { - varName <- trainData$covariateData$covariateRef %>% - dplyr::filter(.data$covariateId == varId) %>% - dplyr::pull(.data$covariateName) - rows <- numericData$missingIndex %>% - dplyr::filter(.data$covariateId == varId) %>% - dplyr::pull(.data$rowId) - imputedValues <- numericData$imputedCovariates %>% - dplyr::filter(.data$covariateId == varId, - .data$rowId %in% rows) %>% - dplyr::pull(.data$imputedValue) - observedValues <- numericData$covariates %>% - dplyr::filter(.data$covariateId == varId, - !is.na(.data$covariateValue)) %>% - dplyr::pull(.data$covariateValue) - kdeEstimates[[as.character(varId)]] <- list( - imputed = density(imputedValues), - observed = density(observedValues) - ) - } - + featureData <- separateFeatures(trainData, continuousFeatures) + numericData <- featureData[[1]] + binary <- featureData[[2]] + on.exit(numericData <- NULL, add = TRUE) + on.exit(binary <- NULL, add = TRUE) + + numericData <- initializeImputation(numericData, "mean") # add imputed values in data + iterativeImputeResults <- iterativeChainedImpute(numericData, + binary, + trainData, + featureEngineeringSettings, + direction = "ascending", + iterations = 5 + ) + Andromeda::appendToTable( trainData$covariateData$covariates, - numericData$imputedCovariates %>% - dplyr::filter(is.na(.data$covariateValue)) %>% + iterativeImputeResults$numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% dplyr::mutate(covariateValue = .data$imputedValue) %>% - dplyr::select(-c("imputedValue"))) - - - attr(featureEngineeringSettings, "missingInfo") <- + dplyr::select(-c("imputedValue")) + ) + + + attr(featureEngineeringSettings, "missingInfo") <- trainData$covariateData$missingInfo %>% dplyr::collect() - attr(featureEngineeringSettings, "imputer") <- modelInfo - attr(featureEngineeringSettings, "kdeEstimates") <- kdeEstimates + attr(featureEngineeringSettings, "imputer") <- iterativeImputeResults$modelInfo + attr(featureEngineeringSettings, "kdeEstimates") <- iterativeImputeResults$kdeEstimates done <- TRUE } else { # remove data with more than missingThreshold - trainData$covariateData$missingInfo <- attr(featureEngineeringSettings, - "missingInfo") + trainData$covariateData$missingInfo <- attr( + featureEngineeringSettings, + "missingInfo" + ) on.exit(trainData$covariateData$missingInfo <- NULL, add = TRUE) trainData$covariateData$covariateIsBinary <- trainData$covariateData$covariateRef %>% - dplyr::select(.data$covariateId, analysisId) %>% + dplyr::select("covariateId", "analysisId") %>% dplyr::inner_join( trainData$covariateData$analysisRef %>% - dplyr::select(analysisId, .data$isBinary), + dplyr::select("analysisId", "isBinary"), by = "analysisId" ) %>% - dplyr::mutate(isBinary = .data$isBinary == 'Y') %>% + dplyr::mutate(isBinary = .data$isBinary == "Y") %>% dplyr::select(.data$covariateId, .data$isBinary) %>% dplyr::compute() on.exit(trainData$covariateData$covariateIsBinary <- NULL, add = TRUE) trainData$covariateData$covariates <- trainData$covariateData$covariates %>% dplyr::left_join(trainData$covariateData$missingInfo, by = "covariateId") %>% - dplyr::left_join(trainData$covariateData$covariateIsBinary, by = "covariateId") %>% + dplyr::left_join(trainData$covariateData$covariateIsBinary, by = "covariateId") %>% dplyr::filter( (!is.na(.data$missing) && .data$missing <= featureEngineeringSettings$missingThreshold) || - (is.na(.data$missing) && .data$isBinary)) %>% + (is.na(.data$missing) && .data$isBinary) + ) %>% dplyr::select(-"missing", -"isBinary") - + continuousFeatures <- trainData$covariateData$analysisRef %>% dplyr::filter(.data$isBinary == "N") %>% dplyr::select("analysisId") %>% dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% dplyr::pull(.data$covariateId) - numericData <- Andromeda::andromeda() - numericData$covariates <- trainData$covariateData$covariates %>% - dplyr::filter(.data$covariateId %in% continuousFeatures) - on.exit(numericData$covariates <- NULL, add = TRUE) - numericData$covariateRef <- trainData$covariateData$covariateRef %>% - dplyr::filter(.data$covariateId %in% continuousFeatures) - on.exit(numericData$covariateRef <- NULL, add = TRUE) - - binary <- Andromeda::andromeda() - binary$covariates <- trainData$covariateData$covariates %>% - dplyr::filter(!.data$covariateId %in% !!continuousFeatures) - on.exit(binary$covariates <- NULL, add = TRUE) - binary$covariateRef <- trainData$covariateData$covariateRef %>% - dplyr::filter(!.data$covariateId %in% !!continuousFeatures) - on.exit(binary$covariateRef <- NULL, add = TRUE) + featureData <- separateFeatures(trainData, continuousFeatures) + numericData <- featureData[[1]] + binary <- featureData[[2]] + on.exit(numericData <- NULL, add = TRUE) + on.exit(binary <- NULL, add = TRUE) # impute missing values allRowIds <- numericData$covariates %>% dplyr::pull(.data$rowId) %>% @@ -292,20 +115,21 @@ iterativeImpute <- function(trainData, featureEngineeringSettings, done = FALSE) completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) # now we have NAs for missing combinations numericData$covariates <- merge(completeIds, numericData$covariates, - all.x = TRUE) - + all.x = TRUE + ) + # get index of NAs for every feature to be imputed numericData$missingIndex <- numericData$covariates %>% dplyr::filter(is.na(.data$covariateValue)) %>% dplyr::select(-c("covariateValue")) on.exit(numericData$missingIndex <- NULL, add = TRUE) - + numericData$imputedCovariates <- numericData$covariates %>% - dplyr::group_by(.data$covariateId) %>% + dplyr::group_by(.data$covariateId) %>% dplyr::mutate(imputedValue = .data$covariateValue) on.exit(numericData$imputedCovariates <- NULL, add = TRUE) - - + + varsToImpute <- numericData$missingIndex %>% dplyr::pull(.data$covariateId) %>% unique() @@ -324,12 +148,13 @@ iterativeImpute <- function(trainData, featureEngineeringSettings, done = FALSE) dplyr::filter(.data$covariateId != varId) on.exit(numericData$X <- NULL, add = TRUE) Andromeda::appendToTable(numericData$X, binary$covariates) - numericData$xMiss <- numericData$X %>% dplyr::filter(.data$rowId %in% !!allRowIds[missIdx]) + numericData$xMiss <- numericData$X %>% + dplyr::filter(.data$rowId %in% !!allRowIds[missIdx]) on.exit(numericData$xMiss <- NULL, add = TRUE) - + imputer <- attr(featureEngineeringSettings, "imputer")[[as.character(varId)]] pmmResults <- pmmPredict(numericData, k = 5, imputer) - + # update imputations in data numericData$imputedValues <- pmmResults$imputedValues on.exit(numericData$imputedValues <- NULL, add = TRUE) @@ -351,19 +176,20 @@ iterativeImpute <- function(trainData, featureEngineeringSettings, done = FALSE) # add imputed values in data Andromeda::appendToTable( trainData$covariateData$covariates, - numericData$imputedCovariates %>% - dplyr::filter(is.na(.data$covariateValue)) %>% + numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% dplyr::mutate(covariateValue = .data$imputedValue) %>% - dplyr::select(-c("imputedValue"))) + dplyr::select(-c("imputedValue")) + ) } featureEngineering <- list( funct = "iterativeImpute", settings = list( featureEngineeringSettings = featureEngineeringSettings, done = done + ) ) - ) - attr(trainData, "metaData")$featureEngineering[["iterativeImputer"]] <- + attr(trainData, "metaData")$featureEngineering[["iterativeImputer"]] <- featureEngineering return(trainData) } @@ -377,7 +203,7 @@ iterativeImpute <- function(trainData, featureEngineeringSettings, done = FALSE) #' @return The settings for the single imputer of class `featureEngineeringSettings` #' @export createIterativeImputer <- function(missingThreshold = 0.3, - method = "pmm") { + method = "pmm") { featureEngineeringSettings <- list( missingThreshold = missingThreshold, method = method @@ -396,38 +222,47 @@ createIterativeImputer <- function(missingThreshold = 0.3, #' @param k The number of donors to use for matching (default 5) #' @keywords internal pmmFit <- function(data, k = 5) { + rlang::check_installed("glmnet") data$rowMap <- data$xObs %>% dplyr::group_by(.data$rowId) %>% dplyr::summarise() %>% - dplyr::mutate(oldRowId = .data$rowId, - newRowId = dplyr::row_number()) %>% + dplyr::mutate( + oldRowId = .data$rowId, + newRowId = dplyr::row_number() + ) %>% dplyr::select(c("newRowId", "oldRowId")) %>% dplyr::compute() on.exit(data$rowMap <- NULL, add = TRUE) data$colMap <- data$xObs %>% dplyr::group_by(.data$covariateId) %>% dplyr::summarise() %>% - dplyr::mutate(oldCovariateId = .data$covariateId, - newCovariateId = dplyr::row_number()) %>% + dplyr::mutate( + oldCovariateId = .data$covariateId, + newCovariateId = dplyr::row_number() + ) %>% dplyr::select(c("newCovariateId", "oldCovariateId")) on.exit(data$colMap <- NULL, add = TRUE) - + data$xObs <- data$xObs %>% dplyr::left_join(data$rowMap, by = c("rowId" = "oldRowId")) %>% dplyr::left_join(data$colMap, by = c("covariateId" = "oldCovariateId")) %>% - dplyr::select(rowId = "newRowId", - covariateId = "newCovariateId", - covariateValue = "covariateValue") - + dplyr::select( + rowId = "newRowId", + covariateId = "newCovariateId", + covariateValue = "covariateValue" + ) + xObs <- Matrix::sparseMatrix( i = data$xObs %>% dplyr::pull(.data$rowId), j = data$xObs %>% dplyr::pull(.data$covariateId), x = data$xObs %>% dplyr::pull(.data$covariateValue), - dims = c(data$rowMap %>% dplyr::pull(.data$newRowId) %>% max(), - data$colMap %>% dplyr::pull(.data$newCovariateId) %>% max()) + dims = c( + data$rowMap %>% dplyr::pull(.data$newRowId) %>% max(), + data$colMap %>% dplyr::pull(.data$newCovariateId) %>% max() ) - - fit <- glmnet::cv.glmnet(xObs, data$yObs %>% + ) + + fit <- glmnet::cv.glmnet(xObs, data$yObs %>% dplyr::pull(.data$y), alpha = 1, nfolds = 3) # predict on both XObs and XMiss @@ -436,42 +271,50 @@ pmmFit <- function(data, k = 5) { dplyr::left_join( data$xMiss %>% dplyr::group_by(.data$covariateId) %>% - dplyr::summarise(n_unique = dplyr::n_distinct(.data$covariateValue), - max = max(.data$covariateValue, na.rm = TRUE), - min = min(.data$covariateValue, na.rm = TRUE), - ), + dplyr::summarise( + n_unique = dplyr::n_distinct(.data$covariateValue), + max = max(.data$covariateValue, na.rm = TRUE), + min = min(.data$covariateValue, na.rm = TRUE), + ), by = "covariateId" - ) %>% + ) %>% dplyr::group_by(.data$covariateId) %>% dplyr::mutate( covariateValue = ifelse(.data$n_unique > 2 & (.data$max - .data$max) > 0, - (.data$covariateValue - .data$min) / (.data$max - .data$min), - .data$covariateValue)) %>% + (.data$covariateValue - .data$min) / (.data$max - .data$min), + .data$covariateValue + ) + ) %>% dplyr::ungroup() %>% dplyr::select(-c("n_unique", "min", "max")) data$rowMapMiss <- data$xMiss %>% dplyr::group_by(.data$rowId) %>% dplyr::summarise() %>% - dplyr::mutate(oldRowId = .data$rowId, - newRowId = dplyr::row_number()) %>% + dplyr::mutate( + oldRowId = .data$rowId, + newRowId = dplyr::row_number() + ) %>% dplyr::select(c("newRowId", "oldRowId")) %>% dplyr::compute() on.exit(data$rowMapMiss <- NULL, add = TRUE) data$xMiss <- data$xMiss %>% dplyr::left_join(data$rowMapMiss, by = c("rowId" = "oldRowId")) %>% dplyr::left_join(data$colMap, by = c("covariateId" = "oldCovariateId")) %>% - dplyr::select(rowId = "newRowId", - covariateId = "newCovariateId", - covariateValue = "covariateValue") - + dplyr::select( + rowId = "newRowId", + covariateId = "newCovariateId", + covariateValue = "covariateValue" + ) + xMiss <- Matrix::sparseMatrix( i = data$xMiss %>% dplyr::pull(.data$rowId), j = data$xMiss %>% dplyr::pull(.data$covariateId), x = data$xMiss %>% dplyr::pull(.data$covariateValue), dims = c( data$xMiss %>% dplyr::pull(.data$rowId) %>% max(), - data$xMiss %>% dplyr::pull(.data$covariateId) %>% max()) + data$xMiss %>% dplyr::pull(.data$covariateId) %>% max() ) + ) predsMiss <- predict(fit, xMiss, fit$lambda.min) @@ -488,7 +331,7 @@ pmmFit <- function(data, k = 5) { donorValues <- donorMapping[donorIndices] imputedValues[j] <- sample(donorValues, 1) } - + results <- list() results$imputedValues <- data.frame( rowId = data$rowMapMiss %>% @@ -504,7 +347,8 @@ pmmFit <- function(data, k = 5) { intercept = as.numeric(fit$glmnet.fit$a0[bestIndex]), coefficients = data.frame( covariateId = nonZeroCovariateIds, - values = as.numeric(fit$glmnet.fit$beta[nonZero, bestIndex])), + values = as.numeric(fit$glmnet.fit$beta[nonZero, bestIndex]) + ), predictions = data.frame( rowId = data$rowMap %>% dplyr::pull(.data$oldRowId), @@ -516,11 +360,11 @@ pmmFit <- function(data, k = 5) { pmmPredict <- function(data, k = 5, imputer) { data$coefficients <- imputer$coefficients - predictionMissing <- data$xMiss %>% + predictionMissing <- data$xMiss %>% dplyr::inner_join(data$coefficients, by = "covariateId") %>% dplyr::mutate(values = .data$covariateValue * .data$values) %>% dplyr::group_by(.data$rowId) %>% - dplyr::summarise(value = sum(.data$values, na.rm = TRUE)) %>% + dplyr::summarise(value = sum(.data$values, na.rm = TRUE)) %>% dplyr::select("rowId", "value") predictionMissing <- as.data.frame(predictionMissing) predictionMissing$value <- predictionMissing$value + imputer$intercept @@ -528,10 +372,12 @@ pmmPredict <- function(data, k = 5, imputer) { # precompute mapping to use - straight from xId (row index) to # covariateValue of donor - donorMapping <- imputer$predictions %>% dplyr::pull(.data$prediction) + donorMapping <- imputer$predictions %>% dplyr::pull(.data$prediction) # for each missing value, find the k closest observed values - nRows <- data$xMiss %>% dplyr::pull(.data$rowId) %>% dplyr::n_distinct() + nRows <- data$xMiss %>% + dplyr::pull(.data$rowId) %>% + dplyr::n_distinct() imputedValues <- numeric(nRows) predsObs <- imputer$predictions$prediction for (j in 1:nRows) { @@ -540,7 +386,7 @@ pmmPredict <- function(data, k = 5, imputer) { donorValues <- donorMapping[donorIndices] imputedValues[j] <- sample(donorValues, 1) } - + results <- list() results$imputedValues <- data.frame( rowId = predictionMissing %>% @@ -549,3 +395,222 @@ pmmPredict <- function(data, k = 5, imputer) { ) return(results) } + +extractMissingInfo <- function(trainData) { + total <- trainData$covariateData$covariates %>% + dplyr::summarise(total = dplyr::n_distinct(.data$rowId)) %>% + dplyr::pull() + continuousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + + missingInfo <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise(counts = dplyr::n()) %>% + dplyr::collect() %>% # necessary because of integer division in sqlite + dplyr::mutate(missing = 1 - .data$counts / total) %>% + dplyr::select(c("covariateId", "missing")) + results <- list( + "missingInfo" = missingInfo, + "continuousFeatures" = continuousFeatures + ) + return(results) +} + +separateFeatures <- function(trainData, continuousFeatures) { + numericData <- Andromeda::andromeda() + numericData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) + numericData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId %in% continuousFeatures) + + binaryData <- Andromeda::andromeda() + binaryData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(!.data$covariateId %in% !!continuousFeatures) + binaryData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(!.data$covariateId %in% !!continuousFeatures) + return(list(numericData, binaryData)) +} + +initializeImputation <- function(numericData, method = "mean") { + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + + # get index of NAs for every feature to be imputed + numericData$missingIndex <- numericData$covariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::select(-c("covariateValue")) + + if (method == "mean") { + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = ifelse(is.na(.data$covariateValue), + mean(.data$covariateValue, na.rm = TRUE), + .data$covariateValue + )) + } else { + stop("Unknown initialization method: ", method) + } + return(numericData) +} + +# Main (M)ICE algorithm - iterative imputation with chained equations +iterativeChainedImpute <- function(numericData, + binaryData, + originalData, + featureEngineeringSettings, + direction = "ascending", + iterations = 5) { + prevImputations <- list() + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + maxIter <- iterations# TODO check + varsToImpute <- numericData$missingIndex %>% + dplyr::pull(.data$covariateId) %>% + unique() + convergenceParameters <- list() + modelInfo <- list() + + for (iter in 1:maxIter) { + ParallelLogger::logInfo("Imputation iteration: ", iter) + currentImputations <- list() + + # TODO do in order from least missing to most missing + for (varId in varsToImpute) { + varName <- originalData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$covariateName) + ParallelLogger::logInfo("Imputing variable: ", varName) + numericData$y <- numericData$covariates %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::mutate(y = .data$covariateValue) %>% + dplyr::select("y", "rowId") + on.exit(numericData$y <- NULL, add = TRUE) + obsIdx <- which(!is.na(numericData$y %>% dplyr::pull(.data$y))) + missIdx <- which(is.na(numericData$y %>% dplyr::pull(.data$y))) + numericData$yObs <- numericData$y %>% + dplyr::filter(.data$rowId %in% !!allRowIds[obsIdx]) + on.exit(numericData$yObs <- NULL, add = TRUE) + + numericData$X <- numericData$imputedCovariates %>% + dplyr::filter(.data$covariateId != varId) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + on.exit(numericData$X <- NULL, add = TRUE) + Andromeda::appendToTable(numericData$X, binaryData$covariates) + numericData$xObs <- numericData$X %>% dplyr::filter(.data$rowId %in% !!allRowIds[obsIdx]) + on.exit(numericData$xObs <- NULL, add = TRUE) + numericData$xMiss <- numericData$X %>% dplyr::filter(.data$rowId %in% !!allRowIds[missIdx]) + on.exit(numericData$xMiss <- NULL, add = TRUE) + + pmmResults <- pmmFit(numericData, k = 5) + + # update imputations in data + numericData$imputedValues <- pmmResults$imputedValues + on.exit(numericData$imputedValues <- NULL, add = TRUE) + numericData$imputedCovariates <- numericData$imputedCovariates %>% + dplyr::left_join(numericData$imputedValues, + by = "rowId", + suffix = c("", ".new") + ) %>% + dplyr::mutate( + imputedValue = + dplyr::if_else(.data$covariateId == varId && + !is.na(.data$imputedValue.new), + .data$imputedValue.new, + .data$imputedValue + ) + ) %>% + dplyr::select(-"imputedValue.new") + + # store current imputations for convergence check + currentImputations[[as.character(varId)]] <- pmmResults$imputedValues$imputedValue + + # store pmm info for each variable + modelInfo[[as.character(varId)]] <- pmmResults$model + } + + # save values for convergence checking afterwards + # store mean and variance of imputed values for each variable + # as well as average change from previous iteration + meanVector <- numeric(length(varsToImpute)) + varVector <- numeric(length(varsToImpute)) + idx <- 1 + for (varId in varsToImpute) { + currentImputation <- currentImputations[[as.character(varId)]] + meanVector[idx] <- mean(currentImputation) + varVector[idx] <- var(currentImputation) + idx <- idx + 1 + } + convergenceInfo <- list( + meanVector = meanVector, + varVector = varVector + ) + if (iter > 1) { + meanVarChange <- numeric(length(varsToImpute)) + for (varId in varsToImpute) { + prevImputation <- prevImputations[[as.character(varId)]] + currentImputation <- currentImputations[[as.character(varId)]] + meanVarChange <- c( + meanVarChange, + mean(abs(currentImputation - prevImputation)) + ) + } + convergenceInfo$meanVarChange <- meanVarChange + } + convergenceParameters[[iter]] <- convergenceInfo + + prevImputations <- currentImputations + } + + # calculate kde estimates of imputed and observed distributions per imputed variable + # and store in featureEngineeringSettings + kdeEstimates <- list() + for (varId in varsToImpute) { + varName <- originalData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$covariateName) + rows <- numericData$missingIndex %>% + dplyr::filter(.data$covariateId == varId) %>% + dplyr::pull(.data$rowId) + imputedValues <- numericData$imputedCovariates %>% + dplyr::filter( + .data$covariateId == varId, + .data$rowId %in% rows + ) %>% + dplyr::pull(.data$imputedValue) + observedValues <- numericData$covariates %>% + dplyr::filter( + .data$covariateId == varId, + !is.na(.data$covariateValue) + ) %>% + dplyr::pull(.data$covariateValue) + kdeEstimates[[as.character(varId)]] <- list( + imputed = density(imputedValues), + observed = density(observedValues) + ) + } + results <- list( + "numericData" = numericData, + "convergenceParameters" = convergenceParameters, + "modelInfo" = modelInfo, + "kdeEstimates" = kdeEstimates + ) + return(results) +} diff --git a/R/PreprocessingData.R b/R/PreprocessingData.R index e5ff95468..8ea17f45f 100644 --- a/R/PreprocessingData.R +++ b/R/PreprocessingData.R @@ -180,8 +180,7 @@ robustNormalize <- function(trainData, featureEngineeringSettings, normalized = # get (25, 75)% quantiles of each feature # sqlite (used by Andromeda) doesn't have quantile function, so we need to load the extension # to get upper_quartile and lower_quartile_functions - con <- trainData$covariateData$covariates %>% dbplyr::remote_con() - RSQLite::initExtension(con) + RSQLite::initExtension(trainData$covariateData, "math") trainData$covariateData$quantiles <- trainData$covariateData$covariates %>% dplyr::filter(.data$covariateId %in% continousFeatures) %>% @@ -285,6 +284,8 @@ removeRareFeatures <- function(trainData, featureEngineeringSettings, findRare = trainData$covariateData$covariates <- trainData$covariateData$covariates %>% dplyr::filter(!.data$covariateId %in% rareFeatures) + trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(!.data$covariateId %in% rareFeatures) attr(featureEngineeringSettings, "rareFeatures") <- rareFeatures @@ -294,6 +295,10 @@ removeRareFeatures <- function(trainData, featureEngineeringSettings, findRare = dplyr::filter( !.data$covariateId %in% !!attr(featureEngineeringSettings, "rareFeatures") ) + trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter( + !.data$covariateId %in% !!attr(featureEngineeringSettings, "rareFeatures") + ) } featureEngineering <- list( funct = "removeRareFeatures", @@ -302,7 +307,7 @@ removeRareFeatures <- function(trainData, featureEngineeringSettings, findRare = findRare = findRare ) ) - attr(trainData, "metaData")$featureEngineering[['removeRare']] <- + attr(trainData, "metaData")$featureEngineering[["removeRare"]] <- featureEngineering return(trainData) } @@ -313,6 +318,9 @@ removeRareFeatures <- function(trainData, featureEngineeringSettings, findRare = #' @return An object of class \code{featureEngineeringSettings} #' @export createRareFeatureRemover <- function(ratio = 0.001) { + checkIsClass(ratio, c("numeric")) + checkHigherEqual(ratio, 0) + checkLower(ratio, 1) featureEngineeringSettings <- list( ratio = ratio ) From 43874329ce0742526806f67ef97a9412525f1482 Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 12 Dec 2024 14:55:24 +0100 Subject: [PATCH 8/8] add simple imputer for mean and median imputation --- NAMESPACE | 1 + R/Imputation.R | 198 +++++++++++++++++++++++++++++++++---- R/PreprocessingData.R | 5 +- man/createNormalization.Rd | 7 +- man/createSimpleImputer.Rd | 18 ++++ man/simpleImpute.Rd | 21 ++++ 6 files changed, 222 insertions(+), 28 deletions(-) create mode 100644 man/createSimpleImputer.Rd create mode 100644 man/simpleImpute.Rd diff --git a/NAMESPACE b/NAMESPACE index a3a8f7e7a..173f8d291 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,6 +36,7 @@ export(createRandomForestFeatureSelection) export(createRareFeatureRemover) export(createRestrictPlpDataSettings) export(createSampleSettings) +export(createSimpleImputer) export(createSplineSettings) export(createStratifiedImputationSettings) export(createStudyPopulation) diff --git a/R/Imputation.R b/R/Imputation.R index 227526c1b..43c58da41 100644 --- a/R/Imputation.R +++ b/R/Imputation.R @@ -15,6 +15,184 @@ # See the License for the specific language governing permissions and # limitations under the License. +#' @title Create Iterative Imputer settings +#' @description This function creates the settings for an iterative imputer +#' which first removes features with more than `missingThreshold` missing values +#' and then imputes the missing values iteratively using chained equations +#' @param missingThreshold The threshold for missing values to remove a feature +#' @param method The method to use for imputation, currently only "pmm" is supported +#' @return The settings for the single imputer of class `featureEngineeringSettings` +#' @export +createIterativeImputer <- function(missingThreshold = 0.3, + method = "pmm") { + featureEngineeringSettings <- list( + missingThreshold = missingThreshold, + method = method + ) + if (method == "pmm") { + # at the moment this requires glmnet + rlang::check_installed("glmnet") + } + attr(featureEngineeringSettings, "fun") <- "iterativeImpute" + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' @title Create Simple Imputer settings +#' @description This function creates the settings for a simple imputer +#' which imputes missing values with the mean or median +#' @param method The method to use for imputation, either "mean" or "median" +#' @return The settings for the single imputer of class `featureEngineeringSettings` +#' @export +createSimpleImputer <- function(method = "mean", + missingThreshold = 0.3) { + checkIsClass(method, "character") + checkInStringVector(method, c("mean", "median")) + featureEngineeringSettings <- list( + method = method, + missingThreshold = missingThreshold + ) + attr(featureEngineeringSettings, "fun") <- "simpleImpute" + + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) +} + +#' @title Simple Imputation +#' @description This function does single imputation with the mean or median +#' @param trainData The data to be imputed +#' @param featureEngineeringSettings The settings for the imputation +#' @param done Whether the imputation has already been done (bool) +#' @return The imputed data +simpleImpute <- function(trainData, featureEngineeringSettings, done = FALSE) { + if (!done) { + missingInfo <- extractMissingInfo(trainData) + trainData$covariateData$missingInfo <- missingInfo$missingInfo + continuousFeatures <- missingInfo$continuousFeatures + on.exit(trainData$covariateData$missingInfo <- NULL, add = TRUE) + + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::filter(is.na(.data$missing) || + .data$missing <= featureEngineeringSettings$missingThreshold) %>% + dplyr::select(-"missing") + + # separate the continuous and binary features + featureData <- separateFeatures(trainData, continuousFeatures) + numericData <- featureData[[1]] + on.exit(numericData <- NULL, add = TRUE) + + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + + if (featureEngineeringSettings$method == "mean") { + numericData$imputedValues <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::summarise(imputedValues = mean(.data$covariateValue, na.rm = TRUE)) + } else if (featureEngineeringSettings$method == "median") { + numericData$imputedValues <- numericData$covariates %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::collect() %>% # median not possible in sql + dplyr::summarise(imputedValues = median(.data$covariateValue, na.rm = TRUE)) + } + + + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::left_join(numericData$imputedValues, by = "covariateId") %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = ifelse(is.na(.data$covariateValue), + .data$imputedValues, + .data$covariateValue + )) %>% + dplyr::select(-c("imputedValues")) + Andromeda::appendToTable( + trainData$covariateData$covariates, + numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + ) + attr(featureEngineeringSettings, "missingInfo") <- + trainData$covariateData$missingInfo %>% + dplyr::collect() + attr(featureEngineeringSettings, "imputer") <- + numericData$imputedValues %>% dplyr::collect() + done <- TRUE + } else { + trainData$covariateData$missingInfo <- attr( + featureEngineeringSettings, + "missingInfo" + ) + on.exit(trainData$covariateData$missingInfo <- NULL, add = TRUE) + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::left_join(trainData$covariateData$missingInfo, by = "covariateId") %>% + dplyr::filter(is.na(.data$missing) || + .data$missing <= featureEngineeringSettings$missingThreshold) %>% + dplyr::select(-"missing") + + continuousFeatures <- trainData$covariateData$analysisRef %>% + dplyr::filter(.data$isBinary == "N") %>% + dplyr::select("analysisId") %>% + dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>% + dplyr::pull(.data$covariateId) + featureData <- separateFeatures(trainData, continuousFeatures) + numericData <- featureData[[1]] + on.exit(numericData <- NULL, add = TRUE) + # impute missing values + allRowIds <- numericData$covariates %>% + dplyr::pull(.data$rowId) %>% + unique() %>% + sort() + allColumnIds <- numericData$covariates %>% + dplyr::pull(.data$covariateId) %>% + unique() %>% + sort() + completeIds <- expand.grid(rowId = allRowIds, covariateId = allColumnIds) + numericData$covariates <- merge(completeIds, numericData$covariates, + all.x = TRUE + ) + numericData$imputedValues <- attr(featureEngineeringSettings, "imputer") + numericData$imputedCovariates <- numericData$covariates %>% + dplyr::left_join(numericData$imputedValues, by = "covariateId") %>% + dplyr::group_by(.data$covariateId) %>% + dplyr::mutate(imputedValue = ifelse(is.na(.data$covariateValue), + .data$imputedValues, + .data$covariateValue + )) %>% + dplyr::select(-c("imputedValues")) + Andromeda::appendToTable( + trainData$covariateData$covariates, + numericData$imputedCovariates %>% + dplyr::filter(is.na(.data$covariateValue)) %>% + dplyr::mutate(covariateValue = .data$imputedValue) %>% + dplyr::select(-c("imputedValue")) + ) + + } + featureEngineering <- list( + funct = "simpleImpute", + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + done = done + ) + ) + attr(trainData, "metaData")$featureEngineering[["simpleImputer"]] <- + featureEngineering + return(trainData) +} + + #' @title Imputation #' @description This function does single imputation with predictive mean matchin #' @param trainData The data to be imputed @@ -194,26 +372,6 @@ iterativeImpute <- function(trainData, featureEngineeringSettings, done = FALSE) return(trainData) } -#' @title Create Iterative Imputer settings -#' @description This function creates the settings for an iterative imputer -#' which first removes features with more than `missingThreshold` missing values -#' and then imputes the missing values iteratively using chained equations -#' @param missingThreshold The threshold for missing values to remove a feature -#' @param method The method to use for imputation, currently only "pmm" is supported -#' @return The settings for the single imputer of class `featureEngineeringSettings` -#' @export -createIterativeImputer <- function(missingThreshold = 0.3, - method = "pmm") { - featureEngineeringSettings <- list( - missingThreshold = missingThreshold, - method = method - ) - attr(featureEngineeringSettings, "fun") <- "iterativeImpute" - - class(featureEngineeringSettings) <- "featureEngineeringSettings" - return(featureEngineeringSettings) -} - #' @title Predictive mean matching using lasso #' @param numericData An andromeda object with the following fields: #' xObs: covariates table for observed data diff --git a/R/PreprocessingData.R b/R/PreprocessingData.R index 8ea17f45f..f7e53576e 100644 --- a/R/PreprocessingData.R +++ b/R/PreprocessingData.R @@ -239,13 +239,12 @@ robustNormalize <- function(trainData, featureEngineeringSettings, normalized = ) ) - attr(trainData, "metaData")$featureEngineering[['robustNormalize']] <- + attr(trainData, "metaData")$featureEngineering[["robustNormalize"]] <- featureEngineering return(trainData) } -#' Create the settings for normalizing the data -#' @param type The type of normalization to use, either "minmax" or "robust" +#' Create the settings for normalizing the data @param type The type of normalization to use, either "minmax" or "robust" #' @return An object of class \code{featureEngineeringSettings} #' @export createNormalization <- function(type = "minmax") { diff --git a/man/createNormalization.Rd b/man/createNormalization.Rd index fa20c6df0..6a73cfb30 100644 --- a/man/createNormalization.Rd +++ b/man/createNormalization.Rd @@ -2,16 +2,13 @@ % Please edit documentation in R/PreprocessingData.R \name{createNormalization} \alias{createNormalization} -\title{Create the settings for normalizing the data} +\title{Create the settings for normalizing the data @param type The type of normalization to use, either "minmax" or "robust"} \usage{ createNormalization(type = "minmax") } -\arguments{ -\item{type}{The type of normalization to use, either "minmax" or "robust"} -} \value{ An object of class \code{featureEngineeringSettings} } \description{ -Create the settings for normalizing the data +Create the settings for normalizing the data @param type The type of normalization to use, either "minmax" or "robust" } diff --git a/man/createSimpleImputer.Rd b/man/createSimpleImputer.Rd new file mode 100644 index 000000000..46b2ef5c4 --- /dev/null +++ b/man/createSimpleImputer.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{createSimpleImputer} +\alias{createSimpleImputer} +\title{Create Simple Imputer settings} +\usage{ +createSimpleImputer(method = "mean", missingThreshold = 0.3) +} +\arguments{ +\item{method}{The method to use for imputation, either "mean" or "median"} +} +\value{ +The settings for the single imputer of class `featureEngineeringSettings` +} +\description{ +This function creates the settings for a simple imputer +which imputes missing values with the mean or median +} diff --git a/man/simpleImpute.Rd b/man/simpleImpute.Rd new file mode 100644 index 000000000..619cb6bf2 --- /dev/null +++ b/man/simpleImpute.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Imputation.R +\name{simpleImpute} +\alias{simpleImpute} +\title{Simple Imputation} +\usage{ +simpleImpute(trainData, featureEngineeringSettings, done = FALSE) +} +\arguments{ +\item{trainData}{The data to be imputed} + +\item{featureEngineeringSettings}{The settings for the imputation} + +\item{done}{Whether the imputation has already been done (bool)} +} +\value{ +The imputed data +} +\description{ +This function does single imputation with the mean or median +}