From eca8bcef07b514182e7edcef7dfe8396b5d1b224 Mon Sep 17 00:00:00 2001 From: Dario Strbenac Date: Sat, 21 Dec 2024 14:00:03 +1100 Subject: [PATCH] - Features used can be extracted from penalised Cox model. - Final model can be used with predict function. - prepareData can filter correlated features. - easyHard helps to associate variables with per-sample prediction accuracy. --- DESCRIPTION | 8 ++-- NAMESPACE | 4 ++ R/calcPerformance.R | 96 ++++++++++++++++++++++++++++++++++++++- R/crissCrossValidate.R | 6 +-- R/crossValidate.R | 15 ++++-- R/interfaceCoxnet.R | 4 +- R/interfacePenalisedGLM.R | 16 +++++-- R/precisionPathways.R | 2 +- R/prepareData.R | 49 +++++++++++++------- R/simpleParams.R | 2 +- man/calcPerformance.Rd | 24 +++++++++- man/prepareData.Rd | 16 +++---- 12 files changed, 193 insertions(+), 49 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b69db13..9a3dcc6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,8 +3,8 @@ Type: Package Title: A framework for cross-validated classification problems, with applications to differential variability and differential distribution testing -Version: 3.10.4 -Date: 2024-12-13 +Version: 3.10.5 +Date: 2024-12-20 Authors@R: c( person(given = "Dario", family = "Strbenac", email = "dario.strbenac@sydney.edu.au", role = c("aut", "cre")), @@ -20,9 +20,9 @@ VignetteBuilder: knitr Encoding: UTF-8 biocViews: Classification, Survival Depends: R (>= 4.1.0), generics, methods, S4Vectors, MultiAssayExperiment, BiocParallel, survival -Imports: grid, genefilter, utils, dplyr, tidyr, rlang, ranger, ggplot2 (>= 3.0.0), ggpubr, reshape2, ggupset +Imports: grid, genefilter, utils, dplyr, tidyr, rlang, ranger, ggplot2 (>= 3.0.0), ggpubr, reshape2, ggupset, broom, dcanr Suggests: limma, edgeR, car, Rmixmod, gridExtra (>= 2.0.0), cowplot, - BiocStyle, pamr, PoiClaClu, parathyroidSE, knitr, htmltools, gtable, + BiocStyle, pamr, PoiClaClu, knitr, htmltools, gtable, scales, e1071, rmarkdown, IRanges, robustbase, glmnet, class, randomForestSRC, MatrixModels, xgboost, data.tree, ggnewscale Description: The software formalises a framework for classification and survival model evaluation diff --git a/NAMESPACE b/NAMESPACE index 36ec951..11dbc00 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,6 +33,7 @@ export(crissCrossPlot) export(crissCrossValidate) export(crossValidate) export(distribution) +export(easyHard) export(edgesToHubNetworks) export(featureSetSummary) export(finalModel) @@ -80,6 +81,7 @@ exportMethods(calcExternalPerformance) exportMethods(chosenFeatureNames) exportMethods(crossValidate) exportMethods(distribution) +exportMethods(easyHard) exportMethods(featureSetSummary) exportMethods(finalModel) exportMethods(interactorDifferences) @@ -111,6 +113,8 @@ import(reshape2) importFrom(S4Vectors,as.data.frame) importFrom(S4Vectors,do.call) importFrom(S4Vectors,mcols) +importFrom(broom,tidy) +importFrom(dcanr,cor.pairs) importFrom(dplyr,mutate) importFrom(dplyr,n) importFrom(generics,train) diff --git a/R/calcPerformance.R b/R/calcPerformance.R index 42db346..28c2a47 100755 --- a/R/calcPerformance.R +++ b/R/calcPerformance.R @@ -32,7 +32,8 @@ #' @param result An object of class \code{\link{ClassifyResult}}. #' @param resultsList A list of modelling results. Each element must be of type \code{\link{ClassifyResult}}. #' @param performanceTypes Default: \code{"auto"} A character vector. If \code{"auto"}, Balanced Accuracy will be used -#' for a classification task and C-index for a time-to-event task. +#' for a classification task and C-index for a time-to-event task. If using \code{easyHard}, the default is +#' \code{"Sample Accuracy"} for a classification task and \code{"Sample C-index"} for a time-to-event task. #' Must be one of the following options: #' \itemize{ #' \item{\code{"Error"}: Ordinary error rate.} @@ -130,6 +131,11 @@ setMethod("calcExternalPerformance", c("factor", "tabular"), # table has class p ) }) +calcCVperformance <- function(results, ...) +{ + lapply(results, calcCVperformance) +} + #' @rdname calcPerformance #' @usage NULL #' @export @@ -427,4 +433,90 @@ performanceTable <- function(resultsList, performanceTypes = "auto", aggregate = }) DataFrame(tidyr::pivot_wider(as.data.frame(result@characteristics), names_from = characteristic, values_from = value), performances, check.names = FALSE) })) -} \ No newline at end of file +} + +#' @rdname calcPerformance +#' @usage NULL +#' @export +setGeneric("easyHard", function(measurements, result, assay, performanceType, ...) + standardGeneric("easyHard")) + +#' @rdname calcPerformance +#' @exportMethod easyHard +#' @importFrom broom tidy +#' @param assay For \code{easyHard} only. The assay to use to look for associations to the per-sample metric. +#' @param useFeatures For \code{easyHard} only. Default: \code{NULL} (i.e. use all provided features). A vector of features to consider of the assay specified. +#' This allows for the avoidance of variables such spike-in RNAs, sample IDs, sample acquisition dates, etc. which are not relevant for outcome prediction. +#' @param fitMode For \code{easyHard} only. Default:\code{"single"}. Either \code{"single"} or \code{"full"}. If \code{"single"}, +#' an ordinary GLM model is fitted for each covariate separately. If \code{"full"}, elastic net is used to automatically tune the non-zero model coefficients. +#' @return For \code{easyHard}, a \code{\link{DataFrame}} of logistic regression model summary. + +setMethod("easyHard", "MultiAssayExperimentOrList", + function(measurements, result, assay = "clinical", useFeatures = NULL, performanceType = "auto", + fitMode = c("single", "full")) +{ + if(!requireNamespace("glmnet", quietly = TRUE)) + stop("The package 'glmnet' could not be found. Please install it.") + + if(!assay %in% names(measurements)) stop("'assay' is not one of the names of 'measurements'.") + fitMode <- match.arg(fitMode) + + if(is(measurements, "MultiAssayExperiment")) + { + if(assay == "clinical") + assay <- colData(measurements) + else assay <- t(measurements[, , assay]) # Ensure that features are in columns. + } else {assay <- measurements[[assay]]} + if(!is.null(useFeatures)) assay <- assay[, useFeatures] + if(performanceType == "auto") + { + if("risk" %in% colnames(predictions(result))) + { + performanceType <- "Sample C-index" + } else {performanceType <-"Sample Accuracy"} + } + if(!performanceType %in% names(performance(result))) + { + warning(paste(performanceType, "not found in result. Calculating it now.")) + result <- calcCVperformance(result, performanceType) + } + samplePerformance <- performance(result)[[performanceType]] + if(any(is.na(samplePerformance))) + { + keep <- !is.na(samplePerformance) + assay <- assay[keep, ] + samplePerformance <- samplePerformance[keep] + } + assay <- assay[names(samplePerformance), ] # Just in case. + assayOHE <- MatrixModels::model.Matrix(~ 0 + ., data = assay) + + if(fitMode == "single") + { + as(do.call(rbind, lapply(colnames(assay), function(featureID) + { + covariate <- assay[, featureID] + fitted <- glm(samplePerformance ~ covariate, family = binomial, weights = rep(100, length(samplePerformance))) + summaryDF <- broom::tidy(fitted) + if(is.factor(covariate)) + { + summaryDF[2:nrow(summaryDF), "term"] <- paste(featureID, levels(covariate)[2:length(levels(covariate))], sep = ": ") + + } else {summaryDF[, "term"] <- featureID} + summaryDF[2:nrow(summaryDF), ] + })), "DataFrame") + } else { # Penalised regression. + samplePerformanceM <- matrix(c(1 - samplePerformance, samplePerformance), ncol = 2) + fitted <- glmnet::glmnet(assayOHE, samplePerformanceM, family = "binomial") + lambdaConsider <- colSums(as.matrix(fitted[["beta"]])) != 0 + bestLambda <- fitted[["lambda"]][lambdaConsider][which.min(sapply(fitted[["lambda"]][lambdaConsider], function(lambda) # Largest Lambda with minimum balanced error rate. + { + predictions <- predict(fitted, assayOHE, s = lambda, type = "response") + sum(abs(samplePerformanceM[, 2] - predictions)) + }))[1]] + useVariables <- abs(fitted[["beta"]][, fitted[["lambda"]] == bestLambda]) > 0.00001 + useVariables <- colnames(assay)[unique(assayOHE@assign[useVariables])] + dataForModel <- data.frame(assay, performance = samplePerformanceM[, 2]) + fitted <- glm(performance ~ . + 0, data = dataForModel, family = binomial(), weights = rep(100, nrow(dataForModel))) + broom::tidy(fitted) + } +}) \ No newline at end of file diff --git a/R/crissCrossValidate.R b/R/crissCrossValidate.R index 301ae31..f2d872c 100644 --- a/R/crissCrossValidate.R +++ b/R/crissCrossValidate.R @@ -190,7 +190,7 @@ crissCrossPlot <- function(crissCrossResult, includeValues = FALSE){ ggheatmap <- ggplot(melted_cormat, aes(Var1, Var2, fill = value)) + geom_tile(color = "white") + - scale_fill_gradient2(high = "red", mid = "white", low = "blue", + scale_fill_gradient2(high = "#e25563ff", mid = "white", low = "#094bacff", midpoint = 0.5, limit = c(0,1), space = "Lab", name=as.character(scalebar_title)) + theme_bw() + xlab("Training Dataset") + ylab("Testing Dataset") + @@ -205,7 +205,7 @@ crissCrossPlot <- function(crissCrossResult, includeValues = FALSE){ melted_cormat_1 <- melt(real, na.rm = TRUE) ggheatmap_1 <- ggplot(melted_cormat_1, aes(Var1, Var2, fill = value)) + geom_tile(color = "white") + - scale_fill_gradient2(high = "red", mid = "white", low = "blue", + scale_fill_gradient2(high = "#e25563ff", mid = "white", low = "#094bacff", midpoint = 0.5, limit = c(0,1), space = "Lab", name=as.character(scalebar_title)) + theme_bw() + xlab("Features Extracted") + ylab("Dataset Tested") + @@ -218,7 +218,7 @@ crissCrossPlot <- function(crissCrossResult, includeValues = FALSE){ melted_cormat_2 <- melt(random, na.rm = TRUE) ggheatmap_2 <- ggplot(melted_cormat_2, aes(Var1, Var2, fill = value)) + geom_tile(color = "white") + - scale_fill_gradient2(high = "red", mid = "white", low = "blue", + scale_fill_gradient2(high = "#e25563ff", mid = "white", low = "#094bacff", midpoint = 0.5, limit = c(0,1), space = "Lab", name=as.character(scalebar_title)) + theme_bw() + xlab("Features Extracted") + ylab("Dataset Tested") + diff --git a/R/crossValidate.R b/R/crossValidate.R index 9a4f542..38638de 100644 --- a/R/crossValidate.R +++ b/R/crossValidate.R @@ -747,6 +747,8 @@ CV <- function(measurements, outcome, x, outcomeTrain, measurementsTest, outcome if(is.character(classifyResults)) stop(classifyResults) fullResult <- runTest(measurements, outcome, measurements, outcome, crossValParams = crossValParams, modellingParams = modellingParams, characteristics = characteristics, .iteration = 1) classifyResults@finalModel <- fullResult$models + class(classifyResults@finalModel) <- c("trainedByClassifyR", classifyResults@finalModel) + attr(classifyResults@finalModel, "predictFunction") <- modellingParams@trainParams@classifier } classifyResults @@ -797,7 +799,7 @@ train.DataFrame <- function(x, outcomeTrain, selectionMethod = "auto", nFeatures if(isTuneCross && !extraParams[["tuneCross"]][["performanceType"]] %in% c("auto", .ClassifyRenvir[["performanceTypes"]])) stop(paste("performanceType for tuning must be one of", paste(c("auto", .ClassifyRenvir[["performanceTypes"]]), collapse = ", "), "but is", extraParams[["tuneCross"]][["performanceType"]])) - isCategorical <- is.character(outcome) && (length(outcome) == 1 || length(outcome) == nrow(measurements)) || is.factor(outcome) + isCategorical <- is.character(outcomeTrain) && (length(outcomeTrain) == 1 || length(outcomeTrain) == nrow(measurements)) || is.factor(outcomeTrain) if(isTuneCross && extraParams[["tuneCross"]][["performanceType"]] == "auto") if(isCategorical) extraParams[["tuneCross"]][["performanceType"]] <- "Balanced Accuracy" else extraParams[["tuneCross"]][["performanceType"]] <- "C-index" if(length(selectionMethod) == 1 && selectionMethod == "auto") @@ -838,7 +840,7 @@ train.DataFrame <- function(x, outcomeTrain, selectionMethod = "auto", nFeatures measurementsUse <- measurements } - classifierParams <- .classifierKeywordToParams(keyword = classifierForAssay) + classifierParams <- .classifierKeywordToParams(classifierForAssay, extraParams[["train"]][["tuneParams"]]) if(!is.null(extraParams) && "train" %in% names(extraParams)) { for(paramIndex in seq_along(extraParams[["train"]])) @@ -922,6 +924,7 @@ train.DataFrame <- function(x, outcomeTrain, selectionMethod = "auto", nFeatures if(multiViewMethod == "merge"){ measurementsUse <- measurements[, S4Vectors::mcols(measurements)[["assay"]] %in% assayIDs, drop = FALSE] model <- .doTrain(measurementsUse, outcomeTrain, NULL, NULL, crossValParams, modellingParams, verbose = verbose)[["model"]] + attr(model, "predictFunction") <- modellingParams@trainParams@classifier class(model) <- c("trainedByClassifyR", class(model)) } @@ -948,6 +951,7 @@ train.DataFrame <- function(x, outcomeTrain, selectionMethod = "auto", nFeatures getFeatures = prevalFeatures), predictParams = PredictParams(prevalPredictInterface, characteristics = paramsAssays$clinical@predictParams@characteristics)) model <- .doTrain(measurementsUse, outcomeTrain, NULL, NULL, crossValParams, modellingParams, verbose = verbose)[["model"]] + attr(model, "predictFunction") <- modellingParams@trainParams@classifier class(model) <- c("trainedByClassifyR", class(model)) } @@ -965,6 +969,7 @@ train.DataFrame <- function(x, outcomeTrain, selectionMethod = "auto", nFeatures getFeatures = PCAfeatures), predictParams = PredictParams(pcaPredictInterface, characteristics = paramsClinical$clinical@predictParams@characteristics)) model <- .doTrain(measurementsUse, outcomeTrain, NULL, NULL, crossValParams, modellingParams, verbose = verbose)[["model"]] + attr(model, "predictFunction") <- modellingParams@trainParams@classifier class(model) <- c("trainedByClassifyR", class(model)) } if(missing(models) || is.null(models)) return(model) else return(models) @@ -1050,12 +1055,12 @@ predict.trainedByClassifyR <- function(object, newData, outcome, ...) }, newData, names(newData)) newData <- do.call(cbind, newData) } else if(is(newData, "MultiAssayExperiment")) - { - newData <- prepareData(newData, outcome) + { + newData <- prepareData(newData, outcome)[["measurements"]] } predictFunctionUse <- attr(object, "predictFunction") - class(object) <- rev(class(object)) # Now want the predict method of the specific model to be picked, so put model class first. + class(object) <- class(object)[-1] # Now want the predict method of the specific model to be picked, so put model class first. if (is(object, "listOfModels")) mapply(function(model, assay) predictFunctionUse(model, assay), object, newData, MoreArgs = list(...), SIMPLIFY = FALSE) else do.call(predictFunctionUse, list(object, newData, ...)) # Object is itself a trained model and it is assumed that a predict method is defined for it. diff --git a/R/interfaceCoxnet.R b/R/interfaceCoxnet.R index c23d18b..31c8091 100644 --- a/R/interfaceCoxnet.R +++ b/R/interfaceCoxnet.R @@ -8,7 +8,7 @@ coxnetTrainInterface <- function(measurementsTrain, survivalTrain, lambda = NULL message(Sys.time(), ": Fitting coxnet model to data.") measurementsTrain <- data.frame(measurementsTrain, check.names = FALSE) - measurementsMatrix <- glmnet::makeX(as(measurementsTrain, "data.frame")) + measurementsMatrix <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTrain) # The response variable is a Surv class of object. fit <- glmnet::cv.glmnet(measurementsMatrix, survivalTrain, family = "cox", type = "C", ...) @@ -16,6 +16,8 @@ coxnetTrainInterface <- function(measurementsTrain, survivalTrain, lambda = NULL offset <- -mean(predict(fitted, measurementsMatrix, s = fit$lambda.min, type = "link")) attr(fitted, "tune") <- list(lambda = fit$lambda.min, offset = offset) + attr(fitted, "featureNames") <- colnames(measurementsMatrix) + attr(fitted, "featureGroups") <- measurementsMatrix@assign class(fitted) <- class(fitted)[1] # Get rid of glmnet which messes with dispatch. fitted diff --git a/R/interfacePenalisedGLM.R b/R/interfacePenalisedGLM.R index 99548ec..bcf4f5a 100644 --- a/R/interfacePenalisedGLM.R +++ b/R/interfacePenalisedGLM.R @@ -32,9 +32,10 @@ attr(penalisedGLMtrainInterface, "name") <- "penalisedGLMtrainInterface" # model is of class multnet penalisedGLMpredictInterface <- function(model, measurementsTest, lambda, ..., returnType = c("both", "class", "score"), verbose = 3) -{ # ... just consumes emitted tuning variables from .doTrain which are unused. +{ + # ... just consumes emitted tuning variables from .doTrain which are unused. returnType <- match.arg(returnType) - # One-hot encoding needed. + # One-hot encoding needed. measurementsTest <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTest) # Ensure that testing data has same columns names in same order as training data. @@ -51,7 +52,7 @@ penalisedGLMpredictInterface <- function(model, measurementsTest, lambda, ..., r measurementsTest <- measurementsTest[, rownames(model[["beta"]][[1]])] - + classPredictions <- factor(as.character(predict(model, measurementsTest, s = lambda, type = "class")), levels = model[["classnames"]]) classScores <- predict(model, measurementsTest, s = lambda, type = "response")[, , 1] @@ -78,8 +79,13 @@ penalisedFeatures <- function(model) { # Floating point numbers test for equality. whichCoefficientColumn <- which(abs(model[["lambda"]] - attr(model, "tune")[["lambda"]]) < 0.00001)[1] - coefficientsUsed <- sapply(model[["beta"]], function(classCoefficients) classCoefficients[, whichCoefficientColumn]) - featureScores <- rowSums(abs(coefficientsUsed)) + if(is.list(model[["beta"]])) # Categorical data + { + coefficientsUsed <- sapply(model[["beta"]], function(classCoefficients) classCoefficients[, whichCoefficientColumn]) + featureScores <- rowSums(abs(coefficientsUsed)) + } else { # survival data + featureScores <- model[["beta"]][, whichCoefficientColumn] + } featureGroups <- attr(model, "featureGroups")[match(names(featureScores), attr(model, "featureNames"))] groupScores <- unname(by(featureScores, featureGroups, max)) rankedFeaturesIndices <- order(groupScores, decreasing = TRUE) diff --git a/R/precisionPathways.R b/R/precisionPathways.R index 1381ad4..8c5e016 100644 --- a/R/precisionPathways.R +++ b/R/precisionPathways.R @@ -345,7 +345,7 @@ bubblePlot.PrecisionPathways <- function(precisionPathways, pathwayColours = NUL performance <- precisionPathways[["performance"]] performance <- cbind(Sequence = rownames(performance), performance) ggplot2::ggplot(performance, aes(x = accuracy, y = cost, colour = Sequence, size = 4)) + ggplot2::geom_point() + - ggplot2::scale_color_manual(values = pathwayColours) + ggplot2::labs(x = "Balanced Accuracy", y = "Total Cost") + ggplot2::guides(size = FALSE) + ggplot2::scale_color_manual(values = pathwayColours) + ggplot2::labs(x = "Balanced Accuracy", y = "Total Cost") + ggplot2::guides(size = "none") } #' @export diff --git a/R/prepareData.R b/R/prepareData.R index 26a9d65..9091851 100644 --- a/R/prepareData.R +++ b/R/prepareData.R @@ -15,7 +15,6 @@ #' a character string, or vector of such strings, containing column name(s) of column(s) #' containing either classes or time and event information about survival. If column names #' of survival information, time must be in first column and event status in the second. -#' @param group Not user-specified. Only passed by \code{runTests} for subsetting. #' @param outcomeColumns If \code{measurements} is a \code{MultiAssayExperiment}, the #' names of the column (class) or columns (survival) in the table extracted by \code{colData(data)} #' that contain(s) the each individual's outcome to use for prediction. @@ -25,6 +24,9 @@ #' This allows for the avoidance of variables such spike-in RNAs, sample IDs, sample acquisition dates, etc. which are not relevant for outcome prediction. #' @param maxMissingProp Default: 0.0. A proportion less than 1 which is the maximum #' tolerated proportion of missingness for a feature to be retained for modelling. +#' @param maxSimilarity Default: 1. A number between 0 and 1 which is the maximum similarity between a pair of +#' variables to be both kept in the data set. For numerical variables, the Pearson correlation is used and for categorical variables, +#' the Chi-squared test p-value is used. For a pair that is too similar, the second variable will be excluded from the data set. #' @param topNvariance Default: NULL. If \code{measurements} is a \code{MultiAssayExperiment} or list of tabular data, a named integer vector of most variable #' features per assay to subset to. If the input data is a single table, then simply a single integer. #' If an assays has less features, it won't be reduced in size but stay as-is. @@ -56,10 +58,11 @@ setMethod("prepareData", "data.frame", prepareData(S4Vectors::DataFrame(measurements, check.names = FALSE), outcome, ...) }) +#' @importFrom dcanr cor.pairs #' @rdname prepareData #' @export setMethod("prepareData", "DataFrame", - function(measurements, outcome, group = NULL, useFeatures = NULL, maxMissingProp = 0.0, topNvariance = NULL) + function(measurements, outcome, useFeatures = NULL, maxMissingProp = 0.0, maxSimilarity = 1, topNvariance = NULL) { if(is.null(rownames(measurements))) { @@ -184,7 +187,6 @@ setMethod("prepareData", "DataFrame", { measurements <- measurements[-dropSamples, ] outcome <- outcome[-dropSamples] - if(!is.null(group)) group <- group[-dropSamples] } # Remove features or samples to ensure all features have less missingness than allowed. @@ -222,7 +224,6 @@ setMethod("prepareData", "DataFrame", dropIndices <- match(dropSamples, rownames(measurements)) measurements <- measurements[-dropIndices, ] outcome <- outcome[-dropIndices] - if(!is.null(group)) group <- group[-dropIndices] } # Use only the most N variable features per assay. @@ -241,20 +242,41 @@ setMethod("prepareData", "DataFrame", })) } + if(maxSimilarity < 1) + { + categoricalFeatures <- sapply(measurements, class) %in% c("factor", "character") + if(any(categoricalFeatures)) + { + checkPairs <- combn(which(categoricalFeatures), 2) + pValues <- apply(checkPairs, 2, function(checkPair) + { + fisher.test(table(measurements[, checkPair[1]], measurements[, checkPair[2]]))$p.value + }) + if(any(pValues) < maxSimilarity) dropFeatures <- checkPairs[2, which(pValues < maxSimilarity)] + } + numericFeatures <- sapply(measurements, class) == "numeric" + if(any(numericFeatures)) + { + correlations <- dcanr::cor.pairs(as.matrix(measurements[, numericFeatures])) + diag(correlations) <- 0 + dropFeatures <- c(dropFeatures, unique(which(correlations > maxSimilarity, arr.ind = TRUE)[, 2])) + } + dropFeatures <- character() + if(length(dropFeatures) > 0) measurements <- measurements[, setdiff(1:ncol(measurements), dropFeatures)] + } + + # Names are on both the covariates and outcome. Ensure consistent ordering. if(!is.null(rownames(measurements)) && !is.null(names(outcome))) - { measurements <- measurements[match(names(outcome), rownames(measurements)), ] - group <- group[match(names(outcome), names(group))] - } - list(measurements = measurements, outcome = outcome, group = group) + list(measurements = measurements, outcome = outcome) }) #' @rdname prepareData #' @export setMethod("prepareData", "MultiAssayExperiment", - function(measurements, outcomeColumns = NULL, group = NULL, useFeatures = NULL, ...) + function(measurements, outcomeColumns = NULL, useFeatures = NULL, ...) { if(is.null(useFeatures) || !"clinical" %in% names(useFeatures)) { @@ -274,12 +296,7 @@ setMethod("prepareData", "MultiAssayExperiment", # Get all desired measurements tables and clinical columns. # These form the independent variables to be used for making predictions with. # Variable names will have names like RNA_BRAF for traceability. - dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(useFeatures[["clinical"]], group, outcomeColumns)) - if(!is.null(group)) - { - group <- dataTable[, "group"] - dataTable <- dataTable[, -match("group", colnames(dataTable))] - } + dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(useFeatures[["clinical"]], outcomeColumns), check.names = FALSE) rownames(dataTable) <- dataTable[, "primary"] S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"]) dataTable <- dataTable[, -match("primary", colnames(dataTable))] @@ -295,7 +312,7 @@ setMethod("prepareData", "MultiAssayExperiment", S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")] # Do other filtering and preparation in DataFrame function. - prepareData(dataTable, outcomeColumns, group = group, useFeatures = NULL, ...) + prepareData(dataTable, outcomeColumns, useFeatures = NULL, ...) }) #' @rdname prepareData diff --git a/R/simpleParams.R b/R/simpleParams.R index 19dc1a4..d92e204 100644 --- a/R/simpleParams.R +++ b/R/simpleParams.R @@ -118,7 +118,7 @@ coxphParams <- function() { # Cox Proportional Hazards Model with Elastic Net for Survival coxnetParams <- function() { - trainParams <- TrainParams(coxnetTrainInterface) + trainParams <- TrainParams(coxnetTrainInterface, getFeatures = penalisedFeatures) predictParams <- PredictParams(coxnetPredictInterface) return(list(trainParams = trainParams, predictParams = predictParams)) diff --git a/man/calcPerformance.Rd b/man/calcPerformance.Rd index 1e06672..fac9ef0 100644 --- a/man/calcPerformance.Rd +++ b/man/calcPerformance.Rd @@ -9,6 +9,8 @@ \alias{calcCVperformance,ClassifyResult-method} \alias{calcExternalPerformance,factor,tabular-method} \alias{performanceTable} +\alias{easyHard} +\alias{easyHard,MultiAssayExperimentOrList-method} \title{Add Performance Calculations to a ClassifyResult Object or Calculate for a Pair of Factor Vectors} \usage{ @@ -37,6 +39,15 @@ performanceTable( performanceTypes = "auto", aggregate = c("median", "mean") ) + +\S4method{easyHard}{MultiAssayExperimentOrList}( + measurements, + result, + assay = "clinical", + useFeatures = NULL, + performanceType = "auto", + fitMode = c("single", "full") +) } \arguments{ \item{actualOutcome}{A factor vector or survival information specifying each sample's known outcome.} @@ -44,7 +55,8 @@ performanceTable( \item{predictedOutcome}{A factor vector or survival information of the same length as \code{actualOutcome} specifying each sample's predicted outcome.} \item{performanceTypes}{Default: \code{"auto"} A character vector. If \code{"auto"}, Balanced Accuracy will be used -for a classification task and C-index for a time-to-event task. +for a classification task and C-index for a time-to-event task. If using \code{easyHard}, the default is +\code{"Sample Accuracy"} for a classification task and \code{"Sample C-index"} for a time-to-event task. Must be one of the following options: \itemize{ \item{\code{"Error"}: Ordinary error rate.} @@ -80,12 +92,22 @@ there are two classes.} \item{aggregate}{Default: \code{"median"}. Can also be \code{"mean"}. If there are multiple values, such as for repeated cross-validation, then they are summarised to a single number using either mean or median.} + +\item{assay}{For \code{easyHard} only. The assay to use to look for associations to the per-sample metric.} + +\item{useFeatures}{For \code{easyHard} only. Default: \code{NULL} (i.e. use all provided features). A vector of features to consider of the assay specified. +This allows for the avoidance of variables such spike-in RNAs, sample IDs, sample acquisition dates, etc. which are not relevant for outcome prediction.} + +\item{fitMode}{For \code{easyHard} only. Default:\code{"single"}. Either \code{"single"} or \code{"full"}. If \code{"single"}, +an ordinary GLM model is fitted for each covariate separately. If \code{"full"}, elastic net is used to automatically tune the non-zero model coefficients.} } \value{ If \code{calcCVperformance} was run, an updated \code{\linkS4class{ClassifyResult}} object, with new metric values in the \code{performance} slot. If \code{calcExternalPerformance} was run, the performance metric value itself. + +For \code{easyHard}, a \code{\link{DataFrame}} of logistic regression model summary. } \description{ If \code{calcExternalPerformance} is used, such as when having a vector of diff --git a/man/prepareData.Rd b/man/prepareData.Rd index bc7e210..ba3a340 100644 --- a/man/prepareData.Rd +++ b/man/prepareData.Rd @@ -16,19 +16,13 @@ \S4method{prepareData}{DataFrame}( measurements, outcome, - group = NULL, useFeatures = NULL, maxMissingProp = 0, + maxSimilarity = 1, topNvariance = NULL ) -\S4method{prepareData}{MultiAssayExperiment}( - measurements, - outcomeColumns = NULL, - group = NULL, - useFeatures = NULL, - ... -) +\S4method{prepareData}{MultiAssayExperiment}(measurements, outcomeColumns = NULL, useFeatures = NULL, ...) \S4method{prepareData}{list}(measurements, outcome = NULL, useFeatures = NULL, ...) } @@ -47,8 +41,6 @@ a character string, or vector of such strings, containing column name(s) of colu containing either classes or time and event information about survival. If column names of survival information, time must be in first column and event status in the second.} -\item{group}{Not user-specified. Only passed by \code{runTests} for subsetting.} - \item{useFeatures}{Default: \code{NULL} (i.e. use all provided features). If \code{measurements} is a \code{MultiAssayExperiment} or list of tabular data, a named list of features to use. Otherwise, the input data is a single table and this can just be a vector of feature names. For any assays not in the named list, all of their features are used. \code{"clinical"} is also a valid assay name and refers to the clinical data table. @@ -57,6 +49,10 @@ This allows for the avoidance of variables such spike-in RNAs, sample IDs, sampl \item{maxMissingProp}{Default: 0.0. A proportion less than 1 which is the maximum tolerated proportion of missingness for a feature to be retained for modelling.} +\item{maxSimilarity}{Default: 1. A number between 0 and 1 which is the maximum similarity between a pair of +variables to be both kept in the data set. For numerical variables, the Pearson correlation is used and for categorical variables, +the Chi-squared test p-value is used. For a pair that is too similar, the second variable will be excluded from the data set.} + \item{topNvariance}{Default: NULL. If \code{measurements} is a \code{MultiAssayExperiment} or list of tabular data, a named integer vector of most variable features per assay to subset to. If the input data is a single table, then simply a single integer. If an assays has less features, it won't be reduced in size but stay as-is.}