From eca8bcef07b514182e7edcef7dfe8396b5d1b224 Mon Sep 17 00:00:00 2001
From: Dario Strbenac <dario@maths.usyd.edu.au>
Date: Sat, 21 Dec 2024 14:00:03 +1100
Subject: [PATCH] - Features used can be extracted from penalised Cox model. -
 Final model can be used with predict function. - prepareData can filter
 correlated features. - easyHard helps to associate variables with per-sample
 prediction accuracy.

---
 DESCRIPTION               |  8 ++--
 NAMESPACE                 |  4 ++
 R/calcPerformance.R       | 96 ++++++++++++++++++++++++++++++++++++++-
 R/crissCrossValidate.R    |  6 +--
 R/crossValidate.R         | 15 ++++--
 R/interfaceCoxnet.R       |  4 +-
 R/interfacePenalisedGLM.R | 16 +++++--
 R/precisionPathways.R     |  2 +-
 R/prepareData.R           | 49 +++++++++++++-------
 R/simpleParams.R          |  2 +-
 man/calcPerformance.Rd    | 24 +++++++++-
 man/prepareData.Rd        | 16 +++----
 12 files changed, 193 insertions(+), 49 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index b69db13..9a3dcc6 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -3,8 +3,8 @@ Type: Package
 Title: A framework for cross-validated classification problems, with
        applications to differential variability and differential
        distribution testing
-Version: 3.10.4
-Date: 2024-12-13
+Version: 3.10.5
+Date: 2024-12-20
 Authors@R:
     c(
     person(given = "Dario", family = "Strbenac", email = "dario.strbenac@sydney.edu.au", role = c("aut", "cre")),
@@ -20,9 +20,9 @@ VignetteBuilder: knitr
 Encoding: UTF-8
 biocViews: Classification, Survival
 Depends: R (>= 4.1.0), generics, methods, S4Vectors, MultiAssayExperiment, BiocParallel, survival
-Imports: grid, genefilter, utils, dplyr, tidyr, rlang, ranger, ggplot2 (>= 3.0.0), ggpubr, reshape2, ggupset
+Imports: grid, genefilter, utils, dplyr, tidyr, rlang, ranger, ggplot2 (>= 3.0.0), ggpubr, reshape2, ggupset, broom, dcanr
 Suggests: limma, edgeR, car, Rmixmod, gridExtra (>= 2.0.0), cowplot,
-        BiocStyle, pamr, PoiClaClu, parathyroidSE, knitr, htmltools, gtable,
+        BiocStyle, pamr, PoiClaClu, knitr, htmltools, gtable,
         scales, e1071, rmarkdown, IRanges, robustbase, glmnet, class, randomForestSRC,
         MatrixModels, xgboost, data.tree, ggnewscale
 Description: The software formalises a framework for classification and survival model evaluation
diff --git a/NAMESPACE b/NAMESPACE
index 36ec951..11dbc00 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -33,6 +33,7 @@ export(crissCrossPlot)
 export(crissCrossValidate)
 export(crossValidate)
 export(distribution)
+export(easyHard)
 export(edgesToHubNetworks)
 export(featureSetSummary)
 export(finalModel)
@@ -80,6 +81,7 @@ exportMethods(calcExternalPerformance)
 exportMethods(chosenFeatureNames)
 exportMethods(crossValidate)
 exportMethods(distribution)
+exportMethods(easyHard)
 exportMethods(featureSetSummary)
 exportMethods(finalModel)
 exportMethods(interactorDifferences)
@@ -111,6 +113,8 @@ import(reshape2)
 importFrom(S4Vectors,as.data.frame)
 importFrom(S4Vectors,do.call)
 importFrom(S4Vectors,mcols)
+importFrom(broom,tidy)
+importFrom(dcanr,cor.pairs)
 importFrom(dplyr,mutate)
 importFrom(dplyr,n)
 importFrom(generics,train)
diff --git a/R/calcPerformance.R b/R/calcPerformance.R
index 42db346..28c2a47 100755
--- a/R/calcPerformance.R
+++ b/R/calcPerformance.R
@@ -32,7 +32,8 @@
 #' @param result An object of class \code{\link{ClassifyResult}}.
 #' @param resultsList A list of modelling results. Each element must be of type \code{\link{ClassifyResult}}.
 #' @param performanceTypes Default: \code{"auto"} A character vector. If \code{"auto"}, Balanced Accuracy will be used
-#' for a classification task and C-index for a time-to-event task.
+#' for a classification task and C-index for a time-to-event task. If using \code{easyHard}, the default is
+#' \code{"Sample Accuracy"} for a classification task and \code{"Sample C-index"} for a time-to-event task.
 #' Must be one of the following options:
 #' \itemize{
 #' \item{\code{"Error"}: Ordinary error rate.}
@@ -130,6 +131,11 @@ setMethod("calcExternalPerformance", c("factor", "tabular"), # table has class p
             )
           })
 
+calcCVperformance <- function(results, ...)
+{
+    lapply(results, calcCVperformance)
+}
+
 #' @rdname calcPerformance
 #' @usage NULL
 #' @export
@@ -427,4 +433,90 @@ performanceTable <- function(resultsList, performanceTypes = "auto", aggregate =
     })
     DataFrame(tidyr::pivot_wider(as.data.frame(result@characteristics), names_from = characteristic, values_from = value), performances, check.names = FALSE)  
   }))
-}
\ No newline at end of file
+}
+
+#' @rdname calcPerformance
+#' @usage NULL
+#' @export
+setGeneric("easyHard", function(measurements, result, assay, performanceType, ...)
+    standardGeneric("easyHard"))
+
+#' @rdname calcPerformance
+#' @exportMethod easyHard
+#' @importFrom broom tidy
+#' @param assay For \code{easyHard} only. The assay to use to look for associations to the per-sample metric.
+#' @param useFeatures For \code{easyHard} only. Default: \code{NULL} (i.e. use all provided features). A vector of features to consider of the assay specified.
+#' This allows for the avoidance of variables such spike-in RNAs, sample IDs, sample acquisition dates, etc. which are not relevant for outcome prediction.
+#' @param fitMode For \code{easyHard} only. Default:\code{"single"}. Either \code{"single"} or \code{"full"}. If \code{"single"},
+#' an ordinary GLM model is fitted for each covariate separately. If \code{"full"}, elastic net is used to automatically tune the non-zero model coefficients.
+#' @return For \code{easyHard}, a \code{\link{DataFrame}} of logistic regression model summary.
+
+setMethod("easyHard", "MultiAssayExperimentOrList",
+          function(measurements, result, assay = "clinical", useFeatures = NULL, performanceType = "auto",
+                   fitMode = c("single", "full"))
+{
+  if(!requireNamespace("glmnet", quietly = TRUE))
+    stop("The package 'glmnet' could not be found. Please install it.")
+                            
+  if(!assay %in% names(measurements)) stop("'assay' is not one of the names of 'measurements'.")
+  fitMode  <- match.arg(fitMode)              
+              
+  if(is(measurements, "MultiAssayExperiment"))
+  {
+    if(assay == "clinical")
+      assay <- colData(measurements)
+    else assay <- t(measurements[, , assay]) # Ensure that features are in columns.
+  } else {assay <- measurements[[assay]]}
+  if(!is.null(useFeatures)) assay <- assay[, useFeatures]
+  if(performanceType == "auto")
+  {
+      if("risk" %in% colnames(predictions(result)))
+      {
+          performanceType <- "Sample C-index"
+      } else {performanceType <-"Sample Accuracy"}
+  }
+  if(!performanceType %in% names(performance(result)))
+  {
+    warning(paste(performanceType, "not found in result. Calculating it now."))
+    result <- calcCVperformance(result, performanceType)
+  }
+  samplePerformance <- performance(result)[[performanceType]]
+  if(any(is.na(samplePerformance)))
+  {
+    keep <- !is.na(samplePerformance)
+    assay <- assay[keep, ]
+    samplePerformance <- samplePerformance[keep]
+  }
+  assay <- assay[names(samplePerformance), ] # Just in case.
+  assayOHE <- MatrixModels::model.Matrix(~ 0 + ., data = assay)
+  
+  if(fitMode == "single")
+  {
+    as(do.call(rbind, lapply(colnames(assay), function(featureID)
+    {
+      covariate <- assay[, featureID]
+      fitted <- glm(samplePerformance ~ covariate, family = binomial, weights = rep(100, length(samplePerformance)))
+      summaryDF <- broom::tidy(fitted)
+      if(is.factor(covariate))
+      {
+        summaryDF[2:nrow(summaryDF), "term"] <- paste(featureID, levels(covariate)[2:length(levels(covariate))], sep = ": ")
+        
+      } else {summaryDF[, "term"] <- featureID}
+      summaryDF[2:nrow(summaryDF), ]
+    })), "DataFrame")
+  } else { # Penalised regression.
+    samplePerformanceM <- matrix(c(1 - samplePerformance, samplePerformance), ncol = 2)
+    fitted <- glmnet::glmnet(assayOHE, samplePerformanceM, family = "binomial")
+    lambdaConsider <- colSums(as.matrix(fitted[["beta"]])) != 0
+    bestLambda <- fitted[["lambda"]][lambdaConsider][which.min(sapply(fitted[["lambda"]][lambdaConsider], function(lambda) # Largest Lambda with minimum balanced error rate.
+    {
+        predictions <- predict(fitted, assayOHE, s = lambda, type = "response")
+        sum(abs(samplePerformanceM[, 2] - predictions))
+    }))[1]]
+    useVariables <- abs(fitted[["beta"]][, fitted[["lambda"]] == bestLambda]) > 0.00001
+    useVariables <- colnames(assay)[unique(assayOHE@assign[useVariables])]
+    dataForModel <- data.frame(assay, performance = samplePerformanceM[, 2])
+    fitted <- glm(performance ~ . + 0, data = dataForModel, family = binomial(), weights = rep(100, nrow(dataForModel)))
+    broom::tidy(fitted)
+  }
+})
\ No newline at end of file
diff --git a/R/crissCrossValidate.R b/R/crissCrossValidate.R
index 301ae31..f2d872c 100644
--- a/R/crissCrossValidate.R
+++ b/R/crissCrossValidate.R
@@ -190,7 +190,7 @@ crissCrossPlot <- function(crissCrossResult, includeValues = FALSE){
   
     ggheatmap <- ggplot(melted_cormat, aes(Var1, Var2, fill = value)) +
       geom_tile(color = "white") +
-      scale_fill_gradient2(high = "red", mid = "white", low = "blue", 
+      scale_fill_gradient2(high = "#e25563ff", mid = "white", low = "#094bacff", 
                           midpoint = 0.5, limit = c(0,1), space = "Lab", 
                           name=as.character(scalebar_title)) +
       theme_bw() + xlab("Training Dataset") + ylab("Testing Dataset") +
@@ -205,7 +205,7 @@ crissCrossPlot <- function(crissCrossResult, includeValues = FALSE){
     melted_cormat_1 <- melt(real, na.rm = TRUE)
     ggheatmap_1 <- ggplot(melted_cormat_1, aes(Var1, Var2, fill = value)) +
       geom_tile(color = "white") +
-      scale_fill_gradient2(high = "red", mid = "white", low = "blue", 
+      scale_fill_gradient2(high = "#e25563ff", mid = "white", low = "#094bacff", 
                           midpoint = 0.5, limit = c(0,1), space = "Lab", 
                           name=as.character(scalebar_title)) +
       theme_bw() + xlab("Features Extracted") + ylab("Dataset Tested") +
@@ -218,7 +218,7 @@ crissCrossPlot <- function(crissCrossResult, includeValues = FALSE){
       melted_cormat_2 <- melt(random, na.rm = TRUE)
       ggheatmap_2 <- ggplot(melted_cormat_2, aes(Var1, Var2, fill = value)) +
         geom_tile(color = "white") +
-        scale_fill_gradient2(high = "red", mid = "white", low = "blue", 
+        scale_fill_gradient2(high = "#e25563ff", mid = "white", low = "#094bacff", 
                             midpoint = 0.5, limit = c(0,1), space = "Lab", 
                             name=as.character(scalebar_title)) +
         theme_bw() + xlab("Features Extracted") + ylab("Dataset Tested") +
diff --git a/R/crossValidate.R b/R/crossValidate.R
index 9a4f542..38638de 100644
--- a/R/crossValidate.R
+++ b/R/crossValidate.R
@@ -747,6 +747,8 @@ CV <- function(measurements, outcome, x, outcomeTrain, measurementsTest, outcome
       if(is.character(classifyResults)) stop(classifyResults)
       fullResult <- runTest(measurements, outcome, measurements, outcome, crossValParams = crossValParams, modellingParams = modellingParams, characteristics = characteristics, .iteration = 1)
       classifyResults@finalModel <- fullResult$models
+      class(classifyResults@finalModel) <- c("trainedByClassifyR", classifyResults@finalModel)
+      attr(classifyResults@finalModel, "predictFunction") <- modellingParams@trainParams@classifier
     }
     
     classifyResults
@@ -797,7 +799,7 @@ train.DataFrame <- function(x, outcomeTrain, selectionMethod = "auto", nFeatures
               if(isTuneCross && !extraParams[["tuneCross"]][["performanceType"]] %in% c("auto", .ClassifyRenvir[["performanceTypes"]]))
                 stop(paste("performanceType for tuning must be one of", paste(c("auto", .ClassifyRenvir[["performanceTypes"]]), collapse = ", "), "but is", extraParams[["tuneCross"]][["performanceType"]]))
               
-              isCategorical <- is.character(outcome) && (length(outcome) == 1 || length(outcome) == nrow(measurements)) || is.factor(outcome)
+              isCategorical <- is.character(outcomeTrain) && (length(outcomeTrain) == 1 || length(outcomeTrain) == nrow(measurements)) || is.factor(outcomeTrain)
               if(isTuneCross && extraParams[["tuneCross"]][["performanceType"]] == "auto")
                 if(isCategorical) extraParams[["tuneCross"]][["performanceType"]] <- "Balanced Accuracy" else extraParams[["tuneCross"]][["performanceType"]] <- "C-index"
               if(length(selectionMethod) == 1 && selectionMethod == "auto")
@@ -838,7 +840,7 @@ train.DataFrame <- function(x, outcomeTrain, selectionMethod = "auto", nFeatures
                                     measurementsUse <- measurements
                                   }
 
-                                  classifierParams <- .classifierKeywordToParams(keyword = classifierForAssay)
+                                  classifierParams <- .classifierKeywordToParams(classifierForAssay, extraParams[["train"]][["tuneParams"]])
                                   if(!is.null(extraParams) && "train" %in% names(extraParams))
                                   {
                                      for(paramIndex in seq_along(extraParams[["train"]]))
@@ -922,6 +924,7 @@ train.DataFrame <- function(x, outcomeTrain, selectionMethod = "auto", nFeatures
               if(multiViewMethod == "merge"){
                   measurementsUse <- measurements[, S4Vectors::mcols(measurements)[["assay"]] %in% assayIDs, drop = FALSE]
                   model <- .doTrain(measurementsUse, outcomeTrain, NULL, NULL, crossValParams, modellingParams, verbose = verbose)[["model"]]
+                  attr(model, "predictFunction") <- modellingParams@trainParams@classifier
                   class(model) <- c("trainedByClassifyR", class(model))
               }
 
@@ -948,6 +951,7 @@ train.DataFrame <- function(x, outcomeTrain, selectionMethod = "auto", nFeatures
                                                               getFeatures = prevalFeatures),
                                     predictParams = PredictParams(prevalPredictInterface, characteristics = paramsAssays$clinical@predictParams@characteristics))
                  model <- .doTrain(measurementsUse, outcomeTrain, NULL, NULL, crossValParams, modellingParams, verbose = verbose)[["model"]]
+                 attr(model, "predictFunction") <- modellingParams@trainParams@classifier
                  class(model) <- c("trainedByClassifyR", class(model))
               }
               
@@ -965,6 +969,7 @@ train.DataFrame <- function(x, outcomeTrain, selectionMethod = "auto", nFeatures
                                                              getFeatures = PCAfeatures),
                                    predictParams = PredictParams(pcaPredictInterface, characteristics = paramsClinical$clinical@predictParams@characteristics))
                 model <- .doTrain(measurementsUse, outcomeTrain, NULL, NULL, crossValParams, modellingParams, verbose = verbose)[["model"]]
+                attr(model, "predictFunction") <- modellingParams@trainParams@classifier
                 class(model) <- c("trainedByClassifyR", class(model))
               }
               if(missing(models) || is.null(models)) return(model) else return(models)
@@ -1050,12 +1055,12 @@ predict.trainedByClassifyR <- function(object, newData, outcome, ...)
                }, newData, names(newData))
     newData <- do.call(cbind, newData)
     } else if(is(newData, "MultiAssayExperiment"))
-            {
-              newData <- prepareData(newData, outcome)
+    {
+              newData <- prepareData(newData, outcome)[["measurements"]]
     }
     
     predictFunctionUse <- attr(object, "predictFunction")
-    class(object) <- rev(class(object)) # Now want the predict method of the specific model to be picked, so put model class first.
+    class(object) <- class(object)[-1] # Now want the predict method of the specific model to be picked, so put model class first.
     if (is(object, "listOfModels")) 
          mapply(function(model, assay) predictFunctionUse(model, assay), object, newData, MoreArgs = list(...), SIMPLIFY = FALSE)
     else do.call(predictFunctionUse, list(object, newData, ...)) # Object is itself a trained model and it is assumed that a predict method is defined for it.
diff --git a/R/interfaceCoxnet.R b/R/interfaceCoxnet.R
index c23d18b..31c8091 100644
--- a/R/interfaceCoxnet.R
+++ b/R/interfaceCoxnet.R
@@ -8,7 +8,7 @@ coxnetTrainInterface <- function(measurementsTrain, survivalTrain, lambda = NULL
     message(Sys.time(), ": Fitting coxnet model to data.")
     
   measurementsTrain <- data.frame(measurementsTrain, check.names = FALSE)
-  measurementsMatrix <- glmnet::makeX(as(measurementsTrain, "data.frame"))
+  measurementsMatrix <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTrain)
   
   # The response variable is a Surv class of object.
   fit <- glmnet::cv.glmnet(measurementsMatrix, survivalTrain, family = "cox", type = "C", ...)
@@ -16,6 +16,8 @@ coxnetTrainInterface <- function(measurementsTrain, survivalTrain, lambda = NULL
   
   offset <- -mean(predict(fitted, measurementsMatrix, s = fit$lambda.min, type = "link"))
   attr(fitted, "tune") <- list(lambda = fit$lambda.min, offset = offset)
+  attr(fitted, "featureNames") <- colnames(measurementsMatrix)
+  attr(fitted, "featureGroups") <- measurementsMatrix@assign
   
   class(fitted) <- class(fitted)[1] # Get rid of glmnet which messes with dispatch. 
   fitted
diff --git a/R/interfacePenalisedGLM.R b/R/interfacePenalisedGLM.R
index 99548ec..bcf4f5a 100644
--- a/R/interfacePenalisedGLM.R
+++ b/R/interfacePenalisedGLM.R
@@ -32,9 +32,10 @@ attr(penalisedGLMtrainInterface, "name") <- "penalisedGLMtrainInterface"
 
 # model is of class multnet
 penalisedGLMpredictInterface <- function(model, measurementsTest, lambda, ..., returnType = c("both", "class", "score"), verbose = 3)
-{ # ... just consumes emitted tuning variables from .doTrain which are unused.
+{
+  # ... just consumes emitted tuning variables from .doTrain which are unused.
   returnType <- match.arg(returnType)
-  # One-hot encoding needed.    
+  # One-hot encoding needed.
   measurementsTest <- MatrixModels::model.Matrix(~ 0 + ., data = measurementsTest)
   
   # Ensure that testing data has same columns names in same order as training data.
@@ -51,7 +52,7 @@ penalisedGLMpredictInterface <- function(model, measurementsTest, lambda, ..., r
 
   
   measurementsTest <- measurementsTest[, rownames(model[["beta"]][[1]])]
-  
+
   classPredictions <- factor(as.character(predict(model, measurementsTest, s = lambda, type = "class")), levels = model[["classnames"]])
   classScores <- predict(model, measurementsTest, s = lambda, type = "response")[, , 1]
   
@@ -78,8 +79,13 @@ penalisedFeatures <- function(model)
                       {
                         # Floating point numbers test for equality.
                         whichCoefficientColumn <- which(abs(model[["lambda"]] - attr(model, "tune")[["lambda"]]) < 0.00001)[1]
-                        coefficientsUsed <- sapply(model[["beta"]], function(classCoefficients) classCoefficients[, whichCoefficientColumn])
-                        featureScores <- rowSums(abs(coefficientsUsed))
+                        if(is.list(model[["beta"]])) # Categorical data
+                        {
+                          coefficientsUsed <- sapply(model[["beta"]], function(classCoefficients) classCoefficients[, whichCoefficientColumn])
+                          featureScores <- rowSums(abs(coefficientsUsed))
+                        } else { # survival data
+                            featureScores <- model[["beta"]][, whichCoefficientColumn]
+                        }
                         featureGroups <- attr(model, "featureGroups")[match(names(featureScores), attr(model, "featureNames"))]
                         groupScores <- unname(by(featureScores, featureGroups, max))
                         rankedFeaturesIndices <- order(groupScores, decreasing = TRUE)
diff --git a/R/precisionPathways.R b/R/precisionPathways.R
index 1381ad4..8c5e016 100644
--- a/R/precisionPathways.R
+++ b/R/precisionPathways.R
@@ -345,7 +345,7 @@ bubblePlot.PrecisionPathways <- function(precisionPathways, pathwayColours = NUL
   performance <- precisionPathways[["performance"]]
   performance <- cbind(Sequence = rownames(performance), performance)
   ggplot2::ggplot(performance, aes(x = accuracy, y = cost, colour = Sequence, size = 4)) + ggplot2::geom_point() +
-    ggplot2::scale_color_manual(values = pathwayColours) + ggplot2::labs(x = "Balanced Accuracy", y = "Total Cost") + ggplot2::guides(size = FALSE)
+    ggplot2::scale_color_manual(values = pathwayColours) + ggplot2::labs(x = "Balanced Accuracy", y = "Total Cost") + ggplot2::guides(size = "none")
 }
 
 #' @export
diff --git a/R/prepareData.R b/R/prepareData.R
index 26a9d65..9091851 100644
--- a/R/prepareData.R
+++ b/R/prepareData.R
@@ -15,7 +15,6 @@
 #' a character string, or vector of such strings, containing column name(s) of column(s)
 #' containing either classes or time and event information about survival. If column names
 #' of survival information, time must be in first column and event status in the second.
-#' @param group Not user-specified. Only passed by \code{runTests} for subsetting.
 #' @param outcomeColumns If \code{measurements} is a \code{MultiAssayExperiment}, the
 #' names of the column (class) or columns (survival) in the table extracted by \code{colData(data)}
 #' that contain(s) the each individual's outcome to use for prediction.
@@ -25,6 +24,9 @@
 #' This allows for the avoidance of variables such spike-in RNAs, sample IDs, sample acquisition dates, etc. which are not relevant for outcome prediction.
 #' @param maxMissingProp Default: 0.0. A proportion less than 1 which is the maximum
 #' tolerated proportion of missingness for a feature to be retained for modelling.
+#' @param maxSimilarity Default: 1. A number between 0 and 1 which is the maximum similarity between a pair of
+#' variables to be both kept in the data set. For numerical variables, the Pearson correlation is used and for categorical variables,
+#' the Chi-squared test p-value is used. For a pair that is too similar, the second variable will be excluded from the data set.
 #' @param topNvariance Default: NULL. If \code{measurements} is a \code{MultiAssayExperiment} or list of tabular data, a named integer vector of most variable 
 #' features per assay to subset to. If the input data is a single table, then simply a single integer.
 #' If an assays has less features, it won't be reduced in size but stay as-is.
@@ -56,10 +58,11 @@ setMethod("prepareData", "data.frame",
   prepareData(S4Vectors::DataFrame(measurements, check.names = FALSE), outcome, ...)
 })
 
+#' @importFrom dcanr cor.pairs
 #' @rdname prepareData
 #' @export
 setMethod("prepareData", "DataFrame",
-  function(measurements, outcome, group = NULL, useFeatures = NULL, maxMissingProp = 0.0, topNvariance = NULL)
+  function(measurements, outcome, useFeatures = NULL, maxMissingProp = 0.0, maxSimilarity = 1, topNvariance = NULL)
 {
   if(is.null(rownames(measurements)))
   {
@@ -184,7 +187,6 @@ setMethod("prepareData", "DataFrame",
   {
     measurements <- measurements[-dropSamples, ]
     outcome <- outcome[-dropSamples]  
-    if(!is.null(group)) group <- group[-dropSamples]
   }
   
   # Remove features or samples to ensure all features have less missingness than allowed.
@@ -222,7 +224,6 @@ setMethod("prepareData", "DataFrame",
     dropIndices <- match(dropSamples, rownames(measurements))      
     measurements <- measurements[-dropIndices, ]
     outcome <- outcome[-dropIndices]
-    if(!is.null(group)) group <- group[-dropIndices]
   }
   
   # Use only the most N variable features per assay.
@@ -241,20 +242,41 @@ setMethod("prepareData", "DataFrame",
     }))
   }
   
+  if(maxSimilarity < 1)
+  {
+    categoricalFeatures <- sapply(measurements, class) %in% c("factor", "character")
+    if(any(categoricalFeatures))
+    {
+      checkPairs <- combn(which(categoricalFeatures), 2)
+      pValues <- apply(checkPairs, 2, function(checkPair)
+      {
+        fisher.test(table(measurements[, checkPair[1]], measurements[, checkPair[2]]))$p.value
+      })
+      if(any(pValues) < maxSimilarity) dropFeatures <- checkPairs[2, which(pValues < maxSimilarity)]
+    }
+    numericFeatures <- sapply(measurements, class) == "numeric"
+    if(any(numericFeatures))
+    {
+        correlations <- dcanr::cor.pairs(as.matrix(measurements[, numericFeatures]))
+        diag(correlations) <- 0
+        dropFeatures <- c(dropFeatures, unique(which(correlations > maxSimilarity, arr.ind = TRUE)[, 2]))
+    }
+    dropFeatures <- character()
+    if(length(dropFeatures) > 0) measurements <- measurements[, setdiff(1:ncol(measurements), dropFeatures)]
+  }
+  
+  
   # Names are on both the covariates and outcome. Ensure consistent ordering.
   if(!is.null(rownames(measurements)) && !is.null(names(outcome)))
-  {
       measurements <- measurements[match(names(outcome), rownames(measurements)), ]
-      group <- group[match(names(outcome), names(group))]
-  }
   
-  list(measurements = measurements, outcome = outcome, group = group)
+  list(measurements = measurements, outcome = outcome)
 })
 
 #' @rdname prepareData
 #' @export
 setMethod("prepareData", "MultiAssayExperiment",
-  function(measurements, outcomeColumns = NULL, group = NULL, useFeatures = NULL, ...)
+  function(measurements, outcomeColumns = NULL, useFeatures = NULL, ...)
 {
   if(is.null(useFeatures) || !"clinical" %in% names(useFeatures))
   {
@@ -274,12 +296,7 @@ setMethod("prepareData", "MultiAssayExperiment",
   # Get all desired measurements tables and clinical columns.
   # These form the independent variables to be used for making predictions with.
   # Variable names will have names like RNA_BRAF for traceability.
-  dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(useFeatures[["clinical"]], group, outcomeColumns))
-  if(!is.null(group))
-  {
-    group <- dataTable[, "group"]
-    dataTable <- dataTable[, -match("group", colnames(dataTable))]
-  }
+  dataTable <- MultiAssayExperiment::wideFormat(measurements, colDataCols = union(useFeatures[["clinical"]], outcomeColumns), check.names = FALSE)
   rownames(dataTable) <- dataTable[, "primary"]
   S4Vectors::mcols(dataTable)[, "sourceName"] <- gsub("colDataCols", "clinical", S4Vectors::mcols(dataTable)[, "sourceName"])
   dataTable <- dataTable[, -match("primary", colnames(dataTable))]
@@ -295,7 +312,7 @@ setMethod("prepareData", "MultiAssayExperiment",
   S4Vectors::mcols(dataTable) <- S4Vectors::mcols(dataTable)[, c("assay", "feature")]
     
   # Do other filtering and preparation in DataFrame function.
-  prepareData(dataTable, outcomeColumns, group = group, useFeatures = NULL, ...)
+  prepareData(dataTable, outcomeColumns, useFeatures = NULL, ...)
 })
 
 #' @rdname prepareData
diff --git a/R/simpleParams.R b/R/simpleParams.R
index 19dc1a4..d92e204 100644
--- a/R/simpleParams.R
+++ b/R/simpleParams.R
@@ -118,7 +118,7 @@ coxphParams <- function() {
 
 # Cox Proportional Hazards Model with Elastic Net for Survival
 coxnetParams <- function() {
-    trainParams <- TrainParams(coxnetTrainInterface)
+    trainParams <- TrainParams(coxnetTrainInterface, getFeatures = penalisedFeatures)
     predictParams <- PredictParams(coxnetPredictInterface)
     
     return(list(trainParams = trainParams, predictParams = predictParams))
diff --git a/man/calcPerformance.Rd b/man/calcPerformance.Rd
index 1e06672..fac9ef0 100644
--- a/man/calcPerformance.Rd
+++ b/man/calcPerformance.Rd
@@ -9,6 +9,8 @@
 \alias{calcCVperformance,ClassifyResult-method}
 \alias{calcExternalPerformance,factor,tabular-method}
 \alias{performanceTable}
+\alias{easyHard}
+\alias{easyHard,MultiAssayExperimentOrList-method}
 \title{Add Performance Calculations to a ClassifyResult Object or Calculate for a
 Pair of Factor Vectors}
 \usage{
@@ -37,6 +39,15 @@ performanceTable(
   performanceTypes = "auto",
   aggregate = c("median", "mean")
 )
+
+\S4method{easyHard}{MultiAssayExperimentOrList}(
+  measurements,
+  result,
+  assay = "clinical",
+  useFeatures = NULL,
+  performanceType = "auto",
+  fitMode = c("single", "full")
+)
 }
 \arguments{
 \item{actualOutcome}{A factor vector or survival information specifying each sample's known outcome.}
@@ -44,7 +55,8 @@ performanceTable(
 \item{predictedOutcome}{A factor vector or survival information of the same length as \code{actualOutcome} specifying each sample's predicted outcome.}
 
 \item{performanceTypes}{Default: \code{"auto"} A character vector. If \code{"auto"}, Balanced Accuracy will be used
-for a classification task and C-index for a time-to-event task.
+for a classification task and C-index for a time-to-event task. If using \code{easyHard}, the default is
+\code{"Sample Accuracy"} for a classification task and \code{"Sample C-index"} for a time-to-event task.
 Must be one of the following options:
 \itemize{
 \item{\code{"Error"}: Ordinary error rate.}
@@ -80,12 +92,22 @@ there are two classes.}
 
 \item{aggregate}{Default: \code{"median"}. Can also be \code{"mean"}. If there are multiple values, such as for repeated
 cross-validation, then they are summarised to a single number using either mean or median.}
+
+\item{assay}{For \code{easyHard} only. The assay to use to look for associations to the per-sample metric.}
+
+\item{useFeatures}{For \code{easyHard} only. Default: \code{NULL} (i.e. use all provided features). A vector of features to consider of the assay specified.
+This allows for the avoidance of variables such spike-in RNAs, sample IDs, sample acquisition dates, etc. which are not relevant for outcome prediction.}
+
+\item{fitMode}{For \code{easyHard} only. Default:\code{"single"}. Either \code{"single"} or \code{"full"}. If \code{"single"},
+an ordinary GLM model is fitted for each covariate separately. If \code{"full"}, elastic net is used to automatically tune the non-zero model coefficients.}
 }
 \value{
 If \code{calcCVperformance} was run, an updated
 \code{\linkS4class{ClassifyResult}} object, with new metric values in the
 \code{performance} slot. If \code{calcExternalPerformance} was run, the
 performance metric value itself.
+
+For \code{easyHard}, a \code{\link{DataFrame}} of logistic regression model summary.
 }
 \description{
 If \code{calcExternalPerformance} is used, such as when having a vector of
diff --git a/man/prepareData.Rd b/man/prepareData.Rd
index bc7e210..ba3a340 100644
--- a/man/prepareData.Rd
+++ b/man/prepareData.Rd
@@ -16,19 +16,13 @@
 \S4method{prepareData}{DataFrame}(
   measurements,
   outcome,
-  group = NULL,
   useFeatures = NULL,
   maxMissingProp = 0,
+  maxSimilarity = 1,
   topNvariance = NULL
 )
 
-\S4method{prepareData}{MultiAssayExperiment}(
-  measurements,
-  outcomeColumns = NULL,
-  group = NULL,
-  useFeatures = NULL,
-  ...
-)
+\S4method{prepareData}{MultiAssayExperiment}(measurements, outcomeColumns = NULL, useFeatures = NULL, ...)
 
 \S4method{prepareData}{list}(measurements, outcome = NULL, useFeatures = NULL, ...)
 }
@@ -47,8 +41,6 @@ a character string, or vector of such strings, containing column name(s) of colu
 containing either classes or time and event information about survival. If column names
 of survival information, time must be in first column and event status in the second.}
 
-\item{group}{Not user-specified. Only passed by \code{runTests} for subsetting.}
-
 \item{useFeatures}{Default: \code{NULL} (i.e. use all provided features). If \code{measurements} is a \code{MultiAssayExperiment} or list of tabular data,
 a named list of features to use. Otherwise, the input data is a single table and this can just be a vector of feature names.
 For any assays not in the named list, all of their features are used. \code{"clinical"} is also a valid assay name and refers to the clinical data table.
@@ -57,6 +49,10 @@ This allows for the avoidance of variables such spike-in RNAs, sample IDs, sampl
 \item{maxMissingProp}{Default: 0.0. A proportion less than 1 which is the maximum
 tolerated proportion of missingness for a feature to be retained for modelling.}
 
+\item{maxSimilarity}{Default: 1. A number between 0 and 1 which is the maximum similarity between a pair of
+variables to be both kept in the data set. For numerical variables, the Pearson correlation is used and for categorical variables,
+the Chi-squared test p-value is used. For a pair that is too similar, the second variable will be excluded from the data set.}
+
 \item{topNvariance}{Default: NULL. If \code{measurements} is a \code{MultiAssayExperiment} or list of tabular data, a named integer vector of most variable 
 features per assay to subset to. If the input data is a single table, then simply a single integer.
 If an assays has less features, it won't be reduced in size but stay as-is.}