feat: allow exclusion of samples/groups from feature definition

- Support exclusion of samples/groups from the definition of new features with the *PeakDensity* method by using a value of `NA` for them with the `sampleGroups` parameter (issue #742).
sneumann · Apr 24, 2024 · ea35751 · ea35751
1 parent faba48d
commit ea35751
Show file tree

Hide file tree

Showing 11 changed files with 111 additions and 40 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: xcms
-Version: 4.1.13
+Version: 4.1.14
 Title: LC-MS and GC-MS Data Analysis
 Description: Framework for processing and visualization of chromatographically
     separated and single-spectra mass spectral data. Imports from AIA/ANDI NetCDF,

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,8 @@
 
 ## Changes in version 4.1.14
 
+- Support excluding samples or sample groups from defining features with
+  *PeakDensity* correspondence analysis (issue #742).
 - Add `plotPrecursorIons()` function.
 
 ## Changes in version 4.1.13

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
@@ -1319,7 +1319,15 @@ setGeneric("group", function(object, ...) standardGeneric("group"))
 #'   representing the m/z dependent measurement error of some MS instruments).
 #'   All peaks (from the same or from different samples) with their apex
 #'   position being close on the retention time axis are grouped into a LC-MS
-#'   feature. See in addition [do_groupChromPeaks_density()] for the core API
+#'   feature. Only samples with non-missing sample group assignment (i.e. for
+#'   which the value provided with parameter `sampleGroups` is different than
+#'   `NA`) are considered and counted for the feature definition. This allows
+#'   to exclude certain samples or groups (e.g. blanks) from the feature
+#'   definition avoiding thus features with only detected peaks in these. Note
+#'   that this affects only the **definition** of **new** features.
+#'   Chromatographic peaks in these samples will still be assigned to features
+#'   which were defined based on the other samples.
+#'   See in addition [do_groupChromPeaks_density()] for the core API
 #'   function.
 #'
 #' - `NearestPeaksParam`: performs peak grouping based on the proximity of
@@ -1399,11 +1407,13 @@ setGeneric("group", function(object, ...) standardGeneric("group"))
 #'
 #' @param sampleGroups For `PeakDensityParam`: A vector of the same length than
 #'     samples defining the sample group assignments (i.e. which samples
-#'     belong to which sample
-#'     group). This parameter is mandatory for the `PeakDensityParam`
-#'     and has to be provided also if there is no sample grouping in the
-#'     experiment (in which case all samples should be assigned to the
-#'     same group).
+#'     belong to which sample group). This parameter is mandatory for
+#'     `PeakDensityParam` and has to be defined also if there is no sample
+#'     grouping in the experiment (in which case all samples should be
+#'     assigned to the same group). Samples for which a `NA` is provided will
+#'     not be considered in the feature definitions step. Providing `NA` for
+#'     all blanks in an experiment will for example avoid features to be
+#'     defined for signals (chrom peaks) present only in blank samples.
 #'
 #' @param value Replacement value for `<-` methods.
 #'

diff --git a/R/XcmsExperiment-plotting.R b/R/XcmsExperiment-plotting.R
@@ -450,6 +450,11 @@ setMethod(
 #' pest_dda <- readMsExperiment(fl)
 #'
 #' plotPrecursorIons(pest_dda)
+#' grid()
+#'
+#' ## Subset the data object to plot the data specifically for one or
+#' ## selected file/sample:
+#' plotPrecursorIons(pest_dda[1L])
 plotPrecursorIons <- function(x, pch = 21, col = "#00000080",
                               bg = "#00000020", xlab = "retention time",
                               ylab = "m/z", main = character(), ...) {
@@ -468,6 +473,5 @@ plotPrecursorIons <- function(x, pch = 21, col = "#00000080",
             main <- basename(dataOrigin(spectra(x_sub)[1L]))
         plot(prt, pmz, xlim = rtr, ylim = mzr, pch = pch, col = col, bg = bg,
              xlab = xlab, ylab = ylab, main = main[1L], ...)
-        grid()
     }
 }
diff --git a/R/do_groupChromPeaks-functions.R b/R/do_groupChromPeaks-functions.R
@@ -113,8 +113,11 @@ do_groupChromPeaks_density <- function(peaks, sampleGroups,
              paste0("'", .reqCols[!.reqCols %in% colnames(peaks)],"'",
                     collapse = ", "), " not found in 'peaks' parameter")
 
-    sampleGroups <- as.character(sampleGroups)
-    sampleGroupNames <- unique(sampleGroups)
+    ## With a `factor` we also support excluding samples/groups, i.e. samples
+    ## with an NA are not considered in the feature definition.
+    if (!is.factor(sampleGroups))
+        sampleGroups <- factor(sampleGroups)
+    sampleGroupNames <- levels(sampleGroups)
     sampleGroupTable <- table(sampleGroups)
     nSampleGroups <- length(sampleGroupTable)
 
@@ -160,15 +163,12 @@ do_groupChromPeaks_density <- function(peaks, sampleGroups,
             pb$tick()
         if (endIdx - startIdx < 0)
             next
-        resL[[i]] <- .group_peaks_density(peaks[startIdx:endIdx, , drop = FALSE],
-                                          bw = bw, densFrom = densFrom,
-                                          densTo = densTo, densN = densN,
-                                          sampleGroups = sampleGroups,
-                                          sampleGroupTable = sampleGroupTable,
-                                          minFraction = minFraction,
-                                          minSamples = minSamples,
-                                          maxFeatures = maxFeatures,
-                                          sleep = sleep)
+        resL[[i]] <- .group_peaks_density(
+            peaks[startIdx:endIdx, , drop = FALSE], bw = bw,
+            densFrom = densFrom, densTo = densTo, densN = densN,
+            sampleGroups = sampleGroups, sampleGroupTable = sampleGroupTable,
+            minFraction = minFraction, minSamples = minSamples,
+            maxFeatures = maxFeatures, sleep = sleep)
     }
     res <- do.call(rbind, resL)
     if (nrow(res)) {

diff --git a/man/do_groupChromPeaks_density.Rd b/man/do_groupChromPeaks_density.Rd
diff --git a/man/do_groupChromPeaks_nearest.Rd b/man/do_groupChromPeaks_nearest.Rd
diff --git a/man/do_groupPeaks_mzClust.Rd b/man/do_groupPeaks_mzClust.Rd
diff --git a/man/groupChromPeaks.Rd b/man/groupChromPeaks.Rd
diff --git a/man/plotPrecursorIons.Rd b/man/plotPrecursorIons.Rd
diff --git a/tests/testthat/test_do_groupChromPeaks-functions.R b/tests/testthat/test_do_groupChromPeaks-functions.R
@@ -85,3 +85,37 @@ test_that(".group_peaks_density works", {
     expect_true(nrow(res) == 0)
     expect_true(is(res, "data.frame"))
 })
+
+test_that("do_groupChromPeaks_density works with skipping samples", {
+    x <- loadXcmsData("xmse")
+    pks <- chromPeaks(x)
+    ## Errors
+    expect_error(do_groupChromPeaks_density(pks), "sampleGroups")
+    expect_error(do_groupChromPeaks_density(3, sampleGroups = 3), "matrix")
+    expect_error(do_groupChromPeaks_density(pks[, 1:3], sampleGroups = 1),
+                 "not found")
+    expect_error(do_groupChromPeaks_density(pks, sampleGroups = 1:3),
+                 "Sample indices")
+
+    ## groups for all samples.
+    grps <- sampleData(x)$sample_group
+    res <- do_groupChromPeaks_density(pks, sampleGroups = grps,
+                                      minFraction = 1, bw = 30)
+    expect_true(all(res$WT == 4 | res$KO == 4))
+    expect_true(all(res$WT <= 4))
+    expect_true(all(res$KO <= 4))
+
+    res_2 <- do_groupChromPeaks_density(
+        pks, sampleGroups = rep(1, length(grps)), minFraction = 1)
+    expect_true(nrow(res_2) < nrow(res))
+    expect_true(all(res_2$`1` == 8))
+
+    ## using only one sample group
+    grps[grps == "KO"] <- NA
+    res_3 <- do_groupChromPeaks_density(pks, sampleGroups = grps,
+                                        minFraction = 1)
+    expect_true(nrow(res_3) < nrow(res))
+    expect_true(all(res_3$WT == 4))
+    expect_equal(nrow(res_3), sum(res$WT == 4))
+    tmp <- res[res$WT == 4, ]
+})