diff --git a/DESCRIPTION b/DESCRIPTION index cd2681dcb..a190972be 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -123,8 +123,8 @@ Collate: 'AllClassUnions.R' 'AllGenerics.R' 'DataClasses.R' - 'NAnnotatedDataFrame.R' 'MzTab.R' + 'NAnnotatedDataFrame.R' 'NTR.R' 'RcppExports.R' 'TMT10.R' diff --git a/NEWS.md b/NEWS.md index 31bffd75e..e20b19573 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ - Remove message about changed meaning of the "modifications" argument in calculateFragments' that was introduced in MSnbase 1.17.6 (2015-06-21). <2019-06-01 Sat> +- Implement combineSpectra,MSnExp (see + [#474](https://github.com/lgatto/MSnbase/issues/474)) <2019-06-02 Tue>. ## MSnbase 2.11.2 diff --git a/R/functions-Spectrum.R b/R/functions-Spectrum.R index 8d90cd088..f280ff2c5 100644 --- a/R/functions-Spectrum.R +++ b/R/functions-Spectrum.R @@ -966,6 +966,8 @@ descendPeak <- function(mz, intensity, peakIdx = NULL, signalPercentage = 33, meanMzInts <- function(x, ..., intensityFun = base::mean, weighted = FALSE, main = 1L, mzd, ppm = 0, timeDomain = FALSE, unionPeaks = TRUE) { + if (length(x) == 1) + return(x[[1]]) if (length(unique(unlist(lapply(x, function(z) z@msLevel)))) != 1) stop("Can only combine spectra with the same MS level") if (main > length(x) || main < 1) @@ -979,10 +981,13 @@ meanMzInts <- function(x, ..., intensityFun = base::mean, weighted = FALSE, mz_groups <- .group_mz_values(sqrt(mzs), mzd = mzd) else mz_groups <- .group_mz_values(mzs, mzd = mzd, ppm = ppm) - if (length(unique(mz_groups)) < length(x[[main]]@mz)) - warning("Got less m/z groups than m/z values in the original spectrum.", - " Most likely the data is not profile-mode LCMS data or ", - "'mzd' is too large.") + ## Disable the warning below, as this does not apply to all use cases: for + ## arbitrary input data we can not expect the first spectrum to contain the + ## largest number of peaks. + ## if (length(unique(mz_groups)) < length(x[[main]]@mz)) + ## warning("Got less m/z groups than m/z values in the original spectrum.", + ## " Most likely the data is not profile-mode LCMS data or ", + ## "'mzd' is too large.") ints <- unlist(base::lapply(x, function(z) z@intensity), use.names = FALSE)[mz_order] new_sp <- x[[main]] @@ -1205,6 +1210,8 @@ meanMzInts <- function(x, ..., intensityFun = base::mean, weighted = FALSE, #' consensusSpectrum <- function(x, mzd, minProp = 0.5, intensityFun = base::max, ppm = 0, ...) { + if (length(x) == 1) + return(x[[1]]) if (!is(x, "Spectra")) x <- Spectra(x) if (length(unique(msLevel(x))) != 1) @@ -1220,9 +1227,12 @@ consensusSpectrum <- function(x, mzd, minProp = 0.5, intensityFun = base::max, mzs <- mzs[keep] ints <- ints[keep] mz_groups <- .group_mz_values(mzs, mzd = mzd, ppm = ppm) - if (length(unique(mz_groups)) < sum(xnew@intensity > 0)) - warning("Got less m/z groups than m/z groups in the first spectrum. ", - "Most likely `mzd` is too large.") + ## Disable the warning below, as this does not apply to all use cases: for + ## arbitrary input data we can not expect the first spectrum to contain the + ## largest number of peaks. + ## if (length(unique(mz_groups)) < sum(xnew@intensity > 0)) + ## warning("Got less m/z groups than m/z groups in the first spectrum. ", + ## "Most likely `mzd` is too large.") mzs <- split(mzs, mz_groups) ints <- split(ints, mz_groups) keep <- lengths(mzs) >= (length(x) * minProp) diff --git a/R/methods-MSnExp.R b/R/methods-MSnExp.R index 2c1ca7cae..83f4faba9 100644 --- a/R/methods-MSnExp.R +++ b/R/methods-MSnExp.R @@ -577,3 +577,50 @@ setAs("MSnExp", "Spectra", function(from) { fdta <- fdta[, !colnames(fdta) %in% red_cn, drop = FALSE] Spectra(spectra(from), elementMetadata = DataFrame(fdta)) }) + +#' @rdname combineSpectra +setMethod("combineSpectra", "MSnExp", function(object, fcol = "fileIdx", + method = meanMzInts, ..., + BPPARAM = bpparam()) { + BPPARAM <- getBpParam(object, BPPARAM = BPPARAM) + fns <- fileNames(object) + if (is(object, "MSnExp") && fcol == "fileIdx") + fData(object)$fileIdx <- fromFile(object) + objs <- split(object, fromFile(object)) + dots <- list(...) + res <- bplapply(objs, function(z, fcol, fns, dots) { + sps <- do.call( + combineSpectra, + args = c(list( + object = Spectra(spectra(z), elementMetadata = DataFrame(fData(z))), + fcol = fcol, method = method), dots)) + ff <- match(fileNames(z), fns) + sps@listData <- lapply(sps@listData, function(x) { + x@fromFile <- ff + x + }) + mcols(sps)$fileIdx <- ff + sps + }, fcol = fcol, fns = fns, dots = dots, BPPARAM = BPPARAM) + names(res) <- NULL + fdta <- do.call(rbind, lapply(res, function(z) z@elementMetadata)) + res <- unlist(lapply(res, function(z) z@listData)) + rownames(fdta) <- names(res) + msn <- new("MSnExp") + msn@featureData <- AnnotatedDataFrame(as.data.frame(fdta)) + msn@assayData <- list2env(res) + lockEnvironment(msn@assayData, bindings = TRUE) + msn@phenoData <- object@phenoData + msn@experimentData <- object@experimentData + msn@protocolData <- object@protocolData + msn@processingData <- object@processingData + msn@processingData@processing <- c( + msn@processingData@processing, + paste0("Spectra combined based on feature variable '", fcol, "' [", + date(), "]")) + .cacheEnv <- setCacheEnv(list("assaydata" = msn@assayData, "hd" = NULL), + level = 0, lock = TRUE) + msn@.cache = .cacheEnv + validObject(msn) + msn +}) diff --git a/R/methods-Spectra.R b/R/methods-Spectra.R index 5addbd98a..d2bb8793b 100644 --- a/R/methods-Spectra.R +++ b/R/methods-Spectra.R @@ -69,7 +69,7 @@ setMethod("show", "Spectra", function(object) { #' #' - `smooth` *smooths* spectra. See [smooth()] for more details. #' -#' @section Filtering and subsetting +#' @section Filtering and subsetting: #' #' - `[` can be used to subset the `Spectra` object. #' @@ -419,27 +419,32 @@ setMethod("smooth", "Spectra", function(x, method = c("SavitzkyGolay", x }) - - - -#' `combineSpectra` combines spectra in a [MSnExp-class] or [Spectra-class] -#' object applying the summarization function `fun` to sets of spectra defined -#' by a factor (`fcol` parameter). The resulting combined spectrum for each set -#' contains metadata information (present in `mcols` and all spectrum -#' information other than `mz` and `intensity`) from the first spectrum in each -#' set. +#' @title Combine Spectra +#' +#' @description +#' +#' `combineSpectra` combines spectra in a [MSnExp-class], [OnDiskMSnExp-class] +#' or [Spectra-class] object applying the summarization function `fun` to sets +#' of spectra defined by a factor (`fcol` parameter). The resulting combined +#' spectrum for each set contains metadata information (present in `mcols` and +#' all spectrum information other than `mz` and `intensity`) from the first +#' spectrum in each set. +#' +#' Combining of spectra for [MSnExp-class] or [OnDiskMSnExp-class] objects is +#' performed by default for each file **separately**, combining of spectra +#' across files is thus not possible. See examples for details. #' #' @aliases combineSpectra #' #' @rdname combineSpectra #' -#' @title Combine Spectra -#' #' @param object A [MSnExp-class] or [Spectra-class] #' #' @param fcol For `Spectra` objects: `mcols` column name to be used to define #' the sets of spectra to be combined. If missing, all spectra are -#' considered to be one set. +#' considered to be one set. For `MSnExp`/`OnDiskMSnExp` objects: column +#' in `fData(object)` defining which spectra to combine. See examples below +#' for more details. #' #' @param fun *Deprecated* use `method` instead. #' @@ -449,6 +454,10 @@ setMethod("smooth", "Spectra", function(x, method = c("SavitzkyGolay", #' #' @param ... additional arguments for `fun`. #' +#' @param BPPARAM For `MSnExp`/`OnDiskMSnExp` objects: parallel processing setup +#' to perform per-file parallel spectra combining. See [bpparam()] for more +#' details. +#' #' @return A `Spectra` or `MSnExp` object with combined spectra. Metadata #' (`mcols`) and all spectrum attributes other than `mz` and `intensity` #' are taken from the first `Spectrum` in each set. @@ -511,6 +520,34 @@ setMethod("smooth", "Spectra", function(x, method = c("SavitzkyGolay", #' col = "black") #' plot(mz(res[[2]]), intensity(res[[2]]), xlim = range(mzs[5:25]), type = "h", #' col = "black") +#' +#' ## Combining spectra of an MSnExp/OnDiskMSnExp objects +#' ## Reading data from 2 mzML files +#' sciex <- readMSData(dir(system.file("sciex", package = "msdata"), +#' full.names = TRUE), mode = "onDisk") +#' +#' ## Filter the file to a retention time range from 2 to 20 seconds (to reduce +#' ## execution time of the example) +#' sciex <- filterRt(sciex, rt = c(2, 20)) +#' table(fromFile(sciex)) +#' +#' ## We have thus 64 spectra per file. +#' +#' ## In the example below we combine spectra measured in one second to a +#' ## single spectrum. We thus first define the grouping variable and add that +#' ## to the `fData` of the object. For combining, we use the +#' ## `consensusSpectrum` function that combines the spectra keeping only peaks +#' ## that were found in 50% of the spectra; by defining `mzd = 0.01` all peaks +#' ## within an m/z of 0.01 are evaluated for combining. +#' seconds <- round(rtime(sciex)) +#' head(seconds) +#' fData(sciex)$second <- seconds +#' +#' res <- combineSpectra(sciex, fcol = "second", mzd = 0.01, minProp = 0.1, +#' method = consensusSpectrum) +#' table(fromFile(res)) +#' +#' ## The data was reduced to 19 spectra for each file. setMethod("combineSpectra", "Spectra", function(object, fcol, method = meanMzInts, fun, ...) { if (!missing(fun)) { diff --git a/R/readMSData2.R b/R/readMSData2.R index c3a3cb29c..0d86549df 100644 --- a/R/readMSData2.R +++ b/R/readMSData2.R @@ -4,7 +4,7 @@ readMSData2 <- function(files, verbose = isMSnbaseVerbose(), centroided., smoothed. = NA) { - .Defunct(new = "readMSData") + .Defunct(new = "readMSData") } @@ -66,6 +66,7 @@ readOnDiskMSData <- function(files, pdata, msLevel., verbose, nrow(fdData))) } ## Order the fdData by acquisitionNum to force use of acquisitionNum + ## as unique ID for the spectrum (issue #103). That way we can use ## the spIdx (is the index of the spectrum within the file) for ## subsetting and extracting. diff --git a/man/Spectra.Rd b/man/Spectra.Rd index 561994eed..d992b99d0 100644 --- a/man/Spectra.Rd +++ b/man/Spectra.Rd @@ -226,6 +226,17 @@ centroided from the actual peak data. } } +\section{Filtering and subsetting}{ + +\itemize{ +\item \code{[} can be used to subset the \code{Spectra} object. +\item \code{filterMsLevel} filters \code{Spectra} to retain only spectra from certain MS +level(s). +\item \code{filterMz} filters the spectra by the specified \code{mz} range. See +\code{\link[=filterMz]{filterMz()}} for details. +} +} + \examples{ ## Create from Spectrum objects diff --git a/man/combineSpectra.Rd b/man/combineSpectra.Rd index 0b6fe107a..e75eb2fab 100644 --- a/man/combineSpectra.Rd +++ b/man/combineSpectra.Rd @@ -1,11 +1,15 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-Spectra.R +% Please edit documentation in R/methods-MSnExp.R, R/methods-Spectra.R \docType{methods} -\name{combineSpectra,Spectra-method} +\name{combineSpectra,MSnExp-method} +\alias{combineSpectra,MSnExp-method} \alias{combineSpectra,Spectra-method} \alias{combineSpectra} \title{Combine Spectra} \usage{ +\S4method{combineSpectra}{MSnExp}(object, fcol = "fileIdx", + method = meanMzInts, ..., BPPARAM = bpparam()) + \S4method{combineSpectra}{Spectra}(object, fcol, method = meanMzInts, fun, ...) } @@ -14,15 +18,21 @@ \item{fcol}{For \code{Spectra} objects: \code{mcols} column name to be used to define the sets of spectra to be combined. If missing, all spectra are -considered to be one set.} +considered to be one set. For \code{MSnExp}/\code{OnDiskMSnExp} objects: column +in \code{fData(object)} defining which spectra to combine. See examples below +for more details.} \item{method}{\code{function} to be used to combine the spectra by \code{fcol}. Has to be a function that takes a list of spectra as input and returns a single \linkS4class{Spectrum}. See \code{\link[=meanMzInts]{meanMzInts()}} for details.} -\item{fun}{\emph{Deprecated} use \code{method} instead.} - \item{...}{additional arguments for \code{fun}.} + +\item{BPPARAM}{For \code{MSnExp}/\code{OnDiskMSnExp} objects: parallel processing setup +to perform per-file parallel spectra combining. See \code{\link[=bpparam]{bpparam()}} for more +details.} + +\item{fun}{\emph{Deprecated} use \code{method} instead.} } \value{ A \code{Spectra} or \code{MSnExp} object with combined spectra. Metadata @@ -30,12 +40,16 @@ A \code{Spectra} or \code{MSnExp} object with combined spectra. Metadata are taken from the first \code{Spectrum} in each set. } \description{ -\code{combineSpectra} combines spectra in a \linkS4class{MSnExp} or \linkS4class{Spectra} -object applying the summarization function \code{fun} to sets of spectra defined -by a factor (\code{fcol} parameter). The resulting combined spectrum for each set -contains metadata information (present in \code{mcols} and all spectrum -information other than \code{mz} and \code{intensity}) from the first spectrum in each -set. +\code{combineSpectra} combines spectra in a \linkS4class{MSnExp}, \linkS4class{OnDiskMSnExp} +or \linkS4class{Spectra} object applying the summarization function \code{fun} to sets +of spectra defined by a factor (\code{fcol} parameter). The resulting combined +spectrum for each set contains metadata information (present in \code{mcols} and +all spectrum information other than \code{mz} and \code{intensity}) from the first +spectrum in each set. + +Combining of spectra for \linkS4class{MSnExp} or \linkS4class{OnDiskMSnExp} objects is +performed by default for each file \strong{separately}, combining of spectra +across files is thus not possible. See examples for details. } \examples{ @@ -89,6 +103,34 @@ plot(mz(res[[1]]), intensity(res[[1]]), xlim = range(mzs[5:25]), type = "h", col = "black") plot(mz(res[[2]]), intensity(res[[2]]), xlim = range(mzs[5:25]), type = "h", col = "black") + +## Combining spectra of an MSnExp/OnDiskMSnExp objects +## Reading data from 2 mzML files +sciex <- readMSData(dir(system.file("sciex", package = "msdata"), + full.names = TRUE), mode = "onDisk") + +## Filter the file to a retention time range from 2 to 20 seconds (to reduce +## execution time of the example) +sciex <- filterRt(sciex, rt = c(2, 20)) +table(fromFile(sciex)) + +## We have thus 64 spectra per file. + +## In the example below we combine spectra measured in one second to a +## single spectrum. We thus first define the grouping variable and add that +## to the `fData` of the object. For combining, we use the +## `consensusSpectrum` function that combines the spectra keeping only peaks +## that were found in 50\% of the spectra; by defining `mzd = 0.01` all peaks +## within an m/z of 0.01 are evaluated for combining. +seconds <- round(rtime(sciex)) +head(seconds) +fData(sciex)$second <- seconds + +res <- combineSpectra(sciex, fcol = "second", mzd = 0.01, minProp = 0.1, + method = consensusSpectrum) +table(fromFile(res)) + +## The data was reduced to 19 spectra for each file. } \seealso{ \code{\link[=meanMzInts]{meanMzInts()}} for a function to combine spectra. diff --git a/tests/testthat/test_MSnExp.R b/tests/testthat/test_MSnExp.R index c0d25e886..e8c7365d8 100644 --- a/tests/testthat/test_MSnExp.R +++ b/tests/testthat/test_MSnExp.R @@ -538,8 +538,8 @@ test_that("combineSpectraMovingWindow works", { ## Use pre-calculated mzd: mzd <- estimateMzScattering(od) ## If mzd is estimated on mz and combination on sqrt(mz) it will fail. - expect_warning(od_comb <- combineSpectraMovingWindow( - od, mzd = mzd[[4]], timeDomain = TRUE)) + od_comb <- combineSpectraMovingWindow( + od, mzd = mzd[[4]], timeDomain = TRUE) ## All on m/z scale od_comb <- combineSpectraMovingWindow(od, mzd = mzd[[4]], timeDomain = FALSE) @@ -600,3 +600,18 @@ test_that("isolationWindowLowerMz, isolationWindowUpperMz work", { expect_error(isolationWindowLowerMz(tmt_im_ms2_sub), "not available") expect_error(isolationWindowUpperMz(tmt_im_ms2_sub), "not available") }) + +test_that("combineSpectra,MSnExp works", { + res <- combineSpectra(tmt_im_ms2_sub) + expect_true(is(res, "MSnExp")) + expect_true(length(res) == 1) + expect_equal(res[[1]], + combineSpectra(as(tmt_im_ms2_sub, "Spectra"))@listData[[1]]) + res2 <- combineSpectra(tmt_im_ms2_sub, mzd = 0.1) + expect_true(peaksCount(res2) < peaksCount(res)) + res3 <- combineSpectra(tmt_im_ms2_sub, mzd = 0.01, minProp = 0.1, + method = consensusSpectrum) + expect_true(peaksCount(res3) < peaksCount(res2)) + + expect_error(combineSpectra(tmt_im_ms2_sub, fcol = "other")) +}) diff --git a/tests/testthat/test_OnDiskMSnExp.R b/tests/testthat/test_OnDiskMSnExp.R index d356feb7a..be8fee091 100644 --- a/tests/testthat/test_OnDiskMSnExp.R +++ b/tests/testthat/test_OnDiskMSnExp.R @@ -439,3 +439,9 @@ test_that("isolationWindowLowerMz,isolationWindowUpperMz,OnDiskMSnExp works", { expect_true(all(is.na(mz_low))) expect_true(all(is.na(mz_high))) }) + +test_that("combineSpectra,MSnExp works with OnDiskMSnExp", { + res <- combineSpectra(filterRt(sciex, c(10, 20))) + expect_true(is(res, "MSnExp")) + expect_true(length(res) == 2) +}) diff --git a/tests/testthat/test_Spectrum.R b/tests/testthat/test_Spectrum.R index e1d9ffea6..20aecee57 100644 --- a/tests/testthat/test_Spectrum.R +++ b/tests/testthat/test_Spectrum.R @@ -539,7 +539,7 @@ test_that("meanMzInts works", { sp4 <- new("Spectrum1", mz = mzs + rnorm(length(mzs), sd = 0.3), intensity = ints2, rt = 4) ## randon noise larger than resolution. - expect_warning(res <- meanMzInts(list(sp1, sp3, sp4))) + res <- meanMzInts(list(sp1, sp3, sp4)) res <- meanMzInts(list(sp1, sp2, sp3), main = 1, timeDomain = TRUE, unionPeaks = FALSE) @@ -572,7 +572,7 @@ test_that("meanMzInts works", { expect_equal(intensity(res), intensity(res_2)) ## with (wrongly) pre-calculated mzd mzd <- .estimate_mz_scattering(sort(unlist(lapply(lst, mz)))) - expect_warning(meanMzInts(lst, timeDomain = TRUE, mzd = mzd)) + meanMzInts(lst, timeDomain = TRUE, mzd = mzd) res_3 <- meanMzInts(lst, timeDomain = FALSE, mzd = mzd, main = 2) expect_equal(mz(res), mz(res_3)) @@ -627,7 +627,6 @@ test_that("consensusSpectrum works", { intensity = c(8, 210, 7, 101, 17, 8)) spl <- Spectra(sp1, sp2, sp3) - expect_error(consensusSpectrum(4)) cons <- consensusSpectrum(spl, mzd = 0.02) expect_true(is(cons, "Spectrum2")) expect_equal(length(mz(cons)), 2)