From 96483fe319ba4846da3915f1ac097d04ae5645af Mon Sep 17 00:00:00 2001 From: Laurent Date: Wed, 4 Apr 2018 11:37:10 +0100 Subject: [PATCH] Assure feature CV feature variable names are unique when combining feature repeatedly - closes issue #303 --- NEWS | 2 ++ NEWS.md | 2 ++ R/combineFeatures.R | 16 ++++++++++- R/functions-MSnSet.R | 55 +++++++++++++++++++++--------------- tests/testthat/test_MSnSet.R | 21 ++++++++++++-- 5 files changed, 69 insertions(+), 27 deletions(-) diff --git a/NEWS b/NEWS index c61a357e1..e27822e15 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,8 @@ Changes in version 2.5.10 <2018-03-26>. - Centroiding information is retrieved from raw files (for mzML/mzXML files;. see issue [#325](https://github.com/lgatto/MSnbase/issues/325) <2018-03-27> +- Assure feature CV feature variable names are unique when combining + feature repeatedly (see issue #303) <2018-04-04 Wed> Changes in version 2.5.9 ------------------------ diff --git a/NEWS.md b/NEWS.md index 85c320997..ee6688860 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,8 @@ <2018-03-26>. - Centroiding information is retrieved from raw files (for mzML/mzXML files;. see issue [#325](https://github.com/lgatto/MSnbase/issues/325) <2018-03-27> +- Assure feature CV feature variable names are unique when combining + feature repeatedly (see issue #303) <2018-04-04 Wed> ## Changes in version 2.5.9 - New combineSpectra, combineSpectraMovingWindow, estimateMzScattering and diff --git a/R/combineFeatures.R b/R/combineFeatures.R index 88f43ae39..3893b700d 100644 --- a/R/combineFeatures.R +++ b/R/combineFeatures.R @@ -89,8 +89,22 @@ combineFeaturesV <- function(object, ## MSnSet ) { groupBy <- as.character(groupBy) if (cv) { + ## New cv feature variable names + cvfvars <- paste("CV", sampleNames(object), sep = ".") + ## If not already present, use as is (i.e no suffix needed) + if (!any(cvfvars %in% fvarLabels(object))) { + .suffix <- NULL + } else { + ## Add a numeric suffix that isn't already in use + .suffix <- 0 + while (any(cvfvars %in% fvarLabels(object))) { + .suffix <- .suffix + 1 + cvfvars <- paste(cvfvars, .suffix, sep = ".") + } + } cv.mat <- featureCV(object, groupBy = groupBy, - norm = cv.norm) + norm = cv.norm, + suffix = .suffix) } n1 <- nrow(object) ## !! order of features in matRes is defined by the groupBy factor !! diff --git a/R/functions-MSnSet.R b/R/functions-MSnSet.R index 7e7d5c27c..ab575d419 100644 --- a/R/functions-MSnSet.R +++ b/R/functions-MSnSet.R @@ -50,40 +50,49 @@ normalise_MSnSet <- function(object, method, ...) { } -##' This function calculates the column-wise coefficient of variation (CV), i.e. -##' the ration between the standard deviation and the mean, for the features -##' in an \code{"\linkS4class{MSnSet}"}. The CVs are calculated for the groups -##' of features defined by \code{groupBy}. For groups defined by single features, -##' \code{NA} is returned. +##' This function calculates the column-wise coefficient of variation +##' (CV), i.e. the ration between the standard deviation and the +##' mean, for the features in an [`MSnSet`]. The CVs are calculated +##' for the groups of features defined by `groupBy`. For groups +##' defined by single features, `NA` is returned. ##' ##' @title Calculates coeffivient of variation for features -##' @param x An instance of class \code{"\linkS4class{MSnSet}"}. -##' @param groupBy An object of class \code{factor} defining how to summerise the features. -##' @param na.rm A \code{logical} defining whether missing values should be removed. -##' @param norm One of 'none' (default), 'sum', 'max', 'center.mean', 'center.median' -##' 'quantiles' or 'quantiles.robust' defining if and how the data should be normalised -##' prior to CV calculation. See \code{\link{normalise}} for more details. -##' @return A \code{matrix} of dimensions \code{length(levels(groupBy))} by \code{ncol(x)} -##' with the respecive CVs. -##' @author Laurent Gatto , -##' Sebastian Gibb -##' @seealso \code{\link{combineFeatures}} +##' @param x An instance of class [`MSnSet`]. +##' @param groupBy An object of class `factor` defining how to +##' summarise the features. +##' @param na.rm A `logical(1)` defining whether missing values should +##' be removed. +##' @param norm One of normalisation methods applied prior to CV +##' calculation. See [normalise()] for more details. Here, the +##' default is `'none'`, i.e. no normalisation. +##' @param suffix A `character(1)` to be used to name the new CV +##' columns. Default is `NULL` to ignore this. This argument +##' should be set when CV values are already present in the +##' [`MSnSet`] feature variables. +##' @return A `matrix` of dimensions `length(levels(groupBy))` by +##' `ncol(x)` with the respecive CVs. The column names are formed +##' by pasting `CV.` and the sample names of object `x`, possibly +##' suffixed by `.suffix`. +##' @author Laurent Gatto and Sebastian Gibb +##' @seealso [combineFeatures()] +##' @md ##' @examples ##' data(msnset) ##' msnset <- msnset[1:4] ##' gb <- factor(rep(1:2, each = 2)) ##' featureCV(msnset, gb) +##' featureCV(msnset, gb, suffix = "2") featureCV <- function(x, groupBy, na.rm = TRUE, - norm = c("sum", "max", "none", - "center.mean", "center.median", - "quantiles", "quantiles.robust")) { - norm <- match.arg(norm) + norm = "none", + suffix = NULL) { if (norm != "none") x <- normalise(x, method = norm) - cv <- rowsd(exprs(x), group=groupBy, reorder=TRUE, na.rm=na.rm) / - rowmean(exprs(x), group=groupBy, reorder=TRUE, na.rm=na.rm) - colnames(cv) <- paste("CV", colnames(cv), sep=".") + cv <- rowsd(exprs(x), group = groupBy, reorder = TRUE, na.rm = na.rm) / + rowmean(exprs(x), group = groupBy, reorder = TRUE, na.rm = na.rm) + colnames(cv) <- paste("CV", colnames(cv), sep = ".") + if (!is.null(suffix)) + colnames(cv) <- paste(colnames(cv), suffix, sep = ".") cv } diff --git a/tests/testthat/test_MSnSet.R b/tests/testthat/test_MSnSet.R index e4c9d1fa0..3e365bc62 100644 --- a/tests/testthat/test_MSnSet.R +++ b/tests/testthat/test_MSnSet.R @@ -48,7 +48,7 @@ test_that("Combine MSnSet features: groupBy and fcol", { x1 <- combineFeatures(msnset, groupBy = grp) x2 <- combineFeatures(msnset, fcol = "k") x2@processingData <- x1@processingData - expect_equal(x1, x2) + expect_equal(x1, x2) }) test_that("Combine MSnSet features (V)", { @@ -98,7 +98,7 @@ test_that("Combine MSnSet features (V)", { matrix(c(5*4, 5*10, 5*4, 5*10), ncol = 2), tolerance = .001, check.attributes = FALSE) - + expect_true(all(fData(bb)[, 1] == c("A", "B"))) expect_true(all(fData(bb)[, 2] == c("A.1", "B.6"))) gb2 <- factor(c("a", "c", "z", "a", "z", "b", "b", "a", "c", "a")) @@ -181,6 +181,17 @@ test_that("Combine MSnSet features (L)", { colMeans(exprs(ee)[sapply(L, function(l) any(grepl("C", l))), ])) }) +test_that("Combine MSnSet features repeatedly", { + data(msnset) + fv1 <- fvarLabels(msnset) + pep <- combineFeatures(msnset, groupBy = fData(msnset)$PeptideSequence, + cv = TRUE) + prot <- combineFeatures(pep, groupBy = fData(pep)$ProteinAccession, + cv = TRUE) + expect_identical(length(fv1) + 4L, length(fvarLabels(pep))) + expect_identical(length(fv1) + 8L, length(fvarLabels(prot))) +}) + test_that("makeImpuritiesMatrix", { i4 <- dir(system.file("extdata", package = "MSnbase"), pattern = "iTRAQ4plexPurityCorrection", @@ -290,6 +301,10 @@ test_that("featureCV", { sd((exprs(m)/div)[ii]/mean((exprs(m)/div)[ii])) }), nrow=2, ncol=2, dimnames=list(c("A", "B"), c("CV.1", "CV.2"))) expect_equal(featureCV(m, group=fData(m)$accession, norm="sum"), cv) + + cv1 <- featureCV(m, group=fData(m)$accession) + cv2 <- featureCV(m, group=fData(m)$accession, suffix = "2") + expect_identical(colnames(cv1), sub("\\.2", "", colnames(cv2))) }) context("MSnSet identification data") @@ -429,7 +444,7 @@ test_that("nFeatures are added correctly", { "'Protein.Group.Accessions.nFeatures' already present.") expect_error(nFeatures(hyperLOPIT2015ms3r1psm, "foo")) tmp <- fData(res)$Protein.Group.Accessions.nFeatures - names(tmp) <- fData(res)$Protein.Group.Accessions + names(tmp) <- fData(res)$Protein.Group.Accessions sel <- !duplicated(names(tmp)) g <- tmp[sel] expect_equivalent(g[sort(names(g))], k0[sort(names(k0))])