Assure feature CV feature variable names are unique when combining fe…

…ature repeatedly - closes issue #303
lgatto · Apr 4, 2018 · 96483fe · 96483fe
1 parent 29ffec9
commit 96483fe
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 27 deletions.
diff --git a/NEWS b/NEWS
@@ -9,6 +9,8 @@ Changes in version 2.5.10
   <2018-03-26>.
 - Centroiding information is retrieved from raw files (for mzML/mzXML files;.
   see issue [#325](https://github.com/lgatto/MSnbase/issues/325) <2018-03-27>
+- Assure feature CV feature variable names are unique when combining
+  feature repeatedly (see issue #303) <2018-04-04 Wed>
 
 Changes in version 2.5.9
 ------------------------

diff --git a/NEWS.md b/NEWS.md
@@ -10,6 +10,8 @@
   <2018-03-26>.
 - Centroiding information is retrieved from raw files (for mzML/mzXML files;.
   see issue [#325](https://github.com/lgatto/MSnbase/issues/325) <2018-03-27>
+- Assure feature CV feature variable names are unique when combining
+  feature repeatedly (see issue #303) <2018-04-04 Wed>
 
 ## Changes in version 2.5.9
 - New combineSpectra, combineSpectraMovingWindow, estimateMzScattering and

diff --git a/R/combineFeatures.R b/R/combineFeatures.R
@@ -89,8 +89,22 @@ combineFeaturesV <- function(object,   ## MSnSet
                              ) {
     groupBy <- as.character(groupBy)
     if (cv) {
+        ## New cv feature variable names
+        cvfvars <- paste("CV", sampleNames(object), sep = ".")
+        ## If not already present, use as is (i.e no suffix needed)
+        if (!any(cvfvars %in% fvarLabels(object))) {
+            .suffix <- NULL
+        } else {
+            ## Add a numeric suffix that isn't already in use
+            .suffix <- 0
+            while (any(cvfvars %in% fvarLabels(object))) {
+                .suffix <- .suffix + 1
+                cvfvars <- paste(cvfvars, .suffix, sep = ".")
+            }
+        }
         cv.mat <- featureCV(object, groupBy = groupBy,
-                            norm = cv.norm)
+                            norm = cv.norm,
+                            suffix = .suffix)
     }
     n1 <- nrow(object)
     ## !! order of features in matRes is defined by the groupBy factor !!

diff --git a/R/functions-MSnSet.R b/R/functions-MSnSet.R
@@ -50,40 +50,49 @@ normalise_MSnSet <- function(object, method, ...) {
 }
 
 
-##' This function calculates the column-wise coefficient of variation (CV), i.e.
-##' the ration between the standard deviation and the mean, for the features
-##' in an \code{"\linkS4class{MSnSet}"}. The CVs are calculated for the groups
-##' of features defined by \code{groupBy}. For groups defined by single features,
-##' \code{NA} is returned.
+##' This function calculates the column-wise coefficient of variation
+##' (CV), i.e.  the ration between the standard deviation and the
+##' mean, for the features in an [`MSnSet`]. The CVs are calculated
+##' for the groups of features defined by `groupBy`. For groups
+##' defined by single features, `NA` is returned.
 ##'
 ##' @title Calculates coeffivient of variation for features
-##' @param x An instance of class \code{"\linkS4class{MSnSet}"}.
-##' @param groupBy An object of class \code{factor} defining how to summerise the features.
-##' @param na.rm A \code{logical} defining whether missing values should be removed.
-##' @param norm One of 'none' (default), 'sum', 'max', 'center.mean', 'center.median'
-##' 'quantiles' or 'quantiles.robust' defining if and how the data should be normalised
-##' prior to CV calculation. See \code{\link{normalise}} for more details.
-##' @return A \code{matrix} of dimensions \code{length(levels(groupBy))} by \code{ncol(x)}
-##' with the respecive CVs.
-##' @author Laurent Gatto <lg390@@cam.ac.uk>,
-##' Sebastian Gibb <mail@@sebastiangibb.de>
-##' @seealso \code{\link{combineFeatures}}
+##' @param x An instance of class [`MSnSet`].
+##' @param groupBy An object of class `factor` defining how to
+##'     summarise the features.
+##' @param na.rm A `logical(1)` defining whether missing values should
+##'     be removed.
+##' @param norm One of normalisation methods applied prior to CV
+##'     calculation. See [normalise()] for more details. Here, the
+##'     default is `'none'`, i.e. no normalisation.
+##' @param suffix A `character(1)` to be used to name the new CV
+##'     columns. Default is `NULL` to ignore this. This argument
+##'     should be set when CV values are already present in the
+##'     [`MSnSet`] feature variables.
+##' @return A `matrix` of dimensions `length(levels(groupBy))` by
+##'     `ncol(x)` with the respecive CVs. The column names are formed
+##'     by pasting `CV.` and the sample names of object `x`, possibly
+##'     suffixed by `.suffix`.
+##' @author Laurent Gatto and Sebastian Gibb
+##' @seealso [combineFeatures()]
+##' @md
 ##' @examples
 ##' data(msnset)
 ##' msnset <- msnset[1:4]
 ##' gb <- factor(rep(1:2, each = 2))
 ##' featureCV(msnset, gb)
+##' featureCV(msnset, gb, suffix = "2")
 featureCV <- function(x, groupBy, na.rm = TRUE,
-                      norm = c("sum", "max", "none",
-                        "center.mean", "center.median",
-                        "quantiles", "quantiles.robust")) {
-  norm <- match.arg(norm)
+                      norm = "none",
+                      suffix = NULL) {
   if (norm != "none")
     x <- normalise(x, method = norm)
 
-  cv <- rowsd(exprs(x), group=groupBy, reorder=TRUE, na.rm=na.rm) /
-          rowmean(exprs(x), group=groupBy, reorder=TRUE, na.rm=na.rm)
-  colnames(cv) <- paste("CV", colnames(cv), sep=".")
+  cv <- rowsd(exprs(x), group = groupBy, reorder = TRUE, na.rm = na.rm) /
+      rowmean(exprs(x), group = groupBy, reorder = TRUE, na.rm = na.rm)
+  colnames(cv) <- paste("CV", colnames(cv), sep = ".")
+  if (!is.null(suffix))
+      colnames(cv) <- paste(colnames(cv), suffix, sep = ".")
   cv
 }
 

diff --git a/tests/testthat/test_MSnSet.R b/tests/testthat/test_MSnSet.R
@@ -48,7 +48,7 @@ test_that("Combine MSnSet features: groupBy and fcol", {
     x1 <- combineFeatures(msnset, groupBy = grp)
     x2 <- combineFeatures(msnset, fcol = "k")
     x2@processingData <- x1@processingData
-    expect_equal(x1, x2)    
+    expect_equal(x1, x2)
 })
 
 test_that("Combine MSnSet features (V)", {
@@ -98,7 +98,7 @@ test_that("Combine MSnSet features (V)", {
                  matrix(c(5*4, 5*10, 5*4, 5*10), ncol = 2),
                  tolerance = .001,
                  check.attributes = FALSE)
-    
+
     expect_true(all(fData(bb)[, 1] == c("A", "B")))
     expect_true(all(fData(bb)[, 2] == c("A.1", "B.6")))
     gb2 <- factor(c("a", "c", "z", "a", "z", "b", "b", "a", "c", "a"))
@@ -181,6 +181,17 @@ test_that("Combine MSnSet features (L)", {
                  colMeans(exprs(ee)[sapply(L, function(l) any(grepl("C", l))), ]))
 })
 
+test_that("Combine MSnSet features repeatedly", {
+    data(msnset)
+    fv1 <- fvarLabels(msnset)
+    pep <- combineFeatures(msnset, groupBy = fData(msnset)$PeptideSequence,
+                           cv = TRUE)
+    prot <- combineFeatures(pep, groupBy = fData(pep)$ProteinAccession,
+                            cv = TRUE)
+    expect_identical(length(fv1) + 4L, length(fvarLabels(pep)))
+    expect_identical(length(fv1) + 8L, length(fvarLabels(prot)))
+})
+
 test_that("makeImpuritiesMatrix", {
     i4 <- dir(system.file("extdata", package = "MSnbase"),
               pattern = "iTRAQ4plexPurityCorrection",
@@ -290,6 +301,10 @@ test_that("featureCV", {
                    sd((exprs(m)/div)[ii]/mean((exprs(m)/div)[ii])) }),
                  nrow=2, ncol=2, dimnames=list(c("A", "B"), c("CV.1", "CV.2")))
     expect_equal(featureCV(m, group=fData(m)$accession, norm="sum"), cv)
+
+    cv1 <- featureCV(m, group=fData(m)$accession)
+    cv2 <- featureCV(m, group=fData(m)$accession, suffix = "2")
+    expect_identical(colnames(cv1), sub("\\.2", "", colnames(cv2)))
 })
 
 context("MSnSet identification data")
@@ -429,7 +444,7 @@ test_that("nFeatures are added correctly", {
                  "'Protein.Group.Accessions.nFeatures' already present.")
     expect_error(nFeatures(hyperLOPIT2015ms3r1psm, "foo"))
     tmp <- fData(res)$Protein.Group.Accessions.nFeatures
-    names(tmp) <- fData(res)$Protein.Group.Accessions    
+    names(tmp) <- fData(res)$Protein.Group.Accessions
     sel <- !duplicated(names(tmp))
     g <- tmp[sel]
     expect_equivalent(g[sort(names(g))], k0[sort(names(k0))])