Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

getPrevalence, NA values #486

Merged
merged 14 commits into from
Mar 5, 2024
Merged
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: mia
Type: Package
Version: 1.11.7
Version: 1.11.8
Authors@R:
c(person(given = "Felix G.M.", family = "Ernst", role = c("aut"),
email = "[email protected]",
Expand Down
1 change: 1 addition & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,4 @@ Changes in version 1.11.x
+ perSampleDominantFeatures: add new arguments (n, other.name, complete)
+ loadFromMetaphlan: support "taxonomy" column for specifying taxonomy
+ cluster: Overwrite old results instead of failing
+ getPrevalence: bugfix, if assay contains NA values, it does not end up to NA anymore.
44 changes: 34 additions & 10 deletions R/getPrevalence.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,17 @@
#'
#' @param rank a single character defining a taxonomic rank. Must be a value of
#' \code{taxonomyRanks()} function.
#'
#'
#' @param na.rm logical scalar: Should NA values be omitted when calculating
#' prevalence? (default: \code{na.rm = TRUE})
#'
#' @param ... additional arguments
#' \itemize{
#' \item{If \code{!is.null(rank)} arguments are passed on to
#' \code{\link[=agglomerate-methods]{agglomerateByRank}}. See
#' \code{\link[=agglomerate-methods]{?agglomerateByRank}} for more details.
#' Note that you can specify whether to remove empty ranks with
#' \code{agg.na.rm} instead of \code{na.rm}. (default: \code{FALSE})
#' }
#' \item{for \code{getPrevalentFeatures}, \code{getRareFeatures},
#' \code{subsetByPrevalentFeatures} and \code{subsetByRareFeatures} additional
Expand Down Expand Up @@ -186,16 +191,19 @@ setGeneric("getPrevalence", signature = "x",

#' @rdname getPrevalence
#' @export
setMethod("getPrevalence", signature = c(x = "ANY"),
function(x, detection = 0, include_lowest = FALSE, sort = FALSE, ...){

setMethod("getPrevalence", signature = c(x = "ANY"), function(
x, detection = 0, include_lowest = FALSE, sort = FALSE, na.rm = TRUE, ...){
# input check
if (!.is_numeric_string(detection)) {
stop("'detection' must be a single numeric value or coercible to ",
"one.",
call. = FALSE)
}

#
if(!.is_a_bool(na.rm)){
stop("'na.rm' must be TRUE or FALSE.", call. = FALSE)
}
#
detection <- as.numeric(detection)
if(!.is_a_bool(include_lowest)){
stop("'include_lowest' must be TRUE or FALSE.", call. = FALSE)
Expand All @@ -204,13 +212,21 @@ setMethod("getPrevalence", signature = c(x = "ANY"),
stop("'sort' must be TRUE or FALSE.", call. = FALSE)
}
#

# Give warning if there are taxa with NA values
if( any( is.na(x) ) ){
msg <- paste0(
"The abundance table contains NA values and they are",
ifelse(na.rm, " not ", " "), "excluded (see 'na.rm').")
warning(msg, call. = FALSE)
}
#
if (include_lowest) {
prev <- x >= detection
} else {
prev <- x > detection
}
prev <- rowSums(prev)
# Calculate prevalence for each taxa
prev <- rowSums(prev, na.rm = na.rm)
TuomasBorman marked this conversation as resolved.
Show resolved Hide resolved
# Always return prevalence as a relative frequency.
# This helps to avoid confusion with detection limit
prev <- prev / ncol(x)
Expand All @@ -222,15 +238,23 @@ setMethod("getPrevalence", signature = c(x = "ANY"),
)

.agg_for_prevalence <- function(
x, rank, relabel = FALSE, make_unique = TRUE, na.rm = FALSE, ...){
# Check na.rm
x, rank, relabel = FALSE, make_unique = TRUE, na.rm = FALSE,
agg.na.rm = FALSE, ...){
# Check na.rm. It is not used in this function, it is only catched so that
# it can be passed to getPrevalence(matrix) and not use it here in
# agglomerateByRank function.
if(!.is_a_bool(na.rm)){
stop("'na.rm' must be TRUE or FALSE.", call. = FALSE)
}
#
# Check drop.empty.rank
if(!.is_a_bool(agg.na.rm)){
stop("'agg.na.rm' must be TRUE or FALSE.", call. = FALSE)
}
#
if(!is.null(rank)){
.check_taxonomic_rank(rank, x)
args <- c(list(x = x, rank = rank, na.rm = na.rm), list(...))
args <- c(list(x = x, rank = rank, na.rm = agg.na.rm), list(...))
argNames <- c("x","rank","onRankOnly","na.rm","empty.fields",
"archetype","mergeTree","average","BPPARAM")
args <- args[names(args) %in% argNames]
Expand Down
14 changes: 13 additions & 1 deletion man/getPrevalence.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

50 changes: 50 additions & 0 deletions tests/testthat/test-5prevalence.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,56 @@ test_that("getPrevalence", {
pr2 <- getPrevalence(gp_null, detection=0.004, as_relative=TRUE, rank = "Family")
expect_equal(pr1, pr2)

# Check that na.rm works correctly
tse <- GlobalPatterns
# Get reference value
ref <- getPrevalence(tse, assay.type = "counts")
# Add NA values to matrix
remove <- c(1, 3, 10)
assay(tse, "counts")[remove, ] <- NA
# There should be 3 NA values if na.rm = FALSE. Otherwise there should be 0
expect_warning(
res <- getPrevalence(tse, assay.type = "counts", na.rm = FALSE) )
expect_true( sum(is.na(res)) == 3)
expect_warning(
res <- getPrevalence(tse, assay.type = "counts", na.rm = TRUE) )
expect_true( sum(is.na(res)) == 0)
# Expect that other than features with NA values are the same as in reference
expect_warning(
res <- getPrevalence(tse, assay.type = "counts", na.rm = TRUE))
res <- res[ !names(res) %in% remove ]
ref <- ref[ !names(ref) %in% remove ]
expect_equal( res[ names(ref) ], res[ names(ref) ] )

# Now test that the number of samples where feature was detected is correct
tse <- GlobalPatterns
ref <- getPrevalence(tse, assay.type = "counts")
# Add NA values to specific feature that has non-zero value
feature <- rownames(tse)[[7]]
assay(tse, "counts")[feature, 1] <- NA
expect_warning(
res <- getPrevalence(tse, assay.type = "counts", na.rm = TRUE))
# Get the feature values and check that they have correct number of samples
res <- res[ feature ]
ref <- ref[ feature ]
expect_true(res*ncol(tse) == 2)
expect_true(ref*ncol(tse) == 3)

#
tse <- GlobalPatterns
rank <- "Genus"
# Add NA values to matrix
remove <- c(15, 200)
assay(tse, "counts")[remove, ] <- NA
# Check that agglomeration works
tse_agg <- agglomerateByRank(tse, rank = rank)
expect_warning(ref <- getPrevalence(tse_agg, na.rm = FALSE))
expect_warning(res <- getPrevalence(tse, na.rm = FALSE, rank = "Genus"))
expect_true( all(res == ref, na.rm = TRUE) )
#
expect_warning(ref <- getPrevalence(tse_agg, na.rm = TRUE))
expect_warning(res <- getPrevalence(tse, na.rm = TRUE, rank = "Genus"))
expect_true( all(res == ref, na.rm = TRUE) )
})


Expand Down
Loading