diff --git a/.Rbuildignore b/.Rbuildignore index b8a7f44..0abfe23 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,3 +4,4 @@ README.md TODO.txt ^.*\.DS_Store docs +inst/extras diff --git a/DESCRIPTION b/DESCRIPTION index 0457073..781f43d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,13 @@ Package: philr Type: Package Title: Phylogenetic partitioning based ILR transform for metagenomics data -Version: 1.13.1 -Date: 2017-04-07 -Authors@R: c(person("Justin", "Silverman", role=c("aut", "cre"), - email = "jsilve24@gmail.com")) -Author: Justin Silverman -Maintainer: Justin Silverman +Version: 1.19.1 +Date: 2021-08-11 +Authors@R: + c(person("Justin", "Silverman", role=c("aut", "cre"), + email = "jsilve24@gmail.com"), + person(given = "Leo", family = "Lahti", role = c("ctb"), + comment = c(ORCID = "0000-0001-5537-637X"))) Description: PhILR is short for Phylogenetic Isometric Log-Ratio Transform. This package provides functions for the analysis of compositional data (e.g., data representing proportions of different variables/parts). @@ -15,11 +16,26 @@ Description: PhILR is short for Phylogenetic Isometric Log-Ratio Transform. data) and makes available the Isometric Log Ratio transform built from the phylogenetic tree and utilizing a weighted reference measure. License: GPL-3 -LazyData: TRUE -RoxygenNote: 6.1.1 -Imports: ape, phangorn, tidyr, ggplot2, ggtree -Depends: -Suggests: testthat, knitr, rmarkdown, BiocStyle, phyloseq, glmnet, dplyr +RoxygenNote: 7.1.1 +Imports: + ape, + phangorn, + tidyr, + ggplot2, + ggtree, + methods +Suggests: + testthat, + knitr, + ecodist, + rmarkdown, + BiocStyle, + phyloseq, + SummarizedExperiment, + TreeSummarizedExperiment, + glmnet, + dplyr, + mia VignetteBuilder: knitr biocViews: ImmunoOncology, Sequencing, Microbiome, Metagenomics, Software URL: https://github.com/jsilve24/philr diff --git a/NAMESPACE b/NAMESPACE index 19e5678..caf6707 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,12 @@ # Generated by roxygen2: do not edit by hand +S3method(philr,TreeSummarizedExperiment) +S3method(philr,array) +S3method(philr,data.frame) +S3method(philr,default) +S3method(philr,matrix) +S3method(philr,numeric) +S3method(philr,phyloseq) export(annotate_balance) export(buildilrBasep) export(calculate.blw) @@ -24,5 +31,7 @@ importFrom(ape,rtree) importFrom(ggplot2,annotate) importFrom(ggtree,get_clade_position) importFrom(ggtree,ggtree) +importFrom(methods,as) +importFrom(methods,is) importFrom(phangorn,Children) importFrom(tidyr,gather_) diff --git a/R/branch_length_calculations.R b/R/branch_length_calculations.R index 3f05591..b113c57 100644 --- a/R/branch_length_calculations.R +++ b/R/branch_length_calculations.R @@ -3,23 +3,28 @@ #' Calculates the weightings for ILR coordinates based on branch lenghts of #' a phylogenetic tree via a few different methods (see details). #' @aliases calc.blw -#' @param tree a \code{phylo} class tree object that is binary (see \code{\link[ape]{multi2di}}) -#' @param method options include: (default) \code{'sum.children'} and \code{'mean.descendants'} +#' @param tree a \code{phylo} class tree object that is binary (see +#' \code{\link[ape]{multi2di}}) +#' @param method options include: (default) \code{'sum.children'} and +#' \code{'mean.descendants'} #' see details for more information. -#' @return vector of weightings for ILR coordinates produced via specified method. +#' @return vector of weightings for ILR coordinates produced via specified +#' method. #' @details #' ILR balances built from a binary partition of a phylogenetic tree #' can be imbued with branch length information. This function is helpful in #' calculating those weightings.\cr \cr #' There are a number of methods for calculating these weightings, the default #' \code{'sum.children'} calculates the weighting for a given balance as the sum -#' of its two direct children's branch length. An alternative that has been as yet less -#' studied is \code{'mean.descendants'} to calculate the weighting for a given balance -#' as the sum of its two direct children's branch lengths PLUS for each child the average -#' distance from it to its descendant tips. \cr \cr -#' \emph{Note:} That some trees contain tips with branch lengths of zero length. This can result -#' in that tip being unreasonably downweighted, as such this function automatically -#' adds a small pseudocount to those tips with zero length (equal to the smallest non-zero) +#' of its two direct children's branch length. An alternative that has been +#' as yet less studied is \code{'mean.descendants'} to calculate the weighting +#' for a given balance as the sum of its two direct children's branch lengths +#' PLUS for each child the average distance from it to its descendant tips. +#' \cr \cr +#' \emph{Note:} That some trees contain tips with branch lengths of zero length. +#' This can result in that tip being unreasonably downweighted, as such this +#' function automatically adds a small pseudocount to those tips with zero +#' length (equal to the smallest non-zero) #' branch length on the tree. #' @author Justin Silverman #' @export @@ -35,20 +40,24 @@ calculate.blw <- function(tree, method='sum.children'){ # In these cases will replace those zero values with # the minimum branch length (greater than 0) found in the tree. # Note this is like adding a psudocount to branch lengths. - min.nonzero <- min(tree$edge.length[tree$edge.length>0 & !is.na(tree$edge.length)]) + min.nonzero <- min(tree$edge.length[tree$edge.length>0 & + !is.na(tree$edge.length)]) # Find the edges that end in tips and have zero length tip.edges.zero <- (tree$edge[,2] <= nTips) & (tree$edge.length == 0) # print warning/note statement n.replace <- sum(tip.edges.zero) if (n.replace >0 ){ - warning(paste('Note: a total of', sum(tip.edges.zero), 'tip edges with zero length', + warning(paste('Note: a total of', sum(tip.edges.zero), + 'tip edges with zero length', 'have been replaced with a small pseudocount of the minimum', 'non-zero edge length (',min.nonzero,').',sep=" ")) tree$edge.length[tip.edges.zero] <- min.nonzero } if (method=='sum.children')return(blw.sum.children(tree)) - else if (method=='mean.descendants')return(blw.mean.descendants.sum.children(tree)) + else if (method=='mean.descendants') { + return(blw.mean.descendants.sum.children(tree)) + } } # Now calculate Branch Length Weighting as the sum of a nodes @@ -64,26 +73,27 @@ blw.sum.children <- function(tree){ res } -# Calculates the sum of the children's nodes average distance to descendant tips +# Calculates the sum of the children's nodes average distance to +# descendant tips # Which should be a slightly better calculation than MDTT when we are # primarily interested in weightings on ILR coordinates blw.mean.descendants.sum.children <- function(tree){ - nTips = ape::Ntip(tree) - X <- phangorn::Children(tree, (nTips+1):(nTips+tree$Nnode)) + nTips = ape::Ntip(tree) + X <- phangorn::Children(tree, (nTips+1):(nTips+tree$Nnode)) - # Children's average branch length to tips (zero for tips) - BMD <- mean_dist_to_tips(tree) - names(BMD) <- NULL - BMD <- c(numeric(nTips), BMD) + # Children's average branch length to tips (zero for tips) + BMD <- mean_dist_to_tips(tree) + names(BMD) <- NULL + BMD <- c(numeric(nTips), BMD) - # Each child's edge length - EL <- numeric(max(tree$edge)) - EL[tree$edge[,2]] <- tree$edge.length + # Each child's edge length + EL <- numeric(max(tree$edge)) + EL[tree$edge[,2]] <- tree$edge.length - fun <- function(x, el, bmd)sum(el[x] + bmd[x]) - res <- sapply(X, fun, EL, BMD) - if(!is.null(tree$node.label)) names(res) <- tree$node.label - return(res) + fun <- function(x, el, bmd)sum(el[x] + bmd[x]) + res <- sapply(X, fun, EL, BMD) + if(!is.null(tree$node.label)) names(res) <- tree$node.label + return(res) } #' Mean distance from internal nodes to descendant tips @@ -105,7 +115,8 @@ mean_dist_to_tips <- function(tree){ dist <- ape::dist.nodes(tree) node.numbers <- (nTips+1):(nTips+tree$Nnode) - chld.tips <- phangorn::Descendants(tree, (nTips+1):(nTips+tree$Nnode), "tips") + chld.tips <- phangorn::Descendants(tree, (nTips+1):(nTips+tree$Nnode), + "tips") # Turn dist into a nodes (rows) x tips (columns) matrix dist <- dist[node.numbers, 1:nTips] diff --git a/R/general_utils.R b/R/general_utils.R index 6ae1c37..076b429 100644 --- a/R/general_utils.R +++ b/R/general_utils.R @@ -1,22 +1,21 @@ - # Generally Helpful for Validation ---------------------------------------- # check for zeros and throw error if present # target is just a name for the warning message check.zeroes <- function(x, target){ - if (any(x==0)){ - warning(paste(target, "should not contain zeroes")) - } + if (any(x==0)){ + warning(paste(target, "should not contain zeroes")) + } } # convert vector to row vector. vec_to_mat <- function(x){ - if (is.vector(x)) { - n <- names(x) - x <- matrix(x, nrow = 1) - colnames(x) <- n - } - x + if (is.vector(x)) { + n <- names(x) + x <- matrix(x, nrow = 1) + colnames(x) <- n + } + x } # ACCESSOR FUNCTIONS FROM NAMES TO NUMBERS -------------------------------- @@ -41,17 +40,17 @@ NULL #' @rdname name_nodenumber_conversion #' @export nn.to.name <- function(tr, x){ - if(!is.numeric(x)) stop('node numbers must be numeric') - labels <- c(tr$tip.label, tr$node.label) - labels[x] + if(!is.numeric(x)) stop('node numbers must be numeric') + labels <- c(tr$tip.label, tr$node.label) + labels[x] } #' @rdname name_nodenumber_conversion #' @export name.to.nn <- function(tr, x){ - if(!is.character(x)) stop('node/tip names (x) should be a character vector') - labels <- c(tr$tip.label, tr$node.label) - match(x, labels) + if(!is.character(x)) stop('node/tip names (x) should be a character vector') + labels <- c(tr$tip.label, tr$node.label) + match(x, labels) } @@ -59,12 +58,15 @@ name.to.nn <- function(tr, x){ #' Converts wide format ILR transformed data to long format #' -#' Converts wide format ILR transformed data (see \code{\link{philr}}) to long format -#' useful in various plotting functions where long format data is required. +#' Converts wide format ILR transformed data (see \code{\link{philr}}) to +#' long format useful in various plotting functions where long format data is +#' required. #' -#' @param df PhILR transformed data in wide format (samples by balances) (see \code{\link{philr}}) -#' @param labels vector (of length \code{nrow(df)}) with labels to group samples by -#' @return \code{df} in long format with columns +#' @param x PhILR transformed data in wide format (samples by balances) +#' (see \code{\link{philr}}) +#' @param labels vector (of length \code{nrow(x)}) with labels to group +#' samples by +#' @return \code{x} in long format with columns #' \itemize{ #' \item sample #' \item labels @@ -75,20 +77,20 @@ name.to.nn <- function(tr, x){ #' @importFrom tidyr gather_ #' @examples #' tr <- named_rtree(5) -#' df <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add a small pseudocount -#' colnames(df) <- tr$tip.label +#' x <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add small pseudocount +#' colnames(x) <- tr$tip.label #' -#' df.philr <- philr(df, tr, part.weights='enorm.x.gm.counts', -#' ilr.weights='blw.sqrt', return.all=FALSE) -#' convert_to_long(df.philr, rep(c('a','b'), 5)) -convert_to_long <- function(df, labels){ - coord.names <- colnames(df) - df.long <- as.data.frame(df) - df.long$sample <- rownames(df.long) - df.long$labels <- labels - df.long <- gather_(df.long, "coord", "value", - setdiff(names(df.long), c("labels", "sample"))) - return(df.long) +#' x.philr <- philr(x, tree=tr, part.weights='enorm.x.gm.counts', +#' ilr.weights='blw.sqrt', return.all=FALSE) +#' convert_to_long(x.philr, rep(c('a','b'), 5)) +convert_to_long <- function(x, labels){ + coord.names <- colnames(x) + x.long <- as.data.frame(x) + x.long$sample <- rownames(x.long) + x.long$labels <- labels + x.long <- gather_(x.long, "coord", "value", + setdiff(names(x.long), c("labels", "sample"))) + return(x.long) } @@ -107,6 +109,6 @@ convert_to_long <- function(df, labels){ #' @examples #' named_rtree(5) named_rtree <- function(n){ - tr <- rtree(n) - makeNodeLabel(tr, 'number', 'n') + tr <- rtree(n) + makeNodeLabel(tr, 'number', 'n') } diff --git a/R/name_balances.R b/R/name_balances.R index 8228570..06134a8 100644 --- a/R/name_balances.R +++ b/R/name_balances.R @@ -1,88 +1,105 @@ #' Name a balance (coordinate) based on taxonomy #' -#' For a given ILR balance (coordinate) assigns a name to the balance based on a -#' provided taxonomy table. This is useful for interpretation of the balances. +#' For a given ILR balance (coordinate) assigns a name to the balance based on +#' a provided taxonomy table. This is useful for interpretation of the balances. #' #' @param tr an object of class \code{'phylo'} -#' @param tax a matrix/data.frame of taxonomy, rownames should correspond to \code{tr$tip.labels} -#' columns should be taxonomic levels (named) with increasing taxonomic resolution from left to right -#' (e.g., Phylum to the left of Genus). -#' @param coord the name of a balance/internal node on the tree (given as a string) +#' @param tax a matrix/data.frame of taxonomy, rownames should correspond to +#' \code{tr$tip.labels} columns should be taxonomic levels (named) with +#' increasing taxonomic resolution from left to right (e.g., Phylum to the +#' left of Genus). +#' @param coord the name of a balance/internal node on the tree (given as a +#' string) #' @param method currently only \code{'voting'} implemented. See Details. -#' @param thresh threshold for assignment of taxonomy to a given part of a balance -#' (must be greater than 0.5 if \code{method='voting'}; see details). -#' @param return.votes whether voting results by taxonomic level should be shown for \code{coord}. Note: this is helpful when -#' \code{name.balance} does not return a clear winner, as may be the case when a given \code{coord} represents more than one -#' taxonomic lineage. votes are returned as a list indexed by \code{colnames(tax)} Options include: +#' @param thresh threshold for assignment of taxonomy to a given part of a +#' balance (must be greater than 0.5 if \code{method='voting'}; see details). +#' @param return.votes whether voting results by taxonomic level should be +#' shown for \code{coord}. Note: this is helpful when \code{name.balance} does +#' not return a clear winner, as may be the case when a given \code{coord} +#' represents more than one taxonomic lineage. votes are returned as a list +#' indexed by \code{colnames(tax)} Options include: #' \describe{ -#' \item{\code{NULL}}{(default) only returns the combined consensus name of the balance} +#' \item{\code{NULL}}{(default) only returns the combined consensus name of +#' the balance} #' \item{\code{'up'}}{adds tallied votes for the 'up' node to the output list} -#' \item{\code{'down'}}{adds tallied votes for the 'down'node to the output list} +#' \item{\code{'down'}}{adds tallied votes for the 'down' node to the output +#' list} #' \item{\code{'self'}}{adds tallied votes for \code{coord} to the output list} #' } -#' @return If \code{return.votes=NULL} returns a string of the form (ex. 'Genus_Bacteroides/Phylum_Firmicutes'). Otherwise returns -#' a list with the above string as 'name', see Arguments for \code{show.votes} for other optional returned items. +#' @return If \code{return.votes=NULL} returns a string of the form +#' (ex. 'Genus_Bacteroides/Phylum_Firmicutes'). Otherwise returns +#' a list with the above string as 'name', see Arguments for \code{show.votes} +#' for other optional returned items. #' @details A bit of terminology: #' \describe{ -#' \item{coord}{this is the same as the names of the balances which should be the same as the names of the internal nodes of \code{tr}} -#' \item{'up'}{this is the child node of \code{coord} that is represented in the numerator of the \code{coord} balance.} -#' \item{'down'}{this is the child node of \code{coord} that is represented in the denominator of the \code{coord} balance} +#' \item{coord}{this is the same as the names of the balances which should be +#' the same as the names of the internal nodes of \code{tr}} +#' \item{'up'}{this is the child node of \code{coord} that is represented in +#' the numerator of the \code{coord} balance.} +#' \item{'down'}{this is the child node of \code{coord} that is represented in +#' the denominator of the \code{coord} balance} #' } -#' The method \code{'voting'} assigns the name of the each part of a balance (e.g., numerator and denominator / each -#' child of \code{coord}) as follows: +#' The method \code{'voting'} assigns the name of the each part of a balance +#' (e.g., numerator and denominator / each child of \code{coord}) as follows: #' \enumerate{ -#' \item First Subset \code{tax} to contain only descendent tips of the given child of \code{coord} -#' \item Second At the finest taxonomic (farthest right of \code{tax}) see if any one taxonomic label -#' is present at or above \code{thresh}. If yes output that taxonomic label (at that taxonomic level) as the label -#' for that child of \code{coord}. If no then move to coarser taxonomic level (leftward) and repeat. +#' \item First Subset \code{tax} to contain only descendent tips of the given +#' child of \code{coord} +#' \item Second At the finest taxonomic (farthest right of \code{tax}) see if +#' any one taxonomic label is present at or above \code{thresh}. If yes output +#' that taxonomic label (at that taxonomic level) as the label for that child +#' of \code{coord}. If no then move to coarser taxonomic level (leftward) and +#' repeat. #' } #' @author Justin Silverman +#' @importFrom methods as +#' @importFrom methods is #' @export #' @seealso \code{\link{philr}} #' @examples #' tr <- named_rtree(40) #' tax <- data.frame(Kingdom=rep('A', 40), -#' Phylum=rep(c('B','C'), each=20), -#' Genus=c(sample(c('D','F'),20, replace=TRUE), -#' sample(c('G','H'), 20, replace=TRUE))) +#' Phylum=rep(c('B','C'), each=20), +#' Genus=c(sample(c('D','F'),20, replace=TRUE), +#' sample(c('G','H'), 20, replace=TRUE))) #' rownames(tax) <- tr$tip.label #' name.balance(tr, tax, 'n1') #' name.balance(tr, tax, 'n34') #' name.balance(tr,tax, 'n34', return.votes = c('up', 'down')) name.balance <- function(tr, tax, coord, method="voting", thresh=0.95, return.votes=NULL){ - if (method=="voting"){ - # Get tips in 'up' and 'down' subtree - l.tips <- get.ud.tips(tr,coord) + if (method=="voting"){ + # Get tips in 'up' and 'down' subtree + l.tips <- get.ud.tips(tr,coord) - tax <- as(tax, "matrix") - # Subset tax table based on above - tax.up <- tax[l.tips[['up']],] - tax.down <- tax[l.tips[['down']],] + tax <- as(tax, "matrix") + # Subset tax table based on above + tax.up <- tax[l.tips[['up']],] + tax.down <- tax[l.tips[['down']],] - # Get Voted Consensus for up and down taxa (character strings) - up.voted <- vote.annotation(tax.up, voting.threshold=thresh) - down.voted <- vote.annotation(tax.down, voting.threshold=thresh) + # Get Voted Consensus for up and down taxa (character strings) + up.voted <- vote.annotation(tax.up, voting.threshold=thresh) + down.voted <- vote.annotation(tax.down, voting.threshold=thresh) - # Combine into a string and output - name <- paste(up.voted,"/",down.voted,sep="") + # Combine into a string and output + name <- paste(up.voted,"/",down.voted,sep="") - if (is.null(return.votes)){ - return(name) - } else { - res <- list('name'=name) - } - if ('up' %in% return.votes){ - res[['up.votes']] <- tally.votes(tax, l.tips[['up']]) - } - if ('down' %in% return.votes){ - res[['down.votes']] <- tally.votes(tax, l.tips[['down']]) - } - if ('self' %in% return.votes){ - res[['self.votes']] <- tally.votes(tax, unlist(l.tips)) - } + if (is.null(return.votes)){ + return(name) + } else { + res <- list('name'=name) + } + if ('up' %in% return.votes){ + res[['up.votes']] <- tally.votes(tax, l.tips[['up']]) + } + if ('down' %in% return.votes){ + res[['down.votes']] <- tally.votes(tax, l.tips[['down']]) + } + if ('self' %in% return.votes){ + res[['self.votes']] <- tally.votes(tax, unlist(l.tips)) + } return(res) } - # In the future can extend to other methods of annotation/naming (other than just voting) + # In the future can extend to other methods of annotation/naming + # (other than just voting) } @@ -91,31 +108,34 @@ name.balance <- function(tr, tax, coord, method="voting", thresh=0.95, return.vo # nn is node number #' @importFrom phangorn Children get.ud.nodes <- function(tr,coord, return.nn=FALSE){ - nn <- name.to.nn(tr, coord) # get node number - l.nodes <- list() - child <- phangorn::Children(tr, nn) - if (length(child) < 2) stop(paste0(coord,' is a tip')) - if (return.nn==TRUE){ - l.nodes[['up']] <- child[1] - l.nodes[['down']] <- child[2] - } else{ - l.nodes[['up']] <- nn.to.name(tr, child[1]) - l.nodes[['down']] <- nn.to.name(tr, child[2]) - } - return(l.nodes) + nn <- name.to.nn(tr, coord) # get node number + l.nodes <- list() + child <- phangorn::Children(tr, nn) + if (length(child) < 2) stop(paste0(coord,' is a tip')) + if (return.nn==TRUE){ + l.nodes[['up']] <- child[1] + l.nodes[['down']] <- child[2] + } else{ + l.nodes[['up']] <- nn.to.name(tr, child[1]) + l.nodes[['down']] <- nn.to.name(tr, child[2]) + } + return(l.nodes) } -# Returns a list of the 'up' and 'down' subtree's values as a vector of tip ids (corresponds -# to up and down used for sbp creation) +# Returns a list of the 'up' and 'down' subtree's values as a vector of tip +# ids (corresponds to up and down used for sbp creation) # Each value is the ID of a tip get.ud.tips <- function(tr,coord){ - l.tips <- list() - child <- phangorn::Children(tr, name.to.nn(tr,coord)) - if (length(child) < 2) stop(paste0(coord,' is a tip')) - if (length(child) > 2) stop("Tree is not soley binary.") #TODO: Bit of validation - consider better location - l.tips[['up']] <- sapply(unlist(phangorn::Descendants(tr,child[1],type='tips')), function(x) nn.to.name(tr, x)) - l.tips[['down']] <- sapply(unlist(phangorn::Descendants(tr,child[2],type='tips')), function(x) nn.to.name(tr, x)) - return(l.tips) + l.tips <- list() + child <- phangorn::Children(tr, name.to.nn(tr,coord)) + if (length(child) < 2) stop(paste0(coord,' is a tip')) + # TODO: Bit of validation - consider better location + if (length(child) > 2) stop("Tree is not soley binary.") + l.tips[['up']] <- sapply(unlist(phangorn::Descendants(tr,child[1], + type='tips')), function(x) nn.to.name(tr, x)) + l.tips[['down']] <- sapply(unlist(phangorn::Descendants(tr,child[2], + type='tips')), function(x) nn.to.name(tr, x)) + return(l.tips) } # Find most concerved @@ -123,33 +143,40 @@ get.ud.tips <- function(tr,coord){ # NA are not counted but held against the winner in voting # Candidate must have >= voting.threshold to be considered the winner vote.annotation <- function(tax, voting.threshold=0.95){ - if (voting.threshold <= 0.5)stop('voting.threshold must be > 0.5 for unique winner') - if (is(tax, "character")){ # e.g., is there only 1 voter here - tmp.names <- names(tax) - tax <- matrix(tax,nrow=1) - colnames(tax) <- tmp.names - } - nr <- nrow(tax) # the number of tips - nc <- ncol(tax) # the number of taxonomic ranks in table - name <- NULL - - for (i in seq(nc,1)){ # evaluate in decreasing order of taxonomic resolution - votes <- tax[,i] - if (all(is.na(votes))) {next} - votes <- votes[!is.na(votes)] # drop NA votes but hold against the total number - winner <- sort(table(votes), decreasing=TRUE)[1] - if (!is.na(winner) & (winner/nr >= voting.threshold)){ # Arbitrarily set threshold to 95% of votes - name <- names(winner) - # Try and append taxonomic rank to name if columns of tax table are labled - if (!is.null(colnames(tax))) {name <- paste(colnames(tax)[i],'_',name,sep="")} - break - } # Else try the next level up - } + if (voting.threshold <= 0.5) { + stop('voting.threshold must be > 0.5 for unique winner') + } + if (is(tax, "character")){ # e.g., is there only 1 voter here + tmp.names <- names(tax) + tax <- matrix(tax,nrow=1) + colnames(tax) <- tmp.names + } + nr <- nrow(tax) # the number of tips + nc <- ncol(tax) # the number of taxonomic ranks in table + name <- NULL - if (is.null(name)){ # If no consensus vote found (above threshold) - name <- "Unclear_Lineage_Identity" - } - return(name) + # evaluate in decreasing order of taxonomic resolution + for (i in seq(nc,1)){ + votes <- tax[,i] + if (all(is.na(votes))) {next} + # drop NA votes but hold against the total number + votes <- votes[!is.na(votes)] + winner <- sort(table(votes), decreasing=TRUE)[1] + # Arbitrarily set threshold to 95% of votes + if (!is.na(winner) & (winner/nr >= voting.threshold)){ + name <- names(winner) + # Try and append taxonomic rank to name if columns of tax table + # are labeled + if (!is.null(colnames(tax))) { + name <- paste(colnames(tax)[i],'_',name,sep="") + } + break + } # Else try the next level up + } + if (is.null(name)){ # If no consensus vote found (above threshold) + name <- "Unclear_Lineage_Identity" + } + return(name) } # Given a list of tax IDs (e.g., rownames in tax table) @@ -157,13 +184,13 @@ vote.annotation <- function(tax, voting.threshold=0.95){ # This is really to be used when You don't get a result you like with # vote.annotation() tally.votes <- function(tax,ids){ - l.votes <- list() - nc <- ncol(tax) # the number of taxonomic ranks in the table - for (i in seq(nc,1)){ - votes <- tax[ids,i] - votes <- votes[!is.na(votes)] - rank <- ifelse(!is.null(colnames(tax)), colnames(tax)[i],i) - l.votes[[rank]] <- table(votes) - } - return(l.votes) -} + l.votes <- list() + nc <- ncol(tax) # the number of taxonomic ranks in the table + for (i in seq(nc,1)){ + votes <- tax[ids,i] + votes <- votes[!is.na(votes)] + rank <- ifelse(!is.null(colnames(tax)), colnames(tax)[i],i) + l.votes[[rank]] <- table(votes) + } + return(l.votes) +} \ No newline at end of file diff --git a/R/philr.R b/R/philr.R index 84f3a02..d4d6d55 100644 --- a/R/philr.R +++ b/R/philr.R @@ -1,38 +1,51 @@ #' Data transformation and driver of PhILR. #' -#' This is the main function for building the phylogenetic ILR basis, calculating the -#' weightings (of the parts and the ILR coordinates) and then transforming the data. +#' This is the main function for building the phylogenetic ILR basis, +#' calculating the weightings (of the parts and the ILR coordinates) and then +#' transforming the data. +#' #' @aliases build.phylo.ilr -#' @param df \strong{matrix} of data to be transformed (samples are rows, -#' compositional parts are columns) - zero must be dealt with either with pseudocount, -#' multiplicative replacement, or another method. +#' @param x \strong{matrix} of data to be transformed (samples are rows, +#' compositional parts are columns) - zero must be dealt with either with +#' pseudocount, multiplicative replacement, or another method. #' @param sbp (Optional) give a precomputed sbp matrix \code{\link{phylo2sbp}} -#' if you are going to build multiple ILR bases (e.g., with different weightings). +#' if you are going to build multiple ILR bases (e.g., with different +#' weightings). #' @param part.weights weightings for parts, can be a named vector with names -#' corresponding to \code{colnames(df)} otherwise can be a string, options include: +#' corresponding to \code{colnames(x)} otherwise can be a string, options +#' include: #' \describe{ #' \item{\code{'uniform'}}{(default) uses the uniform reference measure} -#' \item{\code{'gm.counts'}}{geometric mean of parts of df} -#' \item{\code{'anorm'}}{aitchison norm of parts of df (after closure)} +#' \item{\code{'gm.counts'}}{geometric mean of parts of x} +#' \item{\code{'anorm'}}{aitchison norm of parts of x (after closure)} #' \item{\code{'anorm.x.gm.counts'}}{\code{'anorm'} times \code{'gm.counts'}} -#' \item{\code{'enorm'}}{euclidean norm of parts of df (after closure)} -#' \item{\code{'enorm.x.gm.counts'}}{\code{'enorm'} times \code{'gm.counts'}, often gives good results} +#' \item{\code{'enorm'}}{euclidean norm of parts of x (after closure)} +#' \item{\code{'enorm.x.gm.counts'}}{\code{'enorm'} times \code{'gm.counts'}, +#' often gives good results} #' } -#' @param ilr.weights weightings for the ILR coordiantes can be a named vector with names -#' corresponding to names of internal nodes of \code{tree} otherwise can be a string, +#' @param ilr.weights weightings for the ILR coordiantes can be a named vector +#' with names corresponding to names of internal nodes of \code{tree} +#' otherwise can be a string, #' options include: #' \describe{ #' \item{\code{'uniform'}}{(default) no weighting of the ILR basis} #' \item{\code{'blw'}}{sum of children's branch lengths} #' \item{\code{'blw.sqrt'}}{square root of \code{'blw'} option} -#' \item{\code{'mean.descendants'}}{sum of children's branch lengths PLUS the sum of -#' each child's mean distance to its descendent tips} +#' \item{\code{'mean.descendants'}}{sum of children's branch lengths PLUS the +#' sum of each child's mean distance to its descendent tips} #' } -#' @param return.all return all computed parts (e.g., computed sign matrix(\code{sbp}), -#' part weightings (code{p}), ilr weightings (code{ilr.weights}), contrast matrix (\code{V})) -#' as a list (default=\code{FALSE}) in addition to in addition to returning the transformed data (\code{df.ilrp}). -#' If \code{return.all==FALSE} then only returns the transformed data (not in list format) -#' If \code{FALSE} then just returns list containing \code{df.ilrp}. +#' @param return.all return all computed parts (e.g., computed sign +#' matrix(\code{sbp}), part weightings (code{p}), ilr weightings +#' (code{ilr.weights}), contrast matrix (\code{V})) as a list +#' (default=\code{FALSE}) in addition to in addition to returning the +#' transformed data (\code{.ilrp}). +#' If \code{return.all==FALSE} then only returns the transformed data +#' (not in list format) +#' If \code{FALSE} then just returns list containing \code{x.ilrp}. +#' @param abund_values A single character value for selecting the +#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{assay}} +#' to be used. Only used when \code{x} is object from this class. +#' Default: "counts". #' @inheritParams calculate.blw #' @details #' This is a utility function that pulls together a number of other functions @@ -40,164 +53,269 @@ #' \enumerate{ #' \item Create sbp (sign matrix) if not given #' \item Create parts weightings if not given -#' \item Shift the dataset with respect to the new reference measure (e.g., part weightings) -#' \item Create the basis contrast matrix from the sign matrix and the reference measure -#' \item Transform the data based on the contrast matrix and the reference measure -#' \item Calculate the specified ILR weightings and multiply each balance by the corresponding weighting +#' \item Shift the dataset with respect to the new reference measure +#' (e.g., part weightings) +#' \item Create the basis contrast matrix from the sign matrix and the +#' reference measure +#' \item Transform the data based on the contrast matrix and the reference +#' measure +#' \item Calculate the specified ILR weightings and multiply each balance by +#' the corresponding weighting #' } -#' Note for both the reference measure (part weightings) and the ILR weightings, specifying \code{'uniform'} will -#' give the same results as not weighting at all. \cr \cr -#' Note that some of the prespecified part.weights assume \code{df} is given as +#' Note for both the reference measure (part weightings) and the ILR weightings, +#' specifying \code{'uniform'} will give the same results as not weighting at +#' all. \cr \cr +#' Note that some of the prespecified part.weights assume \code{x} is given as #' counts and not as relative abundances. Except in this case counts or relative #' abundances can be given. -#' @return matrix if \code{return.all=FALSE}, if \code{return.all=TRUE} then a list is returned (see above). -#' @author Justin Silverman -#' @export +#' +#' The tree argument is ignored if the \code{x} argument is +#' \code{\link[phyloseq:phyloseq-class]{assay}} or +#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{assay}} +#' These objects can include a phylogenetic tree. If the phylogenetic tree +#' is missing from these objects, it should be integrated directly in these +#' data objects before running \code{philr}. Alternatively, you can always +#' provide the abundance matrix and tree separately in their standard formats. +#' +#' If you have a +#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}}, +#' this can be converted into +#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{assay}}, +#' to incorporate tree information. +#' +#' @return matrix if \code{return.all=FALSE}, if \code{return.all=TRUE} then +#' a list is returned (see above). +#' @author Justin Silverman; S3 methods by Leo Lahti +#' @export #' @seealso \code{\link{phylo2sbp}} \code{\link{calculate.blw}} #' @examples +#' # Prepare example data #' tr <- named_rtree(5) -#' df <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add a small pseudocount -#' colnames(df) <- tr$tip.label +#' x <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add a small pseudocount +#' colnames(x) <- tr$tip.label +#' philr(x, tr, part.weights='enorm.x.gm.counts', +#' ilr.weights='blw.sqrt', return.all=FALSE) +#' +#' # Running philr on a TreeSummarizedExperiment object +#' +#' ## Prepare example data +#' library(mia) +#' library(tidyr) +#' data(GlobalPatterns, package="mia") +#' +#' ## Select prevalent taxa +#' tse <- GlobalPatterns %>% subsetByPrevalentTaxa( +#' detection = 3, +#' prevalence = 20/100, +#' as_relative = FALSE) #' -#' philr(df, tr, part.weights='enorm.x.gm.counts', -#' ilr.weights='blw.sqrt', return.all=FALSE) -philr <- function(df, tree, sbp=NULL, +#' ## Pick taxa that have notable abundance variation across sammples +#' variable.taxa <- apply(assay(tse, "counts"), 1, function(x) sd(x)/mean(x) > 3.0) +#' tse <- tse[variable.taxa,] +#' +#' # Collapse the tree +#' tree <- ape::keep.tip(phy = rowTree(tse), tip = rowLinks(tse)$nodeNum) +#' rowTree(tse) <- tree +#' +#' ## Add a new assay with a pseudocount +#' assays(tse)$counts.shifted <- assay(tse, "counts") + 1 +#' +#' ## Run philr for TreeSummarizedExperiment object +#' ## using the pseudocount data +#' res.tse <- philr(tse, part.weights='enorm.x.gm.counts', +#' ilr.weights='blw.sqrt', return.all=FALSE, +#' abund_values="counts.shifted") +#' +#' # Running philr on a phyloseq object +#' \donttest{ +#' pseq <- makePhyloseqFromTreeSummarizedExperiment(tse) +#' res.pseq <- philr(pseq, part.weights='enorm.x.gm.counts', +#' ilr.weights='blw.sqrt', return.all=FALSE) +#' } +#' +philr <- function(x, tree=NULL, sbp=NULL, part.weights='uniform', ilr.weights='uniform', return.all=FALSE, abund_values="counts") { UseMethod("philr") } + + +#' @export +philr.default <- function(x, ...){ + warning(paste("philr does not know how to handle object of class ", + class(x), + "and can only be used on classes TBA")) +} + +#' @export +philr.numeric <- function(x, tree, ...){ + philr.data.frame(x, tree, ...) +} + +#' @export +philr.phyloseq <- function(x, ...){ + + otu.table <- t(phyloseq::otu_table(x)) + tree <- phyloseq::phy_tree(x) + + philr.data.frame(otu.table, tree=tree, ...) + +} + + +#' @export +philr.TreeSummarizedExperiment <- function(x, tree=NULL, sbp=NULL, + part.weights='uniform', ilr.weights='uniform', + return.all=FALSE, abund_values="counts", ...){ + + tree <- TreeSummarizedExperiment::rowTree(x) + otu.table <- t(SummarizedExperiment::assays(x)[[abund_values]]) + philr.data.frame(otu.table, tree=tree, ...) +} + +#' @export +philr.matrix <- function(x, tree, ...){ + philr.data.frame(x, tree, ...) +} + +#' @export +philr.array <- function(x, tree, ...){ + philr.data.frame(x, tree, ...) +} + +#' @export +philr.data.frame <- function(x, tree, sbp=NULL, part.weights='uniform', ilr.weights='uniform', - return.all=FALSE){ - - # Convert df to mat with warning - df.name <- deparse(substitute(df)) - if (is.data.frame(df)) { - st <- paste("Converting ", df.name, " to matrix from data.frame", - " consider adding as.matrix(", df.name, ") to function call", - " to control conversion manually", sep="") - warning(st) - df <- as.matrix(df) - } - - # Convert vector input for df to matrix - df <- vec_to_mat(df) - - # Check for Zero values in df - if (any(df == 0)){ - stop('Zero values must be removed either through use of pseudocount, multiplicative replacement or other method.') - } - - # # Check to make sure df is a matrix otherwise convert it to one - # if(is.vector(df)) { - # tmp <- names(df) - # df <- matrix(df, nrow=1) - # colnames(df) <- tmp - # } - - # Create the sequential binary partition sign matrix - if (is.null(sbp)){ - message('Building Sequential Binary Partition from Tree...') - sbp <- phylo2sbp(tree) - } else { - if ( (nrow(sbp)!=ncol(df)) | (ncol(sbp)!=ncol(df)-1) ){ - stop("given sbp does not match dimentions required dimentions (e.g., ncol(df) x ncol(df)-1") + return.all=FALSE, abund_values=NULL, ...){ + # Convert x to mat with warning + x.name <- deparse(substitute(x)) + if (is.data.frame(x)) { + st <- paste("Converting ", x.name, " to matrix from data.frame", + " consider adding as.matrix(", x.name, ") to function call", + " to control conversion manually", sep="") + warning(st) + x <- as.matrix(x) + } + + # Convert vector input for x to matrix + x <- vec_to_mat(x) + + # Check for Zero values in x + if (any(x == 0)){ + stop('Zero values must be removed either through use of pseudocount, + multiplicative replacement or other method.') + } + + # Create the sequential binary partition sign matrix + if (is.null(sbp)){ + message('Building Sequential Binary Partition from Tree...') + sbp <- phylo2sbp(tree) + } else { + if ( (nrow(sbp)!=ncol(x)) | (ncol(sbp)!=ncol(x)-1) ){ + stop("given sbp does not match dimentions required dimentions + (e.g., ncol(x) x ncol(x)-1") + } + } + if (is.null(colnames(x))) { + stop("Input data must have column names that align with phylogenetic tree to prevent logical errors.") } - } - if (is.null(colnames(df))) stop("Input data must have column names that align with phylogenetic tree to prevent logical errors.") - sbp <- sbp[colnames(df), ] #Very Important line to avoid logical errors - - # Now need to create the weights on the parts - if (is.null(part.weights)){part.weights <- 'uniform'} - if (part.weights=='uniform'){ - p <- rep(1, ncol(df)) - names(p) <- rownames(sbp) - } else if (part.weights=='gm.counts'){ - p <- g.colMeans(df) - p <- p[rownames(sbp)] - } else if (part.weights=='anorm'){ - p <- apply(miniclo(t(df)), 1, function(x) normp(x, rep(1, nrow(df)))) - } else if (part.weights=='anorm.x.gm.counts'){ - gm.counts <- g.colMeans(df) - gm.counts <- gm.counts[rownames(sbp)] - anorm <- apply(miniclo(t(df)), 1, function(x) normp(x, rep(1, nrow(df)))) - anorm <- anorm[rownames(sbp)] - p <- gm.counts*anorm - } else if (part.weights=='enorm') { - enorm <- apply(miniclo(t(df)), 1, function(x) sqrt(sum(x^2))) - enorm <- enorm[rownames(sbp)] - p <- enorm - } else if (part.weights=='enorm.x.gm.counts'){ - gm.counts <- g.colMeans(df) - gm.counts <- gm.counts[rownames(sbp)] - enorm <- apply(miniclo(t(df)), 1, function(x) sqrt(sum(x^2))) - enorm <- enorm[rownames(sbp)] - p <- gm.counts*enorm - } - - # Make sure everything is lined up (else throw an error) - if (!all.equal(rownames(sbp), colnames(df), names(p))) { - stop("Rownames of SBP, Colnames of df, and names of part weights not equal!") - } - - # shift the dataset with respect to the new reference measure - df <- shiftp(miniclo(df), p) - - # Now create basis contrast matrix - message('Building Contrast Matrix...') - V <- buildilrBasep(sbp, p) - - # Finally transform the df - message('Transforming the Data...') - df.ilrp <- ilrp(df, p, V) - - # Now calculate ILR Weightings - if (is.character(ilr.weights)){ - message('Calculating ILR Weights...') - if (ilr.weights=='blw'){ - ilr.weights <- calculate.blw(tree, method='sum.children') - } else if (ilr.weights=='blw.sqrt'){ - ilr.weights <- sqrt(calculate.blw(tree, method='sum.children')) - } else if (ilr.weights=='uniform'){ - ilr.weights <- rep(1, ncol(df.ilrp)) - names(ilr.weights) <- colnames(df.ilrp) - } else if (ilr.weights=='mean.descendants'){ - ilr.weights <- calculate.blw(tree, method='mean.descendants') + sbp <- sbp[colnames(x), ] #Very Important line to avoid logical errors + + # Now need to create the weights on the parts + if (is.null(part.weights)){part.weights <- 'uniform'} + if (part.weights=='uniform'){ + p <- rep(1, ncol(x)) + names(p) <- rownames(sbp) + } else if (part.weights=='gm.counts'){ + p <- g.colMeans(x) + p <- p[rownames(sbp)] + } else if (part.weights=='anorm'){ + p <- apply(miniclo(t(x)), 1, function(x) normp(x, rep(1, nrow(x)))) + } else if (part.weights=='anorm.x.gm.counts'){ + gm.counts <- g.colMeans(x) + gm.counts <- gm.counts[rownames(sbp)] + anorm <- apply(miniclo(t(x)), 1, function(x) normp(x, rep(1, nrow(x)))) + anorm <- anorm[rownames(sbp)] + p <- gm.counts*anorm + } else if (part.weights=='enorm') { + enorm <- apply(miniclo(t(x)), 1, function(x) sqrt(sum(x^2))) + enorm <- enorm[rownames(sbp)] + p <- enorm + } else if (part.weights=='enorm.x.gm.counts'){ + gm.counts <- g.colMeans(x) + gm.counts <- gm.counts[rownames(sbp)] + enorm <- apply(miniclo(t(x)), 1, function(x) sqrt(sum(x^2))) + enorm <- enorm[rownames(sbp)] + p <- gm.counts*enorm } - } else { # Validate input of ilr.weights - if (!is.numeric(ilr.weights) | length(ilr.weights) != ncol(df.ilrp)){ - stop("ilr.weights must be recognized string or numeric vector of length = ncol(df)-1") + + # Make sure everything is lined up (else throw an error) + if (!all.equal(rownames(sbp), colnames(x), names(p))) { + stop("Rownames of SBP, Colnames of x, and names of part weights not equal!") } - } - - #TODO: Speed up by not computing if 'uniform' - if (!is.null(colnames(df.ilrp))){ - ilr.weights <- ilr.weights[colnames(df.ilrp)] - } - #ilr.weights <- ilr.weights[colnames(df.ilrp)] - tmp.names <- colnames(df.ilrp) - df.ilrp <- df.ilrp %*% diag(ilr.weights) - colnames(df.ilrp) <- tmp.names - - # Build return list - if (return.all==FALSE)return(df.ilrp) - if (return.all==TRUE){ - l.return = list() - l.return[['df.ilrp']] <- df.ilrp - l.return[['sbp']] <- sbp - l.return[['p']] <- p # the part weights - l.return[['V']] <- V # the contrast matrix - l.return[['ilr.weights']] <- ilr.weights - } - return(l.return) + + # shift the dataset with respect to the new reference measure + x <- shiftp(miniclo(x), p) + + # Now create basis contrast matrix + message('Building Contrast Matrix...') + V <- buildilrBasep(sbp, p) + + # Finally transform the x + message('Transforming the Data...') + x.ilrp <- ilrp(x, p, V) + + # Now calculate ILR Weightings + if (is.character(ilr.weights)){ + message('Calculating ILR Weights...') + if (ilr.weights=='blw'){ + ilr.weights <- calculate.blw(tree, method='sum.children') + } else if (ilr.weights=='blw.sqrt'){ + ilr.weights <- sqrt(calculate.blw(tree, method='sum.children')) + } else if (ilr.weights=='uniform'){ + ilr.weights <- rep(1, ncol(x.ilrp)) + names(ilr.weights) <- colnames(x.ilrp) + } else if (ilr.weights=='mean.descendants'){ + ilr.weights <- calculate.blw(tree, method='mean.descendants') + } + } else { # Validate input of ilr.weights + if (!is.numeric(ilr.weights) | length(ilr.weights) != ncol(x.ilrp)){ + stop("ilr.weights must be recognized string or numeric vector of + length = ncol(x)-1") + } + } + + # TODO: Speed up by not computing if 'uniform' + if (!is.null(colnames(x.ilrp))){ + ilr.weights <- ilr.weights[colnames(x.ilrp)] + } + # ilr.weights <- ilr.weights[colnames(x.ilrp)] + tmp.names <- colnames(x.ilrp) + x.ilrp <- x.ilrp %*% diag(ilr.weights) + colnames(x.ilrp) <- tmp.names + + # Build return list + if (return.all==FALSE)return(x.ilrp) + if (return.all==TRUE){ + l.return = list() + l.return[['x.ilrp']] <- x.ilrp + l.return[['sbp']] <- sbp + l.return[['p']] <- p # the part weights + l.return[['V']] <- V # the contrast matrix + l.return[['ilr.weights']] <- ilr.weights + } + return(l.return) } #' Inverse of PhILR Transform #' -#' @param df.ilrp transformed data to which the inverse transform will be applied -#' @param tree (optional) to be used to build sbp and contrast matrix (see details) +#' @param x.ilrp transformed data to which the inverse transform will be applied +#' @param tree (optional) to be used to build sbp and contrast matrix +#' (see details) #' @param sbp (optional) the sbp (sequential binary partition) used to build a #' contrast matrix (see details) #' @param V (optional) the contrast matrix (see details) #' @param part.weights weightings for parts, can be a named vector with names -#' corresponding to \code{colnames(df)}. Defaults to 'uniform' (part.weights = 1,...,1) -#' @param ilr.weights weightings for the ILR coordiantes can be a named vector with names -#' corresponding to names of internal nodes of \code{tree}. +#' corresponding to \code{colnames(x)}. Defaults to 'uniform' +#' (part.weights = 1,...,1) +#' @param ilr.weights weightings for the ILR coordiantes can be a named vector +#' with names corresponding to names of internal nodes of \code{tree}. #' Defaults to 'uniform' (ilr.weights = 1,...,1) #' #' @details This is a utility function for calculating the inverse of the @@ -213,57 +331,51 @@ philr <- function(df, tree, sbp=NULL, #' #' @examples #' tr <- named_rtree(5) -#' df <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add a small pseudocount -#' colnames(df) <- tr$tip.label -#' d <- philr(df, tr, part.weights='enorm.x.gm.counts', +#' x <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add small pseudocount +#' colnames(x) <- tr$tip.label +#' d <- philr(x, tr, part.weights='enorm.x.gm.counts', #' ilr.weights='blw.sqrt', return.all=TRUE) -#' d.inverted <- philrInv(d$df.ilrp, V=d$V, part.weights = d$p, +#' d.inverted <- philrInv(d$x.ilrp, V=d$V, part.weights = d$p, #' ilr.weights = d$ilr.weights) -#' all.equal(miniclo(df), d.inverted) -philrInv <- function(df.ilrp, tree=NULL, sbp=NULL, V=NULL, part.weights=NULL, ilr.weights=NULL){ - if (all(c(is.null(tree), is.null(sbp), is.null(V)))){ - stop('At least one of the parameters (tree, sbp, V) must be non-null') - } - - # Convert vector input for df to matrix - df.ilrp <- vec_to_mat(df.ilrp) - - # if(is.vector(df.ilrp)) { - # tmp <- names(df.ilrp) - # df.ilrp <- matrix(df.ilrp, nrow=1) - # colnames(df.ilrp) <- tmp - # } - - # Inverse ILR -> y (the shifted composition - but closed) - if (is.null(part.weights)){ - part.weights <- rep(1, ncol(df.ilrp)+1) - } - - if (!is.null(V)) NULL - else if (!is.null(sbp)) V <- buildilrBasep(sbp, part.weights) - else V <- buildilrBasep(phylo2sbp(tree), part.weights) - V <- V[,colnames(df.ilrp)] # make sure everything is lined up - - # Undo ILR weights - note: doing nothing (e.g., not matching criteria) - # is equivalent to undoing uniform ilr.weights (1,...,1) - if (!is.null(ilr.weights)) { - ilr.weights <- ilr.weights[colnames(df.ilrp)] # ensure things are lined up - df.ilrp <- df.ilrp / outer(rep(1,nrow(df.ilrp)), ilr.weights) - } - - # Make sure everything is lined up (else throw an error) - if (!all.equal(colnames(V), colnames(df.ilrp))) { - stop("Colnames of V, Colnames of df!") - } - - if (!all(part.weights==1)){ - if (!all.equal(rownames(V), names(part.weights))){ - stop("rownames of V and names of parts.weights not equal") +#' all.equal(miniclo(x), d.inverted) +philrInv <- function(x.ilrp, tree=NULL, sbp=NULL, V=NULL, part.weights=NULL, ilr.weights=NULL){ + if (all(c(is.null(tree), is.null(sbp), is.null(V)))){ + stop('At least one of the parameters (tree, sbp, V) must be non-null') + } + + # Convert vector input for x to matrix + x.ilrp <- vec_to_mat(x.ilrp) + + # Inverse ILR -> y (the shifted composition - but closed) + if (is.null(part.weights)){ + part.weights <- rep(1, ncol(x.ilrp)+1) + } + + if (!is.null(V)) NULL + else if (!is.null(sbp)) V <- buildilrBasep(sbp, part.weights) + else V <- buildilrBasep(phylo2sbp(tree), part.weights) + V <- V[,colnames(x.ilrp)] # make sure everything is lined up + + # Undo ILR weights - note: doing nothing (e.g., not matching criteria) + # is equivalent to undoing uniform ilr.weights (1,...,1) + if (!is.null(ilr.weights)) { + ilr.weights <- ilr.weights[colnames(x.ilrp)] # ensure things are lined up + x.ilrp <- x.ilrp / outer(rep(1,nrow(x.ilrp)), ilr.weights) + } + + # Make sure everything is lined up (else throw an error) + if (!all.equal(colnames(V), colnames(x.ilrp))) { + stop("Colnames of V, Colnames of x!") + } + + if (!all(part.weights==1)){ + if (!all.equal(rownames(V), names(part.weights))){ + stop("rownames of V and names of parts.weights not equal") + } } - } - y <- ilrpInv(df.ilrp, V) - x <- miniclo(shiftpInv(y, part.weights)) - x + y <- ilrpInv(x.ilrp, V) + x <- miniclo(shiftpInv(y, part.weights)) + x } diff --git a/R/plot_utils.R b/R/plot_utils.R index dac92a6..cfd313b 100644 --- a/R/plot_utils.R +++ b/R/plot_utils.R @@ -8,7 +8,8 @@ #' @param coord named internal node/balance to annotate #' @param p ggtree plot (tree layer), if \code{NULL} then a new plot will be #' created. -#' @param labels label for the numerator and denominator of the balance respectively +#' @param labels label for the numerator and denominator of the balance +#' respectively #' @param offset offset for bar (if \code{bar=TRUE}) from tips #' @param offset.text offset of text from bar (if \code{bar=TRUE}) or from tips #' (if \code{bar=FALSE}) @@ -16,7 +17,8 @@ #' @param barsize width of bar (if \code{bar=TRUE}) #' @param barfill fill of bar #' @param geom geom used to draw label (e.g., \code{'text'} or \code{'label'}) -#' @param ... additional parameters passed to \code{geom_rect} and specified \code{geom} +#' @param ... additional parameters passed to \code{geom_rect} and +#' specified \code{geom} #' #' @return ggplot object #' @importFrom ggplot2 annotate @@ -24,18 +26,21 @@ #' #' @export #' @author Justin Silverman -#' @references Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. -#' \emph{ggtree: an R package for visualization and annotation of phylogenetic trees with -#' their covariates and other associated data.} -#' Methods in Ecology and Evolution 2016, doi:10.1111/2041-210X.12628 +#' @references Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, +#' Tommy Tsan-Yuk Lam. +#' \emph{ggtree: an R package for visualization and annotation of +#' phylogenetic trees with their covariates and other associated data.} +#' Methods in Ecology and Evolution 2016, \url{doi:10.1111/2041-210X.12628} #' #' @examples #' tr <- named_rtree(10) #' #' annotate_balance(tr, 'n4', size=7) -#' annotate_balance(tr, 'n4', size=7, barsize=0.04, barfill='darkgreen', offset.text=0.05, color='red') +#' annotate_balance(tr, 'n4', size=7, barsize=0.04, barfill='darkgreen', +#' offset.text=0.05, color='red') #' annotate_balance(tr, 'n4', bar=FALSE, size=7) -#' annotate_balance(tr, 'n4', bar=TRUE, size=7, labels=c('Num', 'Denom'), offset.text=.3) +#' annotate_balance(tr, 'n4', bar=TRUE, size=7, labels=c('Num', 'Denom'), +#' offset.text=.3) #' annotate_balance(tr, 'n4', bar=TRUE, geom='label', size=8, offset.text=0.1) annotate_balance <- function(tr, coord, p=NULL, labels=c('+','-'), offset=0, offset.text=0.03, bar=TRUE, barsize=0.01, @@ -46,7 +51,8 @@ annotate_balance <- function(tr, coord, p=NULL, labels=c('+','-'), offset=0, names(labels) <- c('up', 'down') # get node numbers of children - ch.coord <- unlist(get.ud.nodes(tr, coord)) # get ud.nodes (to orient) - tests if coord is a tip + # get ud.nodes (to orient) - tests if coord is a tip + ch.coord <- unlist(get.ud.nodes(tr, coord)) ch.nn <- name.to.nn(tr, ch.coord) # Convert to node numbers names(ch.nn) <- names(ch.coord) @@ -56,38 +62,38 @@ annotate_balance <- function(tr, coord, p=NULL, labels=c('+','-'), offset=0, # Create Dataframe that contains location and dimentions of bar xmax <- get_clade_position(p, node=name.to.nn(tr, coord))[,'xmax'] - df.up <- get_clade_position(p, node=ch.nn['up']) - df.down <- get_clade_position(p, node=ch.nn['down']) - df <- rbind(df.up, df.down) - rownames(df) <- c('up','down') - df[,'xmin'] <- xmax + offset - df[, 'xmax'] <- df[,'xmin'] + barsize - df[,'ymin'] <- df[,'ymin'] + 0.2 - df[,'ymax'] <- df[,'ymax'] - 0.2 + x.up <- get_clade_position(p, node=ch.nn['up']) + x.down <- get_clade_position(p, node=ch.nn['down']) + x <- rbind(x.up, x.down) + rownames(x) <- c('up','down') + x[,'xmin'] <- xmax + offset + x[, 'xmax'] <- x[,'xmin'] + barsize + x[,'ymin'] <- x[,'ymin'] + 0.2 + x[,'ymax'] <- x[,'ymax'] - 0.2 if (bar){ p <- p + annotate(geom = geom, - x=df['up',]$xmax+offset.text, - y=(df['up',]$ymax + df['up',]$ymin)/2, + x=x['up',]$xmax+offset.text, + y=(x['up',]$ymax + x['up',]$ymin)/2, label=labels['up'], ...) + annotate(geom = geom, - x=df['down',]$xmax+offset.text, - y=(df['down',]$ymax + df['down',]$ymin)/2, + x=x['down',]$xmax+offset.text, + y=(x['down',]$ymax + x['down',]$ymin)/2, label=labels['down'], ...) + - annotate('rect', xmin=df['up',]$xmin, xmax=df['up',]$xmax, - ymin=df['up',]$ymin, ymax=df['up',]$ymax, fill=barfill) + - annotate('rect', xmin=df['down',]$xmin, xmax=df['down',]$xmax, - ymin=df['down',]$ymin, ymax=df['down',]$ymax, fill=barfill) + annotate('rect', xmin=x['up',]$xmin, xmax=x['up',]$xmax, + ymin=x['up',]$ymin, ymax=x['up',]$ymax, fill=barfill) + + annotate('rect', xmin=x['down',]$xmin, xmax=x['down',]$xmax, + ymin=x['down',]$ymin, ymax=x['down',]$ymax, fill=barfill) } else { p <- p + annotate(geom = geom, x=xmax+offset.text, - y=(df['up',]$ymax + df['up',]$ymin)/2, + y=(x['up',]$ymax + x['up',]$ymin)/2, label=labels['up'], ...) + annotate(geom = geom, x=xmax+offset.text, - y=(df['down',]$ymax + df['down',]$ymin)/2, + y=(x['down',]$ymax + x['down',]$ymin)/2, label=labels['down'], ...) } p diff --git a/inst/NEWS b/inst/NEWS index 8a6e43e..5c724f6 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,17 @@ +CHANGES IN VERSION 1.19.1 +-------------------------- + +USER-VISIBLE CHANGES + + o Added support for TreeSummarizedExperiment class + o Added support for phyloseq class + o Updated vignette + o Changed main argument name from df to x + +INTERNAL + o Implemented philr as S3 method + + CHANGES IN VERSION 1.3.1 -------------------------- diff --git a/man/annotate_balance.Rd b/man/annotate_balance.Rd index 07953c3..8f37820 100644 --- a/man/annotate_balance.Rd +++ b/man/annotate_balance.Rd @@ -4,9 +4,19 @@ \alias{annotate_balance} \title{annotate_balance} \usage{ -annotate_balance(tr, coord, p = NULL, labels = c("+", "-"), - offset = 0, offset.text = 0.03, bar = TRUE, barsize = 0.01, - barfill = "darkgrey", geom = "text", ...) +annotate_balance( + tr, + coord, + p = NULL, + labels = c("+", "-"), + offset = 0, + offset.text = 0.03, + bar = TRUE, + barsize = 0.01, + barfill = "darkgrey", + geom = "text", + ... +) } \arguments{ \item{tr}{phylo object} @@ -16,7 +26,8 @@ annotate_balance(tr, coord, p = NULL, labels = c("+", "-"), \item{p}{ggtree plot (tree layer), if \code{NULL} then a new plot will be created.} -\item{labels}{label for the numerator and denominator of the balance respectively} +\item{labels}{label for the numerator and denominator of the balance +respectively} \item{offset}{offset for bar (if \code{bar=TRUE}) from tips} @@ -31,7 +42,8 @@ created.} \item{geom}{geom used to draw label (e.g., \code{'text'} or \code{'label'})} -\item{...}{additional parameters passed to \code{geom_rect} and specified \code{geom}} +\item{...}{additional parameters passed to \code{geom_rect} and +specified \code{geom}} } \value{ ggplot object @@ -45,16 +57,19 @@ denominator (\code{down}). tr <- named_rtree(10) annotate_balance(tr, 'n4', size=7) -annotate_balance(tr, 'n4', size=7, barsize=0.04, barfill='darkgreen', offset.text=0.05, color='red') +annotate_balance(tr, 'n4', size=7, barsize=0.04, barfill='darkgreen', + offset.text=0.05, color='red') annotate_balance(tr, 'n4', bar=FALSE, size=7) -annotate_balance(tr, 'n4', bar=TRUE, size=7, labels=c('Num', 'Denom'), offset.text=.3) +annotate_balance(tr, 'n4', bar=TRUE, size=7, labels=c('Num', 'Denom'), + offset.text=.3) annotate_balance(tr, 'n4', bar=TRUE, geom='label', size=8, offset.text=0.1) } \references{ -Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. -\emph{ggtree: an R package for visualization and annotation of phylogenetic trees with -their covariates and other associated data.} -Methods in Ecology and Evolution 2016, doi:10.1111/2041-210X.12628 +Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, +Tommy Tsan-Yuk Lam. +\emph{ggtree: an R package for visualization and annotation of +phylogenetic trees with their covariates and other associated data.} +Methods in Ecology and Evolution 2016, \url{doi:10.1111/2041-210X.12628} } \author{ Justin Silverman diff --git a/man/calculate.blw.Rd b/man/calculate.blw.Rd index d400393..5fb3cca 100644 --- a/man/calculate.blw.Rd +++ b/man/calculate.blw.Rd @@ -8,13 +8,16 @@ calculate.blw(tree, method = "sum.children") } \arguments{ -\item{tree}{a \code{phylo} class tree object that is binary (see \code{\link[ape]{multi2di}})} +\item{tree}{a \code{phylo} class tree object that is binary (see +\code{\link[ape]{multi2di}})} -\item{method}{options include: (default) \code{'sum.children'} and \code{'mean.descendants'} +\item{method}{options include: (default) \code{'sum.children'} and +\code{'mean.descendants'} see details for more information.} } \value{ -vector of weightings for ILR coordinates produced via specified method. +vector of weightings for ILR coordinates produced via specified +method. } \description{ Calculates the weightings for ILR coordinates based on branch lenghts of @@ -26,13 +29,15 @@ can be imbued with branch length information. This function is helpful in calculating those weightings.\cr \cr There are a number of methods for calculating these weightings, the default \code{'sum.children'} calculates the weighting for a given balance as the sum -of its two direct children's branch length. An alternative that has been as yet less -studied is \code{'mean.descendants'} to calculate the weighting for a given balance -as the sum of its two direct children's branch lengths PLUS for each child the average -distance from it to its descendant tips. \cr \cr -\emph{Note:} That some trees contain tips with branch lengths of zero length. This can result -in that tip being unreasonably downweighted, as such this function automatically -adds a small pseudocount to those tips with zero length (equal to the smallest non-zero) +of its two direct children's branch length. An alternative that has been +as yet less studied is \code{'mean.descendants'} to calculate the weighting +for a given balance as the sum of its two direct children's branch lengths +PLUS for each child the average distance from it to its descendant tips. +\cr \cr +\emph{Note:} That some trees contain tips with branch lengths of zero length. +This can result in that tip being unreasonably downweighted, as such this +function automatically adds a small pseudocount to those tips with zero +length (equal to the smallest non-zero) branch length on the tree. } \examples{ diff --git a/man/convert_to_long.Rd b/man/convert_to_long.Rd index d72b427..3f24c35 100644 --- a/man/convert_to_long.Rd +++ b/man/convert_to_long.Rd @@ -4,15 +4,17 @@ \alias{convert_to_long} \title{Converts wide format ILR transformed data to long format} \usage{ -convert_to_long(df, labels) +convert_to_long(x, labels) } \arguments{ -\item{df}{PhILR transformed data in wide format (samples by balances) (see \code{\link{philr}})} +\item{x}{PhILR transformed data in wide format (samples by balances) +(see \code{\link{philr}})} -\item{labels}{vector (of length \code{nrow(df)}) with labels to group samples by} +\item{labels}{vector (of length \code{nrow(x)}) with labels to group +samples by} } \value{ -\code{df} in long format with columns +\code{x} in long format with columns \itemize{ \item sample \item labels @@ -21,15 +23,16 @@ convert_to_long(df, labels) } } \description{ -Converts wide format ILR transformed data (see \code{\link{philr}}) to long format -useful in various plotting functions where long format data is required. +Converts wide format ILR transformed data (see \code{\link{philr}}) to +long format useful in various plotting functions where long format data is +required. } \examples{ tr <- named_rtree(5) -df <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add a small pseudocount -colnames(df) <- tr$tip.label +x <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add small pseudocount +colnames(x) <- tr$tip.label -df.philr <- philr(df, tr, part.weights='enorm.x.gm.counts', - ilr.weights='blw.sqrt', return.all=FALSE) -convert_to_long(df.philr, rep(c('a','b'), 5)) +x.philr <- philr(x, tree=tr, part.weights='enorm.x.gm.counts', + ilr.weights='blw.sqrt', return.all=FALSE) + convert_to_long(x.philr, rep(c('a','b'), 5)) } diff --git a/man/mean_dist_to_tips.Rd b/man/mean_dist_to_tips.Rd index 99bdd56..b6d4996 100644 --- a/man/mean_dist_to_tips.Rd +++ b/man/mean_dist_to_tips.Rd @@ -7,7 +7,8 @@ mean_dist_to_tips(tree) } \arguments{ -\item{tree}{a \code{phylo} class tree object that is binary (see \code{\link[ape]{multi2di}})} +\item{tree}{a \code{phylo} class tree object that is binary (see +\code{\link[ape]{multi2di}})} } \value{ vector (named if internal nodes are named) diff --git a/man/name.balance.Rd b/man/name.balance.Rd index 13bdcdb..66c3abb 100644 --- a/man/name.balance.Rd +++ b/man/name.balance.Rd @@ -4,63 +4,83 @@ \alias{name.balance} \title{Name a balance (coordinate) based on taxonomy} \usage{ -name.balance(tr, tax, coord, method = "voting", thresh = 0.95, - return.votes = NULL) +name.balance( + tr, + tax, + coord, + method = "voting", + thresh = 0.95, + return.votes = NULL +) } \arguments{ \item{tr}{an object of class \code{'phylo'}} -\item{tax}{a matrix/data.frame of taxonomy, rownames should correspond to \code{tr$tip.labels} -columns should be taxonomic levels (named) with increasing taxonomic resolution from left to right -(e.g., Phylum to the left of Genus).} +\item{tax}{a matrix/data.frame of taxonomy, rownames should correspond to +\code{tr$tip.labels} columns should be taxonomic levels (named) with +increasing taxonomic resolution from left to right (e.g., Phylum to the +left of Genus).} -\item{coord}{the name of a balance/internal node on the tree (given as a string)} +\item{coord}{the name of a balance/internal node on the tree (given as a +string)} \item{method}{currently only \code{'voting'} implemented. See Details.} -\item{thresh}{threshold for assignment of taxonomy to a given part of a balance -(must be greater than 0.5 if \code{method='voting'}; see details).} +\item{thresh}{threshold for assignment of taxonomy to a given part of a +balance (must be greater than 0.5 if \code{method='voting'}; see details).} -\item{return.votes}{whether voting results by taxonomic level should be shown for \code{coord}. Note: this is helpful when -\code{name.balance} does not return a clear winner, as may be the case when a given \code{coord} represents more than one -taxonomic lineage. votes are returned as a list indexed by \code{colnames(tax)} Options include: +\item{return.votes}{whether voting results by taxonomic level should be +shown for \code{coord}. Note: this is helpful when \code{name.balance} does +not return a clear winner, as may be the case when a given \code{coord} +represents more than one taxonomic lineage. votes are returned as a list +indexed by \code{colnames(tax)} Options include: \describe{ -\item{\code{NULL}}{(default) only returns the combined consensus name of the balance} +\item{\code{NULL}}{(default) only returns the combined consensus name of + the balance} \item{\code{'up'}}{adds tallied votes for the 'up' node to the output list} -\item{\code{'down'}}{adds tallied votes for the 'down'node to the output list} +\item{\code{'down'}}{adds tallied votes for the 'down' node to the output + list} \item{\code{'self'}}{adds tallied votes for \code{coord} to the output list} }} } \value{ -If \code{return.votes=NULL} returns a string of the form (ex. 'Genus_Bacteroides/Phylum_Firmicutes'). Otherwise returns -a list with the above string as 'name', see Arguments for \code{show.votes} for other optional returned items. +If \code{return.votes=NULL} returns a string of the form +(ex. 'Genus_Bacteroides/Phylum_Firmicutes'). Otherwise returns +a list with the above string as 'name', see Arguments for \code{show.votes} +for other optional returned items. } \description{ -For a given ILR balance (coordinate) assigns a name to the balance based on a -provided taxonomy table. This is useful for interpretation of the balances. +For a given ILR balance (coordinate) assigns a name to the balance based on +a provided taxonomy table. This is useful for interpretation of the balances. } \details{ A bit of terminology: \describe{ -\item{coord}{this is the same as the names of the balances which should be the same as the names of the internal nodes of \code{tr}} -\item{'up'}{this is the child node of \code{coord} that is represented in the numerator of the \code{coord} balance.} -\item{'down'}{this is the child node of \code{coord} that is represented in the denominator of the \code{coord} balance} +\item{coord}{this is the same as the names of the balances which should be +the same as the names of the internal nodes of \code{tr}} +\item{'up'}{this is the child node of \code{coord} that is represented in +the numerator of the \code{coord} balance.} +\item{'down'}{this is the child node of \code{coord} that is represented in +the denominator of the \code{coord} balance} } -The method \code{'voting'} assigns the name of the each part of a balance (e.g., numerator and denominator / each -child of \code{coord}) as follows: +The method \code{'voting'} assigns the name of the each part of a balance +(e.g., numerator and denominator / each child of \code{coord}) as follows: \enumerate{ -\item First Subset \code{tax} to contain only descendent tips of the given child of \code{coord} -\item Second At the finest taxonomic (farthest right of \code{tax}) see if any one taxonomic label -is present at or above \code{thresh}. If yes output that taxonomic label (at that taxonomic level) as the label -for that child of \code{coord}. If no then move to coarser taxonomic level (leftward) and repeat. +\item First Subset \code{tax} to contain only descendent tips of the given +child of \code{coord} +\item Second At the finest taxonomic (farthest right of \code{tax}) see if +any one taxonomic label is present at or above \code{thresh}. If yes output +that taxonomic label (at that taxonomic level) as the label for that child +of \code{coord}. If no then move to coarser taxonomic level (leftward) and +repeat. } } \examples{ tr <- named_rtree(40) tax <- data.frame(Kingdom=rep('A', 40), - Phylum=rep(c('B','C'), each=20), - Genus=c(sample(c('D','F'),20, replace=TRUE), - sample(c('G','H'), 20, replace=TRUE))) + Phylum=rep(c('B','C'), each=20), + Genus=c(sample(c('D','F'),20, replace=TRUE), + sample(c('G','H'), 20, replace=TRUE))) rownames(tax) <- tr$tip.label name.balance(tr, tax, 'n1') name.balance(tr, tax, 'n34') diff --git a/man/philr.Rd b/man/philr.Rd index 4de9984..89cbb2c 100644 --- a/man/philr.Rd +++ b/man/philr.Rd @@ -5,53 +5,75 @@ \alias{build.phylo.ilr} \title{Data transformation and driver of PhILR.} \usage{ -philr(df, tree, sbp = NULL, part.weights = "uniform", - ilr.weights = "uniform", return.all = FALSE) +philr( + x, + tree = NULL, + sbp = NULL, + part.weights = "uniform", + ilr.weights = "uniform", + return.all = FALSE, + abund_values = "counts" +) } \arguments{ -\item{df}{\strong{matrix} of data to be transformed (samples are rows, -compositional parts are columns) - zero must be dealt with either with pseudocount, -multiplicative replacement, or another method.} +\item{x}{\strong{matrix} of data to be transformed (samples are rows, +compositional parts are columns) - zero must be dealt with either with +pseudocount, multiplicative replacement, or another method.} -\item{tree}{a \code{phylo} class tree object that is binary (see \code{\link[ape]{multi2di}})} +\item{tree}{a \code{phylo} class tree object that is binary (see +\code{\link[ape]{multi2di}})} \item{sbp}{(Optional) give a precomputed sbp matrix \code{\link{phylo2sbp}} -if you are going to build multiple ILR bases (e.g., with different weightings).} +if you are going to build multiple ILR bases (e.g., with different +weightings).} \item{part.weights}{weightings for parts, can be a named vector with names -corresponding to \code{colnames(df)} otherwise can be a string, options include: +corresponding to \code{colnames(x)} otherwise can be a string, options +include: \describe{ \item{\code{'uniform'}}{(default) uses the uniform reference measure} -\item{\code{'gm.counts'}}{geometric mean of parts of df} -\item{\code{'anorm'}}{aitchison norm of parts of df (after closure)} +\item{\code{'gm.counts'}}{geometric mean of parts of x} +\item{\code{'anorm'}}{aitchison norm of parts of x (after closure)} \item{\code{'anorm.x.gm.counts'}}{\code{'anorm'} times \code{'gm.counts'}} -\item{\code{'enorm'}}{euclidean norm of parts of df (after closure)} -\item{\code{'enorm.x.gm.counts'}}{\code{'enorm'} times \code{'gm.counts'}, often gives good results} +\item{\code{'enorm'}}{euclidean norm of parts of x (after closure)} +\item{\code{'enorm.x.gm.counts'}}{\code{'enorm'} times \code{'gm.counts'}, + often gives good results} }} -\item{ilr.weights}{weightings for the ILR coordiantes can be a named vector with names -corresponding to names of internal nodes of \code{tree} otherwise can be a string, +\item{ilr.weights}{weightings for the ILR coordiantes can be a named vector +with names corresponding to names of internal nodes of \code{tree} +otherwise can be a string, options include: \describe{ \item{\code{'uniform'}}{(default) no weighting of the ILR basis} \item{\code{'blw'}}{sum of children's branch lengths} \item{\code{'blw.sqrt'}}{square root of \code{'blw'} option} -\item{\code{'mean.descendants'}}{sum of children's branch lengths PLUS the sum of -each child's mean distance to its descendent tips} +\item{\code{'mean.descendants'}}{sum of children's branch lengths PLUS the + sum of each child's mean distance to its descendent tips} }} -\item{return.all}{return all computed parts (e.g., computed sign matrix(\code{sbp}), -part weightings (code{p}), ilr weightings (code{ilr.weights}), contrast matrix (\code{V})) -as a list (default=\code{FALSE}) in addition to in addition to returning the transformed data (\code{df.ilrp}). -If \code{return.all==FALSE} then only returns the transformed data (not in list format) -If \code{FALSE} then just returns list containing \code{df.ilrp}.} +\item{return.all}{return all computed parts (e.g., computed sign + matrix(\code{sbp}), part weightings (code{p}), ilr weightings + (code{ilr.weights}), contrast matrix (\code{V})) as a list + (default=\code{FALSE}) in addition to in addition to returning the + transformed data (\code{.ilrp}). +If \code{return.all==FALSE} then only returns the transformed data +(not in list format) +If \code{FALSE} then just returns list containing \code{x.ilrp}.} + +\item{abund_values}{A single character value for selecting the + \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{assay}} +to be used. Only used when \code{x} is object from this class. +Default: "counts".} } \value{ -matrix if \code{return.all=FALSE}, if \code{return.all=TRUE} then a list is returned (see above). +matrix if \code{return.all=FALSE}, if \code{return.all=TRUE} then +a list is returned (see above). } \description{ -This is the main function for building the phylogenetic ILR basis, calculating the -weightings (of the parts and the ILR coordinates) and then transforming the data. +This is the main function for building the phylogenetic ILR basis, +calculating the weightings (of the parts and the ILR coordinates) and then +transforming the data. } \details{ This is a utility function that pulls together a number of other functions @@ -59,28 +81,85 @@ in \code{philr}. The steps that are executed are as follows: \enumerate{ \item Create sbp (sign matrix) if not given \item Create parts weightings if not given -\item Shift the dataset with respect to the new reference measure (e.g., part weightings) -\item Create the basis contrast matrix from the sign matrix and the reference measure -\item Transform the data based on the contrast matrix and the reference measure -\item Calculate the specified ILR weightings and multiply each balance by the corresponding weighting +\item Shift the dataset with respect to the new reference measure + (e.g., part weightings) +\item Create the basis contrast matrix from the sign matrix and the + reference measure +\item Transform the data based on the contrast matrix and the reference + measure +\item Calculate the specified ILR weightings and multiply each balance by + the corresponding weighting } -Note for both the reference measure (part weightings) and the ILR weightings, specifying \code{'uniform'} will -give the same results as not weighting at all. \cr \cr -Note that some of the prespecified part.weights assume \code{df} is given as +Note for both the reference measure (part weightings) and the ILR weightings, +specifying \code{'uniform'} will give the same results as not weighting at +all. \cr \cr +Note that some of the prespecified part.weights assume \code{x} is given as counts and not as relative abundances. Except in this case counts or relative abundances can be given. + +The tree argument is ignored if the \code{x} argument is +\code{\link[phyloseq:phyloseq-class]{assay}} or +\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{assay}} +These objects can include a phylogenetic tree. If the phylogenetic tree +is missing from these objects, it should be integrated directly in these +data objects before running \code{philr}. Alternatively, you can always +provide the abundance matrix and tree separately in their standard formats. + +If you have a +\code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}}, +this can be converted into +\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{assay}}, +to incorporate tree information. } \examples{ +# Prepare example data tr <- named_rtree(5) -df <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add a small pseudocount -colnames(df) <- tr$tip.label +x <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add a small pseudocount +colnames(x) <- tr$tip.label +philr(x, tr, part.weights='enorm.x.gm.counts', + ilr.weights='blw.sqrt', return.all=FALSE) + +# Running philr on a TreeSummarizedExperiment object + +## Prepare example data +library(mia) +library(tidyr) +data(GlobalPatterns, package="mia") + +## Select prevalent taxa +tse <- GlobalPatterns \%>\% subsetByPrevalentTaxa( + detection = 3, + prevalence = 20/100, + as_relative = FALSE) + +## Pick taxa that have notable abundance variation across sammples +variable.taxa <- apply(assay(tse, "counts"), 1, function(x) sd(x)/mean(x) > 3.0) +tse <- tse[variable.taxa,] + +# Collapse the tree +tree <- ape::keep.tip(phy = rowTree(tse), tip = rowLinks(tse)$nodeNum) +rowTree(tse) <- tree + +## Add a new assay with a pseudocount +assays(tse)$counts.shifted <- assay(tse, "counts") + 1 + +## Run philr for TreeSummarizedExperiment object +## using the pseudocount data +res.tse <- philr(tse, part.weights='enorm.x.gm.counts', + ilr.weights='blw.sqrt', return.all=FALSE, + abund_values="counts.shifted") + +# Running philr on a phyloseq object +\donttest{ + pseq <- makePhyloseqFromTreeSummarizedExperiment(tse) + res.pseq <- philr(pseq, part.weights='enorm.x.gm.counts', + ilr.weights='blw.sqrt', return.all=FALSE) +} -philr(df, tr, part.weights='enorm.x.gm.counts', - ilr.weights='blw.sqrt', return.all=FALSE) } \seealso{ \code{\link{phylo2sbp}} \code{\link{calculate.blw}} } \author{ -Justin Silverman +Justin Silverman; S3 methods by Leo Lahti } diff --git a/man/philrInv.Rd b/man/philrInv.Rd index fa57f66..238b99c 100644 --- a/man/philrInv.Rd +++ b/man/philrInv.Rd @@ -4,13 +4,20 @@ \alias{philrInv} \title{Inverse of PhILR Transform} \usage{ -philrInv(df.ilrp, tree = NULL, sbp = NULL, V = NULL, - part.weights = NULL, ilr.weights = NULL) +philrInv( + x.ilrp, + tree = NULL, + sbp = NULL, + V = NULL, + part.weights = NULL, + ilr.weights = NULL +) } \arguments{ -\item{df.ilrp}{transformed data to which the inverse transform will be applied} +\item{x.ilrp}{transformed data to which the inverse transform will be applied} -\item{tree}{(optional) to be used to build sbp and contrast matrix (see details)} +\item{tree}{(optional) to be used to build sbp and contrast matrix +(see details)} \item{sbp}{(optional) the sbp (sequential binary partition) used to build a contrast matrix (see details)} @@ -18,10 +25,11 @@ contrast matrix (see details)} \item{V}{(optional) the contrast matrix (see details)} \item{part.weights}{weightings for parts, can be a named vector with names -corresponding to \code{colnames(df)}. Defaults to 'uniform' (part.weights = 1,...,1)} +corresponding to \code{colnames(x)}. Defaults to 'uniform' +(part.weights = 1,...,1)} -\item{ilr.weights}{weightings for the ILR coordiantes can be a named vector with names -corresponding to names of internal nodes of \code{tree}. +\item{ilr.weights}{weightings for the ILR coordiantes can be a named vector +with names corresponding to names of internal nodes of \code{tree}. Defaults to 'uniform' (ilr.weights = 1,...,1)} } \value{ @@ -39,13 +47,13 @@ parameters must be specified (\code{tree}, \code{sbp}, or \code{V}). } \examples{ tr <- named_rtree(5) - df <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add a small pseudocount - colnames(df) <- tr$tip.label - d <- philr(df, tr, part.weights='enorm.x.gm.counts', + x <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add small pseudocount + colnames(x) <- tr$tip.label + d <- philr(x, tr, part.weights='enorm.x.gm.counts', ilr.weights='blw.sqrt', return.all=TRUE) - d.inverted <- philrInv(d$df.ilrp, V=d$V, part.weights = d$p, + d.inverted <- philrInv(d$x.ilrp, V=d$V, part.weights = d$p, ilr.weights = d$ilr.weights) - all.equal(miniclo(df), d.inverted) + all.equal(miniclo(x), d.inverted) } \seealso{ \code{\link{philr}} diff --git a/tests/testthat/test-BLcalcs.R b/tests/testthat/test-BLcalcs.R index 575408b..8699041 100644 --- a/tests/testthat/test-BLcalcs.R +++ b/tests/testthat/test-BLcalcs.R @@ -10,7 +10,7 @@ test_that('mean_dist_to_tips gives correct results on fixed tree', { mdtt <- mean_dist_to_tips(tr.fixed) expect_equal(length(mdtt), 4) - expect_equal(mdtt[3], 1.018737, tolerance=3e-7, check.attributes=F) + expect_equal(mdtt[3], 1.018737, tolerance=3e-7, check.attributes=FALSE) expect_equivalent(mdtt[4], 0.5168615) }) diff --git a/tests/testthat/test-philr.R b/tests/testthat/test-philr.R index 32c07e3..59c3bbb 100644 --- a/tests/testthat/test-philr.R +++ b/tests/testthat/test-philr.R @@ -2,32 +2,31 @@ context("philr and philrInv functions") test_that("philrInv inverses the philr transform", { tr <- named_rtree(5) - df <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add a small pseudocount - colnames(df) <- tr$tip.label + x <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add a small pseudocount + colnames(x) <- tr$tip.label - d <- philr(df, tr, part.weights='enorm.x.gm.counts', + d <- philr(x, tree=tr, part.weights='enorm.x.gm.counts', ilr.weights='blw.sqrt', return.all=TRUE) - expect_equal(philrInv(d$df.ilrp, V=d$V, part.weights = d$p, ilr.weights = d$ilr.weights), - miniclo(df)) + expect_equal(philrInv(d$x.ilrp, V=d$V, part.weights = d$p, ilr.weights = d$ilr.weights), + miniclo(x)) }) test_that("philr and philrInv handles vectors and default uniform arguments", { tr <- named_rtree(5) - df <- c(1,4,1,22,2) - names(df) <- tr$tip.label + x <- c(1,4,1,22,2) + names(x) <- tr$tip.label - df.ilr <- philr(df, tr, return.all=F) + x.ilr <- philr(x, tree=tr, return.all=FALSE) - expect_equivalent(philrInv(df.ilr, tr), - miniclo(df)) + expect_equivalent(philrInv(x.ilr, tr), + miniclo(x)) }) test_that("philr and philrInv conserve distances of circle", { phy1 <- named_rtree(3) phy2 <- named_rtree(3) - t <- seq(0, 2*pi, by = 0.1) x <- cos(t) y <- sin(t) @@ -42,9 +41,9 @@ test_that("philr and philrInv conserve distances of circle", { test_that("philr handles data.frame input with warning", { tr <- named_rtree(5) - df <- c(1,4,1,22,2) - names(df) <- tr$tip.label - dfxxx <- as.data.frame(t(as.data.frame(df))) + x <- c(1,4,1,22,2) + names(x) <- tr$tip.label + xxxx <- as.data.frame(t(as.data.frame(x))) - expect_warning(philr(dfxxx, tr, return.all=F), "dfxxx") + expect_warning(philr(xxxx, tree=tr, return.all=FALSE), "xxxx") }) diff --git a/tests/testthat/test-wilr.R b/tests/testthat/test-wilr.R index e0b0971..0c613d8 100644 --- a/tests/testthat/test-wilr.R +++ b/tests/testthat/test-wilr.R @@ -6,16 +6,16 @@ test_that('check.zeroes throws proper waring',{ test_that('shiftp function handles both matricies and vectors', { x1 <- c(1,2,3) - x2 <- matrix(c(1,2,3,1,2,3,1,2,3), nrow = 3, byrow=T) + x2 <- matrix(c(1,2,3,1,2,3,1,2,3), nrow = 3, byrow=TRUE) p <- c(1, .1, .5) expect_equal(shiftp(x1, p), matrix(c(1, 20, 6), nrow=1)) - expect_equal(shiftp(x2, p), matrix(rep(c(1, 20, 6), 3), nrow=3, byrow=T)) + expect_equal(shiftp(x2, p), matrix(rep(c(1, 20, 6), 3), nrow=3, byrow=TRUE)) }) test_that('shiftpInv function reverses Shiftp', { x1 <- matrix(c(1,2,3), nrow=1) - x2 <- matrix(c(1,2,3,1,2,3,1,2,3), nrow = 3, byrow=T) + x2 <- matrix(c(1,2,3,1,2,3,1,2,3), nrow = 3, byrow=TRUE) p <- c(1, .1, .5) y1 <- shiftp(x1, p) y2 <- shiftp(x2, p) @@ -73,9 +73,9 @@ test_that('ilrpInv reverses effect of ilrp', { tr <- named_rtree(5) sbp <- phylo2sbp(tr) V <- buildilrBasep(sbp, p) - df <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add a small pseudocount - x <- miniclo(df) - y <- shiftp(miniclo(df), p) + x <- t(rmultinom(10,100,c(.1,.6,.2,.3,.2))) + 0.65 # add a small pseudocount + x <- miniclo(x) + y <- shiftp(miniclo(x), p) y.star <- ilrp(y, p, V) y.inversed <- ilrpInv(y.star, V) diff --git a/vignettes/philr-intro.Rmd b/vignettes/philr-intro.Rmd index b671a5a..d359b1b 100644 --- a/vignettes/philr-intro.Rmd +++ b/vignettes/philr-intro.Rmd @@ -10,88 +10,173 @@ vignette: > %\VignetteEncoding{UTF-8} --- + # Introduction -PhILR is short for "Phylogenetic Isometric Log-Ratio Transform" [@silverman2017]. -This package provides functions for the analysis of [compositional data](https://en.wikipedia.org/wiki/Compositional_data) (e.g., data representing proportions of different variables/parts). Specifically this package allows analysis of compositional data where the parts can be related through a phylogenetic tree (as is common in microbiota survey data) and makes available the Isometric Log Ratio transform built from the phylogenetic tree and utilizing a weighted reference measure [@egozcue2016]. + +PhILR is short for "Phylogenetic Isometric Log-Ratio Transform" +[@silverman2017]. This package provides functions for the analysis of +[compositional data](https://en.wikipedia.org/wiki/Compositional_data) +(e.g., data representing proportions of different +variables/parts). Specifically this package allows analysis of +compositional data where the parts can be related through a +phylogenetic tree (as is common in microbiota survey data) and makes +available the Isometric Log Ratio transform built from the +phylogenetic tree and utilizing a weighted reference measure +[@egozcue2016]. # Overview of PhILR Analysis -The goal of PhILR is to transform compositional data into an orthogonal unconstrained space (real space) with phylogenetic / evolutionary interpretation while preserving all information contained in the original composition. Unlike in the original compositional space, in the transformed real space, standard statistical tools may be applied. For a given set of samples consisting of measurements of taxa, we transform data into a new space of samples and orthonormal coordinates termed ‘balances’. Each balance is associated with a single internal node of a phylogenetic tree with the taxa as leaves. The balance represents the log-ratio of the geometric mean abundance of the two groups of taxa that descend from the given internal node. More details on this method can be found in @silverman2017 ([Link](https://elifesciences.org/content/6/e21887)). -# Loading and Preprocessing Dataset -Here we will demonstrate PhILR analysis using the Global Patterns dataset that -was originally published by @caporaso2011. This dataset is provided with the -`phyloseq` package [@mcmurdie2013] and our analysis follows, in part, that of the authors -[GitHub Tutorial](http://joey711.github.io/phyloseq/preprocess.html). +The goal of PhILR is to transform compositional data into an +orthogonal unconstrained space (real space) with phylogenetic / +evolutionary interpretation while preserving all information contained +in the original composition. Unlike in the original compositional +space, in the transformed real space, standard statistical tools may +be applied. For a given set of samples consisting of measurements of +taxa, we transform data into a new space of samples and orthonormal +coordinates termed ‘balances’. Each balance is associated with a +single internal node of a phylogenetic tree with the taxa as +leaves. The balance represents the log-ratio of the geometric mean +abundance of the two groups of taxa that descend from the given +internal node. More details on this method can be found in +@silverman2017 ([Link](https://elifesciences.org/content/6/e21887)). + +The analysis uses abundance table and a phylogenetic tree. These can +be provided as separate data objects, or embedded in standard +R/Bioconductor data containers. The philr R package supports two +alternative data containers for microbiome data, `TreeSE` [@huang2021] +and `phyloseq` [@mcmurdie2013]. + + +# Loading and Preprocessing Dataset + +We demonstrate PhILR analysis by using the Global Patterns +dataset that was originally published by @caporaso2011. + +Let us first load necessary libraries. ```{r, echo=TRUE, message=FALSE} library(philr); packageVersion("philr") -library(phyloseq); packageVersion("phyloseq") library(ape); packageVersion("ape") library(ggplot2); packageVersion("ggplot2") +``` + + +# Data preparation: TreeSE -data(GlobalPatterns) +We show the GlobalPatterns example workflow as initially +outlined in [@mcmurdie2013]. + +We retrieve the example data in `TreeSummarizedExperiment` (`TreeSE`) +data format in this vignette [@huang2021], and then show example also +for the [phyloseq](philr-intro-phyloseq.md) format. The TreeSE version +for the GlobalPatterns data is provided with the [`mia` +package](microbiome.github.io/mia) [@lahti2020]. + +Let us load the data. + +```{r, echo=TRUE, message=FALSE} +library(mia); packageVersion("mia") +library(dplyr); packageVersion("dplyr") +data(GlobalPatterns, package = "mia") ``` + ## Filter Extremely Low-Abundance OTUs -Taxa that were not seen with more than 3 counts in at least 20% of samples are filtered. -Subsequently, those witha coefficient of variation ≤ 3 are filtered. These steps follow -those of [@mcmurdie2013]. Finally we add a pseudocount of 1 to the remaining OTUs to avoid -calculating log-ratios involving zeros. Alternatively other replacement methods -(multiplicative replacement etc...) may be used instead if desired; the -subsequent taxa weighting procedure we will describe complements a variety of -zero replacement methods. + +Taxa that were not seen with more than 3 counts in at least 20% of +samples are filtered. Subsequently, those with a coefficient of +variation ≤ 3 are filtered. Finally we add a pseudocount of 1 to the +remaining OTUs to avoid calculating log-ratios involving +zeros. Alternatively other replacement methods (multiplicative +replacement etc...) may be used instead if desired; the subsequent +taxa weighting procedure we will describe complements a variety of +zero replacement methods. ```{r, message=FALSE, warning=FALSE} -GP <- filter_taxa(GlobalPatterns, function(x) sum(x > 3) > (0.2*length(x)), TRUE) -GP <- filter_taxa(GP, function(x) sd(x)/mean(x) > 3.0, TRUE) -GP <- transform_sample_counts(GP, function(x) x+1) +## Select prevalent taxa +tse <- GlobalPatterns %>% subsetByPrevalentTaxa( + detection = 3, + prevalence = 20/100, + as_relative = FALSE) + +## Pick taxa that have notable abundance variation across sammples +variable.taxa <- apply(assays(tse)$counts, 1, function(x) sd(x)/mean(x) > 3.0) +tse <- tse[variable.taxa,] +# Collapse the tree! +# Otherwise the original tree with all nodes is kept +# (including those that were filtered out from rowData) +tree <- ape::keep.tip(phy = rowTree(tse), tip = rowLinks(tse)$nodeNum) +rowTree(tse) <- tree + +## Add a new assay with a pseudocount +assays(tse)$counts.shifted <- assays(tse)$counts + 1 ``` -With these two commands we have removed the filtered taxa from the OTU table, +We have now removed the filtered taxa from the OTU table, pruned the phylogenetic tree, and subset the taxa table. -Here is the result of those filtering steps +Here is the result of those filtering steps. + ```{r, echo=FALSE} -GP +tse ``` + + ## Process Phylogenetic Tree -Next we check that the tree is rooted and binary (all multichotomies have been resolved). + +Next we check that the tree is rooted and binary (all multichotomies +have been resolved). + ```{r, message=FALSE, warning=FALSE} -is.rooted(phy_tree(GP)) # Is the tree Rooted? -is.binary.tree(phy_tree(GP)) # All multichotomies resolved? +library(ape); packageVersion("ape") +is.rooted(tree) # Is the tree Rooted? +is.binary(tree) # All multichotomies resolved? ``` -Note that if the tree is not binary, the function `multi2di` from the `ape` package -can be used to replace multichotomies with a series of dichotomies with one (or several) -branch(es) of zero length . +Note that if the tree is not binary, the function `multi2di` from the +`ape` package can be used to replace multichotomies with a series of +dichotomies with one (or several) branch(es) of zero length. -Once this is done, we name the internal nodes of the tree so they -are easier to work with. We prefix the node number with `n` and thus the root -is named `n1`. +Once this is done, we name the internal nodes of the tree so they are +easier to work with. We prefix the node number with `n` and thus the +root is named `n1`. ```{r, message=FALSE, warning=FALSE} -phy_tree(GP) <- makeNodeLabel(phy_tree(GP), method="number", prefix='n') +tree <- makeNodeLabel(tree, method="number", prefix='n') + +# Add the modified tree back to the (`TreeSE`) data object +rowTree(tse) <- tree ``` -We note that the tree is already rooted with Archea as the outgroup and -no multichotomies are present. This uses the function `name.balance` from the `philr` -package. This function uses a simple voting scheme to find a consensus naming -for the two clades that descend from a given balance. Specifically for a -balance named `x/y`, `x` refers to the consensus name of the clade in the numerator -of the log-ratio and `y` refers to the denominator. +We note that the tree is already rooted with Archea as the outgroup +and no multichotomies are present. This uses the function +`name.balance` from the `philr` package. This function uses a simple +voting scheme to find a consensus naming for the two clades that +descend from a given balance. Specifically for a balance named `x/y`, +`x` refers to the consensus name of the clade in the numerator of the +log-ratio and `y` refers to the denominator. + ```{r} -name.balance(phy_tree(GP), tax_table(GP), 'n1') +# Extract taxonomy table from the TreeSE object +tax <- rowData(tse)[,taxonomyRanks(tse)] + +# Get name balances +name.balance(tree, tax, 'n1') ``` + ## Investigate Dataset Components -Finally we transpose the OTU table (`philr` uses the conventions of the `compositions` -package for compositional data analysis in R, taxa are columns, samples are rows). -Then we will take a look at part of the dataset in more detail + +Finally we transpose the OTU table (`philr` uses the conventions of +the `compositions` package for compositional data analysis in R, taxa +are columns, samples are rows). Then we will take a look at part of +the dataset in more detail. + ```{r} -otu.table <- t(otu_table(GP)) -tree <- phy_tree(GP) -metadata <- sample_data(GP) -tax <- tax_table(GP) +otu.table <- t(as(assays(tse)$counts.shifted, "matrix")) +tree <- rowTree(tse) +metadata <- colData(tse) +tax <- rowData(tse)[,taxonomyRanks(tse)] otu.table[1:2,1:2] # OTU Table tree # Phylogenetic Tree @@ -100,7 +185,16 @@ head(tax,2) # taxonomy table ``` +A new variable distinguishing human/non-human: + +```{r} +human.samples <- factor(colData(tse)$SampleType %in% c("Feces", "Mock", "Skin", "Tongue")) +``` + + + # Transform Data using PhILR + The function `philr::philr()` implements a user friendly wrapper for the key steps in the philr transform. @@ -116,54 +210,105 @@ using the function `philr::phylo2sbp()` weights are either provided by the user or can be calculated by the function `philr::calculate.blw()`. -Note: The preprocessed OTU table should be passed -to the function `philr::philr()` before it is closed (normalized) to relative abundances, as -some of the preset weightings of the taxa use the original count data to down weight low -abundance taxa. +Note: The preprocessed OTU table should be passed to the function +`philr::philr()` before it is closed (normalized) to relative +abundances, as some of the preset weightings of the taxa use the +original count data to down weight low abundance taxa. Here we will use the same weightings as we used in the main paper. -```{r} +You can run `philr` with the abundance table and phylogenetic tree. + +```{r, message=FALSE} gp.philr <- philr(otu.table, tree, part.weights='enorm.x.gm.counts', ilr.weights='blw.sqrt') gp.philr[1:5,1:5] ``` -Now the transformed data is represented in terms of balances and since -each balance is associated with a single internal node of the tree, we denote the balances -using the same names we assigned to the internal nodes (e.g., `n1`). + +Alternatively, you can provide the data directly in `TreeSE` format. + +```{r, message=FALSE} +gp.philr <- philr(tse, abund_values = "counts.shifted", + part.weights='enorm.x.gm.counts', + ilr.weights='blw.sqrt') +``` + +Alternatively, you can provide the data in `phyloseq` format. For +simplicity, let us just convert the `TreeSE` object to `phyloseq` +object to give a brief example. + +```{r, message=FALSE} +pseq <- makePhyloseqFromTreeSummarizedExperiment(tse, abund_values="counts.shifted") +gp.philr <- philr(pseq, + part.weights='enorm.x.gm.counts', + ilr.weights='blw.sqrt') + +``` + + + +After running `philr` the transformed data is represented in terms of +balances and since each balance is associated with a single internal +node of the tree, we denote the balances using the same names we +assigned to the internal nodes (e.g., `n1`). + # Ordination in PhILR Space -Euclidean distance in PhILR space can be used for ordination analysis. We -do this ordination using tools from the `phyloseq` package. + +Euclidean distance in PhILR space can be used for ordination analysis. Let us first calculate distances and then calculate standard MDS ordination. ```{r} -gp.dist <- dist(gp.philr, method="euclidean") -gp.pcoa <- ordinate(GP, 'PCoA', distance=gp.dist) -plot_ordination(GP, gp.pcoa, color='SampleType') + geom_point(size=4) +# Distances between samples based on philr transformed data +gp.dist <- dist(gp.philr, method="euclidean") + +# Calculate MDS for the distance matrix +d <- as.data.frame(cmdscale(gp.dist)) +colnames(d) <- paste0("PC", 1:2) ``` -# Identify Balances that Distinguish Human/Non-Human -More than just ordination analysis, PhILR provides an entire coordinate system -in which standard multivariate tools can be used. Here we will make use of sparse -logistic regression (from the `glmnet` pacakge) to identify a small number of -balances that best distinguish human from non-human samples. -First we will make a new variable distinguishing human/non-human -```{r, message=FALSE, warning=FALSE} -sample_data(GP)$human <- factor(get_variable(GP, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue")) +# Visualization with TreeSE + +Let us next visualize the ordination. This example employs standard +tools for ordination and visualization that can be used regardless of +the preferred data container. Note that the `phyloseq` and `TreeSE` frameworks +may provide access to additional ordination and visualization methods. + +```{r} +# Add some metadata for the visualization +d$SampleType <- factor(metadata$SampleType) + +# Create a plot +ggplot(data = d, + aes(x=PC1, y=PC2, color=SampleType)) + + geom_point() + + labs(title = "Euclidean distances with phILR") ``` -Now we will fit a sparse logistic regression model (logistic regression with $l_1$ penalty) + + +# Identify Balances that Distinguish Human/Non-Human + +More than just ordination analysis, PhILR provides an entire +coordinate system in which standard multivariate tools can be +used. Here we will make use of sparse logistic regression (from the +`glmnet` package) to identify a small number of balances that best +distinguish human from non-human samples. + +Now we will fit a sparse logistic regression model (logistic +regression with $l_1$ penalty) + ```{r, message=FALSE, warning=FALSE} library(glmnet); packageVersion('glmnet') -glmmod <- glmnet(gp.philr, sample_data(GP)$human, alpha=1, family="binomial") +glmmod <- glmnet(gp.philr, human.samples, alpha=1, family="binomial") ``` -We will use a hard-threshold for the $l_1$ penalty of $\lambda = 0.2526$ which -we choose so that the resulting number of non-zero coefficients is $\approx 5$ -(for easy of visualization in this tutorial). +We will use a hard-threshold for the $l_1$ penalty of $\lambda = +0.2526$ which we choose so that the resulting number of non-zero +coefficients is $\approx 5$ (for easy of visualization in this +tutorial). ```{r} top.coords <- as.matrix(coefficients(glmmod, s=0.2526)) @@ -172,13 +317,17 @@ top.coords <- rownames(top.coords)[which(top.coords != 0)] ``` # Name Balances -To find the taxonomic labels that correspond to these balances we can use the function -`philr::name.balance()`. This funciton uses a simple voting scheme to name -the two descendent clades of a given balance separately. For a given clade, -the taxonomy table is subset to only contain taxa from that clade. Starting -at the finest taxonomic rank (e.g., species) the subset taxonomy table is checked to see -if any label (e.g., species name) represents ≥ threshold (default 95%) of -the table entries at that taxonomic rank. If no consensus identifier is found, the table is checked at the next-most specific taxonomic rank (etc...). + +To find the taxonomic labels that correspond to these balances we can +use the function `philr::name.balance()`. This funciton uses a simple +voting scheme to name the two descendent clades of a given balance +separately. For a given clade, the taxonomy table is subset to only +contain taxa from that clade. Starting at the finest taxonomic rank +(e.g., species) the subset taxonomy table is checked to see if any +label (e.g., species name) represents ≥ threshold (default 95%) of the +table entries at that taxonomic rank. If no consensus identifier is +found, the table is checked at the next-most specific taxonomic rank +(etc...). ```{r} tc.names <- sapply(top.coords, function(x) name.balance(tree, tax, x)) @@ -190,12 +339,13 @@ directly. ```{r} votes <- name.balance(tree, tax, 'n730', return.votes = c('up', 'down')) - -votes[[c('up.votes', 'Family')]] # Numerator at Family Level +votes[[c('up.votes', 'Family')]] # Numerator at Family Level votes[[c('down.votes', 'Family')]] # Denominator at Family Level ``` + + # Visualize Results ```{r, message=FALSE, warning=FALSE} @@ -203,16 +353,18 @@ library(ggtree); packageVersion("ggtree") library(dplyr); packageVersion('dplyr') ``` -Above we found the top 5 coordinates (balances) that distinguish whether a -sample is from a human or non-human source. Now using the `ggtree` [@yu2016] package we -can visualize these balances on the tree using the `geom_balance` object. -To use these functions we need to know the acctual node number (not just the names we have given) of these balances -on the tree. To convert between node number and name, we have added the functions -`philr::name.to.nn()` and `philr::nn.to.name()`. -In addition, it is important that we know which clade of the balance is in the -numerator (+) and which is in the denominator (-) of the log-ratio. To help us keep track -we have created the function `philr::annotate_balance()` which allows us to easily -label these two clades. +Above we found the top 5 coordinates (balances) that distinguish +whether a sample is from a human or non-human source. Now using the +`ggtree` [@yu2016] package we can visualize these balances on the tree +using the `geom_balance` object. To use these functions we need to +know the acctual node number (not just the names we have given) of +these balances on the tree. To convert between node number and name, +we have added the functions `philr::name.to.nn()` and +`philr::nn.to.name()`. In addition, it is important that we know +which clade of the balance is in the numerator (+) and which is in the +denominator (-) of the log-ratio. To help us keep track we have +created the function `philr::annotate_balance()` which allows us to +easily label these two clades. ```{r, message=FALSE, warning=FALSE} tc.nn <- name.to.nn(tree, top.coords) @@ -235,30 +387,43 @@ data to long format. We have included a function `philr::convert_to_long()` for this purpose. ```{r} -gp.philr.long <- convert_to_long(gp.philr, get_variable(GP, 'human')) %>% +gp.philr.long <- convert_to_long(gp.philr, human.samples) %>% filter(coord %in% top.coords) ggplot(gp.philr.long, aes(x=labels, y=value)) + geom_boxplot(fill='lightgrey') + facet_grid(.~coord, scales='free_x') + - xlab('Human') + ylab('Balance Value') + + labs(x = 'Human', y = 'Balance Value') + theme_bw() ``` # Use Balances for Dimension Reduction -Lets just look at balance n16 vs. balance n730 (the ones we annotated in the above tree). + +Lets just look at balance n16 vs. balance n730 (the ones we annotated +in the above tree). + ```{r, message=FALSE, warning=FALSE} library(tidyr); packageVersion('tidyr') gp.philr.long %>% - rename(Human=labels) %>% - filter(coord %in% c('n16', 'n730')) %>% - spread(coord, value) %>% + dplyr::rename(Human=labels) %>% + dplyr::filter(coord %in% c('n16', 'n730')) %>% + tidyr::spread(coord, value) %>% ggplot(aes(x=n16, y=n730, color=Human)) + geom_point(size=4) + - xlab(tc.names['n16']) + ylab(tc.names['n730']) + + labs(x = tc.names['n16'], y = tc.names['n730']) + theme_bw() ``` + +# Package versions + +```{r, echo=TRUE, message=FALSE} +sessionInfo() +``` + + # References + + diff --git a/vignettes/philr.bib b/vignettes/philr.bib index e6de1f8..68d80e0 100644 --- a/vignettes/philr.bib +++ b/vignettes/philr.bib @@ -1,3 +1,35 @@ + +@article{huang2021, + title = {{\it TreeSummarizedExperiment}: a S4 class for data with + hierarchical structure [version 2]}, + author = {Ruizhu Huang and Charlotte Soneson and Felix + G.M. Ernst and Kevin C. Rue-Albrecht and Guangchuang + Yu and Stephanie C. Hicks and Mark D. Robinson}, + year = 2021, + journal = {F1000Research}, + volume = 9, + pages = 1246, + doi = {10.12688/f1000research.26669.2} +} + + + + +@article{lahti2020, + title = {Upgrading the R/Bioconductor ecosystem for + microbiome research [version 1]}, + author = {Lahti, L and Ernst, FGM and Shetty, SA and Borman, T + and Huang, T and Braccia, DJ and Bravo, HC}, + year = 2020, + journal = {F1000Research}, + volume = 9, + pages = 1464, + note = {Presentation at the European Bioconductor Meeting}, + doi = {10.7490/f1000research.1118447.1}, + URL = {microbiome.github.io} +} + + @article{silverman2017, title = {A phylogenetic transform enhances analysis of compositional microbiota data}, author = {Justin D Silverman and Alex D Washburne and Sayan Mukherjee and Lawrence A David},