diff --git a/.gitignore b/.gitignore index 35485e96..04c16e17 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,4 @@ .Rhistory .RData .Ruserdata -pagoda2*.tar.gz \ No newline at end of file +pagoda2*.tar.gz diff --git a/CHANGELOG.md b/CHANGELOG.md index efff2ec9..b64e7577 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ ## Upcoming +## [1.0.3] - 2020-05-01 + +- Removed `jsDist()` as it's in sccore +- Removed `multi2dend()` as it's in sccore +- Removed strong dependency for p2data + ## [1.0.2] - 2020-03-03 ### Changed diff --git a/DESCRIPTION b/DESCRIPTION index 0e45adf6..91d4f894 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,11 @@ Package: pagoda2 Title: Single Cell Analysis and Differential Expression -Version: 1.0.2 +Version: 1.0.3 Authors@R: c(person("Nikolas","Barkas", email="nikolas_barkas@hms.harvard.edu", role="aut"), person("Viktor", "Petukhov", email="viktor.s.petuhov@ya.ru", role="aut"), person("Peter", "Kharchenko", email = "peter_kharchenko@hms.harvard.edu", role = "aut"), person("Simon", "Steiger", email = "simon.steiger@gmail.com", role = "ctb"), person("Evan", "Biederstedt", email="evan.biederstedt@gmail.com", role=c("cre", "aut"))) Description: Analyzing and interactively exploring large-scale single-cell RNA-seq datasets. 'pagoda2' primarily performs normalization and differential gene expression analysis, with an interactive application for exploring single-cell RNA-seq datasets. It performs basic tasks such as cell size normalization, gene variance normalization, and can be used to identify subpopulations and run differential expression within individual samples. 'pagoda2' was written to rapidly process modern large-scale scRNAseq datasets of approximately 1e6 cells. The companion web application allows users to explore which gene expression patterns form the different subpopulations within your data. The package also serves as the primary method for preprocessing data for conos, . This package interacts with data available through the 'p2data' package, which is available in a 'drat' repository. To access this data package, see the instructions at . The size of the 'p2data' package is approximately 6 MB. License: GPL-3 Copyright: See the file COPYRIGHTS for various pagoda2 copyright details Encoding: UTF-8 -LazyData: true Depends: R (>= 3.5.0), Matrix, igraph biocViews: diff --git a/R/Pagoda2.R b/R/Pagoda2.R index b6f78ac6..84f432b0 100644 --- a/R/Pagoda2.R +++ b/R/Pagoda2.R @@ -23,6 +23,7 @@ NULL #' @param n.cores numeric Number of cores to use (default=1) #' @param n.odgenes integer Number of overdispersed genes to retrieve (default=NULL). If NULL, will return all. #' @param verbose boolean Whether to give verbose output (default=TRUE) +#' @param batch fctor Batch factor for the dataset (default=NULL) #' @param lib.sizes character vector of library sizes (default=NULL) #' @param log.scale boolean If TRUE, scale counts by log() (default=TRUE) #' @param min.cells.per.gene integer Minimum number of cells per gene, used to subset counts for coverage (default=0) @@ -98,10 +99,6 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, min.transcripts.per.cell=10, batch=NULL, lib.sizes=NULL, log.scale=TRUE, keep.genes=NULL) { - if (!requireNamespace("p2data", quietly = TRUE)) { - stop("Package \"p2data\" needed for the Pagoda2 class to work. This can be installed via a drat repository, using \"install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')\". Please read the details provided within the README at https://github.com/kharchenkolab/pagoda2.", call. = FALSE) - } - if ('Pagoda2' %in% class(x)) { # copy constructor for (n in ls(x)) { if (!is.function(get(n, x))) assign(n, get(n, x), self) @@ -309,8 +306,8 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' @param persist boolean Whether to save results (default=TRUE, i.e. is.null(cells)). #' @examples #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 + #' ## Load pre-generated a dataset of 50 bone marrow cells as matrix + #' cm <- readRDS(system.file("extdata", "sample_BM1_50.rds", package="pagoda2")) #' ## Perform QC, i.e. filter any cells that # ## don't fit the expected detected gene vs molecule count relationship #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) @@ -451,8 +448,8 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' @param var.scale boolean Apply scaling if using raw counts (default=TRUE). If type="counts", var.scale is TRUE by default. #' @examples #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 + #' ## Load pre-generated a dataset of 50 bone marrow cells as matrix + #' cm <- readRDS(system.file("extdata", "sample_BM1_50.rds", package="pagoda2")) #' ## Perform QC, i.e. filter any cells that # ## don't fit the expected detected gene vs molecule count relationship #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=300) @@ -575,25 +572,6 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' @param min.cluster.size Minimum size of clusters (default=1). This parameter is primarily used to remove very small clusters. #' @param persist boolean Whether to save the clusters and community structure (default=TRUE) #' @param ... Additional parameters to pass to 'method' - #' @examples - #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 - #' ## Perform QC, i.e. filter any cells that - # ## don't fit the expected detected gene vs molecule count relationship - #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=900) - #' rownames(counts) <- make.unique(rownames(counts)) - #' ## Generate Pagoda2 object - #' p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) - #' ## Normalize gene expression variance - #' p2_object$adjustVariance(plot=TRUE, gam.k=10) - #' ## Reduce the dataset dimensions by running PCA - #' p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) - #' ## Generate a KNN graph of cells that will allow us to identify clusters of cells - #' p2_object$makeKnnGraph(k=20, center=FALSE, distance='L2') - #' ## Call clusters based on KNN - #' p2_object$getKnnClusters(method=infomap.community, type='counts') - #' } #' #' @return the community structure calculated from 'method' getKnnClusters=function(type='counts',method=igraph::multilevel.community, name='community', @@ -662,32 +640,6 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' @param persist boolean Whether to save the clusters and community structure (default=TRUE) #' @param z.threshold numeric Threshold of z-scores to filter, >=z.threshold are kept (default=2) #' @param min.set.size integer Minimum threshold of sets to keep (default=5) - #' @examples - #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 - #' ## Perform QC, i.e. filter any cells that - # ## don't fit the expected detected gene vs molecule count relationship - #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=400) - #' rownames(counts) <- make.unique(rownames(counts)) - #' ## Generate Pagoda2 object - #' p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) - #' ## Normalize gene expression variance - #' p2_object$adjustVariance(plot=TRUE, gam.k=10) - #' ## Reduce the dataset dimensions by running PCA - #' p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) - #' ## Generate a KNN graph of cells that will allow us to identify clusters of cells - #' p2_object$makeKnnGraph(k=20, type='PCA', center=FALSE, distance='cosine') - #' ## Call clusters based on KNN - #' p2_object$getKnnClusters(method=walktrap.community,type='PCA',name='walktrap') - #' ## Generate embedding of the data - #' p2_object$getEmbedding(type='PCA', embeddingType = 'largeVis', M=30, perplexity=30, gamma=1/30) - #' ## Perform differential expression - #' p2_object$getDifferentialGenes(type='PCA', verbose=TRUE, clusterType='walktrap') - #' ## Perform differential expression - #' hdea <- p2_object$getHierarchicalDiffExpressionAspects(type='PCA', - #' clusterName='walktrap', z.threshold=3) - #' } #' #' @return hierarchical clustering getHierarchicalDiffExpressionAspects = function(type='counts', groups=NULL, clusterName=NULL, method='ward.D', @@ -751,7 +703,7 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, # this one has to be done on counts, even if we're working with a reduction cl <- as.factor(cl[match(rownames(self$misc[['rawCounts']]),names(cl))]) lvec <- colSumByFac(self$misc[['rawCounts']],as.integer(cl))[-1,] + 1 - d <- jsDist(t(lvec/pmax(1,Matrix::rowSums(lvec)))) + d <- sccore::jsDist(t(lvec/pmax(1,Matrix::rowSums(lvec)))) colnames(d) <- rownames(d) <- which(table(cl)>0) d <- as.dist(d) } else { @@ -839,22 +791,6 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' @param fastpath boolean Whether to try a (fast) C algorithm implementation if possible (default=TRUE). This parameter is equivalent to 'fastpath' in irlba::irlba(). #' @param maxit integer Maximum number of iterations (default=1000). This parameter is equivalent to 'maxit' in irlba::irlba(). #' @param k integer Number of k clusters for calculating k-NN on the resulting principal components (default=30). - #' @examples - #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 - #' ## Perform QC, i.e. filter any cells that - # ## don't fit the expected detected gene vs molecule count relationship - #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) - #' rownames(counts) <- make.unique(rownames(counts)) - #' ## Generate Pagoda2 object - #' p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) - #' ## Normalize gene expression variance - #' p2_object$adjustVariance(plot=TRUE, gam.k=10) - #' ## Reduce the dataset dimensions by running PCA - #' p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) - #' p2_object$makeGeneKnnGraph(nPcs=50, k=20, center=TRUE) - #' } #' #' @return graph with gene similarity makeGeneKnnGraph = function(nPcs=100, center=TRUE, fastpath=TRUE, maxit=1000, k=30, n.cores=self$n.cores, verbose=TRUE) { @@ -910,26 +846,6 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' @param s numeric The “saturation” to be used to complete the HSV color descriptions (default=1). Equivalent to the 's' parameter in grDevices::rainbow(). #' @param verbose boolean Whether to give verbose output (default=TRUE) #' @param ... Additional parameters passed to dbscan::dbscan(emb, ...) - #' @examples - #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 - #' ## Perform QC, i.e. filter any cells that - # ## don't fit the expected detected gene vs molecule count relationship - #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) - #' rownames(counts) <- make.unique(rownames(counts)) - #' ## Generate Pagoda2 object - #' p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) - #' ## Normalize gene expression variance - #' p2_object$adjustVariance(plot=TRUE, gam.k=10) - #' ## Reduce the dataset dimensions by running PCA - #' p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) - #' ## Generate a KNN graph of cells that will allow us to identify clusters of cells - #' p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') - #' ## Generate embedding of the data - #' p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=20, perplexity=30, gamma=1/20) - #' p2_object$getDensityClusters(type='PCA') - #' } #' #' @return density-based clusters getDensityClusters=function(type='counts', embeddingType=NULL, name='density', eps=0.5, v=0.7, s=1, verbose=TRUE, ...) { @@ -973,29 +889,6 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' @param verbose boolean Whether to give verbose output (default=FALSE) #' @param append.specificity.metrics boolean Whether to append specifity metrics (default=TRUE). Uses the function sccore::appendSpecificityMetricsToDE(). #' @param append.auc boolean If TRUE, append AUC values (default=FALSE). Parameter ignored if append.specificity.metrics is FALSE. - #' @examples - #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 - #' ## Perform QC, i.e. filter any cells that - # ## don't fit the expected detected gene vs molecule count relationship - #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) - #' rownames(counts) <- make.unique(rownames(counts)) - #' ## Generate Pagoda2 object - #' p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) - #' ## Normalize gene expression variance - #' p2_object$adjustVariance(plot=TRUE, gam.k=10) - #' ## Reduce the dataset dimensions by running PCA - #' p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) - #' ## Generate a KNN graph of cells that will allow us to identify clusters of cells - #' p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') - #' ## Call clusters based on KNN - #' p2_object$getKnnClusters(method=multilevel.community,type='PCA',name='multilevel') - #' ## Generate embedding of the data - #' p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=20, perplexity=30, gamma=1/20) - #' ## Perform differential expression - #' p2_object$getDifferentialGenes(type='PCA',verbose=TRUE,clusterType='multilevel') - #' } #' #' @return List with each element of the list corresponding to a cell group in the provided/used factor (i.e. factor levels) #' Each element of a list is a data frame listing the differentially epxressed genes (row names), with the following columns: @@ -1255,28 +1148,10 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, }, #' @description Recalculate library sizes using robust regression within clusters - #' - #' @examples - #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 - #' ## Perform QC, i.e. filter any cells that - # ## don't fit the expected detected gene vs molecule count relationship - #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) - #' rownames(counts) <- make.unique(rownames(counts)) - #' ## Generate Pagoda2 object - #' p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) - #' ## Normalize gene expression variance - #' p2_object$adjustVariance(plot=TRUE, gam.k=10) - #' ## Reduce the dataset dimensions by running PCA - #' p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) - #' ## Generate a kNN graph of cells that will allow us to identify clusters of cells - #' p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') - #' ## Call clusters based on kNN - #' p2_object$getKnnClusters(method=infomap.community, type='PCA') - #' p2_object$getRefinedLibSizes(type='PCA') - #' lib.sizes <- p2_object$getRefinedLibSizes(type="PCA") - #' } + #' @param clusterType Name of cluster to access (default=NULL). If NULL, takes the most recently generated clustering. Parameter ignored if groups is not NULL. + #' @param groups factor named with cell names specifying the clusters of cells (default=NULL) + #' @param type string Either 'counts' or the name of a stored embedding, names(self$embeddings) (default=NULL) + #' @param n.cores numeric Number of cores to use (default=self$n.cores=1) #' #' @return recalculated library sizes getRefinedLibSizes=function(clusterType=NULL, groups=NULL, type='counts', n.cores=self$n.cores) { @@ -1347,32 +1222,6 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' @param useRaster boolean If TRUE a bitmap raster is used to plot the image instead of polygons (default=TRUE). The grid must be regular in that case, otherwise an error is raised. For more information, see graphics::image(). #' @param smooth.span (default=max(1,round(nrow(self$counts)/1024))) #' @param ... Additional parameters passed to internal function used for heatmap plotting, my.heatmap2() - #' @examples - #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 - #' ## Perform QC, i.e. filter any cells that - # ## don't fit the expected detected gene vs molecule count relationship - #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) - #' rownames(counts) <- make.unique(rownames(counts)) - #' ## Generate Pagoda2 object - #' p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) - #' ## Normalize gene expression variance - #' p2_object$adjustVariance(plot=TRUE, gam.k=10) - #' ## Reduce the dataset dimensions by running PCA - #' p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) - #' ## Generate a kNN graph of cells that will allow us to identify clusters of cells - #' p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') - #' ## Call clusters based on kNN - #' p2_object$getKnnClusters(method=multilevel.community,type='PCA',name='multilevel') - #' ## Generate embedding of the data - #' p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=20, perplexity=30, gamma=1/20) - #' ## Perform differential expression - #' p2_object$getDifferentialGenes(type='PCA',verbose=TRUE,clusterType='multilevel') - #' de <- p2_object$diffgenes$PCA[[1]][['2']] - #' p2_object$plotGeneHeatmap(genes=rownames(de)[1:15], - #' groups=p2_object$clusters$PCA[[1]], cluster.genes=TRUE) - #' } #' #' @return plot of gene heatmap plotGeneHeatmap=function(genes, type='counts', clusterType=NULL, groups=NULL, @@ -1496,31 +1345,6 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' @param gene (default=NULL) #' @param plot.theme (default=ggplot2::theme_bw()) #' @param ... Additional parameters passed to sccore::embeddingPlot() - #' @examples - #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 - #' ## Perform QC, i.e. filter any cells that - # ## don't fit the expected detected gene vs molecule count relationship - #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) - #' rownames(counts) <- make.unique(rownames(counts)) - #' ## Generate Pagoda2 object - #' p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) - #' ## Normalize gene expression variance - #' p2_object$adjustVariance(plot=TRUE, gam.k=10) - #' ## Reduce the dataset dimensions by running PCA - #' p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) - #' ## Generate a kNN graph of cells that will allow us to identify clusters of cells - #' p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') - #' ## Call clusters based on kNN - #' p2_object$getKnnClusters(method=multilevel.community,type='PCA',name='multilevel') - #' ## Generate embedding of the data - #' p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=20, perplexity=30, gamma=1/20) - #' library(ggplot2) - #' p2_object$plotEmbedding(type='PCA', show.legend=FALSE, mark.groups=TRUE, min.cluster.size=50, - #' shuffle.colors=FALSE, font.size=1, alpha=0.1, title='clusters (UMAP)', - #' plot.theme=theme(plot.title = element_text(hjust = 0.5))) - #' } #' #' @return plot of the embedding plotEmbedding=function(type=NULL, embeddingType=NULL, clusterType=NULL, @@ -1584,30 +1408,6 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' #' @param alpha numeric The Type I error probability or the significance level (default=5e-2). This is the criterion used to measure statistical significance, i.e. if the p-value < alpha, then it is statistically significant. #' @param use.unadjusted.pvals boolean Whether to use Benjamini-Hochberg adjusted p-values (default=FALSE). - #' @examples - #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 - #' ## Perform QC, i.e. filter any cells that - # ## don't fit the expected detected gene vs molecule count relationship - #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) - #' rownames(counts) <- make.unique(rownames(counts)) - #' ## Generate Pagoda2 object - #' p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) - #' ## Normalize gene expression variance - #' p2_object$adjustVariance(plot=TRUE, gam.k=10) - #' ## Reduce the dataset dimensions by running PCA - #' p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) - #' ## Generate a kNN graph of cells that will allow us to identify clusters of cells - #' p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') - #' ## Call clusters based on kNN - #' p2_object$getKnnClusters(method=infomap.community, type='PCA') - #' ## Generate embedding of the data - #' p2_object$getEmbedding(type='PCA', M=20, perplexity=30, gamma=1/20) - #' ## Perform differential expression - #' p2_object$getDifferentialGenes(type='PCA',verbose=TRUE) - #' odGenes <- p2_object$getOdGenes(use.unadjusted.pvals=FALSE) - #' } #' #' @return vector of overdispersed genes getOdGenes=function(n.odgenes=NULL, alpha=5e-2, use.unadjusted.pvals=FALSE) { @@ -1628,20 +1428,6 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' @description Return variance-normalized matrix for specified genes or a number of OD genes #' #' @param genes vector of gene names to explicitly return (default=NULL) - #' @examples - #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 - #' ## Perform QC, i.e. filter any cells that - # ## don't fit the expected detected gene vs molecule count relationship - #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) - #' rownames(counts) <- make.unique(rownames(counts)) - #' ## Generate Pagoda2 object - #' p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) - #' ## Normalize gene expression variance - #' p2_object$adjustVariance(plot=TRUE, gam.k=10) - #' p2_object$getNormalizedExpressionMatrix() - #' } #' #' @return cell by gene matrix getNormalizedExpressionMatrix=function(genes=NULL, n.odgenes=NULL) { @@ -1667,21 +1453,6 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' @param maxit numeric Maximum number of iterations (default=100). For more information, see 'maxit' parameter in irlba::irlba(). #' @param var.scale boolean Apply scaling if using raw counts (default=TRUE). If type="counts", var.scale is TRUE by default. #' @param ... additional arguments forwarded to irlba::irlba - #' @examples - #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 - #' ## Perform QC, i.e. filter any cells that - # ## don't fit the expected detected gene vs molecule count relationship - #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=600) - #' rownames(counts) <- make.unique(rownames(counts)) - #' ## Generate Pagoda2 object - #' p2_object <- Pagoda2$new(counts, log.scale=FALSE, min.cells.per.gene=30, n.cores=1) - #' ## Normalize gene expression variance - #' p2_object$adjustVariance(plot=TRUE, gam.k=15) - #' ## Reduce the dataset dimensions by running PCA - #' p2_object$calculatePcaReduction(nPcs=50, n.odgenes=2e3) - #' } #' #' @return Invisible PCA result (the reduction itself is saved in self$reductions[[name]])" calculatePcaReduction=function(nPcs=20, type='counts', name='PCA', use.odgenes=TRUE, n.odgenes=NULL, @@ -1783,30 +1554,6 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' @param min.odgenes integer Minimum number of overdispersed genes to use (default=10) #' @param max.odgenes integer Maximum number of overdispersed genes to use (default=Inf) #' @param recursive boolean Whether to determine groups for which variance normalization will be rerun (default=TRUE) - #' @examples - #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 - #' ## Perform QC, i.e. filter any cells that - # ## don't fit the expected detected gene vs molecule count relationship - #' counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) - #' rownames(counts) <- make.unique(rownames(counts)) - #' ## Generate Pagoda2 object - #' p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) - #' ## Normalize gene expression variance - #' p2_object$adjustVariance(plot=TRUE, gam.k=10) - #' ## Reduce the dataset dimensions by running PCA - #' p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) - #' ## Generate a kNN graph of cells that will allow us to identify clusters of cells - #' p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') - #' ## Call clusters based on kNN - #' p2_object$getKnnClusters(method=infomap.community, type='PCA') - #' ## Generate embedding of the data - #' p2_object$getEmbedding(type='PCA', M=20, perplexity=30, gamma=1/20) - #' ## Perform differential expression - #' p2_object$getDifferentialGenes(type='PCA',verbose=TRUE) - #' p2_object$expandOdGenes(type='PCA') - #' } #' #' @return List of overdispersed genes expandOdGenes=function(type='counts', clusterType=NULL, groups=NULL , min.group.size=30, od.alpha=1e-1, @@ -2465,27 +2212,6 @@ Pagoda2 <- R6::R6Class("Pagoda2", lock_objects=FALSE, #' @param distance string 'pearson', 'spearman', 'euclidean', 'L2', 'JS' (default='pearson') #' @param n.sgd.cores numeric Number of cores to use (default=n.cores) #' @param ... Additional parameters passed to embedding functions, Rtsne::Rtsne() if 'L2', uwot::umap() if 'UMAP', embedKnnGraphUmap() if 'UMAP_graph' - #' @examples - #' \donttest{ - #' ## Load pre-generated a dataset of 3000 bone marrow cells as matrix - #' cm <- p2data::sample_BM1 - #' ## Perform QC, i.e. filter any cells that - # ## don't fit the expected detected gene vs molecule count relationship - #' counts <- gene.vs.molecule.cell.filter(cm,min.cell.size=500) - #' rownames(counts) <- make.unique(rownames(counts)) - #' ## Generate Pagoda2 object - #' p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) - #' ## Normalize gene expression variance - #' p2_object$adjustVariance(plot=TRUE, gam.k=10) - #' ## Reduce the dataset dimensions by running PCA - #' p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) - #' ## Generate a kNN graph of cells that will allow us to identify clusters of cells - #' p2_object$makeKnnGraph(k=40, type='PCA', center=TRUE, distance='cosine') - #' ## Call clusters based on kNN - #' p2_object$getKnnClusters(method=infomap.community, type='PCA') - #' ## Generate embedding of the data - #' p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=30, perplexity=30, gamma=1/30) - #' } #' #' @return embedding stored in self$embedding getEmbedding=function(type='counts', embeddingType='largeVis', name=NULL, dims=2, M=1, gamma=1/M, perplexity=50, verbose=TRUE, diff --git a/R/RcppExports.R b/R/RcppExports.R index 45bd8b96..9291ec57 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -41,10 +41,6 @@ inplaceWinsorizeSparseCols <- function(sY, n, ncores = 1L) { .Call('_pagoda2_inplaceWinsorizeSparseCols', PACKAGE = 'pagoda2', sY, n, ncores) } -jsDist <- function(m, ncores = 1L) { - .Call('_pagoda2_jsDist', PACKAGE = 'pagoda2', m, ncores) -} - orderColumnRows <- function(p, i) { .Call('_pagoda2_orderColumnRows', PACKAGE = 'pagoda2', p, i) } diff --git a/R/helpers.R b/R/helpers.R index 81a0fbea..376e1f35 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -18,48 +18,6 @@ NULL library.dynam.unload("pagoda2", libpath) } -#' Translate multilevel segmentation into a dendrogram, with the lowest level of the dendrogram listing the cells -#' -#' @param cl clusters -#' @param counts matrix of counts -#' @param deep boolean (default=FALSE) -#' @param dist character vector Distance metric (default='cor') -#' @return cell dendrogram -#' @keywords internal -multi2dend <- function(cl, counts, deep=FALSE, dist='cor') { - if (deep) { - clf <- as.integer(cl$memberships[1,]) # take the lowest level - } else { - clf <- as.integer(membership(cl)) - } - names(clf) <- names(membership(cl)) - clf.size <- unlist(tapply(clf,factor(clf,levels=seq(1,max(clf))),length)) - rowFac <- rep(NA, nrow(counts)); - rowFac[match(names(clf),rownames(counts))] <- clf - lvec <- colSumByFac(counts,rowFac)[-1,,drop=FALSE] - if(dist=='JS') { - lvec.dist <- jsDist(t(lvec/pmax(1,Matrix::rowSums(lvec)))) - } else { # use correlation distance in log10 space - lvec.dist <- 1-cor(t(log10(lvec/pmax(1,Matrix::rowSums(lvec))+1))) - } - d <- as.dendrogram(stats::hclust(as.dist(lvec.dist),method='ward.D')) - # add cell info to the laves - addinfo <- function(l, env) { - v <- as.integer(mget("index",envir=env,ifnotfound=0)[[1]])+1; - attr(l,'nodeId') <- v - assign("index",v,envir=env) - attr(l,'nCells') <- sum(clf.size[as.integer(unlist(l))]) - if(is.leaf(l)) { - attr(l,'cells') <- names(clf)[clf==attr(l,'label')] - } - attr(l,'root') <- FALSE - return(l); - } - d <- dendrapply(d,addinfo,env=environment()) - attr(d,'root') <- TRUE - return(d) -} - #' Quick utility to check if given character vector is colors #' Thanks to Stackoverflow: http://stackoverflow.com/questions/13289009/check-if-character-string-is-a-valid-color-representation #' @@ -77,7 +35,7 @@ areColors <- function(x) { #' @param mc.preschedule See ?parallel::mclapply (default=FALSE). If TRUE then the computation is first divided to (at most) as many jobs are there are cores and then the jobs are started, each job possibly covering more than one value. If FALSE, then one job is forked for each value of X. The former is better for short computations or large number of values in X, the latter is better for jobs that have high variance of completion time and not too many values of X compared to mc.cores. #' @return list, as returned by lapply #' @keywords internal -papply <- function(..., n.cores=parallel::detectCores(), mc.preschedule=FALSE) { +papply <- function(..., n.cores=parallel::detectCores(), mc.preschedule=FALSE) { # TODO: replace it with sccore::plapply if(n.cores>1) { if(requireNamespace("parallel", quietly = TRUE)) { return(mclapply(...,mc.cores=n.cores,mc.preschedule=mc.preschedule)) diff --git a/R/largeVis.R b/R/largeVis.R index 8f2dcbab..f796c926 100644 --- a/R/largeVis.R +++ b/R/largeVis.R @@ -71,18 +71,6 @@ buildWijMatrix <- function(x, threads = NULL, perplexity = 50) { #' is to use the edge weights, consistent with the reference implementation. #' #' @return A dense [N,D] matrix of the coordinates projecting the w_ij matrix into the lower-dimensional space. -#' @examples -#' \donttest{ -#' data(CO2) -#' CO2$Plant <- as.integer(CO2$Plant) -#' CO2$Type <- as.integer(CO2$Type) -#' CO2$Treatment <- as.integer(CO2$Treatment) -#' co <- scale(as.matrix(CO2)) -#' # Very small datasets often produce a warning regarding the alias table. This is safely ignored. -#' suppressWarnings(vis <- largeVis(t(co), K = 20, sgd_batches = 1, threads = 2)) -#' suppressWarnings(coords <- projectKNNs(vis$wij, threads = 2)) -#' plot(t(coords)) -#' } #' @export projectKNNs <- function(wij, # symmetric sparse matrix dim = 2, # dimension of the projection space @@ -98,10 +86,14 @@ projectKNNs <- function(wij, # symmetric sparse matrix threads = NULL, verbose = getOption("verbose", TRUE)) { - if (alpha < 0) stop("alpha < 0 is meaningless") + if (alpha < 0) { + stop("alpha < 0 is meaningless") + } N <- (length(wij@p) - 1) js <- rep(0:(N - 1), diff(wij@p)) - if (any(is.na(js))) stop("NAs in the index vector.") + if (any(is.na(js))) { + stop("NAs in the index vector") + } is <- wij@i ############################################## diff --git a/R/pagoda2WebApp.R b/R/pagoda2WebApp.R index 445d9f05..f8fe752b 100644 --- a/R/pagoda2WebApp.R +++ b/R/pagoda2WebApp.R @@ -136,7 +136,7 @@ pagoda2WebApp$methods( lvec <- t(lvec/pmax(1,rowSums(lvec))) colnames(lvec) <- which(table(cl0)>0) rownames(lvec) <- colnames(r$misc[['rawCounts']]) - ld <- jsDist(lvec) + ld <- sccore::jsDist(lvec) colnames(ld) <- rownames(ld) <- colnames(lvec) #hcGroup is a hclust object of whatever cell groupings we provided above diff --git a/R/pipelineHelpers.R b/R/pipelineHelpers.R index 6ba62194..fdce4dec 100644 --- a/R/pipelineHelpers.R +++ b/R/pipelineHelpers.R @@ -19,22 +19,12 @@ #' @param get.tsne boolean Whether to calculate tSNE embedding (default=TRUE) #' @param make.geneknn boolean Whether pre-calculate gene kNN (for gene search) (default=TRUE) #' @return a new 'Pagoda2' object -#' @examples -#' \donttest{ -#' ## load count matrix -#' cm <- p2data::sample_BM1 -#' ## perform basic p2 processing -#' p2 <- basicP2proc(cm) -#' } #' #' @export basicP2proc <- function(cd, n.cores=1, n.odgenes=3e3, nPcs=100, k=30, perplexity=50, log.scale=TRUE, trim=10, keep.genes=NULL, min.cells.per.gene=0, min.transcripts.per.cell=100, get.largevis=TRUE, get.tsne=TRUE, make.geneknn=TRUE) { - if (!requireNamespace("p2data", quietly = TRUE)) { - stop("Package \"p2data\" needed for the Pagoda2 class to work. This can be installed via a drat repository, using \"install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')\". Please read the details provided within the README at https://github.com/kharchenkolab/pagoda2.", call. = FALSE) - } rownames(cd) <- make.unique(rownames(cd)) ## Basic Processing p2 <- Pagoda2$new(cd, n.cores = n.cores, keep.genes = keep.genes, trim=trim, log.scale=log.scale, min.cells.per.gene=min.cells.per.gene, min.transcripts.per.cell=min.transcripts.per.cell) @@ -77,10 +67,6 @@ basicP2proc <- function(cd, n.cores=1, n.odgenes=3e3, nPcs=100, k=30, perplexity #' @export extendedP2proc <- function(p2, organism = 'hs') { - if (!requireNamespace("p2data", quietly = TRUE)) { - stop("Package \"p2data\" needed for the Pagoda2 class to work. This can be installed via a drat repository, using \"install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')\". Please read the details provided within the README at https://github.com/kharchenkolab/pagoda2.", call. = FALSE) - } - if (organism == 'hs') { go.env <- p2.generate.human.go(p2) } else if (organism == 'mm') { @@ -201,19 +187,10 @@ webP2proc <- function(p2, additionalMetadata=NULL, title='Pagoda2', #' annotated at that GO term or to one of its child nodes in the GO ontology (default=NULL) #' @param eg.alias2eg mappings between common gene symbol identifiers and entrez gene identifiers (default=NULL) #' @param min.env.length numeric Minimum environment length (default=5) -#' @examples -#' \donttest{ -#' cm <- p2data::sample_BM1 -#' p2 <- basicP2proc(cm) -#' p2.generate.go(p2, organism='hs') -#' } #' #' @export p2.generate.go <- function(r, organism=NULL, go2all.egs=NULL, eg.alias2eg=NULL, min.env.length=5) { - if (!requireNamespace("p2data", quietly = TRUE)) { - stop("Package \"p2data\" needed for the Pagoda2 class to work. This can be installed via a drat repository, using \"install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')\". Please read the details provided within the README at https://github.com/kharchenkolab/pagoda2.", call. = FALSE) - } if (is.null(organism) && (is.null(go2all.egs) || is.null(eg.alias2eg))) { stop('Either organism or go2all.egs and eg.alias2eg must be specified'); @@ -265,18 +242,9 @@ p2.generate.go <- function(r, organism=NULL, go2all.egs=NULL, eg.alias2eg=NULL, #' #' @param r a 'Pagoda2' object #' @return a GO environment object -#' @examples -#' \donttest{ -#' cm <- p2data::sample_BM1 -#' p2 <- basicP2proc(cm) -#' p2.generate.dr.go(p2) -#' } #' #' @export p2.generate.dr.go <- function(r) { - if (!requireNamespace("p2data", quietly = TRUE)) { - stop("Package \"p2data\" needed for the Pagoda2 class to work. This can be installed via a drat repository, using \"install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')\". Please read the details provided within the README at https://github.com/kharchenkolab/pagoda2.", call. = FALSE) - } p2.generate.go(r, "dr") } @@ -285,18 +253,9 @@ p2.generate.dr.go <- function(r) { #' #' @param r a 'Pagoda2' object #' @return a GO environment object -#' @examples -#' \donttest{ -#' cm <- p2data::sample_BM1 -#' p2 <- basicP2proc(cm) -#' p2.generate.human.go(p2) -#' } #' #' @export p2.generate.human.go <- function(r) { - if (!requireNamespace("p2data", quietly = TRUE)) { - stop("Package \"p2data\" needed for the Pagoda2 class to work. This can be installed via a drat repository, using \"install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')\". Please read the details provided within the README at https://github.com/kharchenkolab/pagoda2.", call. = FALSE) - } p2.generate.go(r, "hs") } @@ -304,18 +263,9 @@ p2.generate.human.go <- function(r) { #' #' @param r a 'Pagoda2' object #' @return a GO environment object -#' @examples -#' \donttest{ -#' cm <- p2data::sample_BM1 -#' p2 <- basicP2proc(cm) -#' p2.generate.mouse.go(p2) -#' } #' #' @export p2.generate.mouse.go <- function(r) { - if (!requireNamespace("p2data", quietly = TRUE)) { - stop("Package \"p2data\" needed for the Pagoda2 class to work. This can be installed via a drat repository, using \"install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')\". Please read the details provided within the README at https://github.com/kharchenkolab/pagoda2.", call. = FALSE) - } p2.generate.go(r, "mm") } diff --git a/README.md b/README.md index 9c0ed2b9..cce1bf08 100644 --- a/README.md +++ b/README.md @@ -53,32 +53,6 @@ install.packages('devtools') devtools::install_github('kharchenkolab/pagoda2', build_vignettes = TRUE) ``` -Please note that the package `pagoda2` depends on data in a data package (`p2data`) that is available through a `drat` repository on GitHub. To use the `pagoda2` package, you will need to install `p2data`. There are two equally valid options to install this package: - -A) Users could install `p2data` by adding the `drat` archive to the list of repositories your system will query when adding and updating R packages. Once you do this, you can install `p2data` with `install.packages()`, using the command: - -```r -library(drat) -addRepo("kharchenkolab") -install.packages("p2data") -``` - -The following command is also a valid approach: - -```r -install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source') -``` - -Please see the [drat documentation](https://dirk.eddelbuettel.com/code/drat.html) for more comprehensive explanations and vignettes. - - -B) Another way to install the package `p2data` is to use `devtools::install_github()`: - -```r -library(devtools) -install_github("kharchenkolab/p2data") -``` - ### Installing Linux dependencies diff --git a/doc/pagoda2.walkthrough.R b/doc/pagoda2.walkthrough.R index 2bbf8f3a..f83e7234 100644 --- a/doc/pagoda2.walkthrough.R +++ b/doc/pagoda2.walkthrough.R @@ -12,7 +12,7 @@ install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type=' ## load the dataset countMatrix <- p2data::sample_BM1 ## all basic pagoda2 processing with basicP2proc() -p2.processed <- basicP2proc(countMatrix, n.cores=2, min.cells.per.gene=10, +p2.processed <- basicP2proc(countMatrix, n.cores=1, min.cells.per.gene=10, n.odgenes=2e3, get.largevis=FALSE, make.geneknn=FALSE) ## ----------------------------------------------------------------------------- diff --git a/doc/pagoda2.walkthrough.Rmd b/doc/pagoda2.walkthrough.Rmd index 9a85814b..fdb9809e 100644 --- a/doc/pagoda2.walkthrough.Rmd +++ b/doc/pagoda2.walkthrough.Rmd @@ -31,14 +31,15 @@ library(dplyr) library(ggplot2) ``` -We have pre-generated a dataset of 3000 bone marrow cells that you can load as a matrix directly using the package `p2data` (See the README of pagoda2 for installation details). +We have pre-generated a dataset of 3000 bone marrow cells that you can load as a matrix directly using the package `p2data`, which is available through a `drat` repository on GitHub. Note that the size of the 'p2data' package is approximately 6 MB. This package may be installed as follows: ```{r message=FALSE} install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source') ``` -The following command load the dataset of 3000 bone marrow cells as a sparse matrix: +(Please see the [drat documentation](https://dirk.eddelbuettel.com/code/drat.html) for more comprehensive explanations and vignettes regarding `drat` repositories.) +The following command load the dataset of 3000 bone marrow cells as a sparse matrix: ```r countMatrix <- p2data::sample_BM1 @@ -52,7 +53,7 @@ Next we feed this input into the function `basicP2proc()`, which performs all ba ## load the dataset countMatrix <- p2data::sample_BM1 ## all basic pagoda2 processing with basicP2proc() -p2.processed <- basicP2proc(countMatrix, n.cores=2, min.cells.per.gene=10, +p2.processed <- basicP2proc(countMatrix, n.cores=1, min.cells.per.gene=10, n.odgenes=2e3, get.largevis=FALSE, make.geneknn=FALSE) ``` diff --git a/doc/pagoda2.walkthrough.html b/doc/pagoda2.walkthrough.html index b03cdb48..531d0a84 100644 --- a/doc/pagoda2.walkthrough.html +++ b/doc/pagoda2.walkthrough.html @@ -168,8 +168,9 @@

Preliminary: Loading the libraries

library(pagoda2) library(dplyr) library(ggplot2) -

We have pre-generated a dataset of 3000 bone marrow cells that you can load as a matrix directly using the package p2data (See the README of pagoda2 for installation details).

+

We have pre-generated a dataset of 3000 bone marrow cells that you can load as a matrix directly using the package p2data, which is available through a drat repository on GitHub. Note that the size of the ‘p2data’ package is approximately 6 MB. This package may be installed as follows:

install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')
+

(Please see the drat documentation for more comprehensive explanations and vignettes regarding drat repositories.)

The following command load the dataset of 3000 bone marrow cells as a sparse matrix:

countMatrix <- p2data::sample_BM1

Note that many users will wish to read in their own data from the outputs of the 10x preprocessing pipeline CellRanger, i.e. the gzipped tsv files of matrices, features, and barcodes. For this, we have provided the function read10xMatrix().

@@ -177,7 +178,7 @@

Preliminary: Loading the libraries

## load the dataset
 countMatrix <- p2data::sample_BM1
 ## all basic pagoda2 processing with basicP2proc()
-p2.processed <- basicP2proc(countMatrix, n.cores=2, min.cells.per.gene=10, 
+p2.processed <- basicP2proc(countMatrix, n.cores=1, min.cells.per.gene=10, 
                     n.odgenes=2e3, get.largevis=FALSE, make.geneknn=FALSE)
## creating space of type angular done
 ## adding data ... done
@@ -302,21 +303,21 @@ 

Part 2: Analysing Data with Pagoda2

(Note that largeVis is much faster that the tSNE, which often used in single-cell analysis.)

We now visualize the data:

r$plotEmbedding(type='PCA', show.legend=FALSE, mark.groups=TRUE, min.cluster.size=50, shuffle.colors=FALSE, font.size=3, alpha=0.3, title='clusters (largeVis)', plot.theme=theme_bw() + theme(plot.title = element_text(hjust = 0.5)))
-

+

We next can constructr and plot a tSNE embedding. (This can take some time to complete.)

r$getEmbedding(type='PCA', embeddingType='tSNE', perplexity=50,verbose=FALSE)
 r$plotEmbedding(type='PCA', embeddingType='tSNE', show.legend=FALSE, mark.groups=TRUE, min.cluster.size=1, shuffle.colors=FALSE, font.size=3, alpha=0.3, title='clusters (tSNE)', plot.theme=theme_bw() + theme(plot.title = element_text(hjust = 0.5)))
-

+

Note that we are overlay the expresssion of specific marker genes on this embedding to identify clusters. For instance, subsetting by "HBB" will identify heme cells:

gene <-"HBB"
 r$plotEmbedding(type='PCA', embeddingType='tSNE', colors=r$counts[,gene], shuffle.colors=FALSE, 
     font.size=3, alpha=0.3, title=gene, plot.theme=theme_bw() + theme(plot.title = element_text(hjust = 0.5)))
-

+

Similarly, subsetting by the marker gene "LYZ" should show us CD14+ Monocytes:

gene <-"LYZ"
 r$plotEmbedding(type='PCA', embeddingType='tSNE', colors=r$counts[,gene], shuffle.colors=FALSE, 
     font.size=3, alpha=0.3, title=gene, plot.theme=theme_bw() + theme(plot.title = element_text(hjust = 0.5)))
-

+

Pagoda2 allows us to generate multiple alternative clusterings. Here we will construct multilevel and walktrap clusterings (along with the infomap clusterings generated above):

r$getKnnClusters(method=multilevel.community, type='PCA', name='multilevel')
 r$getKnnClusters(method=walktrap.community, type='PCA', name='walktrap')
@@ -324,11 +325,11 @@

Part 2: Analysing Data with Pagoda2

str(r$clusters)
## List of 1
 ##  $ PCA:List of 3
-##   ..$ community : Factor w/ 23 levels "1","2","3","4",..: 5 1 1 6 6 1 2 4 2 13 ...
+##   ..$ community : Factor w/ 21 levels "1","2","3","4",..: 5 1 1 6 6 1 2 4 2 7 ...
 ##   .. ..- attr(*, "names")= chr [1:2998] "MantonBM1_HiSeq_1-TCTATTGGTCTCTCGT-1" "MantonBM1_HiSeq_1-GAATAAGTCACGCATA-1" "MantonBM1_HiSeq_1-ACACCGGTCTAACTTC-1" "MantonBM1_HiSeq_1-TCATTTGGTACGCTGC-1" ...
-##   ..$ multilevel: Factor w/ 11 levels "1","2","3","4",..: 4 10 10 11 11 10 5 2 5 7 ...
+##   ..$ multilevel: Factor w/ 11 levels "1","2","3","4",..: 10 4 4 1 1 4 7 5 7 9 ...
 ##   .. ..- attr(*, "names")= chr [1:2998] "MantonBM1_HiSeq_1-TCTATTGGTCTCTCGT-1" "MantonBM1_HiSeq_1-GAATAAGTCACGCATA-1" "MantonBM1_HiSeq_1-ACACCGGTCTAACTTC-1" "MantonBM1_HiSeq_1-TCATTTGGTACGCTGC-1" ...
-##   ..$ walktrap  : Factor w/ 11 levels "1","2","3","4",..: 3 8 8 7 7 8 9 5 9 4 ...
+##   ..$ walktrap  : Factor w/ 11 levels "1","2","3","4",..: 3 8 8 6 6 8 9 4 9 5 ...
 ##   .. ..- attr(*, "names")= chr [1:2998] "MantonBM1_HiSeq_1-TCTATTGGTCTCTCGT-1" "MantonBM1_HiSeq_1-GAATAAGTCACGCATA-1" "MantonBM1_HiSeq_1-ACACCGGTCTAACTTC-1" "MantonBM1_HiSeq_1-TCATTTGGTACGCTGC-1" ...

We can now compare these against infomap.community.

@@ -337,21 +338,21 @@

Infomap.community vs. multilevel.community vs. walktrap.community

plt2 = r$plotEmbedding(type='PCA',embeddingType='tSNE', clusterType='multilevel', show.legend=FALSE, mark.groups=TRUE, min.cluster.size=1, shuffle.colors=FALSE, font.size=3, alpha=0.3, title='multlevel clusters (tSNE)', plot.theme=theme_bw() + theme(plot.title = element_text(hjust = 0.5))) plt3 = r$plotEmbedding(type='PCA',embeddingType='tSNE', clusterType='walktrap', show.legend=FALSE, mark.groups=TRUE, min.cluster.size=1, shuffle.colors=FALSE, font.size=3, alpha=0.3, title='walktrap clusters (tSNE)', plot.theme=theme_bw() + theme(plot.title = element_text(hjust = 0.5))) gridExtra::grid.arrange(plt1, plt2, plt3, ncol=3)
-

+

We can then perform differential expression between these clusters:

r$getDifferentialGenes(type='PCA', verbose=TRUE, clusterType='community')
-
## running differential expression with 23 clusters ...
+
## running differential expression with 21 clusters ...
## adjusting p-values ...
## done.

and visualise the top markers of a specific cluster:

de <- r$diffgenes$PCA[[1]][['2']]
 r$plotGeneHeatmap(genes=rownames(de)[1:15], groups=r$clusters$PCA[[1]])
-

+

Let’s further investigate the marker gene "CD74" as shown above, with plotEmbedding():

gene <-"CD74"
 r$plotEmbedding(type='PCA', embeddingType='tSNE', colors=r$counts[,gene], shuffle.colors=FALSE, 
     font.size=3, alpha=0.3, title=gene, legend.title=gene)
-

+

At this point we can perform pathway overdispersion analysis (in the same way we would with pagoda1 in scde) or investigate hierarchical differential expression. The following two code snippetss will run overdispersion analysis (although we don’t run the second in this tutorial, as it takes too long to complete). Overdispersion analysis usually takes too long with the latest datasets composed of +1000’s of cells—for this reason we prefer hierarchical differential expression.

We will need the output of the first of the following two blocks for our web app generation:

suppressMessages(library(org.Hs.eg.db))
@@ -380,18 +381,18 @@ 

Part 3: Generate Frontend Application

genesets <- hierDiffToGenesets(hdea)
 str(genesets[1:2])
## List of 2
-##  $ 16.vs.18  :List of 2
+##  $ 18.vs.7   :List of 2
 ##   ..$ properties:List of 3
 ##   .. ..$ locked          : logi TRUE
-##   .. ..$ genesetname     : chr "16.vs.18"
-##   .. ..$ shortdescription: chr "16.vs.18"
-##   ..$ genes     : chr [1:157] "TSC22D3" "SSR4" "CD79A" "KLF2" ...
-##  $ 16.18.vs.2:List of 2
+##   .. ..$ genesetname     : chr "18.vs.7"
+##   .. ..$ shortdescription: chr "18.vs.7"
+##   ..$ genes     : chr [1:35] "PF4" "TMSB4X" "FERMT3" "PPBP" ...
+##  $ 19.vs.7.18:List of 2
 ##   ..$ properties:List of 3
 ##   .. ..$ locked          : logi TRUE
-##   .. ..$ genesetname     : chr "16.18.vs.2"
-##   .. ..$ shortdescription: chr "16.18.vs.2"
-##   ..$ genes     : chr [1:312] "MZB1" "JCHAIN" "HSP90B1" "ITM2C" ...
+## .. ..$ genesetname : chr "19.vs.7.18" +## .. ..$ shortdescription: chr "19.vs.7.18" +## ..$ genes : chr [1:17] "LEPR" "IGFBP7" "IFITM3" "CXCL12" ...

To add GO Terms as genesets, run the following

library(GO.db)
## 
diff --git a/doc/pagoda2.walkthrough.md b/doc/pagoda2.walkthrough.md new file mode 100644 index 00000000..c552cd4c --- /dev/null +++ b/doc/pagoda2.walkthrough.md @@ -0,0 +1,633 @@ +# Pagoda2 Walkthrough + +## Overview + +This walkthrough will guide you through the analysis of single-cell RNA-seq with pagoda2. + +Pagoda2 performs basic tasks such as cell size normalization/corrections and residual gene variance normalization, and can then be used to perform tasks such as identifying subpopulations and running differential expression within individual samples. The companion web application allows users to interactively explore the transcriptional signatures of subpopulations within the dataset. Users are able to investigate the molecular identity of selected entities, and inspect the associated gene expression patterns through annotated gene sets and pathways, including Gene Ontology (GO) categories. Users may also perform differential expression of selected cells via the frontend application. + +We will begin by showing the quickest way to process data with pagoda2, using the function `basicP2proc()`. We will then systematically re-run this analysis step-by-step, beginning with loading the dataset and performing QC. This will more thoroughly detail and motivate the steps involved in quality control/processing. Finally we will generate an interactive web application in order to explore the dataset. + +## I. Fast Processing and Exploration with Pagoda2 + +This is the rapid walkthrough of pagoda2, showing how the package allows users to quickly process their datasets and load them into an interactive frontend application. + +### Preliminary: Loading the libraries + + +```r +library(Matrix) +library(igraph) +library(pagoda2) +library(dplyr) +library(ggplot2) +``` + +We have pre-generated a dataset of 3000 bone marrow cells that you can load as a matrix directly using the package `p2data`, which is available through a `drat` repository on GitHub. There are two equally valid options to install this package: + +A) Users could install `p2data` by adding the `drat` archive to the list of repositories your system will query when adding and updating R packages. Once you do this, you can install `p2data` with `install.packages()`, using the command: + +```r +library(drat) +addRepo("kharchenkolab") +install.packages("p2data") +``` + +The following command is also a valid approach: + +```r +install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source') +``` + +Please see the [drat documentation](https://dirk.eddelbuettel.com/code/drat.html) for more comprehensive explanations and vignettes. + + +B) Another way to install the package `p2data` is to use `devtools::install_github()`: + +```r +library(devtools) +install_github("kharchenkolab/p2data") +``` + +The following command load the dataset of 3000 bone marrow cells as a sparse matrix: + + +```r +countMatrix <- p2data::sample_BM1 +``` + +Note that many users will wish to read in their own data from the outputs of the 10x preprocessing pipeline [CellRanger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/matrices), i.e. the gzipped tsv files of matrices, features, and barcodes. For this, we have provided the function `read10xMatrix()`. + +Next we feed this input into the function `basicP2proc()`, which performs all basic pagoda2 processing. That is, the function will adjust the variance, calculate PCA reduction, make a KNN graph, identify clusters by [multilevel optimization](https://igraph.org/c/doc/igraph-Community.html#igraph_community_multilevel) (the Louvain algorithm), and generate largeVis and tSNE embeddings. + + +```r +## load the dataset +countMatrix <- p2data::sample_BM1 +## all basic pagoda2 processing with basicP2proc() +p2.processed <- basicP2proc(countMatrix, n.cores=2, min.cells.per.gene=10, + n.odgenes=2e3, get.largevis=FALSE, make.geneknn=FALSE) +``` + +``` +## creating space of type angular done +## adding data ... done +## building index ... done +## querying ... done +``` + +We can now quickly view the results via the interactive web application. First we run `extendedP2proc()` to calculate pathway overdispersion for a specific organism using GO. We currently support three organisms: 'hs' (Homo Sapiens), 'mm' (Mus Musculus, mouse) or 'dr' (Danio Rerio, zebrafish). (This can take some time to run, so we'll omit it for the vignettes.) Then we create a pagoda2 "web object" to be used for the application. This can be accessed via your web browser with the function `show.app()`. + + +```r +## calculate pathway overdispersion for human +## ext.res <- extendedP2proc(p2.processed, organism = 'hs') + +## create app object +## p2app <- webP2proc(ext.res$p2, title = 'Quick pagoda2 app', go.env = ext.res$go.env) + +## open app in the web browser via R session +## show.app(app=p2app, name='pagoda2 app') +``` + +And that's it! You will now be able to interact with the processed dataset via the web browser. The more in-depth demo regarding the web application can be found [here](https://www.youtube.com/watch?v=xzpG1ZYE4Og). + + +## II. In-Depth Processing and Analysis + +We now will re-run and explain each step within `basicP2proc()`, starting from the beginning: + + +### Preliminary: Loading the libraries + +```r +library(Matrix) +library(igraph) +library(pagoda2) +library(dplyr) +library(ggplot2) +``` + +### Part 1: Loading and QC'ing the dataset + +For the purposes of this walkthrough, we have pre-generated a dataset of 3000 bone marrow cells that you can load as a matrix directly. The following command load the data as a sparse matrix and checks its size: + +```r +cm <- p2data::sample_BM1 +dim(cm) +``` + +``` +## [1] 33694 3000 +``` + +We see that the matrix has 33k rows and 3k columns. Next let's have a look at our matrix to see what is in it. We see that genes are named using common gene names and columns by cell barcode. + +```r +cm[1:3, 1:3] +``` + +``` +## 3 x 3 sparse Matrix of class "dgCMatrix" +## MantonBM1_HiSeq_1-TCTATTGGTCTCTCGT-1 +## RP11-34P13.3 . +## FAM138A . +## OR4F5 . +## MantonBM1_HiSeq_1-GAATAAGTCACGCATA-1 +## RP11-34P13.3 . +## FAM138A . +## OR4F5 . +## MantonBM1_HiSeq_1-ACACCGGTCTAACTTC-1 +## RP11-34P13.3 . +## FAM138A . +## OR4F5 . +``` + +We can get more information about how the matrix is stored by running `str()`. To find out more information about the sparse matrix format, check the documentation of the 'Matrix' package. + +```r +str(cm) +``` + +``` +## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots +## ..@ i : int [1:2613488] 33 45 72 153 353 406 436 440 457 484 ... +## ..@ p : int [1:3001] 0 864 1701 2607 3256 3856 4537 5271 6030 7002 ... +## ..@ Dim : int [1:2] 33694 3000 +## ..@ Dimnames:List of 2 +## .. ..$ : chr [1:33694(1d)] "RP11-34P13.3" "FAM138A" "OR4F5" "RP11-34P13.7" ... +## .. ..$ : chr [1:3000(1d)] "MantonBM1_HiSeq_1-TCTATTGGTCTCTCGT-1" "MantonBM1_HiSeq_1-GAATAAGTCACGCATA-1" "MantonBM1_HiSeq_1-ACACCGGTCTAACTTC-1" "MantonBM1_HiSeq_1-TCATTTGGTACGCTGC-1" ... +## ..@ x : num [1:2613488] 1 1 1 9 1 3 1 2 2 20 ... +## ..@ factors : list() +``` + +In order to catch outliers, we can begin with a fairly basic procedure of looking at the dependency between the number of molecules measured per cell and the number of genes per cell. Let's plot the distribution of molecules per cell and molecules per gene for this dataset in log10 scale: + +```r +old_par <- par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0) +on.exit(par(old_par)) +hist(log10(colSums(cm)+1), main='molecules per cell', col='cornsilk', xlab='molecules per cell (log10)') +``` + +![plot of chunk unnamed-chunk-9](figure_pagoda2/unnamed-chunk-9-1.png) + +```r +hist(log10(rowSums(cm)+1), main='molecules per gene', col='cornsilk', xlab='molecules per gene (log10)') +``` + +![plot of chunk unnamed-chunk-9](figure_pagoda2/unnamed-chunk-9-2.png) + +This dataset has already been filtered for low quality cells, so we don't see any cells with fewer that 10^3 UMIs. We can still use the default QC function `gene.vs.molecule.cell.filter()` to filter any cells that don't fit the expected detected gene vs molecule count relationship. In this case we filter out only 2 cells. + +```r +counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) +``` + +![plot of chunk unnamed-chunk-10](figure_pagoda2/unnamed-chunk-10-1.png) + +Next thing we want to do is to find lowly expressed genes and remove them from the dataset. (Subsequent pagoda2 steps will do this automatically for extremely lowly expressed genes anyway, but for the purpose of this tutorial, we demonstrate this.) + + +```r +hist(log10(rowSums(counts)+1), main='Molecules per gene', xlab='molecules (log10)', col='cornsilk') +abline(v=1, lty=2, col=2) +``` + +![plot of chunk unnamed-chunk-11](figure_pagoda2/unnamed-chunk-11-1.png) + +Let's filter out counts less than 10 and check the size of the resulting matrix: + + +```r +counts <- counts[rowSums(counts)>=10, ] +dim(counts) +``` + +``` +## [1] 12693 2998 +``` + + +### Part 2: Analysing Data with Pagoda2 + +We see that we now have 12k genes and 2998 cells. We are now ready to analyze our data with Pagoda2. Remember: all of the following steps can be done with just two functions automatically (see above) but for the purposes of this tutorial we will go over them step by step to understand what we are doing in more detail. Doing these steps manually also allows us to tune parameters. + +First we will generate a pagoda2 object that will contain all our results. Our input matrix contains duplicated gene names (usually originating from different transcripts in the counting process). The easier way to resolve this problem is by making the gene names unique: + + +```r +rownames(counts) <- make.unique(rownames(counts)) +r <- Pagoda2$new(counts, log.scale=TRUE, n.cores=1) +``` + +``` +## 2998 cells, 12693 genes; normalizing ... +``` + +``` +## Using plain model +``` + +``` +## log scale ... +``` + +``` +## done. +``` + +Check that you have the matrix in the correct orientation and that number of cells you are getting here is what you expect (like we do here). The input matrix must be in the genes by cells configuration. + +Next, we’ll adjust the variance with `adjustVariance()` in order to normalize the extent to which genes with (very) different expression magnitudes will contribute to the downstream analysis. + +In order to motivate what we are doing by variance normalization, recall that our goal is to measure the variance of a given gene. (Remember: we are looking +at the variation of this gene across the population of cells measured.) +The key dependency of this variance is the magnitude. If you thus observe highly-expressed genes, these will always give you high expression variance, +regardless of whether these are specific to a cell subpopulation or not. + +For variance normalization, we begin by fitting a smooth linear model of variance by magnitude for the dataset. We then quantify the deviation against this dataset-wide trend, and rescale the variance to put the genes on a comparable scale for downstream analysis. + + + +```r +r$adjustVariance(plot=TRUE, gam.k=10) +``` + +``` +## calculating variance fit ... +``` + +``` +## using gam +``` + +``` +## 187 overdispersed genes ... 187 +``` + +``` +## persisting ... +``` + +``` +## using gam +``` + +``` +## done. +``` + +![plot of chunk unnamed-chunk-14](figure_pagoda2/unnamed-chunk-14-1.png) + +Now that the variance of the gene expression is on a comparable scale, there are many alternative ways of proceeding with the downstream analysis. Below we’ll use the simplest default scenario, whereby we first reduce the dataset dimensions by running PCA, and then move into the k-nearest neighbor (KNN) graph space for clustering and visualization calculations. + +First, we generate the PCA reduction. Depending on the complexity of the dataset you are analyzing, you may want to adjust the parameter `nPcs`. + + +```r +r$calculatePcaReduction(nPcs=50, n.odgenes=3e3) +``` + +``` +## running PCA using 3000 OD genes . +``` + +``` +## . +## . +## . +``` + +``` +## done +``` + +We will now construct a KNN graph space that will allow us to identify clusters of cells: + +```r +r$makeKnnGraph(k=40, type='PCA', center=TRUE, distance='cosine') +``` + +``` +## creating space of type angular done +## adding data ... done +## building index ... done +## querying ... done +``` + +On the basis of this KNN graph, we will call clusters + +```r +r$getKnnClusters(method=infomap.community, type='PCA') +``` + +Next we generate a 2D embedding of the data with largeVis for visualization: + +```r +M <- 30 +r$getEmbedding(type='PCA', embeddingType = 'largeVis', M=M, perplexity=30, gamma=1/M) +``` + +``` +## Estimating embeddings. +``` +(Note that largeVis is much faster that the tSNE, which often used in single-cell analysis.) + +We now visualize the data: + + +```r +r$plotEmbedding(type='PCA', show.legend=FALSE, mark.groups=TRUE, min.cluster.size=50, shuffle.colors=FALSE, font.size=3, alpha=0.3, title='clusters (largeVis)', plot.theme=theme_bw() + theme(plot.title = element_text(hjust = 0.5))) +``` + +![plot of chunk unnamed-chunk-19](figure_pagoda2/unnamed-chunk-19-1.png) + +We next can constructr and plot a tSNE embedding. (This can take some time to complete.) + +```r +r$getEmbedding(type='PCA', embeddingType='tSNE', perplexity=50,verbose=FALSE) +r$plotEmbedding(type='PCA', embeddingType='tSNE', show.legend=FALSE, mark.groups=TRUE, min.cluster.size=1, shuffle.colors=FALSE, font.size=3, alpha=0.3, title='clusters (tSNE)', plot.theme=theme_bw() + theme(plot.title = element_text(hjust = 0.5))) +``` + +![plot of chunk unnamed-chunk-20](figure_pagoda2/unnamed-chunk-20-1.png) + +Note that we are overlay the expresssion of specific marker genes on this embedding to identify clusters. For instance, subsetting by `"HBB"` will identify heme cells: + + +```r +gene <-"HBB" +r$plotEmbedding(type='PCA', embeddingType='tSNE', colors=r$counts[,gene], shuffle.colors=FALSE, + font.size=3, alpha=0.3, title=gene, plot.theme=theme_bw() + theme(plot.title = element_text(hjust = 0.5))) +``` + +![plot of chunk unnamed-chunk-21](figure_pagoda2/unnamed-chunk-21-1.png) + +Similarly, subsetting by the marker gene `"LYZ"` should show us CD14+ Monocytes: + + +```r +gene <-"LYZ" +r$plotEmbedding(type='PCA', embeddingType='tSNE', colors=r$counts[,gene], shuffle.colors=FALSE, + font.size=3, alpha=0.3, title=gene, plot.theme=theme_bw() + theme(plot.title = element_text(hjust = 0.5))) +``` + +![plot of chunk unnamed-chunk-22](figure_pagoda2/unnamed-chunk-22-1.png) + +Pagoda2 allows us to generate multiple alternative clusterings. Here we will construct multilevel and walktrap clusterings (along with the infomap clusterings generated above): + +```r +r$getKnnClusters(method=multilevel.community, type='PCA', name='multilevel') +r$getKnnClusters(method=walktrap.community, type='PCA', name='walktrap') +``` + +Internally the clusters are saved in the clusters variable under the reduction from which they were obtained: + + +```r +str(r$clusters) +``` + +``` +## List of 1 +## $ PCA:List of 3 +## ..$ community : Factor w/ 20 levels "1","2","3","4",..: 5 1 1 6 6 1 2 4 2 13 ... +## .. ..- attr(*, "names")= chr [1:2998] "MantonBM1_HiSeq_1-TCTATTGGTCTCTCGT-1" "MantonBM1_HiSeq_1-GAATAAGTCACGCATA-1" "MantonBM1_HiSeq_1-ACACCGGTCTAACTTC-1" "MantonBM1_HiSeq_1-TCATTTGGTACGCTGC-1" ... +## ..$ multilevel: Factor w/ 11 levels "1","2","3","4",..: 8 10 10 2 2 10 4 1 4 7 ... +## .. ..- attr(*, "names")= chr [1:2998] "MantonBM1_HiSeq_1-TCTATTGGTCTCTCGT-1" "MantonBM1_HiSeq_1-GAATAAGTCACGCATA-1" "MantonBM1_HiSeq_1-ACACCGGTCTAACTTC-1" "MantonBM1_HiSeq_1-TCATTTGGTACGCTGC-1" ... +## ..$ walktrap : Factor w/ 13 levels "1","2","3","4",..: 1 8 8 7 7 8 10 3 10 2 ... +## .. ..- attr(*, "names")= chr [1:2998] "MantonBM1_HiSeq_1-TCTATTGGTCTCTCGT-1" "MantonBM1_HiSeq_1-GAATAAGTCACGCATA-1" "MantonBM1_HiSeq_1-ACACCGGTCTAACTTC-1" "MantonBM1_HiSeq_1-TCATTTGGTACGCTGC-1" ... +``` + +We can now compare these against `infomap.community`. + +#### Infomap.community vs. multilevel.community vs. walktrap.community + + +```r +plt1 = r$plotEmbedding(type='PCA', embeddingType='tSNE', groups=r$clusters$PCA$community, show.legend=FALSE, mark.groups=TRUE, min.cluster.size=1, shuffle.colors=FALSE, font.size=3, alpha=0.3, title='infomap clusters (tSNE)', plot.theme=theme_bw() + theme(plot.title = element_text(hjust = 0.5))) +plt2 = r$plotEmbedding(type='PCA',embeddingType='tSNE', clusterType='multilevel', show.legend=FALSE, mark.groups=TRUE, min.cluster.size=1, shuffle.colors=FALSE, font.size=3, alpha=0.3, title='multlevel clusters (tSNE)', plot.theme=theme_bw() + theme(plot.title = element_text(hjust = 0.5))) +plt3 = r$plotEmbedding(type='PCA',embeddingType='tSNE', clusterType='walktrap', show.legend=FALSE, mark.groups=TRUE, min.cluster.size=1, shuffle.colors=FALSE, font.size=3, alpha=0.3, title='walktrap clusters (tSNE)', plot.theme=theme_bw() + theme(plot.title = element_text(hjust = 0.5))) +gridExtra::grid.arrange(plt1, plt2, plt3, ncol=3) +``` + +![plot of chunk unnamed-chunk-25](figure_pagoda2/unnamed-chunk-25-1.png) + +We can then perform differential expression between these clusters: + + +```r +r$getDifferentialGenes(type='PCA', verbose=TRUE, clusterType='community') +``` + +``` +## running differential expression with 20 clusters ... +``` + +``` +## adjusting p-values ... +``` + +``` +## done. +``` + + +and visualise the top markers of a specific cluster: + + +```r +de <- r$diffgenes$PCA[[1]][['2']] +r$plotGeneHeatmap(genes=rownames(de)[1:15], groups=r$clusters$PCA[[1]]) +``` + +![plot of chunk unnamed-chunk-27](figure_pagoda2/unnamed-chunk-27-1.png) + +Let's further investigate the marker gene `"CD74"` as shown above, with `plotEmbedding()`: + + +```r +gene <-"CD74" +r$plotEmbedding(type='PCA', embeddingType='tSNE', colors=r$counts[,gene], shuffle.colors=FALSE, + font.size=3, alpha=0.3, title=gene, legend.title=gene) +``` + +![plot of chunk unnamed-chunk-28](figure_pagoda2/unnamed-chunk-28-1.png) + +At this point we can perform pathway overdispersion analysis (in the same way we would with pagoda1 in [scde](https://hms-dbmi.github.io/scde/)) or investigate hierarchical differential expression. The following two code snippetss will run overdispersion analysis (although we don't run the second in this tutorial, as it takes too long to complete). Overdispersion analysis usually takes too long with the latest datasets composed of +1000's of cells---for this reason we prefer hierarchical differential expression. + +We will need the output of the first of the following two blocks for our web app generation: + +```r +suppressMessages(library(org.Hs.eg.db)) +# translate gene names to ids +ids <- unlist(lapply(mget(colnames(r$counts), org.Hs.egALIAS2EG, ifnotfound=NA), function(x) x[1])) +# reverse map +rids <- names(ids) +names(rids) <- ids +# list all the ids per GO category +go.env <- list2env(eapply(org.Hs.egGO2ALLEGS,function(x) as.character(na.omit(rids[x])))) +``` + + +```r +## DON'T RUN +## test overdispersion +# r$testPathwayOverdispersion(go.env, verbose=TRUE, correlation.distance.threshold=0.95, recalculate.pca=FALSE, top.aspects=15) +``` + + +Run hierarchical differential expression. This examines cells down a hierarchy of clusters and determines differentially expressed genes at every split. + +```r +hdea <- r$getHierarchicalDiffExpressionAspects(type='PCA', clusterName='community', z.threshold=3) +``` + +``` +## Using community clustering for PCA space +``` + +Finally, please do not forget to save your pagoda2 object as an rds object: + +```r +## saveRDS(r, 'pagoda2object.rds') +``` + +This is very important for future reproducibility, as well as any collaborations you may have. + +### Part 3: Generate Frontend Application + +Next we will generate a web app that will allow us to browse the dataset interactively. (Note that all these steps can be performed with the `basicP2web()` function, as described above in the first section.) + +We will need to prepare a set of genes that we want to be accessible from the application. For the hierarchical differential expression to work, we must include the geneset used from the hierarchical differential expression. However we can include any genesets we want, including GO geneset and custom sets of marker genes: + +```r +genesets <- hierDiffToGenesets(hdea) +str(genesets[1:2]) +``` + +``` +## List of 2 +## $ 12.20.vs.18.19:List of 2 +## ..$ properties:List of 3 +## .. ..$ locked : logi TRUE +## .. ..$ genesetname : chr "12.20.vs.18.19" +## .. ..$ shortdescription: chr "12.20.vs.18.19" +## ..$ genes : chr [1:36] "MALAT1" "MT-CO1" "MT-CO2" "MT-CYB" ... +## $ 15.vs.17 :List of 2 +## ..$ properties:List of 3 +## .. ..$ locked : logi TRUE +## .. ..$ genesetname : chr "15.vs.17" +## .. ..$ shortdescription: chr "15.vs.17" +## ..$ genes : chr [1:157] "TSC22D3" "SSR4" "CD79A" "KLF2" ... +``` + +To add GO Terms as genesets, run the following + +```r +library(GO.db) +``` + +``` +## +``` + +```r +termDescriptions <- Term(GOTERM[names(go.env)]) # saves a good minute or so compared to individual lookups + +sn <- function(x) { names(x) <- x; x} ## utility function + +genesets.go <- lapply(sn(names(go.env)),function(x) { + list(properties=list(locked=TRUE, genesetname=x, shortdescription=as.character(termDescriptions[x])), genes=c(go.env[[x]])) +}) + +## concatenate +genesets <- c(genesets, genesets.go) +``` + +Add per cluster differentially expressed genes to the gene sets: + +```r +deSets <- get.de.geneset(r, groups = r$clusters$PCA[['community']], prefix = 'de_') +## concatenate +genesets <- c(genesets, deSets) +``` + +Next we can add metadata to our web app. The metadata we add can be completely arbitrary and include processing parameters, notes of anything else we like. They are provided in a list of strings. If we include an entry called apptitle, this will appear as an app title in our web browser when we open the app. + +```r +appmetadata <- list(apptitle = 'Demo_App') +``` + +We also want to update the original pagoda2 object to contain a KNN graph of genes. We will need this to enable the 'find similar gene' feature of our application. This takes a moment to complete. + +```r +r$makeGeneKnnGraph(n.cores = 1) +``` + +``` +## creating space of type angular done +## adding data ... done +## building index ... done +## querying ... done +``` + +Finally before we make our web app we want to generate metadata for the cells. The exact data that we will want to incorporate will depend on the dataset we are processing. For example if we are co-processing multiple samples we will usually want to label cells by the sample of origin. We might also want to add our clusterings as metadata. + +Several functions give very detailed control over how metadata will be presented and generated. For example we can generate different palettes or set colors manually. The factorListToMetadata() function will do everything for us, check its documentation for more details. Here we will generate metadata for the different clusterings we previously generated manually: + + +```r +## # Make a list for our metadata +additionalMetadata <- list() +## for Infomap use hue values from 0.1 to 0.5 +additionalMetadata$community <- p2.metadata.from.factor(r$clusters$PCA[['community']], displayname = 'Infomap', s = 0.7, v = 0.8,start = 0.1, end = 0.5) +# use different colors for multilevel +additionalMetadata$multilevel <- p2.metadata.from.factor(r$clusters$PCA[['multilevel']], displayname = 'Multilevel', s = 0.9, v = 0.8,start = 0.5, end = 1) +## Manual palette generation for walktrap +a <- r$clusters$PCA[['walktrap']] +library(colorRamps) +p1 <- colorRamps::primary.colors(n = nlevels(a)) +names(p1) <- levels(a) +additionalMetadata$walktrap <- p2.metadata.from.factor(r$clusters$PCA[['walktrap']], displayname = 'Walktrap', pal = p1) +``` + +We are now ready to build our app: + +```r +p2web <- make.p2.app(r, + dendrogramCellGroups = r$clusters$PCA$community, + additionalMetadata = additionalMetadata, + geneSets = genesets, + appmetadata = appmetadata, + show.clusters = FALSE # Hide the clusters that were used for the dendrogram from the metadata + ) +``` + +We can view this app directly from our R session, which will open the application in the browser: + + +```r +## show.app(app=p2web, name='app') +``` + +This app will now be viewable as long as our R session is running. However we also have the option to serialize this app into a binary file `*.bin` on our hard drive that will allow us to view it after we close the R session directly from our browser: + + +```r +## p2web$serializeToStaticFast('demo_pbmc.bin', verbose=TRUE) +``` + +### View Conos Object in Pagoda2 Frontend Application + +Users may also interactively explore [Conos](https://github.com/kharchenkolab/conos) objects with the Pagoda2 application. + +After constructing the Conos object `con` as shown in the Conos [walkthrough](https://github.com/kharchenkolab/conos/blob/master/doc/walkthrough.md), users can save to a serialized `*.bin` file and upload into the pagoda application with the `p2app4conos()` function, using `p2app4conos(conos=con)`. Please see Conos for more details. + +### More Details + +For the more in-depth demo regarding how to use the web application to analyze datasets, please check [here](https://www.youtube.com/watch?v=xzpG1ZYE4Og). + +You can view the offline app by pointing your browser to http://pklab.med.harvard.edu/nikolas/pagoda2/frontend/current/pagodaLocal/index.html + +The file can also be shared by uploading it to a web server and be viewed remotely by navigating to the following URL: http://pklab.med.harvard.edu/nikolas/pagoda2/frontend/current/pagodaURL/index.html?fileURL= [URL TO FILE] + diff --git a/man/Pagoda2.Rd b/man/Pagoda2.Rd index 0cdfd4e2..1cdda621 100644 --- a/man/Pagoda2.Rd +++ b/man/Pagoda2.Rd @@ -28,8 +28,8 @@ p2_object <- Pagoda2$new(counts, log.scale=TRUE, min.cells.per.gene=10, n.cores= ## ------------------------------------------------ \donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 +## Load pre-generated a dataset of 50 bone marrow cells as matrix +cm <- readRDS(system.file("extdata", "sample_BM1_50.rds", package="pagoda2")) ## Perform QC, i.e. filter any cells that counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) rownames(counts) <- make.unique(rownames(counts)) @@ -45,8 +45,8 @@ p2_object$adjustVariance(plot=TRUE, gam.k=10) ## ------------------------------------------------ \donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 +## Load pre-generated a dataset of 50 bone marrow cells as matrix +cm <- readRDS(system.file("extdata", "sample_BM1_50.rds", package="pagoda2")) ## Perform QC, i.e. filter any cells that counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=300) rownames(counts) <- make.unique(rownames(counts)) @@ -58,333 +58,6 @@ p2_object$adjustVariance(plot=TRUE, gam.k=10) p2_object$makeKnnGraph(k=20, center=FALSE, distance='L2') } - -## ------------------------------------------------ -## Method `Pagoda2$getKnnClusters` -## ------------------------------------------------ - -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=900) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a KNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=20, center=FALSE, distance='L2') -## Call clusters based on KNN -p2_object$getKnnClusters(method=infomap.community, type='counts') -} - - -## ------------------------------------------------ -## Method `Pagoda2$getHierarchicalDiffExpressionAspects` -## ------------------------------------------------ - -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=400) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a KNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=20, type='PCA', center=FALSE, distance='cosine') -## Call clusters based on KNN -p2_object$getKnnClusters(method=walktrap.community,type='PCA',name='walktrap') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', embeddingType = 'largeVis', M=30, perplexity=30, gamma=1/30) -## Perform differential expression -p2_object$getDifferentialGenes(type='PCA', verbose=TRUE, clusterType='walktrap') -## Perform differential expression -hdea <- p2_object$getHierarchicalDiffExpressionAspects(type='PCA', - clusterName='walktrap', z.threshold=3) -} - - -## ------------------------------------------------ -## Method `Pagoda2$makeGeneKnnGraph` -## ------------------------------------------------ - -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -p2_object$makeGeneKnnGraph(nPcs=50, k=20, center=TRUE) -} - - -## ------------------------------------------------ -## Method `Pagoda2$getDensityClusters` -## ------------------------------------------------ - - -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a KNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=20, perplexity=30, gamma=1/20) -p2_object$getDensityClusters(type='PCA') -} - - -## ------------------------------------------------ -## Method `Pagoda2$getDifferentialGenes` -## ------------------------------------------------ - -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a KNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on KNN -p2_object$getKnnClusters(method=multilevel.community,type='PCA',name='multilevel') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=20, perplexity=30, gamma=1/20) -## Perform differential expression -p2_object$getDifferentialGenes(type='PCA',verbose=TRUE,clusterType='multilevel') -} - - -## ------------------------------------------------ -## Method `Pagoda2$getRefinedLibSizes` -## ------------------------------------------------ - -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a kNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on kNN -p2_object$getKnnClusters(method=infomap.community, type='PCA') -p2_object$getRefinedLibSizes(type='PCA') -lib.sizes <- p2_object$getRefinedLibSizes(type="PCA") -} - - -## ------------------------------------------------ -## Method `Pagoda2$plotGeneHeatmap` -## ------------------------------------------------ - -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a kNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on kNN -p2_object$getKnnClusters(method=multilevel.community,type='PCA',name='multilevel') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=20, perplexity=30, gamma=1/20) -## Perform differential expression -p2_object$getDifferentialGenes(type='PCA',verbose=TRUE,clusterType='multilevel') -de <- p2_object$diffgenes$PCA[[1]][['2']] -p2_object$plotGeneHeatmap(genes=rownames(de)[1:15], - groups=p2_object$clusters$PCA[[1]], cluster.genes=TRUE) -} - - -## ------------------------------------------------ -## Method `Pagoda2$plotEmbedding` -## ------------------------------------------------ - -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a kNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on kNN -p2_object$getKnnClusters(method=multilevel.community,type='PCA',name='multilevel') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=20, perplexity=30, gamma=1/20) -library(ggplot2) -p2_object$plotEmbedding(type='PCA', show.legend=FALSE, mark.groups=TRUE, min.cluster.size=50, - shuffle.colors=FALSE, font.size=1, alpha=0.1, title='clusters (UMAP)', - plot.theme=theme(plot.title = element_text(hjust = 0.5))) -} - - -## ------------------------------------------------ -## Method `Pagoda2$getOdGenes` -## ------------------------------------------------ - -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a kNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on kNN -p2_object$getKnnClusters(method=infomap.community, type='PCA') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', M=20, perplexity=30, gamma=1/20) -## Perform differential expression -p2_object$getDifferentialGenes(type='PCA',verbose=TRUE) -odGenes <- p2_object$getOdGenes(use.unadjusted.pvals=FALSE) -} - - -## ------------------------------------------------ -## Method `Pagoda2$getNormalizedExpressionMatrix` -## ------------------------------------------------ - -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -p2_object$getNormalizedExpressionMatrix() -} - - -## ------------------------------------------------ -## Method `Pagoda2$calculatePcaReduction` -## ------------------------------------------------ - -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=600) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts, log.scale=FALSE, min.cells.per.gene=30, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=15) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=2e3) -} - - -## ------------------------------------------------ -## Method `Pagoda2$expandOdGenes` -## ------------------------------------------------ - -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a kNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on kNN -p2_object$getKnnClusters(method=infomap.community, type='PCA') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', M=20, perplexity=30, gamma=1/20) -## Perform differential expression -p2_object$getDifferentialGenes(type='PCA',verbose=TRUE) -p2_object$expandOdGenes(type='PCA') -} - - -## ------------------------------------------------ -## Method `Pagoda2$getEmbedding` -## ------------------------------------------------ - -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm,min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a kNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=40, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on kNN -p2_object$getKnnClusters(method=infomap.community, type='PCA') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=30, perplexity=30, gamma=1/30) -} - } \author{ Simon Steiger @@ -463,6 +136,7 @@ Initialize Pagoda2 class min.cells.per.gene = 0, trim = round(min.cells.per.gene/2), min.transcripts.per.cell = 10, + batch = NULL, lib.sizes = NULL, log.scale = TRUE, keep.genes = NULL @@ -486,6 +160,8 @@ Initialize Pagoda2 class \item{\code{min.transcripts.per.cell}}{integer Minimum number of transcripts per cells, used to subset counts for coverage (default=10)} +\item{\code{batch}}{fctor Batch factor for the dataset (default=NULL)} + \item{\code{lib.sizes}}{character vector of library sizes (default=NULL)} \item{\code{log.scale}}{boolean If TRUE, scale counts by log() (default=TRUE)} @@ -622,8 +298,8 @@ residual matrix with adjusted variance \subsection{Examples}{ \if{html}{\out{
}} \preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 +## Load pre-generated a dataset of 50 bone marrow cells as matrix +cm <- readRDS(system.file("extdata", "sample_BM1_50.rds", package="pagoda2")) ## Perform QC, i.e. filter any cells that counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) rownames(counts) <- make.unique(rownames(counts)) @@ -696,8 +372,8 @@ kNN graph, stored in self$graphs \subsection{Examples}{ \if{html}{\out{
}} \preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 +## Load pre-generated a dataset of 50 bone marrow cells as matrix +cm <- readRDS(system.file("extdata", "sample_BM1_50.rds", package="pagoda2")) ## Perform QC, i.e. filter any cells that counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=300) rownames(counts) <- make.unique(rownames(counts)) @@ -758,31 +434,6 @@ If NULL, if the number of vertices of the graph is greater than or equal to 2000 \subsection{Returns}{ the community structure calculated from 'method' } -\subsection{Examples}{ -\if{html}{\out{
}} -\preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=900) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a KNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=20, center=FALSE, distance='L2') -## Call clusters based on KNN -p2_object$getKnnClusters(method=infomap.community, type='counts') -} - -} -\if{html}{\out{
}} - -} - } \if{html}{\out{
}} \if{html}{\out{}} @@ -842,38 +493,6 @@ Take a given clustering and generate a hierarchical clustering \subsection{Returns}{ hierarchical clustering } -\subsection{Examples}{ -\if{html}{\out{
}} -\preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=400) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a KNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=20, type='PCA', center=FALSE, distance='cosine') -## Call clusters based on KNN -p2_object$getKnnClusters(method=walktrap.community,type='PCA',name='walktrap') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', embeddingType = 'largeVis', M=30, perplexity=30, gamma=1/30) -## Perform differential expression -p2_object$getDifferentialGenes(type='PCA', verbose=TRUE, clusterType='walktrap') -## Perform differential expression -hdea <- p2_object$getHierarchicalDiffExpressionAspects(type='PCA', - clusterName='walktrap', z.threshold=3) -} - -} -\if{html}{\out{
}} - -} - } \if{html}{\out{
}} \if{html}{\out{}} @@ -914,28 +533,6 @@ Calculates gene Knn network for gene similarity \subsection{Returns}{ graph with gene similarity } -\subsection{Examples}{ -\if{html}{\out{
}} -\preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -p2_object$makeGeneKnnGraph(nPcs=50, k=20, center=TRUE) -} - -} -\if{html}{\out{
}} - -} - } \if{html}{\out{
}} \if{html}{\out{}} @@ -979,33 +576,6 @@ Calculate density-based clusters \subsection{Returns}{ density-based clusters } -\subsection{Examples}{ -\if{html}{\out{
}} -\preformatted{ -\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a KNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=20, perplexity=30, gamma=1/20) -p2_object$getDensityClusters(type='PCA') -} - -} -\if{html}{\out{
}} - -} - } \if{html}{\out{
}} \if{html}{\out{}} @@ -1057,35 +627,6 @@ List with each element of the list corresponding to a cell group in the provided highest- a boolean flag indicating whether the expression of a given gene in a given vcell group was on average higher than in every other cell group fe - fraction of cells in a given group having non-zero expression level of a given gene } -\subsection{Examples}{ -\if{html}{\out{
}} -\preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a KNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on KNN -p2_object$getKnnClusters(method=multilevel.community,type='PCA',name='multilevel') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=20, perplexity=30, gamma=1/20) -## Perform differential expression -p2_object$getDifferentialGenes(type='PCA',verbose=TRUE,clusterType='multilevel') -} - -} -\if{html}{\out{
}} - -} - } \if{html}{\out{
}} \if{html}{\out{}} @@ -1162,46 +703,19 @@ Recalculate library sizes using robust regression within clusters \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{clusterType}}{Optional cluster type to use as a group-defining factor (default=NULL)} +\item{\code{clusterType}}{Name of cluster to access (default=NULL). If NULL, takes the most recently generated clustering. Parameter ignored if groups is not NULL.} -\item{\code{groups}}{factor named with cell names specifying the clusters of cells to be compared (one against all) (default=NULL). To compare two cell clusters against each other, simply pass a factor containing only two levels.} +\item{\code{groups}}{factor named with cell names specifying the clusters of cells (default=NULL)} -\item{\code{type}}{string Data type (default='counts'). Currently only 'counts' supported.} +\item{\code{type}}{string Either 'counts' or the name of a stored embedding, names(self$embeddings) (default=NULL)} -\item{\code{n.cores}}{numeric Number of cores to use (default=1)} +\item{\code{n.cores}}{numeric Number of cores to use (default=self$n.cores=1)} } \if{html}{\out{
}} } \subsection{Returns}{ recalculated library sizes } -\subsection{Examples}{ -\if{html}{\out{
}} -\preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a kNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on kNN -p2_object$getKnnClusters(method=infomap.community, type='PCA') -p2_object$getRefinedLibSizes(type='PCA') -lib.sizes <- p2_object$getRefinedLibSizes(type="PCA") -} - -} -\if{html}{\out{
}} - -} - } \if{html}{\out{
}} \if{html}{\out{}} @@ -1266,38 +780,6 @@ Plot heatmap for a given set of genes \subsection{Returns}{ plot of gene heatmap } -\subsection{Examples}{ -\if{html}{\out{
}} -\preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a kNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on kNN -p2_object$getKnnClusters(method=multilevel.community,type='PCA',name='multilevel') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=20, perplexity=30, gamma=1/20) -## Perform differential expression -p2_object$getDifferentialGenes(type='PCA',verbose=TRUE,clusterType='multilevel') -de <- p2_object$diffgenes$PCA[[1]][['2']] -p2_object$plotGeneHeatmap(genes=rownames(de)[1:15], - groups=p2_object$clusters$PCA[[1]], cluster.genes=TRUE) -} - -} -\if{html}{\out{
}} - -} - } \if{html}{\out{
}} \if{html}{\out{}} @@ -1341,37 +823,6 @@ Show embedding \subsection{Returns}{ plot of the embedding } -\subsection{Examples}{ -\if{html}{\out{
}} -\preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a kNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on kNN -p2_object$getKnnClusters(method=multilevel.community,type='PCA',name='multilevel') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=20, perplexity=30, gamma=1/20) -library(ggplot2) -p2_object$plotEmbedding(type='PCA', show.legend=FALSE, mark.groups=TRUE, min.cluster.size=50, - shuffle.colors=FALSE, font.size=1, alpha=0.1, title='clusters (UMAP)', - plot.theme=theme(plot.title = element_text(hjust = 0.5))) -} - -} -\if{html}{\out{
}} - -} - } \if{html}{\out{
}} \if{html}{\out{}} @@ -1400,36 +851,6 @@ Get overdispersed genes \subsection{Returns}{ vector of overdispersed genes } -\subsection{Examples}{ -\if{html}{\out{
}} -\preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a kNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on kNN -p2_object$getKnnClusters(method=infomap.community, type='PCA') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', M=20, perplexity=30, gamma=1/20) -## Perform differential expression -p2_object$getDifferentialGenes(type='PCA',verbose=TRUE) -odGenes <- p2_object$getOdGenes(use.unadjusted.pvals=FALSE) -} - -} -\if{html}{\out{
}} - -} - } \if{html}{\out{
}} \if{html}{\out{}} @@ -1452,26 +873,6 @@ Return variance-normalized matrix for specified genes or a number of OD genes \subsection{Returns}{ cell by gene matrix } -\subsection{Examples}{ -\if{html}{\out{
}} -\preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -p2_object$getNormalizedExpressionMatrix() -} - -} -\if{html}{\out{
}} - -} - } \if{html}{\out{
}} \if{html}{\out{}} @@ -1530,27 +931,6 @@ Calculate PCA reduction of the data \subsection{Returns}{ Invisible PCA result (the reduction itself is saved in self$reductions[[name]])" } -\subsection{Examples}{ -\if{html}{\out{
}} -\preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=600) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts, log.scale=FALSE, min.cells.per.gene=30, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=15) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=2e3) -} - -} -\if{html}{\out{
}} - -} - } \if{html}{\out{
}} \if{html}{\out{}} @@ -1616,36 +996,6 @@ Reset overdispersed genes 'odgenes' to be a superset of the standard odgene sele \subsection{Returns}{ List of overdispersed genes } -\subsection{Examples}{ -\if{html}{\out{
}} -\preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm, min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a kNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=50, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on kNN -p2_object$getKnnClusters(method=infomap.community, type='PCA') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', M=20, perplexity=30, gamma=1/20) -## Perform differential expression -p2_object$getDifferentialGenes(type='PCA',verbose=TRUE) -p2_object$expandOdGenes(type='PCA') -} - -} -\if{html}{\out{
}} - -} - } \if{html}{\out{
}} \if{html}{\out{}} @@ -1873,33 +1223,6 @@ between \code{0} and \code{1}, the default value will be multiplied by the param \subsection{Returns}{ embedding stored in self$embedding } -\subsection{Examples}{ -\if{html}{\out{
}} -\preformatted{\donttest{ -## Load pre-generated a dataset of 3000 bone marrow cells as matrix -cm <- p2data::sample_BM1 -## Perform QC, i.e. filter any cells that -counts <- gene.vs.molecule.cell.filter(cm,min.cell.size=500) -rownames(counts) <- make.unique(rownames(counts)) -## Generate Pagoda2 object -p2_object <- Pagoda2$new(counts,log.scale=TRUE, min.cells.per.gene=10, n.cores=1) -## Normalize gene expression variance -p2_object$adjustVariance(plot=TRUE, gam.k=10) -## Reduce the dataset dimensions by running PCA -p2_object$calculatePcaReduction(nPcs=50, n.odgenes=3e3) -## Generate a kNN graph of cells that will allow us to identify clusters of cells -p2_object$makeKnnGraph(k=40, type='PCA', center=TRUE, distance='cosine') -## Call clusters based on kNN -p2_object$getKnnClusters(method=infomap.community, type='PCA') -## Generate embedding of the data -p2_object$getEmbedding(type='PCA', embeddingType = 'UMAP', M=30, perplexity=30, gamma=1/30) -} - -} -\if{html}{\out{
}} - -} - } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/man/basicP2proc.Rd b/man/basicP2proc.Rd index 7f2098bb..6f953e2f 100644 --- a/man/basicP2proc.Rd +++ b/man/basicP2proc.Rd @@ -60,12 +60,3 @@ Perform basic 'pagoda2' processing, i.e. adjust variance, calculate pca reductio make knn graph, identify clusters with multilevel, and generate largeVis and tSNE embeddings. } -\examples{ -\donttest{ -## load count matrix -cm <- p2data::sample_BM1 -## perform basic p2 processing -p2 <- basicP2proc(cm) -} - -} diff --git a/man/multi2dend.Rd b/man/multi2dend.Rd deleted file mode 100644 index 56a7f285..00000000 --- a/man/multi2dend.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpers.R -\name{multi2dend} -\alias{multi2dend} -\title{Translate multilevel segmentation into a dendrogram, with the lowest level of the dendrogram listing the cells} -\usage{ -multi2dend(cl, counts, deep = FALSE, dist = "cor") -} -\arguments{ -\item{cl}{clusters} - -\item{counts}{matrix of counts} - -\item{deep}{boolean (default=FALSE)} - -\item{dist}{character vector Distance metric (default='cor')} -} -\value{ -cell dendrogram -} -\description{ -Translate multilevel segmentation into a dendrogram, with the lowest level of the dendrogram listing the cells -} -\keyword{internal} diff --git a/man/p2.generate.dr.go.Rd b/man/p2.generate.dr.go.Rd index f6cc3c49..ac011a4b 100644 --- a/man/p2.generate.dr.go.Rd +++ b/man/p2.generate.dr.go.Rd @@ -15,11 +15,3 @@ a GO environment object \description{ Generate a GO environment for human for overdispersion analysis for the the back end } -\examples{ -\donttest{ -cm <- p2data::sample_BM1 -p2 <- basicP2proc(cm) -p2.generate.dr.go(p2) -} - -} diff --git a/man/p2.generate.go.Rd b/man/p2.generate.go.Rd index a07c4ff0..bd400499 100644 --- a/man/p2.generate.go.Rd +++ b/man/p2.generate.go.Rd @@ -27,11 +27,3 @@ annotated at that GO term or to one of its child nodes in the GO ontology (defau \description{ Generate a GO environment for the organism specified } -\examples{ -\donttest{ -cm <- p2data::sample_BM1 -p2 <- basicP2proc(cm) -p2.generate.go(p2, organism='hs') -} - -} diff --git a/man/p2.generate.human.go.Rd b/man/p2.generate.human.go.Rd index 6eaa5402..faef0308 100644 --- a/man/p2.generate.human.go.Rd +++ b/man/p2.generate.human.go.Rd @@ -15,11 +15,3 @@ a GO environment object \description{ Generate a GO environment for human for overdispersion analysis for the the back end } -\examples{ -\donttest{ -cm <- p2data::sample_BM1 -p2 <- basicP2proc(cm) -p2.generate.human.go(p2) -} - -} diff --git a/man/p2.generate.mouse.go.Rd b/man/p2.generate.mouse.go.Rd index f09b7b77..cf367ba6 100644 --- a/man/p2.generate.mouse.go.Rd +++ b/man/p2.generate.mouse.go.Rd @@ -15,11 +15,3 @@ a GO environment object \description{ Generate a GO environment for mouse for overdispersion analysis for the the back end } -\examples{ -\donttest{ -cm <- p2data::sample_BM1 -p2 <- basicP2proc(cm) -p2.generate.mouse.go(p2) -} - -} diff --git a/man/projectKNNs.Rd b/man/projectKNNs.Rd index 15567cce..03516d14 100644 --- a/man/projectKNNs.Rd +++ b/man/projectKNNs.Rd @@ -84,16 +84,3 @@ connecting to the vertex. The reference implementation, however, uses the sum of difference was imperceptible with small (MNIST-size) datasets, but the results seems aesthetically preferrable using degree. The default is to use the edge weights, consistent with the reference implementation. } -\examples{ -\donttest{ -data(CO2) -CO2$Plant <- as.integer(CO2$Plant) -CO2$Type <- as.integer(CO2$Type) -CO2$Treatment <- as.integer(CO2$Treatment) -co <- scale(as.matrix(CO2)) -# Very small datasets often produce a warning regarding the alias table. This is safely ignored. -suppressWarnings(vis <- largeVis(t(co), K = 20, sgd_batches = 1, threads = 2)) -suppressWarnings(coords <- projectKNNs(vis$wij, threads = 2)) -plot(t(coords)) -} -} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index b55bd379..9650b58a 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -147,18 +147,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// jsDist -arma::mat jsDist(const arma::mat& m, int ncores); -RcppExport SEXP _pagoda2_jsDist(SEXP mSEXP, SEXP ncoresSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const arma::mat& >::type m(mSEXP); - Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP); - rcpp_result_gen = Rcpp::wrap(jsDist(m, ncores)); - return rcpp_result_gen; -END_RCPP -} // orderColumnRows arma::ivec orderColumnRows(const arma::ivec& p, arma::ivec& i); RcppExport SEXP _pagoda2_orderColumnRows(SEXP pSEXP, SEXP iSEXP) { @@ -306,7 +294,6 @@ static const R_CallMethodDef CallEntries[] = { {"_pagoda2_colSumByFac", (DL_FUNC) &_pagoda2_colSumByFac, 2}, {"_pagoda2_inplaceColMult", (DL_FUNC) &_pagoda2_inplaceColMult, 4}, {"_pagoda2_inplaceWinsorizeSparseCols", (DL_FUNC) &_pagoda2_inplaceWinsorizeSparseCols, 3}, - {"_pagoda2_jsDist", (DL_FUNC) &_pagoda2_jsDist, 2}, {"_pagoda2_orderColumnRows", (DL_FUNC) &_pagoda2_orderColumnRows, 2}, {"_pagoda2_smatColVecCorr", (DL_FUNC) &_pagoda2_smatColVecCorr, 3}, {"_pagoda2_arma_mat_cor", (DL_FUNC) &_pagoda2_arma_mat_cor, 1}, diff --git a/src/misc2.cpp b/src/misc2.cpp index 5f248d3c..70c26b0d 100644 --- a/src/misc2.cpp +++ b/src/misc2.cpp @@ -88,7 +88,7 @@ Rcpp::DataFrame colMeanVarS(SEXP sY, SEXP rowSel, int ncores=1) { double m=sum(ly)/nrows; meanV[g]=m; - ly-=m; + ly-=m; // ly%=ly; *copyly%=ly; // we need to avoid warning which checks whether assignment operation survives self-assignment. ly = *copyly; @@ -226,30 +226,6 @@ int inplaceWinsorizeSparseCols(SEXP sY,const int n, int ncores=1) { } -// JS distance metric (sqrt(JS div)) between the columns of a dense matrix m -// returns vectorized version of the lower triangle (as R dist oject) -// [[Rcpp::export]] -arma::mat jsDist(const arma::mat& m, int ncores=1) { - //arma::vec d(m.n_cols*(m.n_cols-1)/2); - arma::mat d(m.n_cols,m.n_cols,arma::fill::zeros); -//#pragma omp parallel for num_threads(ncores) shared(d) - for(int i=0;i<(m.n_cols-1);i++) { - arma::vec li=log(m.col(i)); - for(int j=i+1;j