From 9c1b3e83714e8a08077b7ca541c7635483ce1b0e Mon Sep 17 00:00:00 2001 From: Catalina Vallejos Date: Tue, 26 Sep 2017 17:49:30 +0100 Subject: [PATCH 1/7] No-spikes implementation - `makeExampleBASiCS_Data` + `newBASiCS_Data` adapted to no-spikes case - Temporary vignette file for no-spikes case has been created - `HiddenBASiCS_MCMCcppNoSpikes` updated to recycle vertical integration update functions --- NEWS | 6 + R/BASiCS_MCMC.R | 11 +- R/RcppExports.R | 4 +- R/makeExampleBASiCS_Data.R | 29 +- R/newBASiCS_Data.R | 59 ++-- man/newBASiCS_Data.Rd | 2 +- src/BASiCS_CPPcode.cpp | 499 ++++++++++++++-------------------- src/RcppExports.cpp | 17 +- vignettes/BASiCS_nospikes.Rmd | 484 +++++++++++++++++++++++++++++++++ 9 files changed, 763 insertions(+), 348 deletions(-) create mode 100644 vignettes/BASiCS_nospikes.Rmd diff --git a/NEWS b/NEWS index 9808676c..bff14a7b 100644 --- a/NEWS +++ b/NEWS @@ -591,6 +591,12 @@ version 0.99.4: (2017-09-25) version 0.99.5: (2017-09-26) - Change of default `PriorDelta` value in `BASiCS_MCMC` - Unit tests modified accordingly + +No-spikes implementation +- `makeExampleBASiCS_Data` + `newBASiCS_Data` adapted to no-spikes case +- Temporary vignette file for no-spikes case has been created +- `HiddenBASiCS_MCMCcppNoSpikes` updated to recycle vertical integration +update functions - DEVELOPMENT NOTES (potential future changes): * Delete BASiCS_VarianceDecomp? diff --git a/R/BASiCS_MCMC.R b/R/BASiCS_MCMC.R index e99815dc..cc892d1b 100644 --- a/R/BASiCS_MCMC.R +++ b/R/BASiCS_MCMC.R @@ -418,7 +418,8 @@ BASiCS_MCMC <- function(Data, N, Thin, Burn, ...) { as.matrix(assay(Data)), BatchDesign, mu0, delta0, - phi0, nu0, theta0, + phi0, nu0, + rep(theta0, nBatch), PriorParam$s2.mu, PriorParam$a.delta, PriorParam$b.delta, @@ -426,19 +427,19 @@ BASiCS_MCMC <- function(Data, N, Thin, Burn, ...) { PriorParam$b.phi, PriorParam$a.theta, PriorParam$b.theta, + PriorParam$s2.delta, + PriorDeltaNum, AR, ls.mu0, ls.delta0, - ls.nu0, ls.theta0, + ls.nu0, + rep(ls.theta0, nBatch), sum.bycell.all, sum.bygene.all, StoreAdaptNumber, StopAdapt, as.numeric(PrintProgress), - PriorParam$s2.delta, - PriorDeltaNum, metadata(Data)$BatchInfo, BatchIds, - as.vector(BatchSizes), BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, diff --git a/R/RcppExports.R b/R/RcppExports.R index 86cf8f98..91caeb30 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -9,7 +9,7 @@ HiddenBASiCS_DenoisedRates <- function(CountsBio, Mu, TransInvDelta, PhiNu, N, q .Call('_BASiCS_HiddenBASiCS_DenoisedRates', PACKAGE = 'BASiCS', CountsBio, Mu, TransInvDelta, PhiNu, N, q0, n) } -HiddenBASiCS_MCMCcppNoSpikes <- function(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, s2delta, prior_delta, BatchInfo, BatchIds, BatchSizes, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType) { - .Call('_BASiCS_HiddenBASiCS_MCMCcppNoSpikes', PACKAGE = 'BASiCS', N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, s2delta, prior_delta, BatchInfo, BatchIds, BatchSizes, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType) +HiddenBASiCS_MCMCcppNoSpikes <- function(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType) { + .Call('_BASiCS_HiddenBASiCS_MCMCcppNoSpikes', PACKAGE = 'BASiCS', N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType) } diff --git a/R/makeExampleBASiCS_Data.R b/R/makeExampleBASiCS_Data.R index 3ce84a65..c341d6aa 100644 --- a/R/makeExampleBASiCS_Data.R +++ b/R/makeExampleBASiCS_Data.R @@ -34,17 +34,17 @@ makeExampleBASiCS_Data <- function(WithBatch = FALSE, WithSpikes = TRUE) { Mu <- c( 8.36, 10.65, 4.88, 6.29, 21.72, 12.93, 30.19, 83.92, 3.89, 6.34, - 57.87, 12.45, 8.08, 7.31, 15.56, 15.91, 12.24, 15.96, 19.14, 4.20, + 57.87, 12.45, 8.08, 7.31, 15.56, 15.91, 12.24, 15.96, 19.14, 4.20, 6.75, 27.74, 8.88, 21.21, 19.89, 7.14, 11.09, 7.19, 20.64, 73.9, 9.05, 6.13, 16.40, 6.08, 17.89, 6.98, 10.25, 14.05, 8.14, 5.67, 6.95, 11.16, 11.83, 7.56,159.05, 16.41, 4.58, 15.46, 10.96, 25.05, - 18.54, 8.5, 4.05, 5.37, 4.63, 4.08, 3.75, 5.02, 27.74, 10.28, + 18.54, 8.5, 4.05, 5.37, 4.63, 4.08, 3.75, 5.02, 27.74, 10.28, 3.91, 13.10, 8.23, 3.64, 80.77, 20.36, 3.47, 20.12, 13.29, 7.92, - 25.51,173.01, 27.71, 4.89, 33.10, 3.42, 6.19, 4.29, 5.19, 8.36, - 10.27, 6.86, 5.72, 37.25, 3.82, 23.97, 5.80, 14.30, 29.07, 5.30, + 25.51,173.01, 27.71, 4.89, 33.10, 3.42, 6.19, 4.29, 5.19, 8.36, + 10.27, 6.86, 5.72, 37.25, 3.82, 23.97, 5.80, 14.30, 29.07, 5.30, 7.47, 8.44, 4.24, 16.15, 23.39,120.22, 8.92, 97.15, 9.75, 10.07, - 1010.72, 7.90, 31.59, 63.17, 1.97,252.68, 31.59, 31.59, 31.59, 63.17, - 4042.89,4042.89, 3.95,126.34,252.68, 7.90, 15.79,126.34, 3.95, 15.79) + 1010.72, 7.90, 31.59, 63.17, 1.97,252.68, 31.59, 31.59, 31.59, 63.17, + 4042.89,4042.89, 3.95,126.34,252.68, 7.90, 15.79,126.34, 3.95, 15.79) Delta <- c(1.29, 0.88, 1.51, 1.49, 0.54, 0.40, 0.85, 0.27, 0.53, 1.31, 0.26, 0.81, 0.72, 0.70, 0.96, 0.58, 1.15, 0.82, 0.25, 5.32, 1.13, 0.31, 0.66, 0.27, 0.76, 1.39, 1.18, 1.57, 0.55, 0.17, @@ -59,14 +59,14 @@ makeExampleBASiCS_Data <- function(WithBatch = FALSE, 1.09, 1.16, 1.19, 1.14, 0.87, 1.10, 0.48, 1.06, 0.94, 0.97) S <- c(0.38, 0.40, 0.38, 0.39, 0.34, 0.39, 0.31, 0.39, 0.40, 0.37, 0.38, 0.40, 0.38, 0.39, 0.34, 0.39, 0.31, 0.39, 0.40, 0.37) - + Mu <- c(Mu[1:50], Mu[101:120]) Delta <- Delta[1:50] - + q <- length(Mu) q.bio <- length(Delta) n <- length(Phi) - + if (!WithBatch) { Theta <- 0.5 } else { @@ -75,7 +75,7 @@ makeExampleBASiCS_Data <- function(WithBatch = FALSE, Theta2 <- 0.75 Theta <- ifelse(seq_len(n) <= 10, Theta1, Theta2) } - + if (WithSpikes) { # Matrix where simulated counts will be stored @@ -115,7 +115,7 @@ makeExampleBASiCS_Data <- function(WithBatch = FALSE, rownames(Counts.sim) <- paste0("Gene", seq_len(q)) SpikeInfo <- data.frame(paste0("Gene", seq(q.bio+1, q)), Mu[seq(q.bio+1, q)]) - + if (!WithBatch) { Data <- newBASiCS_Data(Counts = Counts.sim, @@ -160,12 +160,11 @@ makeExampleBASiCS_Data <- function(WithBatch = FALSE, Counts.sim[i, ] <- rpois(n, lambda = Nu * Rho[i, ] * Mu[i]) } rownames(Counts.sim) <- paste0("Gene", seq_len(q.bio)) - + Data <- newBASiCS_Data(Counts = Counts.sim, Tech = rep(FALSE, q.bio), - SpikeInfo = 1, BatchInfo = c(rep(1, 10), rep(2, 10))) - + } return(Data) -} +} \ No newline at end of file diff --git a/R/newBASiCS_Data.R b/R/newBASiCS_Data.R index 1318805c..f5a4d16a 100644 --- a/R/newBASiCS_Data.R +++ b/R/newBASiCS_Data.R @@ -48,54 +48,65 @@ #' @references #' Vallejos, Marioni and Richardson (2015). PLoS Computational Biology. #' -newBASiCS_Data <- function(Counts, Tech, SpikeInfo, BatchInfo = NULL) +newBASiCS_Data <- function(Counts, + Tech, + SpikeInfo = NULL, + BatchInfo = NULL) { # Validity checks for SpikeInfo - if (!is.data.frame(SpikeInfo)) - stop("'SpikeInfo' must be a 'data.frame'") - if (data.table::is.data.table(SpikeInfo)) + if(!is.null(SpikeInfo)) + { + if (!is.data.frame(SpikeInfo)) stop("'SpikeInfo' must be a 'data.frame'") - + if (data.table::is.data.table(SpikeInfo)) + stop("'SpikeInfo' must be a 'data.frame'") + } + else + { + message("Spikes information is not provided.\n", + "Only the horizontal integration method can be used.") + } + if (is.null(BatchInfo)) { BatchInfo <- rep(1, times = ncol(Counts)) } - + # Re-ordering genes Counts <- as.matrix(rbind(Counts[!Tech, ], Counts[Tech, ])) Tech <- c(Tech[!Tech], Tech[Tech]) GeneName <- rownames(Counts) - + if (!is.null(SpikeInfo)) { # Extracting spike-in input molecules in the correct order if (sum(!(GeneName[Tech] %in% SpikeInfo[, 1])) > 0) - stop("'SpikeInfo' is missing information for some of the spikes") + stop("'SpikeInfo' is missing information for some of the spikes") if (sum(!(SpikeInfo[, 1] %in% GeneName[Tech])) > 0) - stop("'SpikeInfo' includes spikes that are not in 'Counts'") + stop("'SpikeInfo' includes spikes that are not in 'Counts'") matching <- match(GeneName[Tech], SpikeInfo[, 1]) SpikeInput <- SpikeInfo[matching, 2] } else { SpikeInput <- 1 } - + # Checks to assess if the data contains the required information errors <- HiddenChecksBASiCS_Data(Counts, Tech, SpikeInput, GeneName, BatchInfo) if (length(errors) > 0) stop(errors) - + # Create a SingleCellExperiment data object Data <- SingleCellExperiment::SingleCellExperiment( - assays = list(Counts = as.matrix(Counts)), - metadata = list(SpikeInput = SpikeInput, - BatchInfo = BatchInfo)) + assays = list(Counts = as.matrix(Counts)), + metadata = list(SpikeInput = SpikeInput, + BatchInfo = BatchInfo)) isSpike(Data, "ERCC") <- Tech - + message("\n", "NOTICE: BASiCS requires a pre-filtered dataset \n", - " - You must remove poor quality cells before hand \n", - " - We recommend to pre-filter lowly expressed transcripts. \n", - " Inclusion criteria may vary for each data. \n", - " For example, remove transcripts: \n", - " - with low total counts across of all of the cells \n", - " - that are only expressed in a few cells \n", - " (genes expressed in only 1 cell are not accepted) \n", - "\n BASiCS_Filter can be used for this purpose. \n") + " - You must remove poor quality cells before hand \n", + " - We recommend to pre-filter lowly expressed transcripts. \n", + " Inclusion criteria may vary for each data. \n", + " For example, remove transcripts: \n", + " - with low total counts across of all of the cells \n", + " - that are only expressed in a few cells \n", + " (genes expressed in only 1 cell are not accepted) \n", + "\n BASiCS_Filter can be used for this purpose. \n") return(Data) -} +} \ No newline at end of file diff --git a/man/newBASiCS_Data.Rd b/man/newBASiCS_Data.Rd index 220d06e8..86b0e619 100644 --- a/man/newBASiCS_Data.Rd +++ b/man/newBASiCS_Data.Rd @@ -5,7 +5,7 @@ \title{Creates a SingleCellExperiment object from a matrix of expression counts and experimental information about spike-in genes} \usage{ -newBASiCS_Data(Counts, Tech, SpikeInfo, BatchInfo = NULL) +newBASiCS_Data(Counts, Tech, SpikeInfo = NULL, BatchInfo = NULL) } \arguments{ \item{Counts}{Matrix of dimensions \code{q} times \code{n} whose elements diff --git a/src/BASiCS_CPPcode.cpp b/src/BASiCS_CPPcode.cpp index b057b6f8..bb347f49 100644 --- a/src/BASiCS_CPPcode.cpp +++ b/src/BASiCS_CPPcode.cpp @@ -1264,152 +1264,6 @@ arma::mat muUpdateNoSpikes( return join_rows(mu, ind); } -/* Metropolis-Hastings updates of delta - * Updates are implemented simulateaneously for all biological genes - */ -arma::mat deltaUpdateNoSpikes( - arma::vec const& delta0, - arma::vec const& prop_var, - arma::mat const& Counts, - arma::vec const& mu, - arma::vec const& nu, - double const& a_delta, - double const& b_delta, - int const& q0, - int const& n, - double const& s2delta, - double const& prior_delta) -{ - using arma::span; - - // CREATING VARIABLES WHERE TO STORE DRAWS - arma::vec delta = arma::zeros(q0); - arma::vec ind = arma::zeros(q0); - - // PROPOSAL STEP - arma::vec y = exp(arma::randn(q0) % sqrt(prop_var) + log(delta0)); - arma::vec u = arma::randu(q0); - - // ACCEPT/REJECT STEP - arma::vec log_aux = - n * (lgamma_cpp(1/y)-lgamma_cpp(1/delta0)); - // +1 should appear because we update log(delta) not delta. - // However, it cancels out with the prior. - log_aux -= n * ( (log(y)/y) - (log(delta0)/delta0) ); - - // Loop to replace matrix operations, through genes and cells - for (int i=0; i < q0; i++) - { - for (int j=0; j < n; j++) - { - log_aux(i) += R::lgammafn(Counts(i,j) + (1/y(i))) - R::lgammafn(Counts(i,j) + (1/delta0(i))); - log_aux(i) -= ( Counts(i,j) + (1/y(i)) ) * log( nu(j)*mu(i)+(1/y(i)) ); - log_aux(i) += ( Counts(i,j) + (1/delta0(i)) ) * log( nu(j)*mu(i)+(1/delta0(i)) ); - } - } - - // Component related to the prior - if(prior_delta == 1) {log_aux += (log(y) - log(delta0)) * a_delta - b_delta * (y - delta0);} - else {log_aux -= (0.5/s2delta) * (pow(log(y),2) - pow(log(delta0),2));} - - // CREATING OUTPUT VARIABLE & DEBUG - for (int i=0; i < q0; i++) - { - if((arma::is_finite(log_aux(i)))) - { - if((y(i) > 1e-3) & (log(u(i)) < log_aux(i))) - { - // DEBUG: Reject very small values to avoid numerical issues - if(arma::is_finite(y(i))) {ind(i) = 1; delta(i) = y(i);} - else{ind(i) = 0; delta(i) = delta0(i);} - } - else{ind(i) = 0; delta(i) = delta0(i);} - } - // DEBUG: Reject values such that acceptance rate cannot be computed (due no numerical innacuracies) - // DEBUG: Reject values such that the proposed value is not finite (due no numerical innacuracies) - else - { - ind(i) = 0; delta(i) = delta0(i); - Rcpp::Rcout << "delta0(i): " << delta0(i) << std::endl; - Rcpp::Rcout << "y(i): " << y(i) << std::endl; - Rcpp::Rcout << "mu(i): " << mu(i) << std::endl; - Rcpp::Rcout << "SomeThing went wrong when updating delta " << i+1 << std::endl; - Rcpp::warning("If this error repeats often, please consider additional filter of the input dataset or use a smaller value for s2mu."); - } - } - - // OUTPUT - return join_rows(delta, ind); -} - -/* Auxiliary function required for some of the Metropolis-Hastings updates of nu */ -arma::mat UpdateAux_nuTrick( - arma::vec const& nu, /* Auxiliary variable (function of the current and proposed value of nu) */ -arma::vec const& mu, /* Current value of $\mu=(\mu_1,...,\mu_q)'$ */ -arma::vec const& delta, /* Current value of $\delta=(\delta_1,...,\delta_{q_0})'$ */ -arma::vec const& phi) /* Current value of $\phi=(\phi_1,...,\phi_n)$' */ -{ - arma::mat x = ((phi % nu) * mu.t()).t(); - x.each_col() += 1/delta; - return x; -} - -/* Metropolis-Hastings updates of nu - * Updates are implemented simulateaneously for all cells, significantly reducing the computational burden. - */ -arma::mat nuUpdateNoSpikes( - arma::vec const& nu0, /* Current value of $\nu=(\nu_1,...,\nu_n)'$ */ - arma::vec const& prop_var, /* Current value of the proposal variances for $\nu=(\nu_1,...,\nu_n)'$ */ - arma::mat const& Counts, /* $q \times n$ matrix of expression counts (technical genes at the bottom) */ - arma::mat const& BatchDesign, - arma::vec const& mu, /* Current value of $\mu=(\mu_1,...,\mu_q)'$ */ - arma::vec const& delta, /* Current value of $\delta=(\delta_1,...,\delta_{q_0})'$ */ - arma::vec const& phi, /* Current value of $\phi=(\phi_1,...,\phi_n)$' */ - arma::vec const& theta, /* Current value of $\theta$' */ - arma::vec const& sum_bygene_all, /* Sum of expression counts by gene (all genes) */ - int const& q0, - int const& n, - arma::vec const& BatchInfo, - arma::vec const& BatchIds, - int const& nBatch, - arma::vec const& BatchSizes, - arma::vec const& BatchOffSet) -{ - using arma::span; - - // CREATING VARIABLES WHERE TO STORE DRAWS - arma::vec lognu = log(nu0); - arma::vec thetaBatch = BatchDesign * theta; - - // PROPOSAL STEP - arma::vec y = arma::randn(n) % sqrt(prop_var) + lognu; - arma::vec u = arma::randu(n); - - // ACCEPT/REJECT STEP - arma::vec log_aux = (y - lognu) % (sum_bygene_all + 1/thetaBatch); - log_aux -= nu0 % (exp(y-lognu)-1) % (1/(thetaBatch % phi)); - arma::mat m = Counts.rows(0, q0 - 1); - m.each_col() += 1 / delta; - arma::mat num = UpdateAux_nuTrick(exp(y), mu, delta, arma::ones(n)); - num /= UpdateAux_nuTrick(nu0, mu, delta, arma::ones(n)); - m %= log(num); - log_aux -= sum(m, 0).t(); - arma::umat ind = log(u) < log_aux; - // DEBUG: Reject values such that acceptance rate cannot be computed (due no numerical innacuracies) - // DEBUG: Print warning message - ind.elem(find_nonfinite(log_aux)).zeros(); - if(size(find_nonfinite(log_aux),0)>0) - { - Rcpp::Rcout << "SomeThing went wrong when updating nu" << size(find_nonfinite(log_aux),0) << std::endl; - Rcpp::stop("Please consider additional filter of the input dataset."); - } - - // CREATING OUTPUT VARIABLE - arma::vec nu = ind % exp(y) + (1 - ind) % nu0; - - // OUTPUT - return join_rows(nu, arma::conv_to::from(ind)); -} - // [[Rcpp::export]] arma::mat HiddenBASiCS_DenoisedRates( NumericMatrix CountsBio, @@ -1450,6 +1304,36 @@ arma::mat HiddenBASiCS_DenoisedRates( /* MCMC sampler + * N: Total number of MCMC draws + * Thin: Thinning period for MCMC chain + * Burn: Burning period for MCMC chain + * Counts: Matrix of expression counts + * BatchDesign: Design matrix representing batch information + * mu0: Starting value for mu + * delta0: Starting value for delta + * phi0: Starting value for phi + * nu0: Starting value for nu + * theta0: Starting value for theta + * s2mu: prior variance for mu + * adelta: prior shape for delta (when using a gamma prior) + * bdelta: prior rate for delta (when using a gamma prior) + * s2delta: prior variance for delta (when using a log-normal prior) + * prior_delta: whether gamma or log-normal prior is used for delta + * aphi: Shape hyper-parameter for phi + * bphi: Rate hyper-parameter for phi + * atheta: prior shape for theta + * btheta: prior rate for theta + * ar: Optimal acceptance rate for adaptive Metropolis-Hastings updates + * LSmu0: Starting value of adaptive proposal variance of mu (log-scale) + * LSdelta0: Starting value of adaptive proposal variance of delta (log-scale) + * LSnu0: Starting value of adaptive proposal variance of nu (log-scale) + * LStheta0: Starting value of adaptive proposal variance of theta (log-scale) + * sumByCellAll: Sum of expression counts by cell (all genes) + * sumByGeneAll: Sum of expression counts by gene (all genes) + * StoreAdapt: whether to store adaptive variances + * EndAdapt: when to stop the adaptation + * PrintProgress: whether to print progress report + * ADD ADDITIONAL ARGS HERE! */ // [[Rcpp::export]] Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( @@ -1462,10 +1346,12 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( NumericVector delta0, // Starting value of $\delta=(\delta_1,...,\delta_{q_0})'$ NumericVector phi0, // Starting value of $\phi=(\phi_1,...,\phi_n)$'$ NumericVector nu0, // Starting value of $\nu=(\nu_1,...,\nu_n)$'$ - double theta0, // Starting value of $\theta$ + NumericVector theta0, // Starting value of $\theta$ double s2mu, double adelta, // Shape hyper-parameter of the Gamma($a_{\delta}$,$b_{\delta}$) prior assigned to each $\delta_i$ double bdelta, // Rate hyper-parameter of the Gamma($a_{\delta}$,$b_{\delta}$) prior assigned to each $\delta_i$ + double s2delta, + double prior_delta, double aphi, double bphi, double atheta, // Shape hyper-parameter of the Gamma($a_{\theta}$,$b_{\theta}$) prior assigned to $\theta$ @@ -1474,17 +1360,14 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( NumericVector LSmu0, // Starting value of adaptive proposal variance of $\mu=(\mu_1,...,\mu_q_0)'$ (log-scale) NumericVector LSdelta0, // Starting value of adaptive proposal variance of $\delta=(\delta_1,...,\delta_{q_0})'$ (log-scale) NumericVector LSnu0, // Starting value of adaptive proposal variance of $\nu=(\nu_1,...,\nu_n)'$ (log-scale) - double LStheta0, // Starting value of adaptive proposal variance of $\theta$ (log-scale) + NumericVector LStheta0, // Starting value of adaptive proposal variance of $\theta$ (log-scale) NumericVector sumByCellAll, // Sum of expression counts by cell (all genes) NumericVector sumByGeneAll, // Sum of expression counts by gene (all genes) int StoreAdapt, int EndAdapt, int PrintProgress, - double s2delta, - double prior_delta, NumericVector BatchInfo, NumericVector BatchIds, - NumericVector BatchSizes, NumericVector BatchOffSet, double Constrain, NumericVector Index, @@ -1494,31 +1377,39 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( NumericVector NotConstrainGene, int ConstrainType) { + + using arma::ones; + using arma::zeros; + using Rcpp::Rcout; + // NUMBER OF CELLS, GENES AND STORED DRAWS - int n = Counts.ncol(); int q0 = Counts.nrow(); int Naux = N/Thin - Burn/Thin; + int n = Counts.ncol(); int nBatch = BatchDesign.ncol(); + int q0 = delta0.size(); + int Naux = N/Thin - Burn/Thin; // TRANSFORMATION TO ARMA ELEMENTS arma::vec sumByCellAll_arma = as_arma(sumByCellAll); arma::vec sumByGeneAll_arma = as_arma(sumByGeneAll); - arma::mat Counts_arma = as_arma(Counts); arma::mat BatchDesign_arma = as_arma(BatchDesign); + arma::mat Counts_arma = as_arma(Counts); + arma::mat BatchDesign_arma = as_arma(BatchDesign); arma::vec BatchInfo_arma = as_arma(BatchInfo); arma::vec BatchIds_arma = as_arma(BatchIds); - arma::vec BatchSizes_arma = as_arma(BatchSizes); arma::vec BatchOffSet_arma = as_arma(BatchOffSet); - arma::vec mu0_arma = as_arma(mu0); - arma::vec phi0_arma = as_arma(phi0); arma::vec Index_arma = as_arma(Index); arma::uvec ConstrainGene_arma = Rcpp::as(ConstrainGene); arma::uvec NotConstrainGene_arma = Rcpp::as(NotConstrainGene); arma::vec RefGenes_arma = as_arma(RefGenes); + // OTHER GLOBAL QUANTITIES + arma::vec BatchSizes = sum(BatchDesign_arma,0).t(); + // OBJECTS WHERE DRAWS WILL BE STORED - arma::mat mu = arma::zeros(Naux, q0); - arma::mat delta = arma::zeros(Naux, q0); - arma::mat phi = arma::ones(Naux, n); - arma::mat nu = arma::zeros(Naux, n); - arma::mat theta = arma::zeros(Naux, nBatch); + arma::mat mu = zeros(q0, Naux); + arma::mat delta = zeros(q0, Naux); + arma::mat phi = ones(n, Naux); + arma::mat nu = zeros(n, Naux); + arma::mat theta = zeros(nBatch, Naux); arma::mat LSmu; arma::mat LSdelta; arma::mat LSnu; @@ -1527,70 +1418,86 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( // LOG-PROPOSAL VARIANCES if(StoreAdapt == 1) { - LSmu = arma::zeros(Naux, q0); - LSdelta = arma::zeros(Naux, q0); - LSnu = arma::zeros(Naux, n); - LStheta = arma::zeros(Naux, nBatch); + LSmu = zeros(q0, Naux); + LSdelta = zeros(q0, Naux); + LSnu = zeros(n, Naux); + LStheta = zeros(nBatch, Naux); } // ACCEPTANCE RATES FOR ADAPTIVE METROPOLIS-HASTINGS UPDATES - arma::vec muAccept = arma::zeros(q0); arma::vec PmuAux = arma::zeros(q0); - arma::vec deltaAccept = arma::zeros(q0); arma::vec PdeltaAux = arma::zeros(q0); - arma::vec nuAccept = arma::zeros(n); arma::vec PnuAux = arma::zeros(n); - arma::vec thetaAccept = arma::zeros(nBatch); arma::vec PthetaAux = arma::zeros(nBatch); + arma::vec muAccept = zeros(q0); arma::vec PmuAux = zeros(q0); + arma::vec deltaAccept = zeros(q0); arma::vec PdeltaAux = zeros(q0); + arma::vec nuAccept = zeros(n); arma::vec PnuAux = zeros(n); + arma::vec thetaAccept = zeros(nBatch); arma::vec PthetaAux = zeros(nBatch); + // INITIALIZATION OF VALUES FOR MCMC RUN - arma::mat muAux = arma::zeros(q0,2); muAux.col(0)=as_arma(mu0); arma::vec LSmuAux = as_arma(LSmu0); - arma::mat deltaAux = arma::zeros(q0,2); deltaAux.col(0)=as_arma(delta0); arma::vec LSdeltaAux = as_arma(LSdelta0); - arma::vec phiAux = phi0_arma; - arma::mat nuAux = arma::zeros(n,2); nuAux.col(0)=as_arma(nu0); arma::vec LSnuAux = as_arma(LSnu0); - arma::mat thetaAux = arma::zeros(nBatch, 2); thetaAux.col(0) = theta0 * arma::ones(nBatch); - arma::vec LSthetaAux = LStheta0 * arma::ones(nBatch); + arma::mat muAux = zeros(q0,2); muAux.col(0)=as_arma(mu0); + arma::mat deltaAux = zeros(q0,2); deltaAux.col(0)=as_arma(delta0); + arma::vec phiAux = as_arma(phi0); + arma::mat nuAux = zeros(n,2); nuAux.col(0)=as_arma(nu0); + arma::mat thetaAux = zeros(nBatch, 2); thetaAux.col(0) = as_arma(theta0); + arma::vec thetaBatch = BatchDesign_arma * as_arma(theta0); + + // INITIALIZATION OF ADAPTIVE VARIANCES + arma::vec LSmuAux = as_arma(LSmu0); + arma::vec LSdeltaAux = as_arma(LSdelta0); + arma::vec LSnuAux = as_arma(LSnu0); + arma::vec LSthetaAux = as_arma(LStheta0); + + // OTHER AUXILIARY QUANTITIES FOR ADAPTIVE METROPOLIS UPDATES - arma::vec PmuAux0 = arma::zeros(q0); arma::vec PdeltaAux0 = arma::zeros(q0); - arma::vec PnuAux0 = arma::zeros(n); arma::vec PthetaAux0 = arma::ones(nBatch); + arma::vec PmuAux0 = zeros(q0); arma::vec PdeltaAux0 = zeros(q0); + arma::vec PnuAux0 = zeros(n); arma::vec PthetaAux0 = zeros(nBatch); - // BATCH INITIALIZATION FOR ADAPTIVE METROPOLIS UPDATES (RE-INITIALIZE EVERY 50 ITERATIONS) - int Ibatch = 0; int i; + // BATCH INITIALIZATION FOR ADAPTIVE METROPOLIS UPDATES + // (RE-INITIALIZE EVERY 50 ITERATIONS) + int Ibatch = 0; // INITIALIZATION OF PARAMETERS TO RETURN IN UPDATE FUNCTIONS - arma::vec muUpdateAux = arma::ones(q0); - arma::vec indQ = arma::zeros(q0); + // To avoid repeated initialisation + arma::vec y_q0 = ones(q0); arma::vec y_n = ones(n); + arma::vec ind_q0 = zeros(q0); arma::vec ind_n = zeros(n); + arma::vec u_q0 = zeros(q0); arma::vec u_n = zeros(n); + arma::vec ones_n = ones(n); + + // INITIALIZATION OF PARAMETERS TO RETURN IN UPDATE FUNCTIONS + arma::vec muUpdateAux = ones(q0); double LSmuAuxExtra = 0; - arma::vec RefFreq = arma::zeros(q0); + arma::vec RefFreq = zeros(q0); int RefAux; - arma::vec y_n = arma::ones(n); - - Rcpp::Rcout << "--------------------------------------------------------------------" << std::endl; - Rcpp::Rcout << "MCMC sampler has been started: " << N << " iterations to go." << std::endl; - Rcpp::Rcout << "--------------------------------------------------------------------" << std::endl; + Rcout << "-------------------------------------------------------------" << std::endl; + Rcout << "MCMC sampler has been started: " << N << " iterations to go." << std::endl; + Rcout << "-------------------------------------------------------------" << std::endl; // START OF MCMC LOOP - for (i=0; i=Burn) {thetaAccept += thetaAux.col(1);} -// Rcpp::Rcout << "theta" << thetaAux.col(0).t() << std::endl; - + thetaBatch = BatchDesign_arma * thetaAux.col(0); + + // UPDATE OF MU: 1st COLUMN IS THE UPDATE, 2nd COLUMN IS THE ACCEPTANCE INDICATOR // Additional steps required for ConstrainType = 3 (stochastic reference) if((ConstrainType == 3) | (ConstrainType == 5)) @@ -1601,26 +1508,29 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( } muAux = muUpdateNoSpikes(muAux.col(0), exp(LSmuAux), Constrain, Counts_arma, deltaAux.col(0), nuAux.col(0), sumByCellAll_arma, s2mu, q0, n, - muUpdateAux, indQ, RefGene, ConstrainGene_arma, NotConstrainGene_arma, + muUpdateAux, ind_q0, RefGene, ConstrainGene_arma, NotConstrainGene_arma, ConstrainType); PmuAux += muAux.col(1); if(i>=Burn) {muAccept += muAux.col(1);} -// Rcpp::Rcout << "mu" << muAux.col(0).t() << std::endl; + - // UPDATE OF DELTA: 1st COLUMN IS THE UPDATE, 2nd COLUMN IS THE ACCEPTANCE INDICATOR - deltaAux = deltaUpdateNoSpikes(deltaAux.col(0), exp(LSdeltaAux), Counts_arma, - muAux.col(0), nuAux.col(0), adelta, bdelta, q0, n, s2delta, prior_delta); + // UPDATE OF DELTA: + // 1st COLUMN IS THE UPDATE, + // 2nd COLUMN IS THE ACCEPTANCE INDICATOR + deltaAux = deltaUpdate(deltaAux.col(0), exp(LSdeltaAux), Counts_arma, + muAux.col(0), nuAux.col(0), + adelta, bdelta, s2delta, prior_delta, + q0, n, y_q0, u_q0, ind_q0); PdeltaAux += deltaAux.col(1); if(i>=Burn) {deltaAccept += deltaAux.col(1);} -// Rcpp::Rcout << "delta" << deltaAux.col(0).t() << std::endl; - - // UPDATE OF NU: 1st COLUMN IS THE UPDATE, 2nd COLUMN IS THE ACCEPTANCE INDICATOR - nuAux = nuUpdateNoSpikes(nuAux.col(0), exp(LSnuAux), Counts_arma, - BatchDesign_arma, - muAux.col(0), deltaAux.col(0), - phiAux, thetaAux.col(0), sumByGeneAll_arma, q0, n, - BatchInfo_arma, BatchIds_arma, nBatch, - BatchSizes_arma, BatchOffSet_arma); + + // UPDATE OF NU: + // 1st COLUMN IS THE UPDATE, + // 2nd COLUMN IS THE ACCEPTANCE INDICATOR + nuAux = nuUpdateBatch(nuAux.col(0), exp(LSnuAux), Counts_arma, 0, + BatchDesign_arma, + muAux.col(0), 1/deltaAux.col(0), + ones_n, phiAux, thetaBatch, sumByGeneAll_arma, q0, n, + y_n, u_n, ind_n); PnuAux += nuAux.col(1); if(i>=Burn) {nuAccept += nuAux.col(1);} -// Rcpp::Rcout << "nu" << nuAux.col(0).t() << std::endl; // STOP ADAPTING THE PROPOSAL VARIANCES AFTER EndAdapt ITERATIONS if(i < EndAdapt) @@ -1628,14 +1538,18 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( // UPDATE OF PROPOSAL VARIANCES (ONLY EVERY 50 ITERATIONS) if(Ibatch==50) { - PmuAux=PmuAux/(50-RefFreq); PmuAux = -1+2*arma::conv_to::from(PmuAux>ar);// + PmuAux = PmuAux / (50-RefFreq); + PmuAux = -1 + 2*arma::conv_to::from(PmuAux>ar); LSmuAux.elem(find(Index_arma != RefGene)) = LSmuAux.elem(find(Index_arma != RefGene)) + PmuAux.elem(find(Index_arma != RefGene))*0.1; - PdeltaAux=PdeltaAux/50; PdeltaAux = -1+2*arma::conv_to::from(PdeltaAux>ar); - LSdeltaAux=LSdeltaAux+PdeltaAux*0.1; - PnuAux=PnuAux/50; PnuAux = -1+2*arma::conv_to::from(PnuAux>ar); - LSnuAux=LSnuAux+PnuAux*0.1; - PthetaAux=PthetaAux/50; PthetaAux = -1+2*arma::conv_to::from(PthetaAux>ar); - LSthetaAux= LSthetaAux + PthetaAux*0.1; + PdeltaAux = PdeltaAux / 50; + PdeltaAux = -1 + 2*arma::conv_to::from(PdeltaAux>ar); + LSdeltaAux = LSdeltaAux + PdeltaAux*0.1; + PnuAux = PnuAux / 50; + PnuAux = -1 + 2*arma::conv_to::from(PnuAux>ar); + LSnuAux = LSnuAux + PnuAux*0.1; + PthetaAux = PthetaAux / 50; + PthetaAux = -1 + 2*arma::conv_to::from(PthetaAux>ar); + LSthetaAux = LSthetaAux + PthetaAux*0.1; Ibatch = 0; PmuAux = PmuAux0; PdeltaAux = PdeltaAux0; @@ -1643,92 +1557,93 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( } } + // STORAGE OF DRAWS if(i%Thin==0 & i>=Burn) - { - mu.row(i/Thin - Burn/Thin) = muAux.col(0).t(); - delta.row(i/Thin - Burn/Thin) = deltaAux.col(0).t(); - phi.row(i/Thin - Burn/Thin) = phiAux.t(); - nu.row(i/Thin - Burn/Thin) = nuAux.col(0).t(); - theta.row(i/Thin - Burn/Thin) = thetaAux.col(0).t(); + { + mu.col(i/Thin - Burn/Thin) = muAux.col(0); + delta.col(i/Thin - Burn/Thin) = deltaAux.col(0); + phi.col(i/Thin - Burn/Thin) = phiAux; + nu.col(i/Thin - Burn/Thin) = nuAux.col(0); + theta.col(i/Thin - Burn/Thin) = thetaAux.col(0); if(StoreAdapt == 1) { - LSmu.row(i/Thin - Burn/Thin) = LSmuAux.t(); - LSdelta.row(i/Thin - Burn/Thin) = LSdeltaAux.t(); - LSnu.row(i/Thin - Burn/Thin) = LSnuAux.t(); - LStheta.row(i/Thin - Burn/Thin) = LSthetaAux.t(); + LSmu.col(i/Thin - Burn/Thin) = LSmuAux; + LSdelta.col(i/Thin - Burn/Thin) = LSdeltaAux; + LSnu.col(i/Thin - Burn/Thin) = LSnuAux; + LStheta.col(i/Thin - Burn/Thin) = LSthetaAux; } } // PRINT IN CONSOLE SAMPLED VALUES FOR FEW SELECTED PARAMETERS if(i%(2*Thin) == 0 & PrintProgress == 1) { - Rcpp::Rcout << "--------------------------------------------------------------------" << std::endl; - Rcpp::Rcout << "MCMC iteration " << i << " out of " << N << " has been completed." << std::endl; - Rcpp::Rcout << "--------------------------------------------------------------------" << std::endl; - Rcpp::Rcout << "Current draws of some selected parameters are displayed below." << std::endl; - Rcpp::Rcout << "mu (gene 1): " << muAux(0,0) << std::endl; - Rcpp::Rcout << "delta (gene 1): " << deltaAux(0,0) << std::endl; - Rcpp::Rcout << "phi (cell 1): " << phiAux(0) << std::endl; - Rcpp::Rcout << "nu (cell 1): " << nuAux(0,0) << std::endl; - Rcpp::Rcout << "theta (batch 1): " << thetaAux(0,0) << std::endl; - Rcpp::Rcout << "--------------------------------------------------------------------" << std::endl; - Rcpp::Rcout << "Current proposal variances for Metropolis Hastings updates (log-scale)." << std::endl; - Rcpp::Rcout << "LSmu (gene 1): " << LSmuAuxExtra + LSmuAux(0) << std::endl; - Rcpp::Rcout << "LSdelta (gene 1): " << LSdeltaAux(0) << std::endl; - Rcpp::Rcout << "LSnu (cell 1): " << LSnuAux(0) << std::endl; - Rcpp::Rcout << "LStheta (batch 1): " << LSthetaAux(0) << std::endl; + Rcout << "--------------------------------------------------------------------" << std::endl; + Rcout << "MCMC iteration " << i << " out of " << N << " has been completed." << std::endl; + Rcout << "--------------------------------------------------------------------" << std::endl; + Rcout << "Current draws of some selected parameters are displayed below." << std::endl; + Rcout << "mu (gene 1): " << muAux(0,0) << std::endl; + Rcout << "delta (gene 1): " << deltaAux(0,0) << std::endl; + Rcout << "phi (cell 1): " << phiAux(0) << std::endl; + Rcout << "nu (cell 1): " << nuAux(0,0) << std::endl; + Rcout << "theta (batch 1): " << thetaAux(0,0) << std::endl; + Rcout << "--------------------------------------------------------------------" << std::endl; + Rcout << "Current proposal variances for Metropolis Hastings updates (log-scale)." << std::endl; + Rcout << "LSmu (gene 1): " << LSmuAuxExtra + LSmuAux(0) << std::endl; + Rcout << "LSdelta (gene 1): " << LSdeltaAux(0) << std::endl; + Rcout << "LSnu (cell 1): " << LSnuAux(0) << std::endl; + Rcout << "LStheta (batch 1): " << LSthetaAux(0) << std::endl; } } // ACCEPTANCE RATE CONSOLE OUTPUT - Rcpp::Rcout << " " << std::endl; - Rcpp::Rcout << "--------------------------------------------------------------------" << std::endl; - Rcpp::Rcout << "--------------------------------------------------------------------" << std::endl; - Rcpp::Rcout << "All " << N << " MCMC iterations have been completed." << std::endl; - Rcpp::Rcout << "--------------------------------------------------------------------" << std::endl; - Rcpp::Rcout << "--------------------------------------------------------------------" << std::endl; - Rcpp::Rcout << " " << std::endl; + Rcout << " " << std::endl; + Rcout << "--------------------------------------------------------------------" << std::endl; + Rcout << "--------------------------------------------------------------------" << std::endl; + Rcout << "All " << N << " MCMC iterations have been completed." << std::endl; + Rcout << "--------------------------------------------------------------------" << std::endl; + Rcout << "--------------------------------------------------------------------" << std::endl; + Rcout << " " << std::endl; // ACCEPTANCE RATE CONSOLE OUTPUT - Rcpp::Rcout << "--------------------------------------------------------------------" << std::endl; - Rcpp::Rcout << "Please see below a summary of the overall acceptance rates." << std::endl; - Rcpp::Rcout << "--------------------------------------------------------------------" << std::endl; - Rcpp::Rcout << " " << std::endl; - - Rcpp::Rcout << "Minimum acceptance rate among mu[i]'s: " << min(muAccept.elem(find(Index_arma != RefGene))/(N-Burn)) << std::endl; - Rcpp::Rcout << "Average acceptance rate among mu[i]'s: " << mean(muAccept.elem(find(Index_arma != RefGene))/(N-Burn)) << std::endl; - Rcpp::Rcout << "Maximum acceptance rate among mu[i]'s: " << max(muAccept.elem(find(Index_arma != RefGene))/(N-Burn)) << std::endl; - - Rcpp::Rcout << " " << std::endl; - Rcpp::Rcout << "Minimum acceptance rate among delta[i]'s: " << min(deltaAccept/(N-Burn)) << std::endl; - Rcpp::Rcout << "Average acceptance rate among delta[i]'s: " << mean(deltaAccept/(N-Burn)) << std::endl; - Rcpp::Rcout << "Average acceptance rate among delta[i]'s: " << max(deltaAccept/(N-Burn)) << std::endl; - Rcpp::Rcout << " " << std::endl; - Rcpp::Rcout << "Minimum acceptance rate among nu[jk]'s: " << min(nuAccept/(N-Burn)) << std::endl; - Rcpp::Rcout << "Average acceptance rate among nu[jk]'s: " << mean(nuAccept/(N-Burn)) << std::endl; - Rcpp::Rcout << "Maximum acceptance rate among nu[jk]'s: " << max(nuAccept/(N-Burn)) << std::endl; - Rcpp::Rcout << " " << std::endl; - Rcpp::Rcout << "Minimum acceptance rate among theta[k]'s: " << min(thetaAccept/(N-Burn)) << std::endl; - Rcpp::Rcout << "Average acceptance rate among theta[k]'s: " << mean(thetaAccept/(N-Burn)) << std::endl; - Rcpp::Rcout << "Maximum acceptance rate among theta[k]'s: " << max(thetaAccept/(N-Burn)) << std::endl; - Rcpp::Rcout << "--------------------------------------------------------------------" << std::endl; - Rcpp::Rcout << " " << std::endl; + Rcout << "--------------------------------------------------------------------" << std::endl; + Rcout << "Please see below a summary of the overall acceptance rates." << std::endl; + Rcout << "--------------------------------------------------------------------" << std::endl; + Rcout << " " << std::endl; + + Rcout << "Minimum acceptance rate among mu[i]'s: " << min(muAccept.elem(find(Index_arma != RefGene))/(N-Burn)) << std::endl; + Rcout << "Average acceptance rate among mu[i]'s: " << mean(muAccept.elem(find(Index_arma != RefGene))/(N-Burn)) << std::endl; + Rcout << "Maximum acceptance rate among mu[i]'s: " << max(muAccept.elem(find(Index_arma != RefGene))/(N-Burn)) << std::endl; + + Rcout << " " << std::endl; + Rcout << "Minimum acceptance rate among delta[i]'s: " << min(deltaAccept/(N-Burn)) << std::endl; + Rcout << "Average acceptance rate among delta[i]'s: " << mean(deltaAccept/(N-Burn)) << std::endl; + Rcout << "Average acceptance rate among delta[i]'s: " << max(deltaAccept/(N-Burn)) << std::endl; + Rcout << " " << std::endl; + Rcout << "Minimum acceptance rate among nu[jk]'s: " << min(nuAccept/(N-Burn)) << std::endl; + Rcout << "Average acceptance rate among nu[jk]'s: " << mean(nuAccept/(N-Burn)) << std::endl; + Rcout << "Maximum acceptance rate among nu[jk]'s: " << max(nuAccept/(N-Burn)) << std::endl; + Rcout << " " << std::endl; + Rcout << "Minimum acceptance rate among theta[k]'s: " << min(thetaAccept/(N-Burn)) << std::endl; + Rcout << "Average acceptance rate among theta[k]'s: " << mean(thetaAccept/(N-Burn)) << std::endl; + Rcout << "Maximum acceptance rate among theta[k]'s: " << max(thetaAccept/(N-Burn)) << std::endl; + Rcout << "--------------------------------------------------------------------" << std::endl; + Rcout << " " << std::endl; if(StoreAdapt == 1) { // OUTPUT (AS A LIST) return(Rcpp::List::create( - Rcpp::Named("mu") = mu, - Rcpp::Named("delta") = delta, - Rcpp::Named("phi") = phi, - Rcpp::Named("nu") = nu, - Rcpp::Named("theta") = theta, - Rcpp::Named("ls.mu") = LSmu, - Rcpp::Named("ls.delta") = LSdelta, - Rcpp::Named("ls.nu") = LSnu, - Rcpp::Named("ls.theta") = LStheta, + Rcpp::Named("mu") = mu.t(), + Rcpp::Named("delta") = delta.t(), + Rcpp::Named("phi") = phi.t(), + Rcpp::Named("nu") = nu.t(), + Rcpp::Named("theta") = theta.t(), + Rcpp::Named("ls.mu") = LSmu.t(), + Rcpp::Named("ls.delta") = LSdelta.t(), + Rcpp::Named("ls.nu") = LSnu.t(), + Rcpp::Named("ls.theta") = LStheta.t(), Rcpp::Named("RefFreq") = RefFreq/(N-Burn))); } @@ -1736,11 +1651,11 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( { // OUTPUT (AS A LIST) return(Rcpp::List::create( - Rcpp::Named("mu") = mu, - Rcpp::Named("delta") = delta, - Rcpp::Named("phi") = phi, - Rcpp::Named("nu") = nu, - Rcpp::Named("theta") = theta, + Rcpp::Named("mu") = mu.t(), + Rcpp::Named("delta") = delta.t(), + Rcpp::Named("phi") = phi.t(), + Rcpp::Named("nu") = nu.t(), + Rcpp::Named("theta") = theta.t(), Rcpp::Named("RefFreq") = RefFreq/(N-Burn))); } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 7507ba4c..ecb36545 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -69,8 +69,8 @@ BEGIN_RCPP END_RCPP } // HiddenBASiCS_MCMCcppNoSpikes -Rcpp::List HiddenBASiCS_MCMCcppNoSpikes(int N, int Thin, int Burn, NumericMatrix Counts, NumericMatrix BatchDesign, NumericVector mu0, NumericVector delta0, NumericVector phi0, NumericVector nu0, double theta0, double s2mu, double adelta, double bdelta, double aphi, double bphi, double atheta, double btheta, double ar, NumericVector LSmu0, NumericVector LSdelta0, NumericVector LSnu0, double LStheta0, NumericVector sumByCellAll, NumericVector sumByGeneAll, int StoreAdapt, int EndAdapt, int PrintProgress, double s2delta, double prior_delta, NumericVector BatchInfo, NumericVector BatchIds, NumericVector BatchSizes, NumericVector BatchOffSet, double Constrain, NumericVector Index, int RefGene, NumericVector RefGenes, NumericVector ConstrainGene, NumericVector NotConstrainGene, int ConstrainType); -RcppExport SEXP _BASiCS_HiddenBASiCS_MCMCcppNoSpikes(SEXP NSEXP, SEXP ThinSEXP, SEXP BurnSEXP, SEXP CountsSEXP, SEXP BatchDesignSEXP, SEXP mu0SEXP, SEXP delta0SEXP, SEXP phi0SEXP, SEXP nu0SEXP, SEXP theta0SEXP, SEXP s2muSEXP, SEXP adeltaSEXP, SEXP bdeltaSEXP, SEXP aphiSEXP, SEXP bphiSEXP, SEXP athetaSEXP, SEXP bthetaSEXP, SEXP arSEXP, SEXP LSmu0SEXP, SEXP LSdelta0SEXP, SEXP LSnu0SEXP, SEXP LStheta0SEXP, SEXP sumByCellAllSEXP, SEXP sumByGeneAllSEXP, SEXP StoreAdaptSEXP, SEXP EndAdaptSEXP, SEXP PrintProgressSEXP, SEXP s2deltaSEXP, SEXP prior_deltaSEXP, SEXP BatchInfoSEXP, SEXP BatchIdsSEXP, SEXP BatchSizesSEXP, SEXP BatchOffSetSEXP, SEXP ConstrainSEXP, SEXP IndexSEXP, SEXP RefGeneSEXP, SEXP RefGenesSEXP, SEXP ConstrainGeneSEXP, SEXP NotConstrainGeneSEXP, SEXP ConstrainTypeSEXP) { +Rcpp::List HiddenBASiCS_MCMCcppNoSpikes(int N, int Thin, int Burn, NumericMatrix Counts, NumericMatrix BatchDesign, NumericVector mu0, NumericVector delta0, NumericVector phi0, NumericVector nu0, NumericVector theta0, double s2mu, double adelta, double bdelta, double s2delta, double prior_delta, double aphi, double bphi, double atheta, double btheta, double ar, NumericVector LSmu0, NumericVector LSdelta0, NumericVector LSnu0, NumericVector LStheta0, NumericVector sumByCellAll, NumericVector sumByGeneAll, int StoreAdapt, int EndAdapt, int PrintProgress, NumericVector BatchInfo, NumericVector BatchIds, NumericVector BatchOffSet, double Constrain, NumericVector Index, int RefGene, NumericVector RefGenes, NumericVector ConstrainGene, NumericVector NotConstrainGene, int ConstrainType); +RcppExport SEXP _BASiCS_HiddenBASiCS_MCMCcppNoSpikes(SEXP NSEXP, SEXP ThinSEXP, SEXP BurnSEXP, SEXP CountsSEXP, SEXP BatchDesignSEXP, SEXP mu0SEXP, SEXP delta0SEXP, SEXP phi0SEXP, SEXP nu0SEXP, SEXP theta0SEXP, SEXP s2muSEXP, SEXP adeltaSEXP, SEXP bdeltaSEXP, SEXP s2deltaSEXP, SEXP prior_deltaSEXP, SEXP aphiSEXP, SEXP bphiSEXP, SEXP athetaSEXP, SEXP bthetaSEXP, SEXP arSEXP, SEXP LSmu0SEXP, SEXP LSdelta0SEXP, SEXP LSnu0SEXP, SEXP LStheta0SEXP, SEXP sumByCellAllSEXP, SEXP sumByGeneAllSEXP, SEXP StoreAdaptSEXP, SEXP EndAdaptSEXP, SEXP PrintProgressSEXP, SEXP BatchInfoSEXP, SEXP BatchIdsSEXP, SEXP BatchOffSetSEXP, SEXP ConstrainSEXP, SEXP IndexSEXP, SEXP RefGeneSEXP, SEXP RefGenesSEXP, SEXP ConstrainGeneSEXP, SEXP NotConstrainGeneSEXP, SEXP ConstrainTypeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -83,10 +83,12 @@ BEGIN_RCPP Rcpp::traits::input_parameter< NumericVector >::type delta0(delta0SEXP); Rcpp::traits::input_parameter< NumericVector >::type phi0(phi0SEXP); Rcpp::traits::input_parameter< NumericVector >::type nu0(nu0SEXP); - Rcpp::traits::input_parameter< double >::type theta0(theta0SEXP); + Rcpp::traits::input_parameter< NumericVector >::type theta0(theta0SEXP); Rcpp::traits::input_parameter< double >::type s2mu(s2muSEXP); Rcpp::traits::input_parameter< double >::type adelta(adeltaSEXP); Rcpp::traits::input_parameter< double >::type bdelta(bdeltaSEXP); + Rcpp::traits::input_parameter< double >::type s2delta(s2deltaSEXP); + Rcpp::traits::input_parameter< double >::type prior_delta(prior_deltaSEXP); Rcpp::traits::input_parameter< double >::type aphi(aphiSEXP); Rcpp::traits::input_parameter< double >::type bphi(bphiSEXP); Rcpp::traits::input_parameter< double >::type atheta(athetaSEXP); @@ -95,17 +97,14 @@ BEGIN_RCPP Rcpp::traits::input_parameter< NumericVector >::type LSmu0(LSmu0SEXP); Rcpp::traits::input_parameter< NumericVector >::type LSdelta0(LSdelta0SEXP); Rcpp::traits::input_parameter< NumericVector >::type LSnu0(LSnu0SEXP); - Rcpp::traits::input_parameter< double >::type LStheta0(LStheta0SEXP); + Rcpp::traits::input_parameter< NumericVector >::type LStheta0(LStheta0SEXP); Rcpp::traits::input_parameter< NumericVector >::type sumByCellAll(sumByCellAllSEXP); Rcpp::traits::input_parameter< NumericVector >::type sumByGeneAll(sumByGeneAllSEXP); Rcpp::traits::input_parameter< int >::type StoreAdapt(StoreAdaptSEXP); Rcpp::traits::input_parameter< int >::type EndAdapt(EndAdaptSEXP); Rcpp::traits::input_parameter< int >::type PrintProgress(PrintProgressSEXP); - Rcpp::traits::input_parameter< double >::type s2delta(s2deltaSEXP); - Rcpp::traits::input_parameter< double >::type prior_delta(prior_deltaSEXP); Rcpp::traits::input_parameter< NumericVector >::type BatchInfo(BatchInfoSEXP); Rcpp::traits::input_parameter< NumericVector >::type BatchIds(BatchIdsSEXP); - Rcpp::traits::input_parameter< NumericVector >::type BatchSizes(BatchSizesSEXP); Rcpp::traits::input_parameter< NumericVector >::type BatchOffSet(BatchOffSetSEXP); Rcpp::traits::input_parameter< double >::type Constrain(ConstrainSEXP); Rcpp::traits::input_parameter< NumericVector >::type Index(IndexSEXP); @@ -114,7 +113,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< NumericVector >::type ConstrainGene(ConstrainGeneSEXP); Rcpp::traits::input_parameter< NumericVector >::type NotConstrainGene(NotConstrainGeneSEXP); Rcpp::traits::input_parameter< int >::type ConstrainType(ConstrainTypeSEXP); - rcpp_result_gen = Rcpp::wrap(HiddenBASiCS_MCMCcppNoSpikes(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, s2delta, prior_delta, BatchInfo, BatchIds, BatchSizes, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType)); + rcpp_result_gen = Rcpp::wrap(HiddenBASiCS_MCMCcppNoSpikes(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType)); return rcpp_result_gen; END_RCPP } @@ -122,7 +121,7 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_BASiCS_HiddenBASiCS_MCMCcpp", (DL_FUNC) &_BASiCS_HiddenBASiCS_MCMCcpp, 35}, {"_BASiCS_HiddenBASiCS_DenoisedRates", (DL_FUNC) &_BASiCS_HiddenBASiCS_DenoisedRates, 7}, - {"_BASiCS_HiddenBASiCS_MCMCcppNoSpikes", (DL_FUNC) &_BASiCS_HiddenBASiCS_MCMCcppNoSpikes, 40}, + {"_BASiCS_HiddenBASiCS_MCMCcppNoSpikes", (DL_FUNC) &_BASiCS_HiddenBASiCS_MCMCcppNoSpikes, 39}, {NULL, NULL, 0} }; diff --git a/vignettes/BASiCS_nospikes.Rmd b/vignettes/BASiCS_nospikes.Rmd new file mode 100644 index 00000000..c2263ef5 --- /dev/null +++ b/vignettes/BASiCS_nospikes.Rmd @@ -0,0 +1,484 @@ + + + ```{r, echo=FALSE, results="hide", message=TRUE} +require(knitr) +opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) +``` + +```{r style, echo=FALSE, results='asis'} +BiocStyle::markdown() +``` + +```{r library, echo=FALSE} +library(BASiCS) +``` + +# BASiCS: vertical versus horizontal integration. + +Package: `r Biocpkg("BASiCS")`
+Authors: Catalina Vallejos (cnvallej@uc.cl) and Nils Eling (eling@ebi.ac.uk)
+Compilation date: `r Sys.Date()` + +*** + +# Introduction + +This vignette is to illustrate the no-spikes case of BASiCS. + +# Quick start + +### The input dataset + +The input dataset for BASiCS must be stored as an `SingleCellExperiment` +object (see `r Biocpkg("SingleCellExperiment")` package). + +The `newBASiCS_Data` function can be used to create the input data object based +on the following information: + + * `Counts`: a matrix of raw expression counts with dimensions $q$ times $n$. +Within this matrix, $q_0$ rows must correspond to biological genes and $q-q_0$ + rows must correspond to technical spike-in genes. Gene names must be stored as `rownames(Counts)`. + +* `Tech`: a vector of `TRUE`/`FALSE` elements with length $q$. +If `Tech[i] = FALSE` the gene `i` is biological; otherwise the gene is spike-in. +This vector must be specified in the same order of genes as in the +`Counts` matrix. + +* `SpikeInfo`: a `data.frame` with $q-q_0$ rows. First column must contain the +names associated to the spike-in genes (as in `rownames(Counts)`). Second column +must contain the input number of molecules for the spike-in genes +(amount per cell). + +* `BatchInfo` (optional): vector of length $n$ to indicate batch structure +(whenever cells have been processed using multiple batches). + +For example, the following code simulates a dataset with 50 genes +(40 biological and 10 spike-in) and 40 cells. + +```{r ExampleDataTest} +set.seed(1) +Counts <- matrix(rpois(50*40, 2), ncol = 40) +rownames(Counts) <- c(paste0("Gene", 1:40), paste0("Spike", 1:10)) +Tech <- c(rep(FALSE,40),rep(TRUE,10)) +set.seed(2) +SpikeInput <- rgamma(10,1,1) +SpikeInfo <- data.frame("SpikeID" = paste0("Spike", 1:10), + "SpikeInput" = SpikeInput) + +# With spikes; with batches +DataExample <- newBASiCS_Data(Counts, Tech, SpikeInfo, + BatchInfo = rep(c(1,2), each = 20)) + +# Without spikes; with batches +DataExample <- newBASiCS_Data(Counts, Tech, + BatchInfo = rep(c(1,2), each = 20)) +``` + + +*** + + + ### Running the MCMC sampler + + Parameter estimation is performed using the `BASiCS_MCMC` function. +Essential parameters for running this algorithm are: + + * `N`: total number of iterations +* `Thin`: length of the thining period (i.e. only every `Thin` + iterations will be stored in the output of the `BASiCS_MCMC`) +* `Burn`: length of burn-in period (i.e. the initial `Burn` + iterations that will be discarded from the output of the `BASiCS_MCMC`) + +If the optional parameter `PrintProgress` is set to `TRUE`, the R +console will display the progress of the MCMC algorithm. +For other optional parameters refer to `help(BASiCS_MCMC)`. + +Here, we illustrate the usage of `BASiCS_MCMC` using a built-in +synthetic datasets. + +- Vertical integration (with spikes) + +```{r quick-start-MCMC-vertical} +Data1 <- makeExampleBASiCS_Data(WithSpikes = TRUE, WithBatch = TRUE) +Chain1 <- BASiCS_MCMC(Data = Data1, N = 1000, Thin = 10, Burn = 500, + PrintProgress = FALSE) +``` + + +- Horizontal integration (no spikes case) + +```{r quick-start-MCMC-horizontal} +Data2 <- makeExampleBASiCS_Data(WithSpikes = FALSE, WithBatch = TRUE) +Chain2 <- BASiCS_MCMC(Data = Data2, N = 10000, Thin = 10, Burn = 5000, + PrintProgress = FALSE, StoreDir = tempdir()) +plot(Chain2, Param = "phi", Cell = 1) +plot(Chain2, Param = "theta", Batch = 1) +plot(Chain2, Param = "mu", Gene = 1) +plot(Chain2, Param = "delta", Gene = 1) +``` + + + +**Important remarks:** + + - Please ensure the acceptance rates displayed in the console output of +`BASiCS_MCMC` are around 0.44. If they are too far from this value, you +should increase `N` and `Burn`. + +- It is **essential** to assess the convergence of the MCMC algorithm +**before** performing downstream analyses. For guidance regarding this step, +refer to the 'Convergence assessment' section of this vignette + +Typically, setting `N=20000`, `Thin=20` and `Burn=10000` leads to +stable results. + +### Analysis for a single group of cells + +We illustrate this analysis using a small extract from the MCMC chain obtained +in [2] when analysing the single cell samples provided in [5]. This is included +within `BASiCS` as the `ChainSC` dataset. + +```{r LoadSingleData} +data(ChainSC) +``` + +The following code is used to identify **highly variable genes (HVG)** and +**lowly variable genes (LVG)** within these cells. The `VarThreshold` parameter +sets a lower threshold for the proportion of variability that is assigned to +the biological component (`Sigma`). In the examples below: + + - HVG are defined as those genes for which **at least** 60\% of their total +variability is attributed to the biological variability component. +- LVG are defined as those genes for which **at most** 40\% of their total +variability is attributed to the biological variability component. + +For each gene, these functions return posterior probabilities as a measure of +HVG/LVG evidence. A cut-off value for these posterior probabilities is set by +controlling EFDR (defaul option: `EviThreshold` defined such that EFDR = 0.10). + +```{r quick-start-HVGdetection, fig.height = 6, fig.width = 6} +par(mfrow = c(2,2)) +HVG <- BASiCS_DetectHVG(ChainSC, VarThreshold = 0.6, Plot = TRUE) +LVG <- BASiCS_DetectLVG(ChainSC, VarThreshold = 0.2, Plot = TRUE) +``` + +To access the results of these tests, please use. + +```{r quick-start-HVGdetectionTable} +head(HVG$Table) +head(LVG$Table) +``` + +```{r quick-start-HVGdetectionPlot} +SummarySC <- Summary(ChainSC) +plot(SummarySC, Param = "mu", Param2 = "delta", log = "xy") +with(HVG$Table[HVG$Table$HVG == TRUE,], points(Mu, Delta)) +with(LVG$Table[LVG$Table$LVG == TRUE,], points(Mu, Delta)) +``` + +**Note**: this criteria for threshold has changed with respect to the original +release of BASiCS (where `EviThreshold` was defined such that EFDR = EFNR). +However, the new choice is more stable (sometimes, it was not posible to + find a threshold such that EFDR = EFNR). + +### Analysis for two groups of cells + +To illustrate the use of the differential mean expression and differential +over-dispersion tests between two cell populations, we use extracts from the +MCMC chains obtained in [2] when analysing the [5] dataset (single cells vs + pool-and-split samples). These were obtained by independently running the +`BASiCS_MCMC` function for each group of cells. + +```{r quick-start-LoadBothData} +data(ChainSC) +data(ChainRNA) +``` + +```{r quick-start-TestDE, fig.width=10, fig.height=5} +Test <- BASiCS_TestDE(Chain1 = ChainSC, Chain2 = ChainRNA, + GroupLabel1 = "SC", GroupLabel2 = "PaS", + EpsilonM = log2(1.5), EpsilonD = log2(1.5), + EFDR_M = 0.10, EFDR_D = 0.10, + OffSet = TRUE, OffsetPlot = TRUE, Plot = TRUE) +``` + +In `BASiCS_TestDE`, `EpsilonM` sets the log2 fold change (log2FC) in expression +($\mu$) and `EpsilonD` the log2FC in over-dispersion ($\delta$). As a default +option: `EpsilonM = EpsilonD = log2(1.5)` (i.e. + 50\% increase). To adjust for differences in overall RNA content, an internal +offset correction is performed when `OffSet=TRUE`. +This is the recommended default. + +**Note:** due to the confounding between mean and over-dispersion that is +typically observed in scRNA-seq datasets, we only assess changes in +over-dispersion for those genes in which the mean does not change +between the groups. + +The resulting output list can be displayed using + +```{r quick-start-DE} +head(Test$TableMean) +head(Test$TableDisp) +``` + +# Additional details + +### Storing and loading MCMC chains + +To externally store the output of `BASiCS_MCMC` (recommended), additional +parameters `StoreChains`, `StoreDir` and `RunName` are required. For example: + + ```{r MCMCNotRun} +Data <- makeExampleBASiCS_Data() +Chain <- BASiCS_MCMC(Data, N = 1000, Thin = 10, Burn = 500, + PrintProgress = FALSE, StoreChains = TRUE, + StoreDir = tempdir(), RunName = "Example") +``` + +In this example, the output of `BASiCS_MCMC` will be stored as a `BASiCS_Chain` +object in the file "chain_Example.Rds", within the `tempdir()` directory. + +To load pre-computed MCMC chains, + +```{r LoadChainNotRun} +Chain <- BASiCS_LoadChain("Example", StoreDir = tempdir()) +``` + +## Convergence assessment + +To assess convergence of the chain, the convergence diagnostics provided by the +package `coda` can be used. Additionally, the chains can be visually inspected. +For example: + + ```{r Traceplots, fig.height = 3, fig.width = 6} +plot(Chain, Param = "mu", Gene = 1, log = "y") +plot(Chain, Param = "phi", Cell = 1) +``` + +In the figures above: + + - Left panels show traceplots for the chains +- Right panels show the autocorrelation function (see `?acf`) + + ## Summarising the posterior distribution + + To access the MCMC chains associated to individual parameter use the function `displayChainBASiCS`. For example, + +```{r AccessChains} +displayChainBASiCS(Chain, Param = "mu")[1:5,1:5] +``` + +As a summary of the posterior distribution, the function `Summary` calculates +posterior medians and the High Posterior Density (HPD) intervals for each model +parameter. As a default option, HPD intervals contain 0.95 probability. + +```{r Summary} +ChainSummary <- Summary(Chain) +``` + +The function `displaySummaryBASiCS` extract posterior summaries for individual +parameters. For example + +```{r SummaryExample} +head(displaySummaryBASiCS(ChainSummary, Param = "mu")) +``` + + +The following figures display posterior medians and the corresponding HPD 95% +intervals for gene-specific parameters $\mu_i$ (mean) and $\delta_i$ + (over-dispersion) + +```{r OtherHPD, fig.width = 7, fig.height = 7} +par(mfrow = c(2,2)) +plot(ChainSummary, Param = "mu", main = "All genes", log = "y") +plot(ChainSummary, Param = "mu", Genes = 1:10, main = "First 10 genes") +plot(ChainSummary, Param = "delta", main = "All genes") +plot(ChainSummary, Param = "delta", Genes = c(2,5,10,50), main = "5 customized genes") +``` + +It is also possible to obtain similar summaries for the normalising constants +$\phi_j$ and $s_j$. + +```{r Normalisation, fig.width = 7, fig.height = 3.5} +par(mfrow = c(1,2)) +plot(ChainSummary, Param = "phi") +plot(ChainSummary, Param = "s", Cells = 1:5) +``` + +Finally, it is also possible to create a scatterplot of posterior estimates for gene-specific parameters. Typically, this plot will exhibit the confounding +effect that is observed between mean and over-dispersion. + +```{r DispVsExp, fig.width = 7, fig.height = 3.5} +par(mfrow = c(1,2)) +plot(ChainSummary, Param = "mu", Param2 = "delta", log = "x", SmoothPlot = FALSE) +plot(ChainSummary, Param = "mu", Param2 = "delta", log = "x", SmoothPlot = TRUE) +``` + +The option `SmoothPlot = TRUE` is generally recommended as this plot will +contain thousands of genes when analysing real datasets. + +## Normalisation and removal of technical variation + +It is also possible to produce a matrix of normalised and denoised expression +counts for which the effect of technical variation is removed. For this purpose, +we implemented the function `BASiCS_DenoisedCounts`. For each gene $i$ and +cell $j$ this function returns + +$$ x^*_{ij} = \frac{ x_{ij} } {\hat{\phi}_j \hat{\nu}_j}, $$ + + where $x_{ij}$ is the observed expression count of gene $i$ in cell $j$, +$\hat{\phi}_j$ denotes the posterior median of $\phi_j$ and $\hat{\nu}_j$ is +the posterior median of $\nu_j$. + +```{r DenoisedCounts} +DenoisedCounts = BASiCS_DenoisedCounts(Data = Data, Chain = Chain) +DenoisedCounts[1:5, 1:5] +``` + +Alternativelly, the user can compute the normalised and denoised expression +rates underlying the expression of all genes across cells using +`BASiCS_DenoisedRates`. The output of this function is given by + +$$ \Lambda_{ij} = \hat{\mu_i} \hat{\rho}_{ij}, $$ + + where $\hat{\mu_i}$ represents the posterior median of $\mu_j$ and +$\hat{\rho}_{ij}$ is given by its posterior mean (Monte Carlo estimate based + on the MCMC sample of all model parameters). + +```{r DenoisedProp} +DenoisedRates <- BASiCS_DenoisedRates(Data = Data, Chain = Chain, + Propensities = FALSE) +DenoisedRates[1:5, 1:5] +``` + +Alternative, denoised expression propensities $\hat{\rho}_{ij}$ can +also be extracted + +```{r DenoisedRates} +DenoisedProp = BASiCS_DenoisedRates(Data = Data, Chain = Chain, + Propensities = TRUE) +DenoisedProp[1:5, 1:5] +``` + +# Methodology + +We first describe the model introduced in [1], which relates to a single group +of cells. + +Throughout, we consider the expression counts of $q$ genes, where $q_0$ are +expressed in the population of cells under study (biological genes) and the +remaining $q-q_0$ are extrinsic spike-in (technical) genes. Let $X_{ij}$ be a +random variable representing the expression count of a gene $i$ in cell $j$ ($i=1,\ldots,q$; $j=1,\ldots,n$). BASiCS is based on the following hierarchical +model: $$X_{ij} \big| \mu_i, \phi_j, \nu_j, \rho_{ij} \sim \left\{ \begin{array}{ll} \mbox{Poisson}(\phi_j \nu_j \mu_i \rho_{ij}), \mbox{ for }i=1,\ldots,q_0, j=1,\ldots,n \\ \mbox{Poisson}(\nu_j \mu_i), \mbox{ for }i=q_0+1,\ldots,q, j=1,\ldots,n, \end{array} \right.$$ + + where $\nu_j$ and $\rho_{ij}$ are mutually independent random effects such that $\nu_j|s_j,\theta \sim \mbox{Gamma}(1/\theta,1/ (s_j \theta))$ and $\rho_{ij} | \delta_i \sim \mbox{Gamma} (1/\delta_i,1/\delta_i)$[^footnoteGamma]. + +A graphical representation of this model is displayed below. This is based on +the expression counts of 2 genes ($i$: biological and $i'$: technical) at 2 + cells ($j$ and $j'$). Squared and circular nodes denote known observed +quantities (observed expression counts and added number of spike-in mRNA + molecules) and unknown elements, respectively. Whereas black circular nodes +represent the random effects that play an intermediate role in our hierarchical +structure, red circular nodes relate to unknown model parameters in the top +layer of hierarchy in our model. Blue, green and grey areas highlight elements +that are shared within a biological gene, technical gene or cell, respectively. + + + + \centerline{\includegraphics[height=4in]{./BASiCS_DAG.jpg}} + +In this setting, the key parameters to be used for downstream analyses are: + + * $\mu_i$: mean expression parameter for gene $i$ in the group of cells under +study. In case of the spike-in technical genes, $\mu_i$ is assumed to be known +and equal to the input number of molecules of the corresponding spike-in gene). + +* $\delta_i$: over-dispersion parameter for gene $i$, a measure for the excess +of variation that is observed after accounting for technical noise (with respect + to Poisson sampling) + +Additional (nuisance) parameters are interpreted as follows: + + * $\phi_j$: cell-specific normalizing parameters related to differences in mRNA +content (identifiability constrain: $\sum_{j=1}^n \phi_j = n$). + +* $s_j$: cell-specific normalizing parameters related to technical cell-specific +biases (for more details regarding this interpretation see [6]). + +* $\theta$: technical over-dispersion parameter, controlling the strenght of +cell-to-cell technical variability. + +When cells from the same group are processed in multiple sequencing batches, +this model is extended so that the technical over-dispersion parameter $\theta$ + is batch-specific. This extension allows a different strenght of technical noise +to be inferred for each batch of cells. + +[^footnoteGamma]: We parametrize the Gamma distribution such that if $X \sim \mbox{Gamma}(a,b)$, then $\mbox{E}(X)=a/b$ and $\mbox{var}(X)=a/b^2$. + +In [2], this model has been extended to cases where multiple groups of cells are +under study. This is achieved by assuming gene-specific parameters to be also group-specific. Based on this setup, evidence of differential expression is +quantified through log2-fold changes of gene-specific parameters +(mean and over-dispersion) between the groups. + +More details regarding the model setup, prior specification and implementation +are described in [1] and [2]. + +*** + + # Acknowledgements + + We thank several members of the Marioni laboratory (EMBL-EBI; CRUK-CI) for +support and discussions throughout the development of this R library. +In particular, we are grateful to Aaron Lun (@LTLA, CRUK-CI) for advise and +support during the preparation the Bioconductor submission. + +We also acknowledge feedback and contributions from (Github aliases provided + within parenthesis): Ben Dulken (@bdulken), Chang Xu (@xuchang116), +Danilo Horta (@Horta), Dmitriy Zhukov (@dvzhukov), +Jens Preußner (@jenzopr), Joanna Dreux (@Joannacodes), Kevin Rue-Albrecht (@kevinrue), Luke Zappia (@lazappi), Simon Anders (@s-andrews), Yongchao Ge and +Yuan Cao (@yuancao90), among others. + +This work has been funded by the MRC Biostatistics Unit (MRC grant no. + MRC_MC_UP_0801/1; Catalina Vallejos and Sylvia Richardson), EMBL European +Bioinformatics Institute (core European Molecular Biology Laboratory funding; + Catalina Vallejos, Nils Eling and John Marioni), CRUK Cambridge Institute +(core CRUK funding; John Marioni) and The Alan Turing Institute (EPSRC grant no. EP/N510129/1; Catalina Vallejos). + +*** + + # References + + [1] Vallejos CA, Marioni JCM and Richardson S (2015) BASiCS: Bayesian analysis +of single-cell sequencing data. *PLoS Computational Biology* 11 (6), e1004333. + +[2] Vallejos CA, Richardson S and Marioni JCM (2016) Beyond comparisons of +means: understanding changes in gene expression at the single-cell level. +*Genome Biology* 17 (1), 1-14. + +[3] Martinez-Jimenez CP, Eling N, Chen H, Vallejos CA, Kolodziejczyk AA, +Connor F, Stojic L, Rayner TF, Stubbington MJT, Teichmann SA, de la Roche M, +Marioni JC and Odom DT (2017) Aging increases cell-to-cell transcriptional +variability upon immune stimulation. *Science* 355 (6332), 1433-1436. + +[4] Newton MA, Noueiry A, Sarkar D, Ahlquist P (2004) Detecting differential +gene expression with a semiparametric hierarchical mixture method. +*Biostatistics* 5 (2), 155-76. + +[5] Grün D, Kester L, van Oudenaarden A (2014) Validation of noise models +for single-cell transcriptomics. *Nature Methods* 11 (6), 637-40. + +[6] Vallejos CA, Risso D, Scialdone A, Dudoit S and Marioni JCM (2017) +Normalizing single-cell RNA-sequencing data: challenges and opportunities. +*Nature Methods* 14, 565-571. + +[7] Roberts GO and Rosenthal JS (2009). Examples of adaptive MCMC. *Journal of Computational and Graphical Statistics* 18: 349-367. + +# Session information + +```{r SessionInfo} +sessionInfo() +``` From a4490bc5955b9db60a93d605e7ef1796eaf7f010 Mon Sep 17 00:00:00 2001 From: Catalina Vallejos Date: Tue, 26 Sep 2017 17:58:35 +0100 Subject: [PATCH 2/7] updated to-do list only --- NEWS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS b/NEWS index bff14a7b..888c3ecb 100644 --- a/NEWS +++ b/NEWS @@ -597,6 +597,9 @@ No-spikes implementation - Temporary vignette file for no-spikes case has been created - `HiddenBASiCS_MCMCcppNoSpikes` updated to recycle vertical integration update functions + +TO-DO +- Implement Giacomo's suggestion? - DEVELOPMENT NOTES (potential future changes): * Delete BASiCS_VarianceDecomp? From 8db0b38a3793c844933ee6ae51fe9e3a1e0ddce8 Mon Sep 17 00:00:00 2001 From: catavallejos Date: Wed, 27 Sep 2017 10:55:14 +0100 Subject: [PATCH 3/7] No-spikes implementation - `makeExampleBASiCS_Data` + `newBASiCS_Data` adapted to no-spikes case - Temporary vignette file for no-spikes case has been created - `HiddenBASiCS_MCMCcppNoSpikes` updated to recycle vertical integration update functions - Fixed bug to allow use of stochastic reference --- NEWS | 9 + R/BASiCS_MCMC.R | 36 +--- R/RcppExports.R | 4 +- src/BASiCS_CPPcode.cpp | 6 +- src/RcppExports.cpp | 9 +- vignettes/BASiCS_nospikes.Rmd | 379 ++-------------------------------- 6 files changed, 50 insertions(+), 393 deletions(-) diff --git a/NEWS b/NEWS index 888c3ecb..2898446d 100644 --- a/NEWS +++ b/NEWS @@ -597,9 +597,18 @@ No-spikes implementation - Temporary vignette file for no-spikes case has been created - `HiddenBASiCS_MCMCcppNoSpikes` updated to recycle vertical integration update functions +- Fixed bug to allow use of stochastic reference TO-DO - Implement Giacomo's suggestion? + +Summary of constrain types +- ConstrainType: full or subset of genes + * For the second case, different filter thresholds have been proposed +- StochasticRef or no +- Idea: + * Try StochasticRef with no filter + * Try StochasticRef with filter > 0 total counts - DEVELOPMENT NOTES (potential future changes): * Delete BASiCS_VarianceDecomp? diff --git a/R/BASiCS_MCMC.R b/R/BASiCS_MCMC.R index cc892d1b..e822be24 100644 --- a/R/BASiCS_MCMC.R +++ b/R/BASiCS_MCMC.R @@ -343,12 +343,8 @@ BASiCS_MCMC <- function(Data, N, Thin, Burn, ...) { # 1: Full constrain; 2: Non-zero genes only ConstrainType <- ifelse("ConstrainType" %in% names(args), args$ConstrainType, 2) - ConstrainLimit <- ifelse("ConstrainLimit" %in% names(args), - args$ConstrainLimit, 1) - ConstrainAlpha <- ifelse("ConstrainAlpha" %in% names(args), - args$ConstrainAlpha, 0.05) - ConstrainProb <- ifelse("ConstrainProb" %in% names(args), - args$ConstrainProb, 0.95) + StochasticRef <- ifelse("StochasticRef" %in% names(args), + args$StochasticRef, TRUE) BatchDesign <- model.matrix(~as.factor(metadata(Data)$BatchInfo) - 1) BatchSizes <- table(metadata(Data)$BatchInfo) @@ -368,34 +364,19 @@ BASiCS_MCMC <- function(Data, N, Thin, Burn, ...) { # '0' as its first element Constrain for gene-specific expression rates if (ConstrainType == 1) { - # Full constrain Note we use 'ConstrainLimit + 1' as 1 - # pseudo-count was added - # when computing 'mu0' (to avoid numerical issues) + # Full constrain ConstrainGene <- (1:q.bio) - 1 NotConstrainGene <- 0 Constrain <- mean(log(mu0[ConstrainGene + 1])) } if (ConstrainType == 2) { - # Trimmed constrain based on mean Note we use 'ConstrainLimit + 1' - # as 1 pseudo-count - # was added when computing 'mu0' (to avoid numerical issues) - ConstrainGene <- which(mu0 >= ConstrainLimit + 1) - 1 - NotConstrainGene <- which(mu0 < ConstrainLimit + 1) - 1 - Constrain <- mean(log(mu0[ConstrainGene + 1])) - } - if (ConstrainType == 3) - { - # Trimmed constrain based on detection - Detection <- rowMeans(assay(Data) > 0) - ConstrainGene <- which(Detection >= ConstrainLimit) - 1 - NotConstrainGene <- which(Detection < ConstrainLimit) - 1 + # Trimmed constrain based on total counts > 0 + ConstrainGene <- which(sum.bycell.bio > 0) - 1 + NotConstrainGene <- which(sum.bycell.bio == 0) - 1 Constrain <- mean(log(mu0[ConstrainGene + 1])) } - - StochasticRef <- ifelse("StochasticRef" %in% names(args), - args$StochasticRef, FALSE) - + if (StochasticRef == TRUE) { aux.ref <- cbind(ConstrainGene, @@ -444,7 +425,8 @@ BASiCS_MCMC <- function(Data, N, Thin, Burn, ...) { Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, - ConstrainType)) + ConstrainType, + as.numeric(StochasticRef))) } Chain$mu <- Chain$mu[, 1:q.bio] diff --git a/R/RcppExports.R b/R/RcppExports.R index 91caeb30..2d4a6c51 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -9,7 +9,7 @@ HiddenBASiCS_DenoisedRates <- function(CountsBio, Mu, TransInvDelta, PhiNu, N, q .Call('_BASiCS_HiddenBASiCS_DenoisedRates', PACKAGE = 'BASiCS', CountsBio, Mu, TransInvDelta, PhiNu, N, q0, n) } -HiddenBASiCS_MCMCcppNoSpikes <- function(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType) { - .Call('_BASiCS_HiddenBASiCS_MCMCcppNoSpikes', PACKAGE = 'BASiCS', N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType) +HiddenBASiCS_MCMCcppNoSpikes <- function(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef) { + .Call('_BASiCS_HiddenBASiCS_MCMCcppNoSpikes', PACKAGE = 'BASiCS', N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef) } diff --git a/src/BASiCS_CPPcode.cpp b/src/BASiCS_CPPcode.cpp index bb347f49..9212f7f5 100644 --- a/src/BASiCS_CPPcode.cpp +++ b/src/BASiCS_CPPcode.cpp @@ -1375,7 +1375,8 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( NumericVector RefGenes, NumericVector ConstrainGene, NumericVector NotConstrainGene, - int ConstrainType) + int ConstrainType, + int StochasticRef) { using arma::ones; @@ -1499,8 +1500,7 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( // UPDATE OF MU: 1st COLUMN IS THE UPDATE, 2nd COLUMN IS THE ACCEPTANCE INDICATOR - // Additional steps required for ConstrainType = 3 (stochastic reference) - if((ConstrainType == 3) | (ConstrainType == 5)) + if(StochasticRef == 1) { RefAux = as_scalar(arma::randi( 1, arma::distr_param(0, RefGenes_arma.size()-1) )); RefGene = RefGenes(RefAux); diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index ecb36545..b5cef5a5 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -69,8 +69,8 @@ BEGIN_RCPP END_RCPP } // HiddenBASiCS_MCMCcppNoSpikes -Rcpp::List HiddenBASiCS_MCMCcppNoSpikes(int N, int Thin, int Burn, NumericMatrix Counts, NumericMatrix BatchDesign, NumericVector mu0, NumericVector delta0, NumericVector phi0, NumericVector nu0, NumericVector theta0, double s2mu, double adelta, double bdelta, double s2delta, double prior_delta, double aphi, double bphi, double atheta, double btheta, double ar, NumericVector LSmu0, NumericVector LSdelta0, NumericVector LSnu0, NumericVector LStheta0, NumericVector sumByCellAll, NumericVector sumByGeneAll, int StoreAdapt, int EndAdapt, int PrintProgress, NumericVector BatchInfo, NumericVector BatchIds, NumericVector BatchOffSet, double Constrain, NumericVector Index, int RefGene, NumericVector RefGenes, NumericVector ConstrainGene, NumericVector NotConstrainGene, int ConstrainType); -RcppExport SEXP _BASiCS_HiddenBASiCS_MCMCcppNoSpikes(SEXP NSEXP, SEXP ThinSEXP, SEXP BurnSEXP, SEXP CountsSEXP, SEXP BatchDesignSEXP, SEXP mu0SEXP, SEXP delta0SEXP, SEXP phi0SEXP, SEXP nu0SEXP, SEXP theta0SEXP, SEXP s2muSEXP, SEXP adeltaSEXP, SEXP bdeltaSEXP, SEXP s2deltaSEXP, SEXP prior_deltaSEXP, SEXP aphiSEXP, SEXP bphiSEXP, SEXP athetaSEXP, SEXP bthetaSEXP, SEXP arSEXP, SEXP LSmu0SEXP, SEXP LSdelta0SEXP, SEXP LSnu0SEXP, SEXP LStheta0SEXP, SEXP sumByCellAllSEXP, SEXP sumByGeneAllSEXP, SEXP StoreAdaptSEXP, SEXP EndAdaptSEXP, SEXP PrintProgressSEXP, SEXP BatchInfoSEXP, SEXP BatchIdsSEXP, SEXP BatchOffSetSEXP, SEXP ConstrainSEXP, SEXP IndexSEXP, SEXP RefGeneSEXP, SEXP RefGenesSEXP, SEXP ConstrainGeneSEXP, SEXP NotConstrainGeneSEXP, SEXP ConstrainTypeSEXP) { +Rcpp::List HiddenBASiCS_MCMCcppNoSpikes(int N, int Thin, int Burn, NumericMatrix Counts, NumericMatrix BatchDesign, NumericVector mu0, NumericVector delta0, NumericVector phi0, NumericVector nu0, NumericVector theta0, double s2mu, double adelta, double bdelta, double s2delta, double prior_delta, double aphi, double bphi, double atheta, double btheta, double ar, NumericVector LSmu0, NumericVector LSdelta0, NumericVector LSnu0, NumericVector LStheta0, NumericVector sumByCellAll, NumericVector sumByGeneAll, int StoreAdapt, int EndAdapt, int PrintProgress, NumericVector BatchInfo, NumericVector BatchIds, NumericVector BatchOffSet, double Constrain, NumericVector Index, int RefGene, NumericVector RefGenes, NumericVector ConstrainGene, NumericVector NotConstrainGene, int ConstrainType, int StochasticRef); +RcppExport SEXP _BASiCS_HiddenBASiCS_MCMCcppNoSpikes(SEXP NSEXP, SEXP ThinSEXP, SEXP BurnSEXP, SEXP CountsSEXP, SEXP BatchDesignSEXP, SEXP mu0SEXP, SEXP delta0SEXP, SEXP phi0SEXP, SEXP nu0SEXP, SEXP theta0SEXP, SEXP s2muSEXP, SEXP adeltaSEXP, SEXP bdeltaSEXP, SEXP s2deltaSEXP, SEXP prior_deltaSEXP, SEXP aphiSEXP, SEXP bphiSEXP, SEXP athetaSEXP, SEXP bthetaSEXP, SEXP arSEXP, SEXP LSmu0SEXP, SEXP LSdelta0SEXP, SEXP LSnu0SEXP, SEXP LStheta0SEXP, SEXP sumByCellAllSEXP, SEXP sumByGeneAllSEXP, SEXP StoreAdaptSEXP, SEXP EndAdaptSEXP, SEXP PrintProgressSEXP, SEXP BatchInfoSEXP, SEXP BatchIdsSEXP, SEXP BatchOffSetSEXP, SEXP ConstrainSEXP, SEXP IndexSEXP, SEXP RefGeneSEXP, SEXP RefGenesSEXP, SEXP ConstrainGeneSEXP, SEXP NotConstrainGeneSEXP, SEXP ConstrainTypeSEXP, SEXP StochasticRefSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -113,7 +113,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< NumericVector >::type ConstrainGene(ConstrainGeneSEXP); Rcpp::traits::input_parameter< NumericVector >::type NotConstrainGene(NotConstrainGeneSEXP); Rcpp::traits::input_parameter< int >::type ConstrainType(ConstrainTypeSEXP); - rcpp_result_gen = Rcpp::wrap(HiddenBASiCS_MCMCcppNoSpikes(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType)); + Rcpp::traits::input_parameter< int >::type StochasticRef(StochasticRefSEXP); + rcpp_result_gen = Rcpp::wrap(HiddenBASiCS_MCMCcppNoSpikes(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef)); return rcpp_result_gen; END_RCPP } @@ -121,7 +122,7 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_BASiCS_HiddenBASiCS_MCMCcpp", (DL_FUNC) &_BASiCS_HiddenBASiCS_MCMCcpp, 35}, {"_BASiCS_HiddenBASiCS_DenoisedRates", (DL_FUNC) &_BASiCS_HiddenBASiCS_DenoisedRates, 7}, - {"_BASiCS_HiddenBASiCS_MCMCcppNoSpikes", (DL_FUNC) &_BASiCS_HiddenBASiCS_MCMCcppNoSpikes, 39}, + {"_BASiCS_HiddenBASiCS_MCMCcppNoSpikes", (DL_FUNC) &_BASiCS_HiddenBASiCS_MCMCcppNoSpikes, 40}, {NULL, NULL, 0} }; diff --git a/vignettes/BASiCS_nospikes.Rmd b/vignettes/BASiCS_nospikes.Rmd index c2263ef5..47b771c6 100644 --- a/vignettes/BASiCS_nospikes.Rmd +++ b/vignettes/BASiCS_nospikes.Rmd @@ -39,9 +39,9 @@ object (see `r Biocpkg("SingleCellExperiment")` package). The `newBASiCS_Data` function can be used to create the input data object based on the following information: - * `Counts`: a matrix of raw expression counts with dimensions $q$ times $n$. +* `Counts`: a matrix of raw expression counts with dimensions $q$ times $n$. Within this matrix, $q_0$ rows must correspond to biological genes and $q-q_0$ - rows must correspond to technical spike-in genes. Gene names must be stored as `rownames(Counts)`. +rows must correspond to technical spike-in genes. Gene names must be stored as `rownames(Counts)`. * `Tech`: a vector of `TRUE`/`FALSE` elements with length $q$. If `Tech[i] = FALSE` the gene `i` is biological; otherwise the gene is spike-in. @@ -82,16 +82,16 @@ DataExample <- newBASiCS_Data(Counts, Tech, *** - ### Running the MCMC sampler +### Running the MCMC sampler Parameter estimation is performed using the `BASiCS_MCMC` function. Essential parameters for running this algorithm are: - * `N`: total number of iterations -* `Thin`: length of the thining period (i.e. only every `Thin` - iterations will be stored in the output of the `BASiCS_MCMC`) -* `Burn`: length of burn-in period (i.e. the initial `Burn` - iterations that will be discarded from the output of the `BASiCS_MCMC`) +* `N`: total number of iterations +* `Thin`: length of the thining period (i.e. only every `Thin` iterations will +be stored in the output of the `BASiCS_MCMC`) +* `Burn`: length of burn-in period (i.e. the initial `Burn` iterations that will +be discarded from the output of the `BASiCS_MCMC`) If the optional parameter `PrintProgress` is set to `TRUE`, the R console will display the progress of the MCMC algorithm. @@ -106,6 +106,7 @@ synthetic datasets. Data1 <- makeExampleBASiCS_Data(WithSpikes = TRUE, WithBatch = TRUE) Chain1 <- BASiCS_MCMC(Data = Data1, N = 1000, Thin = 10, Burn = 500, PrintProgress = FALSE) +Summary1 <- Summary(Chain1) ``` @@ -114,368 +115,32 @@ Chain1 <- BASiCS_MCMC(Data = Data1, N = 1000, Thin = 10, Burn = 500, ```{r quick-start-MCMC-horizontal} Data2 <- makeExampleBASiCS_Data(WithSpikes = FALSE, WithBatch = TRUE) Chain2 <- BASiCS_MCMC(Data = Data2, N = 10000, Thin = 10, Burn = 5000, - PrintProgress = FALSE, StoreDir = tempdir()) + PrintProgress = FALSE, StoreDir = tempdir(), + StochasticRef = FALSE) plot(Chain2, Param = "phi", Cell = 1) plot(Chain2, Param = "theta", Batch = 1) plot(Chain2, Param = "mu", Gene = 1) plot(Chain2, Param = "delta", Gene = 1) -``` - - - -**Important remarks:** - - - Please ensure the acceptance rates displayed in the console output of -`BASiCS_MCMC` are around 0.44. If they are too far from this value, you -should increase `N` and `Burn`. - -- It is **essential** to assess the convergence of the MCMC algorithm -**before** performing downstream analyses. For guidance regarding this step, -refer to the 'Convergence assessment' section of this vignette - -Typically, setting `N=20000`, `Thin=20` and `Burn=10000` leads to -stable results. - -### Analysis for a single group of cells - -We illustrate this analysis using a small extract from the MCMC chain obtained -in [2] when analysing the single cell samples provided in [5]. This is included -within `BASiCS` as the `ChainSC` dataset. - -```{r LoadSingleData} -data(ChainSC) -``` - -The following code is used to identify **highly variable genes (HVG)** and -**lowly variable genes (LVG)** within these cells. The `VarThreshold` parameter -sets a lower threshold for the proportion of variability that is assigned to -the biological component (`Sigma`). In the examples below: - - - HVG are defined as those genes for which **at least** 60\% of their total -variability is attributed to the biological variability component. -- LVG are defined as those genes for which **at most** 40\% of their total -variability is attributed to the biological variability component. - -For each gene, these functions return posterior probabilities as a measure of -HVG/LVG evidence. A cut-off value for these posterior probabilities is set by -controlling EFDR (defaul option: `EviThreshold` defined such that EFDR = 0.10). - -```{r quick-start-HVGdetection, fig.height = 6, fig.width = 6} -par(mfrow = c(2,2)) -HVG <- BASiCS_DetectHVG(ChainSC, VarThreshold = 0.6, Plot = TRUE) -LVG <- BASiCS_DetectLVG(ChainSC, VarThreshold = 0.2, Plot = TRUE) -``` - -To access the results of these tests, please use. - -```{r quick-start-HVGdetectionTable} -head(HVG$Table) -head(LVG$Table) -``` - -```{r quick-start-HVGdetectionPlot} -SummarySC <- Summary(ChainSC) -plot(SummarySC, Param = "mu", Param2 = "delta", log = "xy") -with(HVG$Table[HVG$Table$HVG == TRUE,], points(Mu, Delta)) -with(LVG$Table[LVG$Table$LVG == TRUE,], points(Mu, Delta)) -``` - -**Note**: this criteria for threshold has changed with respect to the original -release of BASiCS (where `EviThreshold` was defined such that EFDR = EFNR). -However, the new choice is more stable (sometimes, it was not posible to - find a threshold such that EFDR = EFNR). - -### Analysis for two groups of cells - -To illustrate the use of the differential mean expression and differential -over-dispersion tests between two cell populations, we use extracts from the -MCMC chains obtained in [2] when analysing the [5] dataset (single cells vs - pool-and-split samples). These were obtained by independently running the -`BASiCS_MCMC` function for each group of cells. - -```{r quick-start-LoadBothData} -data(ChainSC) -data(ChainRNA) -``` - -```{r quick-start-TestDE, fig.width=10, fig.height=5} -Test <- BASiCS_TestDE(Chain1 = ChainSC, Chain2 = ChainRNA, - GroupLabel1 = "SC", GroupLabel2 = "PaS", - EpsilonM = log2(1.5), EpsilonD = log2(1.5), - EFDR_M = 0.10, EFDR_D = 0.10, - OffSet = TRUE, OffsetPlot = TRUE, Plot = TRUE) -``` - -In `BASiCS_TestDE`, `EpsilonM` sets the log2 fold change (log2FC) in expression -($\mu$) and `EpsilonD` the log2FC in over-dispersion ($\delta$). As a default -option: `EpsilonM = EpsilonD = log2(1.5)` (i.e. - 50\% increase). To adjust for differences in overall RNA content, an internal -offset correction is performed when `OffSet=TRUE`. -This is the recommended default. - -**Note:** due to the confounding between mean and over-dispersion that is -typically observed in scRNA-seq datasets, we only assess changes in -over-dispersion for those genes in which the mean does not change -between the groups. - -The resulting output list can be displayed using - -```{r quick-start-DE} -head(Test$TableMean) -head(Test$TableDisp) -``` - -# Additional details - -### Storing and loading MCMC chains - -To externally store the output of `BASiCS_MCMC` (recommended), additional -parameters `StoreChains`, `StoreDir` and `RunName` are required. For example: - - ```{r MCMCNotRun} -Data <- makeExampleBASiCS_Data() -Chain <- BASiCS_MCMC(Data, N = 1000, Thin = 10, Burn = 500, - PrintProgress = FALSE, StoreChains = TRUE, - StoreDir = tempdir(), RunName = "Example") -``` - -In this example, the output of `BASiCS_MCMC` will be stored as a `BASiCS_Chain` -object in the file "chain_Example.Rds", within the `tempdir()` directory. - -To load pre-computed MCMC chains, +Summary2 <- Summary(Chain2) -```{r LoadChainNotRun} -Chain <- BASiCS_LoadChain("Example", StoreDir = tempdir()) +Chain3 <- BASiCS_MCMC(Data = Data2, N = 10000, Thin = 10, Burn = 5000, + PrintProgress = FALSE, StoreDir = tempdir(), + StochasticRef = TRUE) +plot(Chain3, Param = "phi", Cell = 1) +plot(Chain3, Param = "theta", Batch = 1) +plot(Chain3, Param = "mu", Gene = 1) +plot(Chain3, Param = "delta", Gene = 1) +Summary3 <- Summary(Chain3) ``` -## Convergence assessment - -To assess convergence of the chain, the convergence diagnostics provided by the -package `coda` can be used. Additionally, the chains can be visually inspected. -For example: - - ```{r Traceplots, fig.height = 3, fig.width = 6} -plot(Chain, Param = "mu", Gene = 1, log = "y") -plot(Chain, Param = "phi", Cell = 1) -``` - -In the figures above: - - - Left panels show traceplots for the chains -- Right panels show the autocorrelation function (see `?acf`) - - ## Summarising the posterior distribution - - To access the MCMC chains associated to individual parameter use the function `displayChainBASiCS`. For example, - -```{r AccessChains} -displayChainBASiCS(Chain, Param = "mu")[1:5,1:5] -``` - -As a summary of the posterior distribution, the function `Summary` calculates -posterior medians and the High Posterior Density (HPD) intervals for each model -parameter. As a default option, HPD intervals contain 0.95 probability. - -```{r Summary} -ChainSummary <- Summary(Chain) -``` - -The function `displaySummaryBASiCS` extract posterior summaries for individual -parameters. For example - -```{r SummaryExample} -head(displaySummaryBASiCS(ChainSummary, Param = "mu")) -``` - - -The following figures display posterior medians and the corresponding HPD 95% -intervals for gene-specific parameters $\mu_i$ (mean) and $\delta_i$ - (over-dispersion) - -```{r OtherHPD, fig.width = 7, fig.height = 7} -par(mfrow = c(2,2)) -plot(ChainSummary, Param = "mu", main = "All genes", log = "y") -plot(ChainSummary, Param = "mu", Genes = 1:10, main = "First 10 genes") -plot(ChainSummary, Param = "delta", main = "All genes") -plot(ChainSummary, Param = "delta", Genes = c(2,5,10,50), main = "5 customized genes") -``` - -It is also possible to obtain similar summaries for the normalising constants -$\phi_j$ and $s_j$. - -```{r Normalisation, fig.width = 7, fig.height = 3.5} -par(mfrow = c(1,2)) -plot(ChainSummary, Param = "phi") -plot(ChainSummary, Param = "s", Cells = 1:5) -``` - -Finally, it is also possible to create a scatterplot of posterior estimates for gene-specific parameters. Typically, this plot will exhibit the confounding -effect that is observed between mean and over-dispersion. - -```{r DispVsExp, fig.width = 7, fig.height = 3.5} -par(mfrow = c(1,2)) -plot(ChainSummary, Param = "mu", Param2 = "delta", log = "x", SmoothPlot = FALSE) -plot(ChainSummary, Param = "mu", Param2 = "delta", log = "x", SmoothPlot = TRUE) -``` - -The option `SmoothPlot = TRUE` is generally recommended as this plot will -contain thousands of genes when analysing real datasets. - -## Normalisation and removal of technical variation - -It is also possible to produce a matrix of normalised and denoised expression -counts for which the effect of technical variation is removed. For this purpose, -we implemented the function `BASiCS_DenoisedCounts`. For each gene $i$ and -cell $j$ this function returns - -$$ x^*_{ij} = \frac{ x_{ij} } {\hat{\phi}_j \hat{\nu}_j}, $$ - - where $x_{ij}$ is the observed expression count of gene $i$ in cell $j$, -$\hat{\phi}_j$ denotes the posterior median of $\phi_j$ and $\hat{\nu}_j$ is -the posterior median of $\nu_j$. - -```{r DenoisedCounts} -DenoisedCounts = BASiCS_DenoisedCounts(Data = Data, Chain = Chain) -DenoisedCounts[1:5, 1:5] -``` - -Alternativelly, the user can compute the normalised and denoised expression -rates underlying the expression of all genes across cells using -`BASiCS_DenoisedRates`. The output of this function is given by - -$$ \Lambda_{ij} = \hat{\mu_i} \hat{\rho}_{ij}, $$ - - where $\hat{\mu_i}$ represents the posterior median of $\mu_j$ and -$\hat{\rho}_{ij}$ is given by its posterior mean (Monte Carlo estimate based - on the MCMC sample of all model parameters). - -```{r DenoisedProp} -DenoisedRates <- BASiCS_DenoisedRates(Data = Data, Chain = Chain, - Propensities = FALSE) -DenoisedRates[1:5, 1:5] -``` - -Alternative, denoised expression propensities $\hat{\rho}_{ij}$ can -also be extracted - -```{r DenoisedRates} -DenoisedProp = BASiCS_DenoisedRates(Data = Data, Chain = Chain, - Propensities = TRUE) -DenoisedProp[1:5, 1:5] -``` - -# Methodology - -We first describe the model introduced in [1], which relates to a single group -of cells. - -Throughout, we consider the expression counts of $q$ genes, where $q_0$ are -expressed in the population of cells under study (biological genes) and the -remaining $q-q_0$ are extrinsic spike-in (technical) genes. Let $X_{ij}$ be a -random variable representing the expression count of a gene $i$ in cell $j$ ($i=1,\ldots,q$; $j=1,\ldots,n$). BASiCS is based on the following hierarchical -model: $$X_{ij} \big| \mu_i, \phi_j, \nu_j, \rho_{ij} \sim \left\{ \begin{array}{ll} \mbox{Poisson}(\phi_j \nu_j \mu_i \rho_{ij}), \mbox{ for }i=1,\ldots,q_0, j=1,\ldots,n \\ \mbox{Poisson}(\nu_j \mu_i), \mbox{ for }i=q_0+1,\ldots,q, j=1,\ldots,n, \end{array} \right.$$ - - where $\nu_j$ and $\rho_{ij}$ are mutually independent random effects such that $\nu_j|s_j,\theta \sim \mbox{Gamma}(1/\theta,1/ (s_j \theta))$ and $\rho_{ij} | \delta_i \sim \mbox{Gamma} (1/\delta_i,1/\delta_i)$[^footnoteGamma]. - -A graphical representation of this model is displayed below. This is based on -the expression counts of 2 genes ($i$: biological and $i'$: technical) at 2 - cells ($j$ and $j'$). Squared and circular nodes denote known observed -quantities (observed expression counts and added number of spike-in mRNA - molecules) and unknown elements, respectively. Whereas black circular nodes -represent the random effects that play an intermediate role in our hierarchical -structure, red circular nodes relate to unknown model parameters in the top -layer of hierarchy in our model. Blue, green and grey areas highlight elements -that are shared within a biological gene, technical gene or cell, respectively. - - - - \centerline{\includegraphics[height=4in]{./BASiCS_DAG.jpg}} - -In this setting, the key parameters to be used for downstream analyses are: - - * $\mu_i$: mean expression parameter for gene $i$ in the group of cells under -study. In case of the spike-in technical genes, $\mu_i$ is assumed to be known -and equal to the input number of molecules of the corresponding spike-in gene). - -* $\delta_i$: over-dispersion parameter for gene $i$, a measure for the excess -of variation that is observed after accounting for technical noise (with respect - to Poisson sampling) - -Additional (nuisance) parameters are interpreted as follows: - - * $\phi_j$: cell-specific normalizing parameters related to differences in mRNA -content (identifiability constrain: $\sum_{j=1}^n \phi_j = n$). - -* $s_j$: cell-specific normalizing parameters related to technical cell-specific -biases (for more details regarding this interpretation see [6]). - -* $\theta$: technical over-dispersion parameter, controlling the strenght of -cell-to-cell technical variability. - -When cells from the same group are processed in multiple sequencing batches, -this model is extended so that the technical over-dispersion parameter $\theta$ - is batch-specific. This extension allows a different strenght of technical noise -to be inferred for each batch of cells. - -[^footnoteGamma]: We parametrize the Gamma distribution such that if $X \sim \mbox{Gamma}(a,b)$, then $\mbox{E}(X)=a/b$ and $\mbox{var}(X)=a/b^2$. - -In [2], this model has been extended to cases where multiple groups of cells are -under study. This is achieved by assuming gene-specific parameters to be also group-specific. Based on this setup, evidence of differential expression is -quantified through log2-fold changes of gene-specific parameters -(mean and over-dispersion) between the groups. - -More details regarding the model setup, prior specification and implementation -are described in [1] and [2]. - -*** - - # Acknowledgements - - We thank several members of the Marioni laboratory (EMBL-EBI; CRUK-CI) for -support and discussions throughout the development of this R library. -In particular, we are grateful to Aaron Lun (@LTLA, CRUK-CI) for advise and -support during the preparation the Bioconductor submission. - -We also acknowledge feedback and contributions from (Github aliases provided - within parenthesis): Ben Dulken (@bdulken), Chang Xu (@xuchang116), -Danilo Horta (@Horta), Dmitriy Zhukov (@dvzhukov), -Jens Preußner (@jenzopr), Joanna Dreux (@Joannacodes), Kevin Rue-Albrecht (@kevinrue), Luke Zappia (@lazappi), Simon Anders (@s-andrews), Yongchao Ge and -Yuan Cao (@yuancao90), among others. - -This work has been funded by the MRC Biostatistics Unit (MRC grant no. - MRC_MC_UP_0801/1; Catalina Vallejos and Sylvia Richardson), EMBL European -Bioinformatics Institute (core European Molecular Biology Laboratory funding; - Catalina Vallejos, Nils Eling and John Marioni), CRUK Cambridge Institute -(core CRUK funding; John Marioni) and The Alan Turing Institute (EPSRC grant no. EP/N510129/1; Catalina Vallejos). - -*** - - # References - - [1] Vallejos CA, Marioni JCM and Richardson S (2015) BASiCS: Bayesian analysis -of single-cell sequencing data. *PLoS Computational Biology* 11 (6), e1004333. +# Case studies -[2] Vallejos CA, Richardson S and Marioni JCM (2016) Beyond comparisons of -means: understanding changes in gene expression at the single-cell level. -*Genome Biology* 17 (1), 1-14. +The following datasets can be used to test the performance of this approach: -[3] Martinez-Jimenez CP, Eling N, Chen H, Vallejos CA, Kolodziejczyk AA, -Connor F, Stojic L, Rayner TF, Stubbington MJT, Teichmann SA, de la Roche M, -Marioni JC and Odom DT (2017) Aging increases cell-to-cell transcriptional -variability upon immune stimulation. *Science* 355 (6332), 1433-1436. -[4] Newton MA, Noueiry A, Sarkar D, Ahlquist P (2004) Detecting differential -gene expression with a semiparametric hierarchical mixture method. -*Biostatistics* 5 (2), 155-76. -[5] Grün D, Kester L, van Oudenaarden A (2014) Validation of noise models -for single-cell transcriptomics. *Nature Methods* 11 (6), 637-40. -[6] Vallejos CA, Risso D, Scialdone A, Dudoit S and Marioni JCM (2017) -Normalizing single-cell RNA-sequencing data: challenges and opportunities. -*Nature Methods* 14, 565-571. -[7] Roberts GO and Rosenthal JS (2009). Examples of adaptive MCMC. *Journal of Computational and Graphical Statistics* 18: 349-367. # Session information From 818eba25f43b8935cf6e8023513992b2e99531b6 Mon Sep 17 00:00:00 2001 From: catavallejos Date: Wed, 27 Sep 2017 22:19:16 +0100 Subject: [PATCH 4/7] Back-up commit while debugging no-spikes version --- R/BASiCS_MCMC.R | 14 +++--- src/BASiCS_CPPcode.cpp | 107 +++++++++++++++++++++++++++++++++-------- 2 files changed, 93 insertions(+), 28 deletions(-) diff --git a/R/BASiCS_MCMC.R b/R/BASiCS_MCMC.R index e822be24..993e46cf 100644 --- a/R/BASiCS_MCMC.R +++ b/R/BASiCS_MCMC.R @@ -399,13 +399,13 @@ BASiCS_MCMC <- function(Data, N, Thin, Burn, ...) { as.matrix(assay(Data)), BatchDesign, mu0, delta0, - phi0, nu0, - rep(theta0, nBatch), + s0, nu0, + c(1.936500, 1.601918), # rep(theta0, nBatch), PriorParam$s2.mu, PriorParam$a.delta, PriorParam$b.delta, - PriorParam$a.phi, - PriorParam$b.phi, + PriorParam$a.s, + PriorParam$b.s, PriorParam$a.theta, PriorParam$b.theta, PriorParam$s2.delta, @@ -433,8 +433,8 @@ BASiCS_MCMC <- function(Data, N, Thin, Burn, ...) { colnames(Chain$mu) <- rownames(assay(Data))[!isSpike(Data)] colnames(Chain$delta) <- rownames(assay(Data))[!isSpike(Data)] CellLabels <- paste0("Cell", 1:n, "_Batch", metadata(Data)$BatchInfo) - colnames(Chain$phi) <- CellLabels - if (length(metadata(Data)$SpikeInput) > 1) { colnames(Chain$s) <- CellLabels } + colnames(Chain$s) <- CellLabels + if (length(metadata(Data)$SpikeInput) > 1) { colnames(Chain$phi) <- CellLabels } colnames(Chain$nu) <- CellLabels colnames(Chain$theta) <- paste0("Batch", unique(metadata(Data)$BatchInfo)) @@ -451,7 +451,7 @@ BASiCS_MCMC <- function(Data, N, Thin, Burn, ...) { if (length(metadata(Data)$SpikeInput) == 1) { - Chain$s <- matrix(1, ncol = ncol(Chain$phi), nrow = nrow(Chain$phi)) + Chain$phi <- matrix(1, ncol = ncol(Chain$s), nrow = nrow(Chain$s)) } ChainClass <- newBASiCS_Chain(mu = Chain$mu, delta = Chain$delta, diff --git a/src/BASiCS_CPPcode.cpp b/src/BASiCS_CPPcode.cpp index 9212f7f5..d6efcc9b 100644 --- a/src/BASiCS_CPPcode.cpp +++ b/src/BASiCS_CPPcode.cpp @@ -1302,6 +1302,71 @@ arma::mat HiddenBASiCS_DenoisedRates( /* IGNORE CODE FROM HERE ONWARDS */ +/* Metropolis-Hastings updates of nu (batch case) + * Updates are implemented simulateaneously for all cells. + */ +arma::mat nuUpdateNoSpikes( + arma::vec const& nu0, + arma::vec const& prop_var, + arma::mat const& Counts, + arma::mat const& BatchDesign, + arma::vec const& mu, + arma::vec const& invdelta, + arma::vec const& phi, + arma::vec const& thetaBatch, + arma::vec const& sum_bygene_all, + int const& q0, + int const& n, + arma::vec & nu1, + arma::vec & u, + arma::vec & ind) +{ + using arma::span; + + // PROPOSAL STEP + nu1 = exp(arma::randn(n) % sqrt(prop_var) + log(nu0)); + u = arma::randu(n); + + // ACCEPT/REJECT STEP + arma::vec log_aux = arma::zeros(n); + + for (int j=0; j < n; j++) + { + for (int i=0; i < q0; i++) + { + log_aux(j) -= ( Counts(i,j) + invdelta(i) ) * + log( ( nu1(j)*mu(i) + invdelta(i) ) / + ( nu0(j)*mu(i) + invdelta(i) )); + } + } + + log_aux += (log(nu1) - log(nu0)) % (sum_bygene_all + 1/thetaBatch); + log_aux -= (nu1 - nu0) % (1/(thetaBatch % phi)); + + /* CREATING OUTPUT VARIABLE & DEBUG + * Proposed values are automatically rejected in the following cases: + * - If smaller than 1e-5 + * - If the proposed value is not finite + * - When the acceptance rate cannot be numerally computed + */ + for (int j=0; j < n; j++) + { + if(arma::is_finite(log_aux(j)) & arma::is_finite(nu1(j))) + { + if( (log(u(j)) < log_aux(j)) & (nu1(j) > 1e-5) ) { ind(j) = 1; } + else {ind(j) = 0; nu1(j) = nu0(j); } + } + else + { + ind(j) = 0; nu1(j) = nu0(j); + Rcpp::Rcout << "Error when updating nu " << j+1 << std::endl; + Rcpp::warning("Consider additional data filter if error is frequent."); + } + } + + // OUTPUT + return join_rows(nu1, ind); +} /* MCMC sampler * N: Total number of MCMC draws @@ -1344,7 +1409,7 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( NumericMatrix BatchDesign, // Design matrix representing batch information (number of columns must be equal to number of batches) NumericVector mu0, // Starting value of $\mu=(\mu_1,...,\mu_q_0)'$ (true mRNA content for technical genes) NumericVector delta0, // Starting value of $\delta=(\delta_1,...,\delta_{q_0})'$ - NumericVector phi0, // Starting value of $\phi=(\phi_1,...,\phi_n)$'$ + NumericVector s0, // Starting value of $\phi=(\phi_1,...,\phi_n)$'$ NumericVector nu0, // Starting value of $\nu=(\nu_1,...,\nu_n)$'$ NumericVector theta0, // Starting value of $\theta$ double s2mu, @@ -1352,8 +1417,8 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( double bdelta, // Rate hyper-parameter of the Gamma($a_{\delta}$,$b_{\delta}$) prior assigned to each $\delta_i$ double s2delta, double prior_delta, - double aphi, - double bphi, + double as, + double bs, double atheta, // Shape hyper-parameter of the Gamma($a_{\theta}$,$b_{\theta}$) prior assigned to $\theta$ double btheta, // Rate hyper-parameter of the Gamma($a_{\theta}$,$b_{\theta}$) prior assigned to $\theta$ double ar, // Optimal acceptance rate for adaptive Metropolis-Hastings updates @@ -1408,7 +1473,7 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( // OBJECTS WHERE DRAWS WILL BE STORED arma::mat mu = zeros(q0, Naux); arma::mat delta = zeros(q0, Naux); - arma::mat phi = ones(n, Naux); + arma::mat s = zeros(n, Naux); arma::mat nu = zeros(n, Naux); arma::mat theta = zeros(nBatch, Naux); arma::mat LSmu; @@ -1431,10 +1496,10 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( arma::vec thetaAccept = zeros(nBatch); arma::vec PthetaAux = zeros(nBatch); // INITIALIZATION OF VALUES FOR MCMC RUN - arma::mat muAux = zeros(q0,2); muAux.col(0)=as_arma(mu0); - arma::mat deltaAux = zeros(q0,2); deltaAux.col(0)=as_arma(delta0); - arma::vec phiAux = as_arma(phi0); - arma::mat nuAux = zeros(n,2); nuAux.col(0)=as_arma(nu0); + arma::mat muAux = zeros(q0,2); muAux.col(0) = as_arma(mu0); + arma::mat deltaAux = zeros(q0,2); deltaAux.col(0) = as_arma(delta0); + arma::vec sAux = as_arma(s0); + arma::mat nuAux = zeros(n,2); nuAux.col(0) = as_arma(nu0); arma::mat thetaAux = zeros(nBatch, 2); thetaAux.col(0) = as_arma(theta0); arma::vec thetaBatch = BatchDesign_arma * as_arma(theta0); @@ -1486,17 +1551,17 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( // UPDATE OF PHI: // SAME AS FULL CONDITIONAL FOR S IN THE VERTICAL INTEGRATION CASE - phiAux = sUpdateBatch(phiAux, nuAux.col(0), thetaBatch, - aphi, bphi, BatchDesign_arma, n, y_n); + sAux = sUpdateBatch(sAux, nuAux.col(0), thetaBatch, + as, bs, BatchDesign_arma, n, y_n); // UPDATE OF THETA: // 1st ELEMENT IS THE UPDATE, // 2nd ELEMENT IS THE ACCEPTANCE INDICATOR - thetaAux = thetaUpdateBatch(thetaAux.col(0), exp(LSthetaAux), - BatchDesign_arma, BatchSizes, - phiAux, nuAux.col(0), atheta, btheta, n, nBatch); - PthetaAux += thetaAux.col(1); if(i>=Burn) {thetaAccept += thetaAux.col(1);} - thetaBatch = BatchDesign_arma * thetaAux.col(0); +// thetaAux = thetaUpdateBatch(thetaAux.col(0), exp(LSthetaAux), +// BatchDesign_arma, BatchSizes, +// phiAux, nuAux.col(0), atheta, btheta, n, nBatch); +// PthetaAux += thetaAux.col(1); if(i>=Burn) {thetaAccept += thetaAux.col(1);} +// thetaBatch = BatchDesign_arma * thetaAux.col(0); // UPDATE OF MU: 1st COLUMN IS THE UPDATE, 2nd COLUMN IS THE ACCEPTANCE INDICATOR @@ -1525,10 +1590,10 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( // UPDATE OF NU: // 1st COLUMN IS THE UPDATE, // 2nd COLUMN IS THE ACCEPTANCE INDICATOR - nuAux = nuUpdateBatch(nuAux.col(0), exp(LSnuAux), Counts_arma, 0, + nuAux = nuUpdateNoSpikes(nuAux.col(0), exp(LSnuAux), Counts_arma, BatchDesign_arma, muAux.col(0), 1/deltaAux.col(0), - ones_n, phiAux, thetaBatch, sumByGeneAll_arma, q0, n, + sAux, thetaBatch, sumByGeneAll_arma, q0, n, y_n, u_n, ind_n); PnuAux += nuAux.col(1); if(i>=Burn) {nuAccept += nuAux.col(1);} @@ -1564,7 +1629,7 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( { mu.col(i/Thin - Burn/Thin) = muAux.col(0); delta.col(i/Thin - Burn/Thin) = deltaAux.col(0); - phi.col(i/Thin - Burn/Thin) = phiAux; + s.col(i/Thin - Burn/Thin) = sAux; nu.col(i/Thin - Burn/Thin) = nuAux.col(0); theta.col(i/Thin - Burn/Thin) = thetaAux.col(0); @@ -1586,7 +1651,7 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( Rcout << "Current draws of some selected parameters are displayed below." << std::endl; Rcout << "mu (gene 1): " << muAux(0,0) << std::endl; Rcout << "delta (gene 1): " << deltaAux(0,0) << std::endl; - Rcout << "phi (cell 1): " << phiAux(0) << std::endl; + Rcout << "s (cell 1): " << sAux(0) << std::endl; Rcout << "nu (cell 1): " << nuAux(0,0) << std::endl; Rcout << "theta (batch 1): " << thetaAux(0,0) << std::endl; Rcout << "--------------------------------------------------------------------" << std::endl; @@ -1637,7 +1702,7 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( return(Rcpp::List::create( Rcpp::Named("mu") = mu.t(), Rcpp::Named("delta") = delta.t(), - Rcpp::Named("phi") = phi.t(), + Rcpp::Named("s") = s.t(), Rcpp::Named("nu") = nu.t(), Rcpp::Named("theta") = theta.t(), Rcpp::Named("ls.mu") = LSmu.t(), @@ -1653,7 +1718,7 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( return(Rcpp::List::create( Rcpp::Named("mu") = mu.t(), Rcpp::Named("delta") = delta.t(), - Rcpp::Named("phi") = phi.t(), + Rcpp::Named("s") = s.t(), Rcpp::Named("nu") = nu.t(), Rcpp::Named("theta") = theta.t(), Rcpp::Named("RefFreq") = RefFreq/(N-Burn))); From 0b89f353952cea38479ccf4409962d7ebbeedbe4 Mon Sep 17 00:00:00 2001 From: catavallejos Date: Wed, 27 Sep 2017 23:28:50 +0100 Subject: [PATCH 5/7] No-spikes implementation - `makeExampleBASiCS_Data` + `newBASiCS_Data` adapted to no-spikes case - Temporary vignette file for no-spikes case has been created - `HiddenBASiCS_MCMCcppNoSpikes` updated to recycle vertical integration update functions - Fixed bug to allow use of stochastic reference - Fixed bug in argument order for no-spikes version --- R/BASiCS_MCMC.R | 9 ++++----- R/RcppExports.R | 4 ++-- src/BASiCS_CPPcode.cpp | 10 +++++----- src/RcppExports.cpp | 12 ++++++------ vignettes/BASiCS_nospikes.Rmd | 2 ++ 5 files changed, 19 insertions(+), 18 deletions(-) diff --git a/R/BASiCS_MCMC.R b/R/BASiCS_MCMC.R index 993e46cf..eeffab05 100644 --- a/R/BASiCS_MCMC.R +++ b/R/BASiCS_MCMC.R @@ -346,7 +346,6 @@ BASiCS_MCMC <- function(Data, N, Thin, Burn, ...) { StochasticRef <- ifelse("StochasticRef" %in% names(args), args$StochasticRef, TRUE) - BatchDesign <- model.matrix(~as.factor(metadata(Data)$BatchInfo) - 1) BatchSizes <- table(metadata(Data)$BatchInfo) BatchIds <- as.numeric(names(BatchSizes)) BatchOffSet <- rep(1, times = nBatch) @@ -399,17 +398,17 @@ BASiCS_MCMC <- function(Data, N, Thin, Burn, ...) { as.matrix(assay(Data)), BatchDesign, mu0, delta0, - s0, nu0, - c(1.936500, 1.601918), # rep(theta0, nBatch), + phi0, nu0, + rep(theta0, nBatch), PriorParam$s2.mu, PriorParam$a.delta, PriorParam$b.delta, + PriorParam$s2.delta, + PriorDeltaNum, PriorParam$a.s, PriorParam$b.s, PriorParam$a.theta, PriorParam$b.theta, - PriorParam$s2.delta, - PriorDeltaNum, AR, ls.mu0, ls.delta0, ls.nu0, diff --git a/R/RcppExports.R b/R/RcppExports.R index 2d4a6c51..b8db1c6b 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -9,7 +9,7 @@ HiddenBASiCS_DenoisedRates <- function(CountsBio, Mu, TransInvDelta, PhiNu, N, q .Call('_BASiCS_HiddenBASiCS_DenoisedRates', PACKAGE = 'BASiCS', CountsBio, Mu, TransInvDelta, PhiNu, N, q0, n) } -HiddenBASiCS_MCMCcppNoSpikes <- function(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef) { - .Call('_BASiCS_HiddenBASiCS_MCMCcppNoSpikes', PACKAGE = 'BASiCS', N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef) +HiddenBASiCS_MCMCcppNoSpikes <- function(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, s0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, as, bs, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef) { + .Call('_BASiCS_HiddenBASiCS_MCMCcppNoSpikes', PACKAGE = 'BASiCS', N, Thin, Burn, Counts, BatchDesign, mu0, delta0, s0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, as, bs, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef) } diff --git a/src/BASiCS_CPPcode.cpp b/src/BASiCS_CPPcode.cpp index d6efcc9b..8b7f7f8d 100644 --- a/src/BASiCS_CPPcode.cpp +++ b/src/BASiCS_CPPcode.cpp @@ -1557,11 +1557,11 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( // UPDATE OF THETA: // 1st ELEMENT IS THE UPDATE, // 2nd ELEMENT IS THE ACCEPTANCE INDICATOR -// thetaAux = thetaUpdateBatch(thetaAux.col(0), exp(LSthetaAux), -// BatchDesign_arma, BatchSizes, -// phiAux, nuAux.col(0), atheta, btheta, n, nBatch); -// PthetaAux += thetaAux.col(1); if(i>=Burn) {thetaAccept += thetaAux.col(1);} -// thetaBatch = BatchDesign_arma * thetaAux.col(0); + thetaAux = thetaUpdateBatch(thetaAux.col(0), exp(LSthetaAux), + BatchDesign_arma, BatchSizes, + sAux, nuAux.col(0), atheta, btheta, n, nBatch); + PthetaAux += thetaAux.col(1); if(i>=Burn) {thetaAccept += thetaAux.col(1);} + thetaBatch = BatchDesign_arma * thetaAux.col(0); // UPDATE OF MU: 1st COLUMN IS THE UPDATE, 2nd COLUMN IS THE ACCEPTANCE INDICATOR diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index b5cef5a5..a954c484 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -69,8 +69,8 @@ BEGIN_RCPP END_RCPP } // HiddenBASiCS_MCMCcppNoSpikes -Rcpp::List HiddenBASiCS_MCMCcppNoSpikes(int N, int Thin, int Burn, NumericMatrix Counts, NumericMatrix BatchDesign, NumericVector mu0, NumericVector delta0, NumericVector phi0, NumericVector nu0, NumericVector theta0, double s2mu, double adelta, double bdelta, double s2delta, double prior_delta, double aphi, double bphi, double atheta, double btheta, double ar, NumericVector LSmu0, NumericVector LSdelta0, NumericVector LSnu0, NumericVector LStheta0, NumericVector sumByCellAll, NumericVector sumByGeneAll, int StoreAdapt, int EndAdapt, int PrintProgress, NumericVector BatchInfo, NumericVector BatchIds, NumericVector BatchOffSet, double Constrain, NumericVector Index, int RefGene, NumericVector RefGenes, NumericVector ConstrainGene, NumericVector NotConstrainGene, int ConstrainType, int StochasticRef); -RcppExport SEXP _BASiCS_HiddenBASiCS_MCMCcppNoSpikes(SEXP NSEXP, SEXP ThinSEXP, SEXP BurnSEXP, SEXP CountsSEXP, SEXP BatchDesignSEXP, SEXP mu0SEXP, SEXP delta0SEXP, SEXP phi0SEXP, SEXP nu0SEXP, SEXP theta0SEXP, SEXP s2muSEXP, SEXP adeltaSEXP, SEXP bdeltaSEXP, SEXP s2deltaSEXP, SEXP prior_deltaSEXP, SEXP aphiSEXP, SEXP bphiSEXP, SEXP athetaSEXP, SEXP bthetaSEXP, SEXP arSEXP, SEXP LSmu0SEXP, SEXP LSdelta0SEXP, SEXP LSnu0SEXP, SEXP LStheta0SEXP, SEXP sumByCellAllSEXP, SEXP sumByGeneAllSEXP, SEXP StoreAdaptSEXP, SEXP EndAdaptSEXP, SEXP PrintProgressSEXP, SEXP BatchInfoSEXP, SEXP BatchIdsSEXP, SEXP BatchOffSetSEXP, SEXP ConstrainSEXP, SEXP IndexSEXP, SEXP RefGeneSEXP, SEXP RefGenesSEXP, SEXP ConstrainGeneSEXP, SEXP NotConstrainGeneSEXP, SEXP ConstrainTypeSEXP, SEXP StochasticRefSEXP) { +Rcpp::List HiddenBASiCS_MCMCcppNoSpikes(int N, int Thin, int Burn, NumericMatrix Counts, NumericMatrix BatchDesign, NumericVector mu0, NumericVector delta0, NumericVector s0, NumericVector nu0, NumericVector theta0, double s2mu, double adelta, double bdelta, double s2delta, double prior_delta, double as, double bs, double atheta, double btheta, double ar, NumericVector LSmu0, NumericVector LSdelta0, NumericVector LSnu0, NumericVector LStheta0, NumericVector sumByCellAll, NumericVector sumByGeneAll, int StoreAdapt, int EndAdapt, int PrintProgress, NumericVector BatchInfo, NumericVector BatchIds, NumericVector BatchOffSet, double Constrain, NumericVector Index, int RefGene, NumericVector RefGenes, NumericVector ConstrainGene, NumericVector NotConstrainGene, int ConstrainType, int StochasticRef); +RcppExport SEXP _BASiCS_HiddenBASiCS_MCMCcppNoSpikes(SEXP NSEXP, SEXP ThinSEXP, SEXP BurnSEXP, SEXP CountsSEXP, SEXP BatchDesignSEXP, SEXP mu0SEXP, SEXP delta0SEXP, SEXP s0SEXP, SEXP nu0SEXP, SEXP theta0SEXP, SEXP s2muSEXP, SEXP adeltaSEXP, SEXP bdeltaSEXP, SEXP s2deltaSEXP, SEXP prior_deltaSEXP, SEXP asSEXP, SEXP bsSEXP, SEXP athetaSEXP, SEXP bthetaSEXP, SEXP arSEXP, SEXP LSmu0SEXP, SEXP LSdelta0SEXP, SEXP LSnu0SEXP, SEXP LStheta0SEXP, SEXP sumByCellAllSEXP, SEXP sumByGeneAllSEXP, SEXP StoreAdaptSEXP, SEXP EndAdaptSEXP, SEXP PrintProgressSEXP, SEXP BatchInfoSEXP, SEXP BatchIdsSEXP, SEXP BatchOffSetSEXP, SEXP ConstrainSEXP, SEXP IndexSEXP, SEXP RefGeneSEXP, SEXP RefGenesSEXP, SEXP ConstrainGeneSEXP, SEXP NotConstrainGeneSEXP, SEXP ConstrainTypeSEXP, SEXP StochasticRefSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -81,7 +81,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< NumericMatrix >::type BatchDesign(BatchDesignSEXP); Rcpp::traits::input_parameter< NumericVector >::type mu0(mu0SEXP); Rcpp::traits::input_parameter< NumericVector >::type delta0(delta0SEXP); - Rcpp::traits::input_parameter< NumericVector >::type phi0(phi0SEXP); + Rcpp::traits::input_parameter< NumericVector >::type s0(s0SEXP); Rcpp::traits::input_parameter< NumericVector >::type nu0(nu0SEXP); Rcpp::traits::input_parameter< NumericVector >::type theta0(theta0SEXP); Rcpp::traits::input_parameter< double >::type s2mu(s2muSEXP); @@ -89,8 +89,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< double >::type bdelta(bdeltaSEXP); Rcpp::traits::input_parameter< double >::type s2delta(s2deltaSEXP); Rcpp::traits::input_parameter< double >::type prior_delta(prior_deltaSEXP); - Rcpp::traits::input_parameter< double >::type aphi(aphiSEXP); - Rcpp::traits::input_parameter< double >::type bphi(bphiSEXP); + Rcpp::traits::input_parameter< double >::type as(asSEXP); + Rcpp::traits::input_parameter< double >::type bs(bsSEXP); Rcpp::traits::input_parameter< double >::type atheta(athetaSEXP); Rcpp::traits::input_parameter< double >::type btheta(bthetaSEXP); Rcpp::traits::input_parameter< double >::type ar(arSEXP); @@ -114,7 +114,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< NumericVector >::type NotConstrainGene(NotConstrainGeneSEXP); Rcpp::traits::input_parameter< int >::type ConstrainType(ConstrainTypeSEXP); Rcpp::traits::input_parameter< int >::type StochasticRef(StochasticRefSEXP); - rcpp_result_gen = Rcpp::wrap(HiddenBASiCS_MCMCcppNoSpikes(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, phi0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, aphi, bphi, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef)); + rcpp_result_gen = Rcpp::wrap(HiddenBASiCS_MCMCcppNoSpikes(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, s0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, as, bs, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef)); return rcpp_result_gen; END_RCPP } diff --git a/vignettes/BASiCS_nospikes.Rmd b/vignettes/BASiCS_nospikes.Rmd index 47b771c6..c805d629 100644 --- a/vignettes/BASiCS_nospikes.Rmd +++ b/vignettes/BASiCS_nospikes.Rmd @@ -122,7 +122,9 @@ plot(Chain2, Param = "theta", Batch = 1) plot(Chain2, Param = "mu", Gene = 1) plot(Chain2, Param = "delta", Gene = 1) Summary2 <- Summary(Chain2) +``` +```{r} Chain3 <- BASiCS_MCMC(Data = Data2, N = 10000, Thin = 10, Burn = 5000, PrintProgress = FALSE, StoreDir = tempdir(), StochasticRef = TRUE) From 2d4bac44e635fe887b86f66ee6eed65eb70a09aa Mon Sep 17 00:00:00 2001 From: catavallejos Date: Wed, 27 Sep 2017 23:42:17 +0100 Subject: [PATCH 6/7] No-spikes implementation - `makeExampleBASiCS_Data` + `newBASiCS_Data` adapted to no-spikes case - Temporary vignette file for no-spikes case has been created - `HiddenBASiCS_MCMCcppNoSpikes` updated to recycle vertical integration update functions - Fixed bug to allow use of stochastic reference - Fixed bug in argument order for no-spikes version - Spikes version code clean-up --- NEWS | 2 + R/BASiCS_MCMC.R | 15 +----- R/RcppExports.R | 4 +- src/BASiCS_CPPcode.cpp | 119 +++++++++-------------------------------- src/RcppExports.cpp | 11 ++-- 5 files changed, 33 insertions(+), 118 deletions(-) diff --git a/NEWS b/NEWS index 2898446d..5d192591 100644 --- a/NEWS +++ b/NEWS @@ -598,6 +598,8 @@ No-spikes implementation - `HiddenBASiCS_MCMCcppNoSpikes` updated to recycle vertical integration update functions - Fixed bug to allow use of stochastic reference +- Fixed bug in argument order for no-spikes version +- Spikes version code clean-up TO-DO - Implement Giacomo's suggestion? diff --git a/R/BASiCS_MCMC.R b/R/BASiCS_MCMC.R index eeffab05..8b8d4e2e 100644 --- a/R/BASiCS_MCMC.R +++ b/R/BASiCS_MCMC.R @@ -346,17 +346,6 @@ BASiCS_MCMC <- function(Data, N, Thin, Burn, ...) { StochasticRef <- ifelse("StochasticRef" %in% names(args), args$StochasticRef, TRUE) - BatchSizes <- table(metadata(Data)$BatchInfo) - BatchIds <- as.numeric(names(BatchSizes)) - BatchOffSet <- rep(1, times = nBatch) - for (k in 2:nBatch) - { - aux1 <- matrixStats::colSums2(assay(Data)[, - metadata(Data)$BatchInfo == BatchIds[k]]) - aux2 <- colSums(assay(Data)[, metadata(Data)$BatchInfo == BatchIds[1]]) - BatchOffSet[k] <- median(aux1) / median(aux2) - } - # Auxiliary vector contaning a gene index Index <- (1:q.bio) - 1 # In the following '+1' is used as c++ vector indexes vectors setting @@ -418,9 +407,7 @@ BASiCS_MCMC <- function(Data, N, Thin, Burn, ...) { StoreAdaptNumber, StopAdapt, as.numeric(PrintProgress), - metadata(Data)$BatchInfo, - BatchIds, - BatchOffSet, Constrain, + Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, diff --git a/R/RcppExports.R b/R/RcppExports.R index b8db1c6b..cb824ffd 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -9,7 +9,7 @@ HiddenBASiCS_DenoisedRates <- function(CountsBio, Mu, TransInvDelta, PhiNu, N, q .Call('_BASiCS_HiddenBASiCS_DenoisedRates', PACKAGE = 'BASiCS', CountsBio, Mu, TransInvDelta, PhiNu, N, q0, n) } -HiddenBASiCS_MCMCcppNoSpikes <- function(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, s0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, as, bs, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef) { - .Call('_BASiCS_HiddenBASiCS_MCMCcppNoSpikes', PACKAGE = 'BASiCS', N, Thin, Burn, Counts, BatchDesign, mu0, delta0, s0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, as, bs, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef) +HiddenBASiCS_MCMCcppNoSpikes <- function(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, s0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, as, bs, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef) { + .Call('_BASiCS_HiddenBASiCS_MCMCcppNoSpikes', PACKAGE = 'BASiCS', N, Thin, Burn, Counts, BatchDesign, mu0, delta0, s0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, as, bs, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef) } diff --git a/src/BASiCS_CPPcode.cpp b/src/BASiCS_CPPcode.cpp index 8b7f7f8d..c6e968cb 100644 --- a/src/BASiCS_CPPcode.cpp +++ b/src/BASiCS_CPPcode.cpp @@ -1185,20 +1185,21 @@ arma::mat muUpdateNoSpikes( double const& s2_mu, int const& q0, int const& n, - arma::vec & mu, - arma::vec & ind, int const& RefGene, arma::uvec const& ConstrainGene, arma::uvec const& NotConstrainGene, - int const& ConstrainType) + int const& ConstrainType, + arma::vec & mu1, + arma::vec & u, + arma::vec & ind) { using arma::span; // PROPOSAL STEP - arma::vec y = exp(arma::randn(q0) % sqrt(prop_var) + log(mu0)); - arma::vec u = arma::randu(q0); + mu1 = exp(arma::randn(q0) % sqrt(prop_var) + log(mu0)); + u = arma::randu(q0); // INITIALIZE MU - mu = mu0 + 1 - 1; + arma::vec mu = mu0 + 1 - 1; double aux; double iAux; double sumAux = sum(log(mu0.elem(ConstrainGene))) - log(mu0(RefGene)); @@ -1207,7 +1208,7 @@ arma::mat muUpdateNoSpikes( // Step 1: Computing the likelihood contribution of the acceptance rate // Calculated in the same way for all genes, // but the reference one (no need to be sequential) - arma::vec log_aux = (log(y) - log(mu0)) % sum_bycell_all; + arma::vec log_aux = (log(mu1) - log(mu0)) % sum_bycell_all; for (int i=0; i < q0; i++) { if(i != RefGene) @@ -1215,7 +1216,7 @@ arma::mat muUpdateNoSpikes( for (int j=0; j < n; j++) { log_aux(i) -= ( Counts(i,j) + (1/delta(i)) ) * - log( ( nu(j)*y(i)+(1/delta(i)) ) / ( nu(j)*mu(i)+(1/delta(i)) )); + log( ( nu(j)*mu1(i)+(1/delta(i)) ) / ( nu(j)*mu(i)+(1/delta(i)) )); } } } @@ -1229,12 +1230,12 @@ arma::mat muUpdateNoSpikes( if(iAux != RefGene) { aux = 0.5 * (ConstrainGene.size() * Constrain - (sumAux - log(mu(iAux)))); - log_aux(iAux) -= (0.5 * 2 /s2_mu) * (pow(log(y(iAux)) - aux,2)); + log_aux(iAux) -= (0.5 * 2 /s2_mu) * (pow(log(mu1(iAux)) - aux,2)); log_aux(iAux) += (0.5 * 2 /s2_mu) * (pow(log(mu0(iAux)) - aux,2)); // ACCEPT REJECT - if((log(u(iAux)) < log_aux(iAux)) & (y(iAux) > 1e-3)) + if((log(u(iAux)) < log_aux(iAux)) & (mu1(iAux) > 1e-3)) { - ind(iAux) = 1; mu(iAux) = y(iAux); + ind(iAux) = 1; mu(iAux) = mu1(iAux); sumAux += log(mu(iAux)) - log(mu0(iAux)); } else{ind(iAux) = 0; mu(iAux) = mu0(iAux); } @@ -1251,11 +1252,11 @@ arma::mat muUpdateNoSpikes( for (int i=0; i < NotConstrainGene.size(); i++) { iAux = NotConstrainGene(i); - log_aux(iAux) -= (0.5/s2_mu) * (pow(log(y(iAux)),2) - pow(log(mu0(iAux)),2)); + log_aux(iAux) -= (0.5/s2_mu) * (pow(log(mu1(iAux)),2) - pow(log(mu0(iAux)),2)); // ACCEPT REJECT - if((log(u(iAux)) < log_aux(iAux)) & (y(iAux) > 1e-3)) + if((log(u(iAux)) < log_aux(iAux)) & (mu1(iAux) > 1e-3)) { - ind(iAux) = 1; mu(iAux) = y(iAux); + ind(iAux) = 1; mu(iAux) = mu1(iAux); } else{ind(iAux) = 0; mu(iAux) = mu0(iAux);} } @@ -1302,71 +1303,6 @@ arma::mat HiddenBASiCS_DenoisedRates( /* IGNORE CODE FROM HERE ONWARDS */ -/* Metropolis-Hastings updates of nu (batch case) - * Updates are implemented simulateaneously for all cells. - */ -arma::mat nuUpdateNoSpikes( - arma::vec const& nu0, - arma::vec const& prop_var, - arma::mat const& Counts, - arma::mat const& BatchDesign, - arma::vec const& mu, - arma::vec const& invdelta, - arma::vec const& phi, - arma::vec const& thetaBatch, - arma::vec const& sum_bygene_all, - int const& q0, - int const& n, - arma::vec & nu1, - arma::vec & u, - arma::vec & ind) -{ - using arma::span; - - // PROPOSAL STEP - nu1 = exp(arma::randn(n) % sqrt(prop_var) + log(nu0)); - u = arma::randu(n); - - // ACCEPT/REJECT STEP - arma::vec log_aux = arma::zeros(n); - - for (int j=0; j < n; j++) - { - for (int i=0; i < q0; i++) - { - log_aux(j) -= ( Counts(i,j) + invdelta(i) ) * - log( ( nu1(j)*mu(i) + invdelta(i) ) / - ( nu0(j)*mu(i) + invdelta(i) )); - } - } - - log_aux += (log(nu1) - log(nu0)) % (sum_bygene_all + 1/thetaBatch); - log_aux -= (nu1 - nu0) % (1/(thetaBatch % phi)); - - /* CREATING OUTPUT VARIABLE & DEBUG - * Proposed values are automatically rejected in the following cases: - * - If smaller than 1e-5 - * - If the proposed value is not finite - * - When the acceptance rate cannot be numerally computed - */ - for (int j=0; j < n; j++) - { - if(arma::is_finite(log_aux(j)) & arma::is_finite(nu1(j))) - { - if( (log(u(j)) < log_aux(j)) & (nu1(j) > 1e-5) ) { ind(j) = 1; } - else {ind(j) = 0; nu1(j) = nu0(j); } - } - else - { - ind(j) = 0; nu1(j) = nu0(j); - Rcpp::Rcout << "Error when updating nu " << j+1 << std::endl; - Rcpp::warning("Consider additional data filter if error is frequent."); - } - } - - // OUTPUT - return join_rows(nu1, ind); -} /* MCMC sampler * N: Total number of MCMC draws @@ -1431,9 +1367,6 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( int StoreAdapt, int EndAdapt, int PrintProgress, - NumericVector BatchInfo, - NumericVector BatchIds, - NumericVector BatchOffSet, double Constrain, NumericVector Index, int RefGene, @@ -1459,9 +1392,6 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( arma::vec sumByGeneAll_arma = as_arma(sumByGeneAll); arma::mat Counts_arma = as_arma(Counts); arma::mat BatchDesign_arma = as_arma(BatchDesign); - arma::vec BatchInfo_arma = as_arma(BatchInfo); - arma::vec BatchIds_arma = as_arma(BatchIds); - arma::vec BatchOffSet_arma = as_arma(BatchOffSet); arma::vec Index_arma = as_arma(Index); arma::uvec ConstrainGene_arma = Rcpp::as(ConstrainGene); arma::uvec NotConstrainGene_arma = Rcpp::as(NotConstrainGene); @@ -1526,8 +1456,6 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( arma::vec ones_n = ones(n); // INITIALIZATION OF PARAMETERS TO RETURN IN UPDATE FUNCTIONS - arma::vec muUpdateAux = ones(q0); - double LSmuAuxExtra = 0; arma::vec RefFreq = zeros(q0); int RefAux; @@ -1567,14 +1495,15 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( // UPDATE OF MU: 1st COLUMN IS THE UPDATE, 2nd COLUMN IS THE ACCEPTANCE INDICATOR if(StochasticRef == 1) { - RefAux = as_scalar(arma::randi( 1, arma::distr_param(0, RefGenes_arma.size()-1) )); + RefAux = as_scalar(arma::randi(1,arma::distr_param(0, RefGenes_arma.size()-1))); RefGene = RefGenes(RefAux); if(i >= Burn) {RefFreq(RefGene) += 1;} } - muAux = muUpdateNoSpikes(muAux.col(0), exp(LSmuAux), Constrain, Counts_arma, deltaAux.col(0), - nuAux.col(0), sumByCellAll_arma, s2mu, q0, n, - muUpdateAux, ind_q0, RefGene, ConstrainGene_arma, NotConstrainGene_arma, - ConstrainType); + muAux = muUpdateNoSpikes(muAux.col(0), exp(LSmuAux), Constrain, Counts_arma, + deltaAux.col(0), nuAux.col(0), sumByCellAll_arma, + s2mu, q0, n, RefGene, + ConstrainGene_arma, NotConstrainGene_arma, + ConstrainType, y_q0, u_q0, ind_q0); PmuAux += muAux.col(1); if(i>=Burn) {muAccept += muAux.col(1);} @@ -1590,10 +1519,10 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( // UPDATE OF NU: // 1st COLUMN IS THE UPDATE, // 2nd COLUMN IS THE ACCEPTANCE INDICATOR - nuAux = nuUpdateNoSpikes(nuAux.col(0), exp(LSnuAux), Counts_arma, + nuAux = nuUpdateBatch(nuAux.col(0), exp(LSnuAux), Counts_arma, 0, BatchDesign_arma, muAux.col(0), 1/deltaAux.col(0), - sAux, thetaBatch, sumByGeneAll_arma, q0, n, + ones_n, sAux, thetaBatch, sumByGeneAll_arma, q0, n, y_n, u_n, ind_n); PnuAux += nuAux.col(1); if(i>=Burn) {nuAccept += nuAux.col(1);} @@ -1656,7 +1585,7 @@ Rcpp::List HiddenBASiCS_MCMCcppNoSpikes( Rcout << "theta (batch 1): " << thetaAux(0,0) << std::endl; Rcout << "--------------------------------------------------------------------" << std::endl; Rcout << "Current proposal variances for Metropolis Hastings updates (log-scale)." << std::endl; - Rcout << "LSmu (gene 1): " << LSmuAuxExtra + LSmuAux(0) << std::endl; + Rcout << "LSmu (gene 1): " << LSmuAux(0) << std::endl; Rcout << "LSdelta (gene 1): " << LSdeltaAux(0) << std::endl; Rcout << "LSnu (cell 1): " << LSnuAux(0) << std::endl; Rcout << "LStheta (batch 1): " << LSthetaAux(0) << std::endl; diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index a954c484..6e51767d 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -69,8 +69,8 @@ BEGIN_RCPP END_RCPP } // HiddenBASiCS_MCMCcppNoSpikes -Rcpp::List HiddenBASiCS_MCMCcppNoSpikes(int N, int Thin, int Burn, NumericMatrix Counts, NumericMatrix BatchDesign, NumericVector mu0, NumericVector delta0, NumericVector s0, NumericVector nu0, NumericVector theta0, double s2mu, double adelta, double bdelta, double s2delta, double prior_delta, double as, double bs, double atheta, double btheta, double ar, NumericVector LSmu0, NumericVector LSdelta0, NumericVector LSnu0, NumericVector LStheta0, NumericVector sumByCellAll, NumericVector sumByGeneAll, int StoreAdapt, int EndAdapt, int PrintProgress, NumericVector BatchInfo, NumericVector BatchIds, NumericVector BatchOffSet, double Constrain, NumericVector Index, int RefGene, NumericVector RefGenes, NumericVector ConstrainGene, NumericVector NotConstrainGene, int ConstrainType, int StochasticRef); -RcppExport SEXP _BASiCS_HiddenBASiCS_MCMCcppNoSpikes(SEXP NSEXP, SEXP ThinSEXP, SEXP BurnSEXP, SEXP CountsSEXP, SEXP BatchDesignSEXP, SEXP mu0SEXP, SEXP delta0SEXP, SEXP s0SEXP, SEXP nu0SEXP, SEXP theta0SEXP, SEXP s2muSEXP, SEXP adeltaSEXP, SEXP bdeltaSEXP, SEXP s2deltaSEXP, SEXP prior_deltaSEXP, SEXP asSEXP, SEXP bsSEXP, SEXP athetaSEXP, SEXP bthetaSEXP, SEXP arSEXP, SEXP LSmu0SEXP, SEXP LSdelta0SEXP, SEXP LSnu0SEXP, SEXP LStheta0SEXP, SEXP sumByCellAllSEXP, SEXP sumByGeneAllSEXP, SEXP StoreAdaptSEXP, SEXP EndAdaptSEXP, SEXP PrintProgressSEXP, SEXP BatchInfoSEXP, SEXP BatchIdsSEXP, SEXP BatchOffSetSEXP, SEXP ConstrainSEXP, SEXP IndexSEXP, SEXP RefGeneSEXP, SEXP RefGenesSEXP, SEXP ConstrainGeneSEXP, SEXP NotConstrainGeneSEXP, SEXP ConstrainTypeSEXP, SEXP StochasticRefSEXP) { +Rcpp::List HiddenBASiCS_MCMCcppNoSpikes(int N, int Thin, int Burn, NumericMatrix Counts, NumericMatrix BatchDesign, NumericVector mu0, NumericVector delta0, NumericVector s0, NumericVector nu0, NumericVector theta0, double s2mu, double adelta, double bdelta, double s2delta, double prior_delta, double as, double bs, double atheta, double btheta, double ar, NumericVector LSmu0, NumericVector LSdelta0, NumericVector LSnu0, NumericVector LStheta0, NumericVector sumByCellAll, NumericVector sumByGeneAll, int StoreAdapt, int EndAdapt, int PrintProgress, double Constrain, NumericVector Index, int RefGene, NumericVector RefGenes, NumericVector ConstrainGene, NumericVector NotConstrainGene, int ConstrainType, int StochasticRef); +RcppExport SEXP _BASiCS_HiddenBASiCS_MCMCcppNoSpikes(SEXP NSEXP, SEXP ThinSEXP, SEXP BurnSEXP, SEXP CountsSEXP, SEXP BatchDesignSEXP, SEXP mu0SEXP, SEXP delta0SEXP, SEXP s0SEXP, SEXP nu0SEXP, SEXP theta0SEXP, SEXP s2muSEXP, SEXP adeltaSEXP, SEXP bdeltaSEXP, SEXP s2deltaSEXP, SEXP prior_deltaSEXP, SEXP asSEXP, SEXP bsSEXP, SEXP athetaSEXP, SEXP bthetaSEXP, SEXP arSEXP, SEXP LSmu0SEXP, SEXP LSdelta0SEXP, SEXP LSnu0SEXP, SEXP LStheta0SEXP, SEXP sumByCellAllSEXP, SEXP sumByGeneAllSEXP, SEXP StoreAdaptSEXP, SEXP EndAdaptSEXP, SEXP PrintProgressSEXP, SEXP ConstrainSEXP, SEXP IndexSEXP, SEXP RefGeneSEXP, SEXP RefGenesSEXP, SEXP ConstrainGeneSEXP, SEXP NotConstrainGeneSEXP, SEXP ConstrainTypeSEXP, SEXP StochasticRefSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -103,9 +103,6 @@ BEGIN_RCPP Rcpp::traits::input_parameter< int >::type StoreAdapt(StoreAdaptSEXP); Rcpp::traits::input_parameter< int >::type EndAdapt(EndAdaptSEXP); Rcpp::traits::input_parameter< int >::type PrintProgress(PrintProgressSEXP); - Rcpp::traits::input_parameter< NumericVector >::type BatchInfo(BatchInfoSEXP); - Rcpp::traits::input_parameter< NumericVector >::type BatchIds(BatchIdsSEXP); - Rcpp::traits::input_parameter< NumericVector >::type BatchOffSet(BatchOffSetSEXP); Rcpp::traits::input_parameter< double >::type Constrain(ConstrainSEXP); Rcpp::traits::input_parameter< NumericVector >::type Index(IndexSEXP); Rcpp::traits::input_parameter< int >::type RefGene(RefGeneSEXP); @@ -114,7 +111,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< NumericVector >::type NotConstrainGene(NotConstrainGeneSEXP); Rcpp::traits::input_parameter< int >::type ConstrainType(ConstrainTypeSEXP); Rcpp::traits::input_parameter< int >::type StochasticRef(StochasticRefSEXP); - rcpp_result_gen = Rcpp::wrap(HiddenBASiCS_MCMCcppNoSpikes(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, s0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, as, bs, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, BatchInfo, BatchIds, BatchOffSet, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef)); + rcpp_result_gen = Rcpp::wrap(HiddenBASiCS_MCMCcppNoSpikes(N, Thin, Burn, Counts, BatchDesign, mu0, delta0, s0, nu0, theta0, s2mu, adelta, bdelta, s2delta, prior_delta, as, bs, atheta, btheta, ar, LSmu0, LSdelta0, LSnu0, LStheta0, sumByCellAll, sumByGeneAll, StoreAdapt, EndAdapt, PrintProgress, Constrain, Index, RefGene, RefGenes, ConstrainGene, NotConstrainGene, ConstrainType, StochasticRef)); return rcpp_result_gen; END_RCPP } @@ -122,7 +119,7 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_BASiCS_HiddenBASiCS_MCMCcpp", (DL_FUNC) &_BASiCS_HiddenBASiCS_MCMCcpp, 35}, {"_BASiCS_HiddenBASiCS_DenoisedRates", (DL_FUNC) &_BASiCS_HiddenBASiCS_DenoisedRates, 7}, - {"_BASiCS_HiddenBASiCS_MCMCcppNoSpikes", (DL_FUNC) &_BASiCS_HiddenBASiCS_MCMCcppNoSpikes, 40}, + {"_BASiCS_HiddenBASiCS_MCMCcppNoSpikes", (DL_FUNC) &_BASiCS_HiddenBASiCS_MCMCcppNoSpikes, 37}, {NULL, NULL, 0} }; From 16e9d70548b411eb250bed3e67c50451428454a4 Mon Sep 17 00:00:00 2001 From: Nils Eling Date: Sat, 30 Sep 2017 23:27:48 +0100 Subject: [PATCH 7/7] Test commit --- R/BASiCS_TestDE.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/BASiCS_TestDE.R b/R/BASiCS_TestDE.R index b7f7bf39..bffb7e9e 100644 --- a/R/BASiCS_TestDE.R +++ b/R/BASiCS_TestDE.R @@ -157,6 +157,7 @@ BASiCS_TestDE <- function(Chain1, GenesSelect = NULL, ...) { # Checking validity of input arguments + # Test if (!is(Chain1, "BASiCS_Chain")) stop("'Chain1' is not a 'BASiCS_Chain' class object.") if (!is(Chain2, "BASiCS_Chain"))