From d88b9d850fd9271371c3241ad90061634ffbf56b Mon Sep 17 00:00:00 2001 From: Pol Alvarez Date: Fri, 5 Aug 2022 13:31:21 +0200 Subject: [PATCH] Revert "Revert "Revert "[BIOMAGE-2004] Merging in QC""" --- Makefile | 7 +- local-runner/cf-local-container-launcher.yaml | 4 +- pipeline-runner/Dockerfile | 5 +- pipeline-runner/NAMESPACE | 11 +- pipeline-runner/R/gem2s-2-load_user_files.R | 7 +- pipeline-runner/R/gem2s-4-score_doublets.R | 2 +- pipeline-runner/R/gem2s-5-create_seurat.R | 39 +- .../R/gem2s-6-construct_qc_config.R | 62 ++- .../R/gem2s-6-prepare_experiment.R | 69 +-- pipeline-runner/R/gem2s-7-upload_to_aws.R | 101 ++-- pipeline-runner/R/handle_data.R | 68 +-- pipeline-runner/R/qc-1-filter_emptydrops.R | 38 +- pipeline-runner/R/qc-2-filter_low_cellsize.R | 50 +- pipeline-runner/R/qc-3-filter_high_mito.R | 38 +- .../R/qc-4-filter_gene_umi_outlier.R | 48 +- pipeline-runner/R/qc-5-filter_doublets.R | 40 +- pipeline-runner/R/qc-6-integrate_scdata.R | 111 +---- pipeline-runner/R/qc-7-embed_and_cluster.R | 2 +- pipeline-runner/R/qc-helpers.R | 13 +- pipeline-runner/R/sysdata.rda | Bin 2529 -> 2520 bytes pipeline-runner/README.md | 4 +- pipeline-runner/data-raw/sysdata.R | 6 +- pipeline-runner/init.R | 18 +- pipeline-runner/man/add_metadata.Rd | 21 - .../man/build_metadata_cellsets.Rd | 23 - pipeline-runner/man/construct_metadata.Rd | 22 - pipeline-runner/man/construct_qc_config.Rd | 21 - pipeline-runner/man/create_scdata.Rd | 19 - pipeline-runner/man/filter_doublets.Rd | 2 +- .../man/filter_gene_umi_outlier.Rd | 4 +- pipeline-runner/man/filter_high_mito.Rd | 2 +- pipeline-runner/man/filter_low_cellsize.Rd | 10 +- .../man/filter_unnamed_features.Rd | 3 - .../man/generate_first_step_ids.Rd | 4 +- pipeline-runner/man/get_gem2s_file_v1.Rd | 20 + ...get_gem2s_file.Rd => get_gem2s_file_v2.Rd} | 6 +- pipeline-runner/man/ids_to_sym.Rd | 3 - pipeline-runner/man/merge_scdata_list.Rd | 18 - pipeline-runner/man/merge_scdatas.Rd | 14 + pipeline-runner/man/sym_to_ids.Rd | 3 - pipeline-runner/start.sh | 3 - .../_snaps/gem2s-6-prepare_experiment.md | 6 +- .../testthat/_snaps/gem2s-7-upload_to_aws.md | 449 +----------------- pipeline-runner/tests/testthat/qc_mock.R | 1 - .../test-gem2s-6-construct_qc_config.R | 4 +- .../test-gem2s-6-prepare_experiment.R | 195 +++----- .../testthat/test-gem2s-7-upload_to_aws.R | 232 +++++---- .../testthat/test-qc-1-filter_emptydrops.R | 115 ++--- .../testthat/test-qc-2-filter_low_cellsize.R | 109 ++--- .../testthat/test-qc-3-filter_high_mito.R | 84 ++-- .../test-qc-4-filter_gene_umi_outlier.R | 237 ++++----- .../testthat/test-qc-5-filter_doublets.R | 68 ++- .../testthat/test-qc-6-integrate_scdata.R | 222 +++------ 53 files changed, 890 insertions(+), 1773 deletions(-) delete mode 100644 pipeline-runner/man/add_metadata.Rd delete mode 100644 pipeline-runner/man/build_metadata_cellsets.Rd delete mode 100644 pipeline-runner/man/construct_metadata.Rd delete mode 100644 pipeline-runner/man/construct_qc_config.Rd delete mode 100644 pipeline-runner/man/create_scdata.Rd create mode 100644 pipeline-runner/man/get_gem2s_file_v1.Rd rename pipeline-runner/man/{get_gem2s_file.Rd => get_gem2s_file_v2.Rd} (81%) delete mode 100644 pipeline-runner/man/merge_scdata_list.Rd create mode 100644 pipeline-runner/man/merge_scdatas.Rd delete mode 100755 pipeline-runner/start.sh diff --git a/Makefile b/Makefile index 422b10b8..5c959f6d 100644 --- a/Makefile +++ b/Makefile @@ -16,13 +16,12 @@ endif #-------------------------------------------------- # Targets #-------------------------------------------------- -install: +install: @echo "Installing local runner" @(cd ./local-runner && npm install) - @echo "Installing renv packages" + @echo "Installing R env packages" @(cd ./pipeline-runner && R -e "renv::restore()") - @(cd ./pipeline-runner && R -e "renv::install()") -build: +build: # regenerate sysdata.rda env file @(cd ./pipeline-runner && Rscript data-raw/sysdata.R) @(cd ./local-runner && npm run build) diff --git a/local-runner/cf-local-container-launcher.yaml b/local-runner/cf-local-container-launcher.yaml index 6d46aa36..2ed9a436 100644 --- a/local-runner/cf-local-container-launcher.yaml +++ b/local-runner/cf-local-container-launcher.yaml @@ -30,7 +30,7 @@ Resources: docker_command = ' '.join([ "docker run --rm -t", - f"--name {event['name']}-{random_string(10)}", + f"--name {event['name']}-{random_string(10)}", f"{'-d' if event['detached'] else ''}", f"--env ACTIVITY_ARN={event.get('activityArn', '')}", f"--env HOST_IP=__HOST_IP__", @@ -47,7 +47,7 @@ Resources: Timeout: 25 QualityControlActivity: Type: AWS::StepFunctions::Activity - Properties: + Properties: Name: biomage-qc-activity-development RemovePreviousPipelineContainers: Type: "AWS::Lambda::Function" diff --git a/pipeline-runner/Dockerfile b/pipeline-runner/Dockerfile index a42a177f..30b84246 100644 --- a/pipeline-runner/Dockerfile +++ b/pipeline-runner/Dockerfile @@ -105,8 +105,6 @@ FROM common AS dev # also install watchdog to automatically restart # when source files change RUN pip install -U jedi radian PyYAML watchdog[watchmedo] -RUN apt update && apt -y install git -RUN pip install memory_profiler # add R package files and runner ADD R ./R @@ -114,5 +112,4 @@ ADD tests ./tests COPY DESCRIPTION NAMESPACE init.R ./ # start app -COPY start.sh start.sh -ENTRYPOINT ["./start.sh"] \ No newline at end of file +ENTRYPOINT ["Rscript", "init.R"] diff --git a/pipeline-runner/NAMESPACE b/pipeline-runner/NAMESPACE index e9da0fb3..5519ebf0 100644 --- a/pipeline-runner/NAMESPACE +++ b/pipeline-runner/NAMESPACE @@ -1,9 +1,6 @@ # Generated by roxygen2: do not edit by hand -export(add_metadata) export(build_cc_gene_list) -export(build_metadata_cellsets) -export(create_scdata) export(create_seurat) export(download_user_files) export(filter_doublets) @@ -11,27 +8,21 @@ export(filter_emptydrops) export(filter_gene_umi_outlier) export(filter_high_mito) export(filter_low_cellsize) -export(filter_unnamed_features) export(generate_default_values_cellSizeDistribution) export(generate_default_values_classifier) export(generate_first_step_ids) export(getClusters) export(get_feature_types) export(get_gem2s_file) -export(ids_to_sym) export(list_exclude_genes) export(load_user_files) -export(make_annot_with_ids) -export(merge_scdata_list) -export(normalize_annotation_types) +export(merge_scdatas) export(prepare_experiment) -export(read_10x_annotations) export(remove_genes) export(runClusters) export(run_emptydrops) export(score_doublets) export(subset_ids) export(subset_safe) -export(sym_to_ids) import(data.table) importFrom(magrittr,"%>%") diff --git a/pipeline-runner/R/gem2s-2-load_user_files.R b/pipeline-runner/R/gem2s-2-load_user_files.R index 2a70e913..8d8c85c9 100644 --- a/pipeline-runner/R/gem2s-2-load_user_files.R +++ b/pipeline-runner/R/gem2s-2-load_user_files.R @@ -222,6 +222,7 @@ parse_rhapsody_matrix <- function(config, input_dir) { #' @return list of annotations data.frame #' @export #' +#' @examples read_10x_annotations <- function(annot_fpath, sample) { gene_column <- 1 @@ -408,7 +409,7 @@ make_annot_with_ids <- function(annot_list, feature_types_list) { #' @param sample_annot data.frame of annotations #' @param annot_with_ids data.frame of annotations with IDs. Key data.frame #' -#' @return data.frame of annotations +#' @return #' @export #' sym_to_ids <- function(sample_annot, annot_with_ids) { @@ -436,7 +437,7 @@ sym_to_ids <- function(sample_annot, annot_with_ids) { #' @param sample_annot data.frame of annotations #' @param annot_with_ids data.frame of annotations with IDs. Key data.frame #' -#' @return data.frame of annotations +#' @return #' @export #' ids_to_sym <- function(sample_annot, annot_with_ids) { @@ -464,7 +465,7 @@ ids_to_sym <- function(sample_annot, annot_with_ids) { #' @param annotations list of annotations data.frame, feature types and gene_column #' @param sample character specifying current sample #' -#' @return list of counts and annotations +#' @return #' @export #' filter_unnamed_features <- function(counts, annotations, sample) { diff --git a/pipeline-runner/R/gem2s-4-score_doublets.R b/pipeline-runner/R/gem2s-4-score_doublets.R index b68405f9..436452ea 100644 --- a/pipeline-runner/R/gem2s-4-score_doublets.R +++ b/pipeline-runner/R/gem2s-4-score_doublets.R @@ -55,7 +55,7 @@ score_doublets <- function(input, pipeline_config, prev_out) { #' @return data.frame with doublet scores and assigned classes #' compute_sample_doublet_scores <- function(sample_counts) { - set.seed(RANDOM_SEED) + set.seed(gem2s$random.seed) sce <- scDblFinder::scDblFinder(sample_counts) doublet_res <- data.frame( row.names = colnames(sce), diff --git a/pipeline-runner/R/gem2s-5-create_seurat.R b/pipeline-runner/R/gem2s-5-create_seurat.R index b61fbc22..2fea8475 100644 --- a/pipeline-runner/R/gem2s-5-create_seurat.R +++ b/pipeline-runner/R/gem2s-5-create_seurat.R @@ -15,11 +15,7 @@ create_seurat <- function(input, pipeline_config, prev_out) { check_prev_out(prev_out, check_names) # destructure previous output: config, counts_list, annot, and doublet_scores - config <- prev_out$config - counts_list <- prev_out$counts_list - annot <- prev_out$annot - doublet_scores <- prev_out$doublet_scores - edrops <- prev_out$edrops + list2env(prev_out, envir = environment()) samples <- names(counts_list) scdata_list <- list() @@ -68,33 +64,26 @@ construct_scdata <- function(counts, doublet_score, edrops_out, sample, annot, c } -#' Construct metadata for each SeuratObject -#' -#' This function creates a `data.frame` with the barcodes, sampleIDs and user supplied -#' metadata that corresponds to each one. -#' -#' @param counts count matrix -#' @param sample character sample ID -#' @param config list containing experiment config -#' -#' @return data.frame of sample metadata -#' + +# NOTE: any changes here must be reflected in meta_sets + +# construct metadata for each SeuratObject construct_metadata <- function(counts, sample, config) { - message("Constructing metadata data.frame...") - metadata_df <- data.frame(row.names = colnames(counts), samples = rep(sample, ncol(counts))) + message("Constructing metadata df...") + metadata <- data.frame(row.names = colnames(counts), samples = rep(sample, ncol(counts))) # Add "metadata" if exists in config - user_metadata <- config$metadata - if (!is.null(user_metadata)) { - user_metadata <- lapply(user_metadata, unlist) - user_metadata <- data.frame(user_metadata, row.names = config$samples, check.names = FALSE) - metadata_df[names(user_metadata)] <- user_metadata[sample, ] + rest <- config$metadata + if (!is.null(rest)) { + rest <- lapply(rest, unlist) + rest <- data.frame(rest, row.names = config$samples, check.names = FALSE) + metadata[names(rest)] <- rest[sample, ] } # make syntactically valid column names - colnames(metadata_df) <- make.names(colnames(metadata_df), unique = TRUE) + colnames(metadata) <- make.names(colnames(metadata), unique = TRUE) - return(metadata_df) + return(metadata) } # add mitochondrial percent to SeuratObject diff --git a/pipeline-runner/R/gem2s-6-construct_qc_config.R b/pipeline-runner/R/gem2s-6-construct_qc_config.R index 34f7c2e1..7060767a 100644 --- a/pipeline-runner/R/gem2s-6-construct_qc_config.R +++ b/pipeline-runner/R/gem2s-6-construct_qc_config.R @@ -1,16 +1,6 @@ -#' Constructs default QC configuration -#' -#' This function returns the default parameters used during QC as a nested list. -#' It is sent to the API, which in turn saves it as a jsonb object in the PostgreSQL -#' database. -#' -#' @param scdata_list list of seurat objects -#' @param any_filtered boolean indicating if barcodes were filtered by the emptyDrops. -#' -#' @return list of QC configuration parameters -#' -construct_qc_config <- function(scdata_list, any_filtered) { - samples <- names(scdata_list) +# constructs default QC configuration for merged SeuratObject +construct_qc_config <- function(scdata, any_filtered) { + samples <- scdata$samples # classifier config.classifier <- list( @@ -36,7 +26,7 @@ construct_qc_config <- function(scdata_list, any_filtered) { filterSettings = list(minCellSize = 1080, binStep = 200) ) - config.cellSizeDistribution <- add_custom_config_per_sample(get_cellsize_config, config.cellSizeDistribution, scdata_list) + config.cellSizeDistribution <- add_custom_config_per_sample(get_cellsize_config, config.cellSizeDistribution, scdata) # mito config.mitochondrialContent <- list( @@ -53,7 +43,7 @@ construct_qc_config <- function(scdata_list, any_filtered) { ) ) - config.mitochondrialContent <- add_custom_config_per_sample(get_sample_mitochondrial_config, config.mitochondrialContent, scdata_list) + config.mitochondrialContent <- add_custom_config_per_sample(get_sample_mitochondrial_config, config.mitochondrialContent, scdata) # ngenes vs umis config.numGenesVsNumUmis <- list( @@ -68,7 +58,7 @@ construct_qc_config <- function(scdata_list, any_filtered) { ) ) - config.numGenesVsNumUmis <- add_custom_config_per_sample(get_gene_umi_config, config.numGenesVsNumUmis, scdata_list) + config.numGenesVsNumUmis <- add_custom_config_per_sample(get_gene_umi_config, config.numGenesVsNumUmis, scdata) # doublet scores @@ -81,7 +71,7 @@ construct_qc_config <- function(scdata_list, any_filtered) { ) ) - config.doubletScores <- add_custom_config_per_sample(get_dblscore_config, config.doubletScores, scdata_list) + config.doubletScores <- add_custom_config_per_sample(get_dblscore_config, config.doubletScores, scdata) # data integration config.dataIntegration <- list( @@ -112,8 +102,8 @@ construct_qc_config <- function(scdata_list, any_filtered) { distanceMetric = "cosine" ), tsne = list( - perplexity = min(30, ncol(scdata_list) / 100), - learningRate = max(200, ncol(scdata_list) / 12) + perplexity = min(30, ncol(scdata) / 100), + learningRate = max(200, ncol(scdata) / 12) ) ) ), @@ -138,13 +128,13 @@ construct_qc_config <- function(scdata_list, any_filtered) { } -get_cellsize_config <- function(scdata_list, config) { - minCellSize <- generate_default_values_cellSizeDistribution(scdata_list, config) +get_cellsize_config <- function(scdata, config) { + minCellSize <- generate_default_values_cellSizeDistribution(scdata, config) config$filterSettings$minCellSize <- minCellSize return(config) } -get_sample_mitochondrial_config <- function(scdata_list.sample, config) { +get_sample_mitochondrial_config <- function(scdata.sample, config) { config.sample <- list( auto = TRUE, @@ -155,32 +145,32 @@ get_sample_mitochondrial_config <- function(scdata_list.sample, config) { ) config.sample$filterSettings$methodSettings$absoluteThreshold <- list( - maxFraction = generate_default_values_mitochondrialContent(scdata_list.sample, config.sample), + maxFraction = generate_default_values_mitochondrialContent(scdata.sample, config.sample), binStep = 0.3 ) return(config.sample) } - # threshold for doublet score is the max score given to a singlet (above score => doublets) -get_dblscore_config <- function(scdata_list, config) { - probabilityThreshold <- max(scdata_list$doublet_scores[scdata_list$doublet_class == "singlet"], na.rm = TRUE) +get_dblscore_config <- function(scdata, config) { + probabilityThreshold <- max(scdata$doublet_scores[scdata$doublet_class == "singlet"], na.rm = TRUE) config$filterSettings$probabilityThreshold <- probabilityThreshold return(config) } -get_gene_umi_config <- function(scdata_list, config) { +get_gene_umi_config <- function(scdata, config) { # Sensible values are based on the function "gene.vs.molecule.cell.filter" from the pagoda2 package - p.level <- min(0.001, 1 / ncol(scdata_list)) + p.level <- min(0.001, 1 / ncol(scdata)) config$filterSettings$regressionTypeSettings[[config$filterSettings$regressionType]]$p.level <- p.level return(config) } + duplicate_config_per_sample <- function(step_config, config, samples) { for (sample in unique(samples)) { config[[sample]] <- step_config @@ -190,23 +180,25 @@ duplicate_config_per_sample <- function(step_config, config, samples) { return(config) } +add_custom_config_per_sample <- function(generate_sample_config, config, scdata) { -add_custom_config_per_sample <- function(generate_sample_config, config, scdata_list) { # We update the config file, so to be able to access the raw config we create a copy - raw_config <- config + config.raw <- config - for (sample in names(scdata_list)) { + samples <- scdata$samples + + for (sample in unique(samples)) { # subset the Seurat object to a single sample - sample_data <- scdata_list[[sample]] + scdata.sample <- scdata[, samples %in% sample] # run the function to generate config for a sample - sample_config <- generate_sample_config(sample_data, raw_config) + config.sample <- generate_sample_config(scdata.sample, config.raw) # update sample config thresholds - config[[sample]] <- sample_config + config[[sample]] <- config.sample # add auto settings - config[[sample]]$defaultFilterSettings <- sample_config$filterSettings + config[[sample]]$defaultFilterSettings <- config.sample$filterSettings } return(config) diff --git a/pipeline-runner/R/gem2s-6-prepare_experiment.R b/pipeline-runner/R/gem2s-6-prepare_experiment.R index d49c780c..c81e7a71 100644 --- a/pipeline-runner/R/gem2s-6-prepare_experiment.R +++ b/pipeline-runner/R/gem2s-6-prepare_experiment.R @@ -14,57 +14,70 @@ #' prepare_experiment <- function(input, pipeline_config, prev_out) { message("Preparing experiment ...") - check_names <- c("config", "counts_list", "annot", "doublet_scores", "scdata_list") check_prev_out(prev_out, check_names) scdata_list <- prev_out$scdata_list samples <- names(scdata_list) - message("Total cells:", sum(sapply(scdata_list, ncol))) + message("Merging Seurat Objects...") + scdata <- merge_scdatas(scdata_list) + + #If subsetting all cells, Seurat will not reorder the cells in the object. We need to subset to [-1] and [1] and merge to shuffle. + set.seed(gem2s$random.seed) + shuffle_mask <- sample(colnames(scdata)) + scdata <- merge(scdata[,shuffle_mask[1]],scdata[,shuffle_mask[-1]]) - scdata_list <- add_metadata_to_samples(scdata_list, prev_out$annot, input$experimentId) - prev_out$scdata_list <- scdata_list + scdata <- add_metadata(scdata, prev_out$annot, input$experimentId) + prev_out$scdata <- scdata # construct default QC config and update prev out message("Constructing default QC configuration...") any_filtered <- !(length(prev_out$edrops) == length(samples)) - prev_out$qc_config <- construct_qc_config(scdata_list, any_filtered) + prev_out$qc_config <- construct_qc_config(scdata, any_filtered) res <- list( data = list(), output = prev_out ) - message("\nPreparation for AWS upload step complete.") + message("\nPreperation for AWS upload step complete.") return(res) } -add_metadata_to_samples <- function(scdata_list, annot, experiment_id) { - # cell ids will be generated at random in order to shuffle samples. it is done - # here because merging samples in QC means that shuffling the cells will not - # result in a shuffled cell_ids - set.seed(RANDOM_SEED) - total_cells <- sum(sapply(scdata_list, ncol)) - cell_ids <- 0:total_cells-1 +#' Merge scdatas: merge rds files in input list +#' +#' @param scdata_list +#' +#' @return +#' @export +#' +#' @examples +merge_scdatas <- function(scdata_list) { + if (length(scdata_list) == 1) { + scdata <- scdata_list[[1]] + } else { + scdata <- merge(scdata_list[[1]], y = scdata_list[-1]) + } + + return(scdata) +} - for (sample in names(scdata_list)) { - sample_size <- ncol(scdata_list[[sample]]) +add_metadata <- function(scdata, annot, experiment_id) { - # select only the annotations of the current sample - sample_annotations_idx <- match(rownames(scdata_list[[sample]]), annot$input) - sample_annot <- annot[sample_annotations_idx, ] - scdata_list[[sample]]@misc[["gene_annotations"]] <- sample_annot + # Ensure index by rownames in scdata + annot <- annot[match(rownames(scdata), annot$input), ] + scdata@misc[["gene_annotations"]] <- annot - # add the experiment ID so it's available later - scdata_list[[sample]]@misc[["experimentId"]] <- experiment_id + message("Storing cells id...") + # Keeping old version of ids starting from 0 + scdata$cells_id <- 0:(ncol(scdata) - 1) - # sample cell ids to shuffle them - idxs <- sample(seq_along(cell_ids), sample_size) - scdata_list[[sample]]$cells_id <- cell_ids[idxs] - # remove the selected cell ids for next samples - cell_ids <- cell_ids[-idxs] - } + message("Storing color pool...") + # We store the color pool in a slot in order to be able to access it during configureEmbedding + scdata@misc[["color_pool"]] <- get_color_pool() + scdata@misc[["experimentId"]] <- experiment_id + scdata@misc[["ingestionDate"]] <- Sys.time() - return(scdata_list) + return(scdata) } diff --git a/pipeline-runner/R/gem2s-7-upload_to_aws.R b/pipeline-runner/R/gem2s-7-upload_to_aws.R index e5420b76..e0f389f0 100644 --- a/pipeline-runner/R/gem2s-7-upload_to_aws.R +++ b/pipeline-runner/R/gem2s-7-upload_to_aws.R @@ -1,19 +1,19 @@ upload_to_aws <- function(input, pipeline_config, prev_out) { message("Uploading to AWS ...") - check_names <- c("config", "counts_list", "annot", "doublet_scores", "scdata_list", "qc_config") + check_names <- c("config", "counts_list", "annot", "doublet_scores", "scdata_list", "scdata", "qc_config") check_prev_out(prev_out, check_names) experiment_id <- input$experimentId project_id <- input$projectId # destructure what need from prev_out - scdata_list <- prev_out$scdata_list + scdata <- prev_out$scdata config <- prev_out$config - qc_config <- prev_out$qc_config + config_dataProcessing <- prev_out$qc_config message("Constructing cell sets ...") - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, input) # cell sets file to s3 cell_sets_data <- RJSONIO::toJSON(cell_sets) @@ -24,20 +24,17 @@ upload_to_aws <- function(input, pipeline_config, prev_out) { key = experiment_id ) - for (sample in names(scdata_list)) { - message("Uploading sample ", sample, " object to S3 ...") - fpath <- file.path(tempdir(), "experiment.rds") - saveRDS(scdata_list[[sample]], fpath, compress = FALSE) - # can only upload up to 50Gb because multipart uploads work by uploading - # smaller chunks (parts) and the max amount of parts is 10,000. - put_object_in_s3_multipart(pipeline_config, - bucket = pipeline_config$source_bucket, - object = fpath, - key = file.path(experiment_id, sample, "r.rds") - ) - } + # seurat object to s3 + message("Uploading Seurat Object to S3 ...") + fpath <- file.path(tempdir(), "experiment.rds") + saveRDS(scdata, fpath, compress = FALSE) - message("Samples uploaded") + # can only upload up to 50Gb because part numbers can be any number from 1 to 10,000, inclusive. + put_object_in_s3_multipart(pipeline_config, + bucket = pipeline_config$source_bucket, + object = fpath, + key = file.path(experiment_id, "r.rds") + ) cluster_env <- pipeline_config$cluster_env @@ -49,7 +46,7 @@ upload_to_aws <- function(input, pipeline_config, prev_out) { organism = config$organism, type = config$input$type ), - processingConfig = qc_config + processingConfig = config_dataProcessing ) res <- list( @@ -61,12 +58,11 @@ upload_to_aws <- function(input, pipeline_config, prev_out) { ) message("\nUpload to AWS step complete.") - return(res) } # creates initial cell sets object -get_cell_sets <- function(scdata_list, input) { +get_cell_sets <- function(scdata, input) { scratchpad <- list( key = "scratchpad", name = "Custom cell sets", @@ -76,24 +72,24 @@ get_cell_sets <- function(scdata_list, input) { ) color_pool <- get_color_pool() - sample_cellsets <- build_sample_cellsets(input, scdata_list, color_pool) + samples_set <- samples_sets(input, scdata, color_pool) # remove used colors from pool - used <- seq_along(sample_cellsets$children) + used <- seq_along(samples_set$children) color_pool <- color_pool[-used] # Design cell_set meta_data for DynamoDB - cell_sets <- c(list(scratchpad), list(sample_cellsets)) + cell_sets <- c(list(scratchpad), list(samples_set)) if ("metadata" %in% names(input)) { - cell_sets <- c(cell_sets, build_metadata_cellsets(input, scdata_list, color_pool)) + cell_sets <- c(cell_sets, meta_sets(input, scdata, color_pool)) } cell_sets <- list(cellSets = cell_sets) } # cell_sets fn for seurat samples information -build_sample_cellsets <- function(input, scdata_list, color_pool) { +samples_sets <- function(input, scdata, color_pool) { cell_set <- list( key = "sample", name = "Samples", @@ -102,15 +98,14 @@ build_sample_cellsets <- function(input, scdata_list, color_pool) { type = "metadataCategorical" ) + cells_sample <- scdata$samples sample_ids <- unlist(input$sampleIds) sample_names <- unlist(input$sampleNames) for (i in seq_along(sample_ids)) { - scdata <- scdata_list[[i]] sample_id <- sample_ids[i] sample_name <- sample_names[i] - - cell_ids <- scdata$cells_id + cell_ids <- scdata$cells_id[cells_sample == sample_id] cell_set$children[[i]] <- list( key = sample_id, @@ -124,60 +119,40 @@ build_sample_cellsets <- function(input, scdata_list, color_pool) { } - -#' Create cellsets from user-supplied metadata -#' -#' This function creates the cellsets for the user-supplied metadata (in data -#' management module). It also takes care of assigning a color from the color pool -#' to each cellset. -#' -#' @param input list -#' @param scdata_list list of Seurat objects -#' @param color_pool list of colors to use -#' -#' @return list of cellsets -#' @export -#' -build_metadata_cellsets <- function(input, scdata_list, color_pool) { +# cell_sets fn for seurat metadata information +meta_sets <- function(input, scdata, color_pool) { cell_set_list <- c() - user_metadata <- lapply(input$metadata, unlist) + meta <- lapply(input$metadata, unlist) # user-supplied metadata track names - metadata_names <- names(user_metadata) + keys <- names(meta) - # user supplied metadata names must be made syntactically valid, as seurat does. - # We use them to access the cell ids stored in the seurat object, to create the - # cellsets. The same names as used in construct_metadata including internal - # 'samples' column which is dropped after making the names. - valid_metadata_names <- make.names(c("samples", metadata_names), unique = TRUE)[-1] + # syntactically valid metadata names as stored in scdata + # same names as used in construct_metadata including internal 'samples' column (dropped) + seurat_keys <- make.names(c("samples", keys), unique = TRUE)[-1] color_index <- 1 - for (i in seq_along(metadata_names)) { - user_metadata_name <- metadata_names[i] - valid_metadata_name <- valid_metadata_names[i] + for (i in seq_along(keys)) { + key <- keys[i] + seurat_key <- seurat_keys[i] cell_set <- list( - "key" = user_metadata_name, - "name" = user_metadata_name, + "key" = key, + "name" = key, "rootNode" = TRUE, "children" = c(), "type" = "metadataCategorical" ) # values of current metadata track - values <- unique(user_metadata[[i]]) + values <- unique(meta[[i]]) for (j in seq_along(values)) { value <- values[j] - - cell_ids <- list() - for (scdata in scdata_list) { - cells_in_value <- scdata[[valid_metadata_name]] == value - cell_ids <- append(cell_ids, scdata$cells_id[cells_in_value]) - } + cell_ids <- scdata$cells_id[scdata[[seurat_key]] == value] cell_set$children[[j]] <- list( - "key" = paste(user_metadata_name, value, sep = "-"), + "key" = paste(key, value, sep = "-"), "name" = value, "color" = color_pool[color_index], "cellIds" = unname(cell_ids) diff --git a/pipeline-runner/R/handle_data.R b/pipeline-runner/R/handle_data.R index fd48f93d..1f9ad076 100644 --- a/pipeline-runner/R/handle_data.R +++ b/pipeline-runner/R/handle_data.R @@ -30,8 +30,17 @@ upload_cells_id <- function(pipeline_config, object_key, cells_id) { return(object_key) } -reload_scdata_from_s3 <- function (s3, pipeline_config, experiment_id) { - bucket <- pipeline_config$processed_bucket + +reload_scdata_from_s3 <- function(pipeline_config, experiment_id, task_name, tasks) { + # If the task is after data integration, we need to get scdata from processed_matrix + task_names <- names(tasks) + integration_index <- match("dataIntegration", task_names) + if (match(task_name, task_names) > integration_index) { + bucket <- pipeline_config$processed_bucket + } else { + bucket <- pipeline_config$source_bucket + } + s3 <- paws::s3(config = pipeline_config$aws_config) message(bucket) message(paste(experiment_id, "r.rds", sep = "/")) @@ -42,54 +51,10 @@ reload_scdata_from_s3 <- function (s3, pipeline_config, experiment_id) { obj <- readRDS(rawConnection(body)) return(obj) } -reload_scdata_list_from_s3 <- function (s3, pipeline_config, experiment_id) { - bucket <- pipeline_config$source_bucket - objects <- s3$list_objects( - Bucket = bucket, - Prefix = experiment_id - ) - samples <- objects$Contents - - scdata_list <- list() - for (sample in samples) { - key <- sample$Key - - c(body, ...rest) %<-% s3$get_object( - Bucket = bucket, - Key = paste(key, sep = "/") - ) - obj <- readRDS(rawConnection(body)) - sample_id <- strsplit(key, "/")[[1]][[2]] - scdata_list[[sample_id]] <- obj - } - - return(scdata_list) -} - -# reload_data_from_s3 will reload: -# * scdata_list for all steps before integration (included) -# * scdata file for all steps after data integration -reload_data_from_s3 <- function(pipeline_config, experiment_id, task_name, tasks) { - task_names <- names(tasks) - integration_index <- match("dataIntegration", task_names) - s3 <- paws::s3(config = pipeline_config$aws_config) - - # If the task is after data integration, we need to get scdata from processed_matrix - if (match(task_name, task_names) > integration_index) { - return(reload_scdata_from_s3(s3, pipeline_config, experiment_id)) - } - - # Otherwise, return scdata_list - return(reload_scdata_list_from_s3(s3, pipeline_config, experiment_id)) - -} load_cells_id_from_s3 <- function(pipeline_config, experiment_id, task_name, tasks, samples) { s3 <- paws::s3(config = pipeline_config$aws_config) - object_list <- s3$list_objects( - Bucket = pipeline_config$cells_id_bucket, - Prefix = paste0(experiment_id, "/", task_name, "/") - ) + object_list <- s3$list_objects(pipeline_config$cells_id_bucket, Prefix = paste0(experiment_id, "/", task_name, "/")) message(pipeline_config$cells_id_bucket) message(paste(experiment_id, "r.rds", sep = "/")) cells_id <- list() @@ -97,8 +62,6 @@ load_cells_id_from_s3 <- function(pipeline_config, experiment_id, task_name, tas task_names <- names(tasks) integration_index <- match("dataIntegration", task_names) - # after data integration the cell ids are no longer used because there is no filtering - # so they are not loaded if (match(task_name, task_names) <= integration_index) { for (object in object_list$Contents) { key <- object$Key @@ -155,7 +118,6 @@ send_output_to_api <- function(pipeline_config, input, plot_data_keys, output) { message("Sending to SNS topic ", pipeline_config$sns_topic) sns <- paws::sns(config = pipeline_config$aws_config) - message("Building the message") msg <- list( experimentId = input$experimentId, taskName = input$taskName, @@ -169,7 +131,6 @@ send_output_to_api <- function(pipeline_config, input, plot_data_keys, output) { ) ) - message("Publishing the message") result <- sns$publish( Message = RJSONIO::toJSON(msg), TopicArn = pipeline_config$sns_topic, @@ -181,7 +142,6 @@ send_output_to_api <- function(pipeline_config, input, plot_data_keys, output) { ) ) ) - message("Done publishing") return(result$MessageId) } @@ -212,7 +172,7 @@ send_pipeline_fail_update <- function(pipeline_config, input, error_message) { error_msg <- list() - # TODO - REMOVE THE DUPLICATE EXPERIMENT ID FROM INPUT RESPONSE + # TODO - REMOVE THE DUPLICATE EXPERIMETN ID FROM INPUT RESPONSE error_msg$experimentId <- input$experimentId error_msg$taskName <- input$taskName @@ -327,6 +287,7 @@ put_object_in_s3_multipart <- function(pipeline_config, bucket, object, key) { message(sprintf("Putting %s in %s from object %s", key, bucket, object)) s3 <- paws::s3(config = pipeline_config$aws_config) + multipart <- s3$create_multipart_upload( Bucket = bucket, Key = key @@ -341,7 +302,6 @@ put_object_in_s3_multipart <- function(pipeline_config, bucket, object, key) { ) } }) - message("Uploading multiparts") resp <- try({ parts <- upload_multipart_parts(s3, bucket, object, key, multipart$UploadId) s3$complete_multipart_upload( diff --git a/pipeline-runner/R/qc-1-filter_emptydrops.R b/pipeline-runner/R/qc-1-filter_emptydrops.R index e808e33f..80ef2fd8 100644 --- a/pipeline-runner/R/qc-1-filter_emptydrops.R +++ b/pipeline-runner/R/qc-1-filter_emptydrops.R @@ -16,42 +16,38 @@ #' @export #' @return a list with the filtered seurat object by mitochondrial content, the config and the plot values #' -filter_emptydrops <- function(scdata_list, config, sample_id, cells_id, task_name = "classifier", num_cells_to_downsample = 6000) { - sample_cell_ids <- cells_id[[sample_id]] - message("Number of cells IDs: ", length(sample_cell_ids)) - message("Number of cells: ", ncol(scdata_list[[sample_id]])) +filter_emptydrops <- function(scdata, config, sample_id, cells_id, task_name = "classifier", num_cells_to_downsample = 6000) { + cells_id.sample <- cells_id[[sample_id]] - if (length(sample_cell_ids) == 0) { - return(list(data = scdata_list[[sample_id]], new_ids = cells_id, config = config, plotData = list())) + if (length(cells_id.sample) == 0) { + return(list(data = scdata, new_ids = cells_id, config = config, plotData = list())) } - # TODO this is probably not needed because it's the first filter and it has not been yet applied - # so the cells_id are the ones generate in gem2s - sample_data <- subset_ids(scdata_list[[sample_id]], sample_cell_ids) + scdata.sample <- subset_ids(scdata, cells_id.sample) FDR <- config$filterSettings$FDR if (isTRUE(config$auto)) { - FDR <- generate_default_values_classifier(sample_data, config) + FDR <- generate_default_values_classifier(scdata.sample, config) } # for plots and filter stats to be populated guidata <- list() # check if filter data is actually available - if (is.null(scdata_list[[sample_id]]@meta.data$emptyDrops_FDR)) { + if (is.null(scdata@meta.data$emptyDrops_FDR)) { message("Classify is enabled but has no classify data available: will dissable it: no filtering!") config$enabled <- FALSE } if (config$enabled) { message("Classify is enabled: filtering with FDR=", FDR) - meta <- sample_data@meta.data + meta <- scdata.sample@meta.data message("Info: empty-drops table of FDR threshold categories (# UMIs for a given threshold interval)") print(table(meta$samples, cut(meta$emptyDrops_FDR, breaks = c(-Inf, 0, 0.0001, 0.01, 0.1, 0.5, 1, Inf)), useNA = "ifany")) # prevents filtering of NA FDRs if FDR=1 - ed_fdr <- sample_data$emptyDrops_FDR + ed_fdr <- scdata.sample$emptyDrops_FDR ed_fdr[is.na(ed_fdr)] <- 1 message( @@ -59,20 +55,20 @@ filter_emptydrops <- function(scdata_list, config, sample_id, cells_id, task_nam sum(ed_fdr > FDR, na.rm = TRUE), "/", length(ed_fdr) ) - numis <- log10(sample_data@meta.data$nCount_RNA) + numis <- log10(scdata.sample@meta.data$nCount_RNA) fdr_data <- unname(purrr::map2(ed_fdr, numis, function(x, y) { c("FDR" = x, "log_u" = y) })) - fdr_data <- fdr_data[get_positions_to_keep(sample_data, num_cells_to_downsample)] + fdr_data <- fdr_data[get_positions_to_keep(scdata.sample, num_cells_to_downsample)] - remaining_ids <- sample_data@meta.data$cells_id[ed_fdr <= FDR] + remaining_ids <- scdata.sample@meta.data$cells_id[ed_fdr <= FDR] # update config config$filterSettings$FDR <- FDR # Downsample plotData - knee_data <- get_bcranks_plot_data(sample_data, is.cellsize = FALSE)[["knee"]] + knee_data <- get_bcranks_plot_data(scdata.sample, is.cellsize = FALSE)[["knee"]] # Populate guidata list guidata[[generate_gui_uuid(sample_id, task_name, 0)]] <- fdr_data @@ -83,13 +79,13 @@ filter_emptydrops <- function(scdata_list, config, sample_id, cells_id, task_nam guidata[[generate_gui_uuid(sample_id, task_name, 0)]] <- list() guidata[[generate_gui_uuid(sample_id, task_name, 1)]] <- list() guidata[[generate_gui_uuid(sample_id, task_name, 2)]] <- list() - remaining_ids <- sample_cell_ids + remaining_ids <- cells_id.sample } # get filter stats after filtering filter_stats <- list( - before = calc_filter_stats(sample_data), - after = calc_filter_stats(subset_ids(sample_data, remaining_ids)) + before = calc_filter_stats(scdata.sample), + after = calc_filter_stats(subset_ids(scdata.sample, remaining_ids)) ) guidata[[generate_gui_uuid(sample_id, task_name, 2)]] <- filter_stats @@ -97,7 +93,7 @@ filter_emptydrops <- function(scdata_list, config, sample_id, cells_id, task_nam cells_id[[sample_id]] <- remaining_ids result <- list( - data = scdata_list, + data = scdata, new_ids = cells_id, config = config, plotData = guidata diff --git a/pipeline-runner/R/qc-2-filter_low_cellsize.R b/pipeline-runner/R/qc-2-filter_low_cellsize.R index 4b3148c8..84a5b7b1 100644 --- a/pipeline-runner/R/qc-2-filter_low_cellsize.R +++ b/pipeline-runner/R/qc-2-filter_low_cellsize.R @@ -2,11 +2,9 @@ #' #' cell_size_distribution_filter function #' -#' This is a simplest filter that looks at the shape of the cell size (# of UMIs -#' per cell) distribution and looks for some local minima, minimum of second derivative, etc. -#' To separate the large cell barcodes that correspond to real cells from the -#' tail containing 'empty droplets'. This can be a useful first guess. The settings -#'for such a filter can also contain a simple "min cell size" setting. +#' This is a simplest filter that looks at the shape of the cell size (# of UMIs per cell) distribution and looks for some local minima, minimum of second derivative, etc. +#' To separate the large cell barcodes that correspond to real cells from the tail containing 'empty droplets'. +#' This can be a useful first guess. The settings for such a filter can also contain a simple "min cell size" setting. #' #' @param config list containing the following information #' - enable: true/false. Refering to apply or not the filter. @@ -18,43 +16,43 @@ #' @export #' @return a list with the filtered seurat object by cell size ditribution, the config and the plot values #' -filter_low_cellsize <- function(scdata_list, config, sample_id, cells_id, task_name = "cellSizeDistribution", num_cells_to_downsample = 6000) { - sample_cell_ids <- cells_id[[sample_id]] +filter_low_cellsize <- function(scdata, config, sample_id, cells_id, task_name = "cellSizeDistribution", num_cells_to_downsample = 6000) { + cells_id.sample <- cells_id[[sample_id]] - if (length(sample_cell_ids) == 0) { - return(list(data = scdata_list[[sample_id]], new_ids = cells_id, config = config, plotData = list())) + if (length(cells_id.sample) == 0) { + return(list(data = scdata, new_ids = cells_id, config = config, plotData = list())) } - sample_data <- subset_ids(scdata_list[[sample_id]], sample_cell_ids) + scdata.sample <- subset_ids(scdata, cells_id.sample) minCellSize <- as.numeric(config$filterSettings$minCellSize) # extract plotting data of original data to return to plot slot later - plot_data <- get_bcranks_plot_data(sample_data, num_cells_to_downsample) + plot_data <- get_bcranks_plot_data(scdata.sample, num_cells_to_downsample) # Check if it is required to compute sensible values. From the function 'generate_default_values_cellSizeDistribution', it is expected # to get a list with two elements {minCellSize and binStep} if (exists("auto", where = config)) { if (as.logical(toupper(config$auto))) { - # config not really needed for this one (maybe later for threshold_low/high): - # HARDCODE Value. threshold_low + # config not really needed for this one (maybe later for threshold.low/high): + # HARDCODE Value. threshold.low # [ Parameter for function CalculateBarcodeInflections. Description: Ignore barcodes of rank below this threshold in inflection calculation] - threshold_low <- 1e2 - # If there are less cells than the value threshold_low, the function CalculateBarcodeInflections fails. So we need to handle by not removing any cells, that is, + threshold.low <- 1e2 + # If there are less cells than the value threshold.low, the function CalculateBarcodeInflections fails. So we need to handle by not removing any cells, that is, # consider the minCellSize as the minimun UMIs in the dataset. # This should be handled in a long-term by adding a different function for computing the default value. - if (ncol(sample_data) < threshold_low) { - minCellSize <- min(sample_data$nCount_RNA) + if (ncol(scdata.sample) < threshold.low) { + minCellSize <- min(scdata.sample$nCount_RNA) } else { - minCellSize <- generate_default_values_cellSizeDistribution(sample_data, config) + minCellSize <- generate_default_values_cellSizeDistribution(scdata.sample, config) } } } if (as.logical(toupper(config$enabled))) { - remaining_ids <- sample_data@meta.data$cells_id[sample_data$nCount_RNA >= minCellSize] + remaining_ids <- scdata.sample@meta.data$cells_id[scdata.sample$nCount_RNA >= minCellSize] } else { - remaining_ids <- sample_cell_ids + remaining_ids <- cells_id.sample } # update config @@ -65,15 +63,15 @@ filter_low_cellsize <- function(scdata_list, config, sample_id, cells_id, task_n guidata[[generate_gui_uuid(sample_id, task_name, 1)]] <- plot_data[["hist"]] # Populate with filter statistics filter_stats <- list( - before = calc_filter_stats(sample_data), - after = calc_filter_stats(subset_ids(sample_data, remaining_ids)) + before = calc_filter_stats(scdata.sample), + after = calc_filter_stats(subset_ids(scdata.sample, remaining_ids)) ) guidata[[generate_gui_uuid(sample_id, task_name, 3)]] <- filter_stats cells_id[[sample_id]] <- remaining_ids result <- list( - data = scdata_list, + data = scdata, new_ids = cells_id, config = config, plotData = guidata @@ -96,7 +94,7 @@ filter_low_cellsize <- function(scdata_list, config, sample_id, cells_id, task_n #' @importFrom magrittr %>% #' get_bcranks_plot_data <- function(sample_subset, nmax = 6000, is.cellsize = TRUE) { - set.seed(RANDOM_SEED) + set.seed(gem2s$random.seed) plot_data <- list() numis <- unname(sample_subset$nCount_RNA) ord <- order(numis, decreasing = TRUE) @@ -188,12 +186,12 @@ plot_knee_regions <- function(dt, thresh = 0.01) { # samples. generate_default_values_cellSizeDistribution <- function(scdata, config) { # `CalculateBarcodeInflections` including inflection point calculation - threshold_low <- if (ncol(scdata) <= 200) NULL else 100 + threshold.low <- if (ncol(scdata) <= 200) NULL else 100 scdata_tmp <- Seurat::CalculateBarcodeInflections( object = scdata, barcode.column = "nCount_RNA", group.column = "samples", - threshold.low = threshold_low + threshold.low = threshold.low ) # returned is both the rank(s) as well as inflection point # extracting only inflection point(s) diff --git a/pipeline-runner/R/qc-3-filter_high_mito.R b/pipeline-runner/R/qc-3-filter_high_mito.R index 0eae72d3..12413d9e 100644 --- a/pipeline-runner/R/qc-3-filter_high_mito.R +++ b/pipeline-runner/R/qc-3-filter_high_mito.R @@ -19,45 +19,45 @@ #' @export #' @return a list with the filtered seurat object by mitochondrial content, the config and the plot values #' -filter_high_mito <- function(scdata_list, config, sample_id, cells_id, task_name = "mitochondrialContent", num_cells_to_downsample = 6000) { - sample_cell_ids <- cells_id[[sample_id]] +filter_high_mito <- function(scdata, config, sample_id, cells_id, task_name = "mitochondrialContent", num_cells_to_downsample = 6000) { + cells_id.sample <- cells_id[[sample_id]] - if (length(sample_cell_ids) == 0) { - return(list(data = scdata_list[[sample_id]], new_ids = cells_id, config = config, plotData = list())) + if (length(cells_id.sample) == 0) { + return(list(data = scdata, new_ids = cells_id, config = config, plotData = list())) } - sample_data <- subset_ids(scdata_list[[sample_id]], sample_cell_ids) + scdata.sample <- subset_ids(scdata, cells_id.sample) # Check if the experiment has MT-content - if (!"percent.mt" %in% colnames(scdata_list[[sample_id]]@meta.data)) { + if (!"percent.mt" %in% colnames(scdata@meta.data)) { message("Warning! No MT-content has been computed for this experiment!") guidata <- list() - return(list(data = scdata_list[[sample_id]], config = config, plotData = guidata)) + return(list(data = scdata, config = config, plotData = guidata)) } - max_fraction <- config$filterSettings$methodSettings[[config$filterSettings$method]]$maxFraction + maxFraction <- config$filterSettings$methodSettings[[config$filterSettings$method]]$maxFraction - plot_data <- generate_mito_plot_data(sample_data) + plot_data <- generate_mito_plot_data(scdata.sample) if (exists("auto", where = config)) { if (as.logical(toupper(config$auto))) { - max_fraction <- generate_default_values_mitochondrialContent(sample_data, config) + maxFraction <- generate_default_values_mitochondrialContent(scdata.sample, config) } } if (as.logical(toupper(config$enabled))) { - remaining_ids <- sample_data@meta.data$cells_id[sample_data$percent.mt <= max_fraction * 100] + remaining_ids <- scdata.sample@meta.data$cells_id[scdata.sample$percent.mt <= maxFraction * 100] } else { - remaining_ids <- sample_cell_ids + remaining_ids <- cells_id.sample } - config$filterSettings$methodSettings[[config$filterSettings$method]]$maxFraction <- max_fraction + config$filterSettings$methodSettings[[config$filterSettings$method]]$maxFraction <- maxFraction # Downsample plotData - num_cells_to_downsample <- downsample_plotdata(ncol(sample_data), num_cells_to_downsample) + num_cells_to_downsample <- downsample_plotdata(ncol(scdata.sample), num_cells_to_downsample) - set.seed(RANDOM_SEED) - cells_position_to_keep <- sample(1:ncol(sample_data), num_cells_to_downsample, replace = FALSE) + set.seed(gem2s$random.seed) + cells_position_to_keep <- sample(1:ncol(scdata.sample), num_cells_to_downsample, replace = FALSE) cells_position_to_keep <- sort(cells_position_to_keep) plot1_data <- plot_data$plot1_data[cells_position_to_keep] plot2_data <- plot_data$plot2_data[cells_position_to_keep] @@ -71,8 +71,8 @@ filter_high_mito <- function(scdata_list, config, sample_id, cells_id, task_name # populate with filter statistics filter_stats <- list( - before = calc_filter_stats(sample_data), - after = calc_filter_stats(subset_ids(sample_data, remaining_ids)) + before = calc_filter_stats(scdata.sample), + after = calc_filter_stats(subset_ids(scdata.sample, remaining_ids)) ) guidata[[generate_gui_uuid(sample_id, task_name, 2)]] <- filter_stats @@ -80,7 +80,7 @@ filter_high_mito <- function(scdata_list, config, sample_id, cells_id, task_name cells_id[[sample_id]] <- remaining_ids result <- list( - data = scdata_list, + data = scdata, new_ids = cells_id, config = config, plotData = guidata diff --git a/pipeline-runner/R/qc-4-filter_gene_umi_outlier.R b/pipeline-runner/R/qc-4-filter_gene_umi_outlier.R index f475c6a1..041b6c8a 100644 --- a/pipeline-runner/R/qc-4-filter_gene_umi_outlier.R +++ b/pipeline-runner/R/qc-4-filter_gene_umi_outlier.R @@ -15,7 +15,7 @@ #' - linear and spline: for each there is only one element: #' - p.level: which refers to confidence level for deviation from the main trend #' -#' @param scdata_list list of \code{SeuratObject} +#' @param scdata \code{SeuratObject} #' @param sample_id value in \code{scdata$samples} to apply filter for #' @param task_name name of task: \code{'numGenesVsNumUmis'} #' @param num_cells_to_downsample maximum number of cells for returned plots @@ -24,34 +24,34 @@ #' @return a list with the filtered seurat object by numGenesVsNumUmis, the config and the plot values #' #' -filter_gene_umi_outlier <- function(scdata_list, config, sample_id, cells_id, task_name = "numGenesVsNumUmis", num_cells_to_downsample = 6000) { - sample_cell_ids <- cells_id[[sample_id]] +filter_gene_umi_outlier <- function(scdata, config, sample_id, cells_id, task_name = "numGenesVsNumUmis", num_cells_to_downsample = 6000) { + cells_id.sample <- cells_id[[sample_id]] - if (length(sample_cell_ids) == 0) { - return(list(data = scdata_list[[sample_id]], new_ids = cells_id, config = config, plotData = list())) + if (length(cells_id.sample) == 0) { + return(list(data = scdata, new_ids = cells_id, config = config, plotData = list())) } - sample_data <- subset_ids(scdata_list[[sample_id]], sample_cell_ids) + scdata.sample <- subset_ids(scdata, cells_id.sample) type <- config$filterSettings$regressionType - # get p_level and update in config + # get p.level and update in config # defaults from "gene.vs.molecule.cell.filter" in pagoda2 if (safeTRUE(config$auto)) - p_level <- min(0.001, 1 / ncol(sample_data)) + p.level <- min(0.001, 1 / ncol(scdata.sample)) else - p_level <- config$filterSettings$regressionTypeSettings[[type]]$p.level + p.level <- config$filterSettings$regressionTypeSettings[[type]]$p.level - p_level <- suppressWarnings(as.numeric(p_level)) - if(is.na(p_level)) stop("p_level couldn't be interpreted as a number.") + p.level <- suppressWarnings(as.numeric(p.level)) + if(is.na(p.level)) stop("p.level couldnt be interpreted as a number.") - config$filterSettings$regressionTypeSettings[[type]]$p.level <- p_level + config$filterSettings$regressionTypeSettings[[type]]$p.level <- p.level # regress log10 molecules vs genes fit.data <- data.frame( - log_molecules = log10(sample_data$nCount_RNA), - log_genes = log10(sample_data$nFeature_RNA), - row.names = sample_data$cells_id + log_molecules = log10(scdata.sample$nCount_RNA), + log_genes = log10(scdata.sample$nFeature_RNA), + row.names = scdata.sample$cells_id ) fit.data <- fit.data[order(fit.data$log_molecules), ] @@ -60,21 +60,21 @@ filter_gene_umi_outlier <- function(scdata_list, config, sample_id, cells_id, ta else fit <- MASS::rlm(log_genes ~ log_molecules, data = fit.data) if (safeTRUE(config$enabled)) { - # get the interval based on p_level parameter - preds <- suppressWarnings(predict(fit, interval = "prediction", level = 1 - p_level)) + # get the interval based on p.level parameter + preds <- suppressWarnings(predict(fit, interval = "prediction", level = 1 - p.level)) # filter outliers above/below cutoff bands is.outlier <- fit.data$log_genes > preds[, 'upr'] | fit.data$log_genes < preds[, 'lwr'] remaining_ids <- as.numeric(rownames(fit.data)[!is.outlier]) remaining_ids <- remaining_ids[order(remaining_ids)] } else { - remaining_ids <- sample_cell_ids + remaining_ids <- cells_id.sample } # downsample for plot data - nkeep <- downsample_plotdata(ncol(sample_data), num_cells_to_downsample) + nkeep <- downsample_plotdata(ncol(scdata.sample), num_cells_to_downsample) - set.seed(RANDOM_SEED) + set.seed(gem2s$random.seed) keep_rows <- sample(nrow(fit.data), nkeep) keep_rows <- sort(keep_rows) downsampled_data <- fit.data[keep_rows, ] @@ -82,7 +82,7 @@ filter_gene_umi_outlier <- function(scdata_list, config, sample_id, cells_id, ta # get evenly spaced predictions on downsampled data for plotting lines xrange <- range(downsampled_data$log_molecules) newdata <- data.frame(log_molecules = seq(xrange[1], xrange[2], length.out = 10)) - line_preds <- suppressWarnings(predict(fit, newdata, interval = "prediction", level = 1 - p_level)) + line_preds <- suppressWarnings(predict(fit, newdata, interval = "prediction", level = 1 - p.level)) line_preds <- cbind(newdata, line_preds) %>% dplyr::select(-fit) %>% @@ -95,8 +95,8 @@ filter_gene_umi_outlier <- function(scdata_list, config, sample_id, cells_id, ta # Populate with filter statistics and plot data filter_stats <- list( - before = calc_filter_stats(sample_data), - after = calc_filter_stats(subset_ids(sample_data, remaining_ids)) + before = calc_filter_stats(scdata.sample), + after = calc_filter_stats(subset_ids(scdata.sample, remaining_ids)) ) guidata <- list() @@ -106,7 +106,7 @@ filter_gene_umi_outlier <- function(scdata_list, config, sample_id, cells_id, ta cells_id[[sample_id]] <- remaining_ids result <- list( - data = scdata_list, + data = scdata, new_ids = cells_id, config = config, plotData = guidata diff --git a/pipeline-runner/R/qc-5-filter_doublets.R b/pipeline-runner/R/qc-5-filter_doublets.R index 044ac2df..b99bf9ce 100644 --- a/pipeline-runner/R/qc-5-filter_doublets.R +++ b/pipeline-runner/R/qc-5-filter_doublets.R @@ -17,47 +17,47 @@ #' @export #' @return a list with the filtered seurat object by doublet score, the config and the plot values #' -filter_doublets <- function(scdata_list, config, sample_id, cells_id, task_name = "doubletScores", num_cells_to_downsample = 6000) { - sample_cell_ids <- cells_id[[sample_id]] +filter_doublets <- function(scdata, config, sample_id, cells_id, task_name = "doubletScores", num_cells_to_downsample = 6000) { + cells_id.sample <- cells_id[[sample_id]] - if (length(sample_cell_ids) == 0) { - return(list(data = scdata_list[[sample_id]], new_ids = cells_id, config = config, plotData = list())) + if (length(cells_id.sample) == 0) { + return(list(data = scdata, new_ids = cells_id, config = config, plotData = list())) } - sample_data <- subset_ids(scdata_list[[sample_id]], sample_cell_ids) + scdata.sample <- subset_ids(scdata, cells_id.sample) # Check if the experiment has doubletScores - if (!"doublet_scores" %in% colnames(scdata_list[[sample_id]]@meta.data)) { + if (!"doublet_scores" %in% colnames(scdata@meta.data)) { message("Warning! No doubletScores scores has been computed for this experiment!") guidata <- list() - return(list(data = scdata_list[[sample_id]], config = config, plotData = guidata)) + return(list(data = scdata, config = config, plotData = guidata)) } - probability_threshold <- config$filterSettings[["probabilityThreshold"]] + probabilityThreshold <- config$filterSettings[["probabilityThreshold"]] # Check if it is required to compute sensible values. From the function 'generate_default_values_doubletScores', it is expected - # to get a probability threshold value + # to get a value --> probabilityThreshold. if (exists("auto", where = config)) { if (as.logical(toupper(config$auto))) { - probability_threshold <- generate_default_values_doubletScores(sample_data) + probabilityThreshold <- generate_default_values_doubletScores(scdata.sample) } } - plot1_data <- generate_doublets_plot_data(sample_data, num_cells_to_downsample) + plot1_data <- generate_doublets_plot_data(scdata.sample, num_cells_to_downsample) # Check whether the filter is set to true or false if (as.logical(toupper(config$enabled))) { # all barcodes that match threshold in the subset data # treat NA doublet scores as defacto singlets - doublet_scores <- sample_data$doublet_scores + doublet_scores <- scdata.sample$doublet_scores doublet_scores[is.na(doublet_scores)] <- 0 - remaining_ids <- sample_data@meta.data$cells_id[doublet_scores <= probability_threshold] + remaining_ids <- scdata.sample@meta.data$cells_id[doublet_scores <= probabilityThreshold] } else { - remaining_ids <- sample_cell_ids + remaining_ids <- cells_id.sample } # update config - config$filterSettings$probabilityThreshold <- probability_threshold + config$filterSettings$probabilityThreshold <- probabilityThreshold guidata <- list() @@ -66,8 +66,8 @@ filter_doublets <- function(scdata_list, config, sample_id, cells_id, task_name # populate with filter statistics filter_stats <- list( - before = calc_filter_stats(sample_data), - after = calc_filter_stats(subset_ids(sample_data, remaining_ids)) + before = calc_filter_stats(scdata.sample), + after = calc_filter_stats(subset_ids(scdata.sample, remaining_ids)) ) guidata[[generate_gui_uuid(sample_id, task_name, 1)]] <- filter_stats @@ -75,7 +75,7 @@ filter_doublets <- function(scdata_list, config, sample_id, cells_id, task_name cells_id[[sample_id]] <- remaining_ids result <- list( - data = scdata_list, + data = scdata, new_ids = cells_id, config = config, plotData = guidata @@ -85,8 +85,8 @@ filter_doublets <- function(scdata_list, config, sample_id, cells_id, task_name generate_default_values_doubletScores <- function(scdata) { # default doublet score based of scDblFinder classification - is_singlet <- scdata$doublet_class == "singlet" - threshold <- max(scdata$doublet_scores[is_singlet], na.rm = TRUE) + is.singlet <- scdata$doublet_class == "singlet" + threshold <- max(scdata$doublet_scores[is.singlet], na.rm = TRUE) return(threshold) } diff --git a/pipeline-runner/R/qc-6-integrate_scdata.R b/pipeline-runner/R/qc-6-integrate_scdata.R index 646d818b..38c1383a 100644 --- a/pipeline-runner/R/qc-6-integrate_scdata.R +++ b/pipeline-runner/R/qc-6-integrate_scdata.R @@ -20,41 +20,33 @@ # } # }, -integrate_scdata <- function(scdata_list, config, sample_id, cells_id, task_name = "dataIntegration") { - # subset each of the samples before merging them - for (sample in names(scdata_list)) { - flat_cell_ids <- unname(unlist(cells_id[[sample]])) - scdata_list[[sample]] <- subset_ids(scdata_list[[sample]], flat_cell_ids) - } - - message("Total cells: ", sum(sapply(scdata_list, ncol))) - scdata <- create_scdata(scdata_list) - +integrate_scdata <- function(scdata, config, sample_id, cells_id, task_name = "dataIntegration") { + flat_cells_id <- unname(unlist(cells_id)) + scdata <- subset_ids(scdata, flat_cells_id) # main function - set.seed(RANDOM_SEED) - message("running data integration") - scdata_integrated <- run_dataIntegration(scdata, config) + set.seed(gem2s$random.seed) + scdata.integrated <- run_dataIntegration(scdata, config) # get npcs from the UMAP call in integration functions - npcs <- length(scdata_integrated@commands$RunUMAP@params$dims) + npcs <- length(scdata.integrated@commands$RunUMAP@params$dims) message("\nSet config numPCs to npcs used in last UMAP call: ", npcs, "\n") config$dimensionalityReduction$numPCs <- npcs - var_explained <- get_explained_variance(scdata_integrated) + var_explained <- get_explained_variance(scdata.integrated) # This same numPCs will be used throughout the platform. - scdata_integrated@misc[["numPCs"]] <- config$dimensionalityReduction$numPCs + scdata.integrated@misc[["numPCs"]] <- config$dimensionalityReduction$numPCs - scdata_integrated <- colorObject(scdata_integrated) - cells_order <- rownames(scdata_integrated@meta.data) - plot1_data <- unname(purrr::map2(scdata_integrated@reductions$umap@cell.embeddings[, 1], scdata_integrated@reductions$umap@cell.embeddings[, 2], function(x, y) { + scdata.integrated <- colorObject(scdata.integrated) + cells_order <- rownames(scdata.integrated@meta.data) + plot1_data <- unname(purrr::map2(scdata.integrated@reductions$umap@cell.embeddings[, 1], scdata.integrated@reductions$umap@cell.embeddings[, 2], function(x, y) { c("x" = x, "y" = y) })) # Adding color and sample id plot1_data <- purrr::map2( plot1_data, - unname(scdata_integrated@meta.data[cells_order, "samples"]), + unname(scdata.integrated@meta.data[cells_order, "samples"]), function(x, y) { append(x, list("sample" = y)) } @@ -62,7 +54,7 @@ integrate_scdata <- function(scdata_list, config, sample_id, cells_id, task_name plot1_data <- purrr::map2( plot1_data, - unname(scdata_integrated@meta.data[cells_order, "color_samples"]), + unname(scdata.integrated@meta.data[cells_order, "color_samples"]), function(x, y) { append(x, list("col" = y)) } @@ -79,7 +71,7 @@ integrate_scdata <- function(scdata_list, config, sample_id, cells_id, task_name # the result object will have to conform to this format: {data, config, plotData : {plot1, plot2}} result <- list( - data = scdata_integrated, + data = scdata.integrated, new_ids = cells_id, config = config, plotData = plots @@ -88,47 +80,6 @@ integrate_scdata <- function(scdata_list, config, sample_id, cells_id, task_name return(result) } -#' Create the merged Seurat object -#' -#' This function takes care of merging the sample seurat objects, shuffling -#' and adding the metadata to the complete Seurat object. It does so by calling -#' the corresponding functions. -#' -#' @param scdata_list -#' -#' @return SeuratObject -#' @export -#' -create_scdata <- function(scdata_list) { - - merged_scdatas <- merge_scdata_list(scdata_list) - merged_scdatas <- add_metadata(merged_scdatas, scdata_list) - - return(merged_scdatas) -} - -#' Merge the list of sample Seurat objects -#' -#' If the list contains more than one seurat object, it merges them all. Else, -#' returns the only element. -#' -#' @param scdata_list list of SeuratObjects -#' -#' @return SeuratObject -#' @export -#' -merge_scdata_list <- function(scdata_list) { - - if (length(scdata_list) == 1) { - scdata <- scdata_list[[1]] - } else { - scdata <- merge(scdata_list[[1]], y = scdata_list[-1]) - } - - return(scdata) - -} - # This function covers # - Integrate the data using the variable "type" (in case of data integration method is selected) and normalize using LogNormalize method. # - Compute PCA analysis @@ -481,37 +432,3 @@ build_cc_gene_list <- function(all_genes) { return(cc_gene_indices) } - -#' Add the metadata present in scdata_list into the merged SeuratObject -#' -#' This function adds metadata, some of which is present in the sample SeuratObjects -#' (the experimentID, the gene annotations), and some that is not (the color pool) -#' to the merged SeuratObject. -#' -#' @param scdata SeuratObject -#' @param scdata_list list of sample SeuratObjects -#' -#' @return SeuratObject with added metadata -#' @export -#' -add_metadata <- function(scdata, scdata_list) { - # misc data is duplicated in each of the samples and it does not - # need to be merge so pick the data in the first one and add it to the merged dataset - scdata@misc <- scdata_list[[1]]@misc - experiment_id <- scdata_list[[1]]@misc[["experimentId"]] - - annot_list <- list() - for (sample in names(scdata_list)) { - annot_list[[sample]] <- scdata_list[[sample]]@misc[["gene_annotations"]] - } - - scdata@misc[["gene_annotations"]] <- format_annot(annot_list) - - # We store the color pool in a slot in order to be able to access it during configureEmbedding - scdata@misc[["color_pool"]] <- get_color_pool() - scdata@misc[["experimentId"]] <- experiment_id - scdata@misc[["ingestionDate"]] <- Sys.time() - - return(scdata) -} - diff --git a/pipeline-runner/R/qc-7-embed_and_cluster.R b/pipeline-runner/R/qc-7-embed_and_cluster.R index 709033de..572f6971 100644 --- a/pipeline-runner/R/qc-7-embed_and_cluster.R +++ b/pipeline-runner/R/qc-7-embed_and_cluster.R @@ -9,6 +9,7 @@ embed_and_cluster <- sample_id, cells_id, task_name = "configureEmbedding") { + message("starting clusters") clustering_method <- config$clusteringSettings$method methodSettings <- @@ -77,7 +78,6 @@ update_sets_through_api <- experiment_id, cell_set_key, auth_JWT) { - httr_query <- paste0("$[?(@.key == \"", cell_set_key, "\")]") httr::PATCH( diff --git a/pipeline-runner/R/qc-helpers.R b/pipeline-runner/R/qc-helpers.R index 4a9af388..dcc9b574 100644 --- a/pipeline-runner/R/qc-helpers.R +++ b/pipeline-runner/R/qc-helpers.R @@ -1,14 +1,15 @@ #' Title #' -#' @param scdata_list +#' @param scdata #' #' @return #' @export #' -generate_first_step_ids <- function(scdata_list) { +#' @examples +generate_first_step_ids <- function(scdata) { cells_id <- list() - for (sample_id in names(scdata_list)) { - cells_id[[sample_id]] <- scdata_list[[sample_id]]$cells_id + for (sample_id in unique(scdata$samples)) { + cells_id[[sample_id]] <- scdata$cells_id[scdata$samples == sample_id] } return(cells_id) } @@ -44,7 +45,7 @@ remove_cell_ids <- function(pipeline_config, experiment_id) { get_positions_to_keep <- function(scdata, num_cells_to_downsample) { # Downsample plotData num_cells_to_downsample <- downsample_plotdata(ncol(scdata), num_cells_to_downsample) - set.seed(RANDOM_SEED) + set.seed(gem2s$random.seed) cells_position_to_keep <- sample(1:ncol(scdata), num_cells_to_downsample, replace = FALSE) cells_position_to_keep <- sort(cells_position_to_keep) @@ -180,6 +181,7 @@ calc_filter_stats <- function(scdata) { #' @return #' @export #' +#' @examples runClusters <- function(clustering_method, resolution, data) { data <- getClusters(clustering_method, resolution, data) res_col <- paste0(data@active.assay, "_snn_res.", toString(resolution)) @@ -201,6 +203,7 @@ runClusters <- function(clustering_method, resolution, data) { #' @return #' @export #' +#' @examples getClusters <- function(clustering_method, resolution, data) { res_col <- paste0(data@active.assay, "_snn_res.", toString(resolution)) algorithm <- list("louvain" = 1, "leiden" = 4)[[clustering_method]] diff --git a/pipeline-runner/R/sysdata.rda b/pipeline-runner/R/sysdata.rda index d71d6e79d68ff93481dc693bca91c7463d41cf23..27434084e7f46e0d61fd1d38159f1bbe684811e2 100644 GIT binary patch literal 2520 zcmV;}2`BbKT4*^jL0KkKS-btAe*h7C|MLI;|NsC0|NsC0|NsC0|L{OS03ZM$5C9+r z;0d1>#&5jXaG50JC;*)SZgiqlB~24RnlTebX*E34NwSzGihD{uO_VpOJtmC+^neWj z000J>4^U~OA)%nuVgh1El|3|1C~Zfi(V);Y01W^D00uw+2rvLl00hA>00000Oafpb zfB+)^00F7!0znF=`kJ0Z#XUiZp_)d4lST*t00t8vVgLg`$&(3HK*AWBWYB4*ni@3I zKp2KhMw&Ee(TS0SFd#4>VloDSB8p<0LY|L7r2qf`02*ij0MG`2000_#MuFuTa(TH~ z$E!C#OzE&-+Zfs%(4ekKC99;xK&Yh!WQ<{Epa?NY5J>{Y8W2!4i$fK$ta)GsfuiYZ z{o=tTkM#s?j07xBo>yJF)#R-o)Eq(1u`TY0;<{lL0+g%sm&`>5GkAL;Nm0IKGa1bW zlStDw4O3WVG#Z-b!Ll|Rn?r%f=-W33V`Fr9ZyDzJ^x!y0#}URqISx_*0D?ju5RPIQ z5PEZ+1X3{^en>h_qpXriB@)Sm(s9^!oyU3Lc$3L=2*ipssnqiNW&LmFeZ7Bt-^CxA z3wS-$ycyU}kO;wt6-0^}2P6O~bdiv+4HK;LOZ8={Bmhpv(m;UBGUhr)^L?f6jSh-> zeLm~GrLh?&Nz#a%_gx|(4&<{nt>JCvwX*qcad9u*4b$&m1y$qq4;~lE5{5j&DDCx= z!aDY0X?0Z}J^qf4wc@QaG4e2SF5ZgGzrS{C=lS)1Rpx6ma%X*+^0q9;Jrf*y zs%E8GGBI_kT19Z$I2A)f`pUejs9p$-*02V+oz*rpb>0z?7gn)dz6a^X+^D0;5YUXw zzuukMF`8LNsJ3sjKVzw4+8qg-i&%wphNehqOKYM+G8hc zxnvW$q1{hqr(YV~jo9^BG~#n{;`0WZEELM5&aM$zKmkt-ZIoI!E#66 zQqYA4F<}}EtdU6FxS8J2We^jDxlw|0KxaT3h1Oy)P88_Y?C65c6H`dtJB4si} zT1l%3g)FKUOpJl$wiy;cApqEQ@<=mOwn~gt&w6UlB2PN-;KPT|X2wxOE4Ir;H2N{| zQ?BOYtkHKC^#trHTa>uR5~a1PM-d7Vmzd$Jj+#GyJ^_=Cu9w_rNC$AQ!zmaFsB==; z%q#-(LpKKnlx2i6W+Z5hp^3L^ZMM=HG&s9vV2mSxvE35}HDICzI7oDp&e9^+!CdDl zhQ4MhIN06SZ|2gun9;U*=a+Nj=MG#$ZrzpFOZDsJ&xTCvp`d$iE>D2XU5YpoIeA*#7IkaXdM@Z0gbWrK(=X;JRro0-_oi&a!?(#X!Lo+;8EAbOZ zNfkA1$mL%<>7jTsDL&WEUD@+Hqug%0wKLAEVxtdNJjZjn$DZT6-QR~;aVmJS?(NDW zfQgyp+>~oFs<~90}rh`gimkmNzRWH!lG({Q}&5d^&pfRJud zlQ(8n;50fuYq#32V{MES(dpMLL;n#HgR!Apr(bfcaYf7JPRK{6JQ2?IkEki$9*)QTjX%BFB&*g>{oj2#7P zw;)a#<``W0J2JB=w&t_afQA=CHKDjIF4>mQKV%33exL+Il^`1j#_RO4E4oR<<29I& zA|OVEo?Q4i($!(nIDL&rWSNM}l8yf?X$l3m4SN}0mx?qM4bz0NLv^J7o%&yB^-iW9+-0~?+2CQf<$8zC*IvrHc03srVK=!XCV!5xyRn|PuLbFUohEOOd z5fCG$CS?l&Pbo;SO@-L$C8aPiA*$^#b3`VRTPqGDuPt7s8>N&IECKdlbUIyiS=N4N zUq(@k7RW_3(!d!3SdzV%&Gbuikr4qFG=zrR1jfJ^l+%#`B`rsZW|F};+Q&xXd)LYo z-l ztmj4aG63;1_7lQR$=EA8L=re;HhAOejc#zu+U1Z_PjHxT?vGSdm?2CBRA18}nH5is zRhKFz&X@Ki0C9?dmw}N>Yl6{gUSzquxv>UDw-90l-QLd}@@UASV$l-2*>;Ee2&zlS zX(y}}P&?hVEz6rDuUsq}cS$B>IL28bRM%pz`|))5G&prxj0^-trg#!eaSi2d*;i{2 zU(mCIfLHi5K~N^-t4XZl$$|wvcXMu|h6iedI_EH~MAYBlXBM@pwlv8^&Dv5KmZxqQ zpGx%(T-;tWlDwE2V6kIZ$0dd(kgOdi^uI&F)4ureXEKS6+Aq9QEEL#d<~wHf1WG^BP7qZ!+P)FriOnRsi&qz!=5LlxXjW^*f~o?-BKEE3fL z2G?sBgm%=`VoiEx7jSDvrRhlA$n)&cS;S1x5$#{Fj*8MmjI`tBjOhOK1dq^>TU*96 zNwBp%v_51UPuYulM2|ZdVr8U0tjyI&1Cx3iuSlm}M$xPlJ@6$75=B3aRwA#<$mr!^ iO=F7l&jHm(S5#_KPN%dWFb53h{x0N-aG@c0`$GPNR+-QM literal 2529 zcmV<72_E)BT4*^jL0KkKSwsIA%K#CCfAatT|NsC0|NsC0|NsC0|L{OS03ZM$5C9+r z;0eEdWY*&Nc&;ZbaWYB4;jX^a%GN$yVo~NdfI{GefB*mh0BC3sU;vl^34&k%0001( z1i(W807d`+15?lhgqc+TP-<#^ihik+VNE?x)iY2sXlP^%jD`@<7@9OR0LWz1MiD9| zNj9dQlhHH{00006fB*mh000000p%W`l{FJY%@AbB0000002(yYO)>z}Lrnpo0MG%T zgPj}rmFq=7;Yfghh=K?rgnSOiBMdxzHX0~s6@zWIPy!fb6eh=sx_h(X08xvgLw z2%+%uT?pu5A%46XJrD&7nu!}(5{b(Uukf-i4oFqI*FqS{MS!I%YXP)jF&NB)(-WI! zQ*iDXjng@x-Z#$>(rP#k6U95H5yo;Hr!nIk=ZRWIZu+uP?fb{M1iO06NlHZ}R11wC3$QatxXMK(w`$x4060Y{5KsyO z$nb1w7x*;&qoLI|XQSHQwXqo&Vc3YAw%S5K9s04l>$}d}b=PY7E^qHHLd5Gn)hSW( z4mQ5zM*`+B0P- z^gcQ>E=sdsGZ6o8^MWTa)4V5_osHEHM~P1Ev;g^ z+2~D)+^D0ZMP^1K@yAaAnUeOclQS)@qQ$g26E_yI3h50@kkXddW*o^yc2ub<$|+r{ z-rOoI(#69h7G+z9xuK};{ARqwttdt%%chLcq&vQ_k2I19ZAHveSTbsok(KQNSe6fF z0!j#niVjj0BBGvRR?xbn(RG`uxo-xH8#NeO(0Wl+7LA?P0D;I-9G9m>t9WiNOvC0%V78*MSATkuM#&|0hHe!c% zSP9d{FfaxOhk=Y>Vnn$SVlvdsfl!sVhk*!XRjH6ulwmGi11m#23N&SqMlqo7b9`K~ z%En6Ta>OTpGju(cZCX~^jj5usyMgT6tI+<2(2_YXZ4wc!*HtJ zB4si}T1l%3g)FKUNCrOg-GU{Lh(0?8t_X%`t5}@DG`Az7@+7jZ{Fm?f%+|~}U|!2B zlTGK;-voOMu+baFzva)Q0V~>tQ#&X1b zCcTl}(CyFQanqq&v$KLb%Z-j0;DQITKHam1xsM#RDnkAH{5LGxxVx?(%Mo2{h>{_* zODovP3`HANrB-5uWfLgKW%~VSwn(6?sbsQeUaskri3e=k0Kr(ghatkDEw!_hZc*Cn zapL#C7McY{O}4}FW?j`obF&-Z>2I4wl^Jwnf{>>#66vnG1r?nN^X!hdpaBD;o9u!!bN8~JEQ`-va&Nv_ucfjCDw8x@{hhlkp|_E+a+5~!tty` z67((tLAgpy-I-T_(CGQD$6mONwlGsir(CfR84j*zV@nl)PO4nUWSdZ7Bw8ES=vVI*xVU1+(A`8s4e8At4QJkAYEe71|$k2aywzdHB)3+ z0T32MLSpM94#r!(Sa1_)6A_i-+YU@l_N`rN#dE?LG#*lzrfZf)sbi6}w&@W8r zkscRdVPMd}aLV(yzMRsU>yBn-wcd!ZF6|9NEyJ?%7+-=<8{o?v@BtAdO++bTbmAiX za$AMZI$K!F%TadrzTO@*KOWCIxL-8#sJpS71vULRe~Y0*@5NlS;ov(sum#>Qb_@XfDl{@ zMIbIEjqj1L$lAwKaGqz%6x~#(;&bI#5(=V%K?RTyY(Cdwag->R9jpSBr8!uAjAuIdNj)x zY>Fc*kgLI$VCo^AE+M*009HWoca*zY>DQzb%NEF^WjsQpDlGY{$0sKBL4%oBfiZB2 z(@~k7(j4Sn`IBmhVZk1ROV<7AM5r)riv3aqb9u2H@s)0jk5gu`$& rZc`_Pk+rO`Xlkaz%s++w75nzqqitsV3P7-G&+&I8Q-uixKk= 0) - #Verify that we have percent mt and not fraction - expect_true(max(metadata$percent.mt) > 1 || all(metadata$percent.mt == 0)) + expect_true("doublet_scores" %in% colnames(md)) + expect_true("cells_id" %in% colnames(md)) + expect_true("samples" %in% colnames(md)) + + test_that("Cell ids are assigned correctly", { + cellNumber <- ncol(scdata@assays$RNA@data) + expect_equal(md$cells_id, 0:(cellNumber - 1)) + }) + + test_that("Mitochondrial percentage is correct", { + expect_true(max(md$percent.mt) <= 100) + expect_true(min(md$percent.mt) >= 0) + #Verify that we have percent mt and not fraction + expect_true(max(md$percent.mt) > 1 || all(md$percent.mt == 0)) + }) + }) +} - } +test_that("prepare_experiment creates a valid Seurat object", { + expect_true(TRUE) + suppressWarnings(test_object()) }) diff --git a/pipeline-runner/tests/testthat/test-gem2s-7-upload_to_aws.R b/pipeline-runner/tests/testthat/test-gem2s-7-upload_to_aws.R index 9af76d18..b0554952 100644 --- a/pipeline-runner/tests/testthat/test-gem2s-7-upload_to_aws.R +++ b/pipeline-runner/tests/testthat/test-gem2s-7-upload_to_aws.R @@ -1,188 +1,168 @@ -set.seed(1) - -mock_doublet_scores <- function(counts) { - doublet_scores <- runif(ncol(counts)) - doublet_class <- ifelse(doublet_scores < 0.8, "singlet", "doublet") - - data.frame( - row.names = colnames(counts), - barcodes = colnames(counts), - doublet_class = doublet_class, - doublet_scores = doublet_scores - ) -} - -mock_scdata_list = function (config) { - prev_out <- mock_prev_out(config) - scdata_list <- prev_out$scdata_list - - task_out <- prepare_experiment(NULL, NULL, prev_out)$output - scdata_list <- task_out$scdata_list -} - -mock_input <- function(metadata = NULL) { - input <- list( - name = "project name", - sampleNames = list("a", "b", "c"), - sampleIds = list("123abc", "123def", "123ghi"), +mock_config <- function(metadata = NULL) { + config <- list( + sampleNames = list("WT1", "WT2"), + sampleIds = list("123abc", "123def"), metadata = metadata ) - return(input) + return(config) } -mock_config <- function(input) { - config <- list( - name = input$name, - samples = input$sampleIds, - metadata = input$metadata + +mock_scdata <- function(config) { + pbmc_raw <- read.table( + file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), + as.is = TRUE ) - return (config) -} + # construct metadata + # add samples + samples <- unlist(config$sampleIds) -mock_prev_out <- function(config, counts = NULL) { - samples <- config$samples + rest <- config$metadata + keys <- c("samples", names(rest)) + metadata <- data.frame(row.names = colnames(pbmc_raw), samples = rep(samples, each = 40)) - if (is.null(counts)) { - counts <- DropletUtils:::simCounts() - colnames(counts) <- paste0("cell", seq_len(ncol(counts))) + # Add "metadata" if exists in config + if (!is.null(rest)) { + rest <- lapply(rest, unlist) + rest <- data.frame(rest, row.names = samples, check.names = FALSE) + metadata[names(rest)] <- rest[metadata$samples, ] } - eout <- DropletUtils::emptyDrops(counts) - - counts_list <- list() - edrops <- list() - doublet_scores <- list() + # make syntactically valid + colnames(metadata) <- make.names(colnames(metadata), unique = TRUE) - for (sampleId in samples) { - counts_list[[sampleId]] <- counts - edrops[[sampleId]] <- eout - doublet_scores[[sampleId]] <- mock_doublet_scores(counts) - } + scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw, meta.data = metadata) - # as passed to create_seurat - prev_out <- list( - counts_list = counts_list, - edrops = edrops, - doublet_scores = doublet_scores, - annot = data.frame(name = row.names(counts), input = row.names(counts)), - config = config - ) - # call create_seurat to get prev_out to pass to prepare_experiment - return(create_seurat(NULL, NULL, prev_out)$output) + scdata$cells_id <- seq(0, ncol(scdata) - 1) + return(scdata) } -check_metadata_cell_ids <- function(metadata_key, metadata_value, sample_keys, cell_sets) { - keys <- sapply(cell_sets$cellSets, `[[`, "key") - metadata_set <- cell_sets$cellSets[[which(keys == metadata_key)]] - metadata_name <- sapply(metadata_set$children, `[[`, "name") +test_that("get_cell_sets creates scratchpad and sample sets if no metadata", { + config <- mock_config() + scdata <- mock_scdata(config) - sample_set <- cell_sets$cellSets[[which(keys == "sample")]] - sample_names <- sapply(sample_set$children, `[[`, "name") + cell_sets <- get_cell_sets(scdata, config) + keys <- sapply(cell_sets$cellSets, `[[`, "key") - metadata_cell_ids <- unlist(metadata_set$children[[which(metadata_name == metadata_value)]]$cellId) + expect_setequal(keys, c("scratchpad", "sample")) +}) - sample_cell_sets = purrr::keep(sample_set$children, \(x) x[["key"]] %in% sample_keys) - sample_cell_ids = unlist(lapply(sample_cell_sets, `[[`, "cellIds")) - expect_equal(metadata_cell_ids, sample_cell_ids) -} +test_that("get_cell_sets adds correct cell ids for each sample", { + config <- mock_config() + scdata <- mock_scdata(config) -test_that("get_cell_sets creates scratchpad and sample sets if no metadata", { - input <- mock_input() - config <- mock_config(input) + cell_sets <- get_cell_sets(scdata, config) + sets_key <- sapply(cell_sets$cellSets, `[[`, "key") - scdata_list <- mock_scdata_list(config) + sample_sets <- cell_sets$cellSets[[which(sets_key == "sample")]] + samples_key <- sapply(sample_sets$children, `[[`, "key") - cell_sets <- get_cell_sets(scdata_list, input) - keys <- sapply(cell_sets$cellSets, `[[`, "key") + for (sample_id in config$sampleIds) { + sample_cells <- sample_sets$children[[which(samples_key == sample_id)]]$cellIds + expected_cells <- unname(scdata$cells_id)[scdata$samples == sample_id] - expect_setequal(keys, c("scratchpad", "sample")) + expect_equal(sample_cells, expected_cells) + } }) test_that("get_cell_sets adds correct cell ids for each sample", { - input <- mock_input() - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + config <- mock_config() + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, config) sets_key <- sapply(cell_sets$cellSets, `[[`, "key") sample_sets <- cell_sets$cellSets[[which(sets_key == "sample")]] samples_key <- sapply(sample_sets$children, `[[`, "key") - for (sample_id in config$samples) { + # ids are correct for each child + for (sample_id in config$sampleIds) { sample_cells <- sample_sets$children[[which(samples_key == sample_id)]]$cellIds - expected_cells <- unname(scdata_list[[sample_id]]$cells_id) + expected_cells <- unname(scdata$cells_id)[scdata$samples == sample_id] expect_equal(sample_cells, expected_cells) } }) + test_that("get_cell_sets adds a single metadata column", { - metadata <- list(Group = list("Hello", "WT2", "WT2")) - input <- mock_input(metadata) - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + metadata <- list(Group = list("Hello", "WT2")) + config <- mock_config(metadata) + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, config) + # have it as a key keys <- sapply(cell_sets$cellSets, `[[`, "key") - # has sample key as one of the keys expect_setequal(keys, c("scratchpad", "sample", "Group")) - # Check that each sample/metadata intersection contains the correct cell ids - check_metadata_cell_ids("Group", "WT2", c("123def", "123ghi"), cell_sets) - check_metadata_cell_ids("Group", "Hello", c("123abc"), cell_sets) + group_set <- cell_sets$cellSets[[which(keys == "Group")]] + group_names <- sapply(group_set$children, `[[`, "name") + + + # cell ids are correct for each child + for (group_name in group_names) { + group_cells <- group_set$children[[which(group_names == group_name)]]$cellIds + expected_cells <- unname(scdata$cells_id)[scdata$Group == group_name] + + expect_equal(group_cells, expected_cells) + } }) + test_that("get_cell_sets uses user-supplied syntactically invalid metadata column names", { - metadata <- list("TRUE" = list("Hello", "WT2", "WT2")) - input <- mock_input(metadata) - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + metadata <- list("TRUE" = list("Hello", "WT2")) + config <- mock_config(metadata) + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, config) - # has sample key as one of the keys + # have TRUE as a key keys <- sapply(cell_sets$cellSets, `[[`, "key") - expect_setequal(keys, c("scratchpad", "sample", "TRUE")) + expect_true("TRUE" %in% keys) - check_metadata_cell_ids("TRUE", "WT2", c("123def", "123ghi"), cell_sets) - check_metadata_cell_ids("TRUE", "Hello", c("123abc"), cell_sets) + group_set <- cell_sets$cellSets[[which(keys == "TRUE")]] + group_names <- sapply(group_set$children, `[[`, "name") + + # cell ids are correct for each child + for (group_name in group_names) { + group_cells <- group_set$children[[which(group_names == group_name)]]$cellIds + expected_cells <- unname(scdata$cells_id)[scdata$TRUE. == group_name] + + expect_equal(group_cells, expected_cells) + } }) test_that("get_cell_sets adds two metadata columns", { - metadata <- list(Group1 = list("Hello", "WT2", "WT2"), Group2 = list("WT", "WT", "WTA")) - input <- mock_input(metadata) - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + metadata <- list(Group1 = list("Hello", "WT2"), Group2 = list("WT", "WT")) + config <- mock_config(metadata) + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, config) # have as keys keys <- sapply(cell_sets$cellSets, `[[`, "key") expect_setequal(keys, c("scratchpad", "sample", "Group1", "Group2")) - check_metadata_cell_ids("Group1", "Hello", c("123abc"), cell_sets) - check_metadata_cell_ids("Group1", "WT2", c("123def", "123ghi"), cell_sets) - - check_metadata_cell_ids("Group2", "WT", c("123abc", "123def"), cell_sets) - check_metadata_cell_ids("Group2", "WTA", c( "123ghi"), cell_sets) + # check that Group2 has all cells + group2_set <- cell_sets$cellSets[[which(keys == "Group2")]] + group2_cells <- group2_set$children[[1]]$cellIds + expect_equal(group2_cells, unname(scdata$cells_id)) }) test_that("get_cell_sets uses unique colors for each cell set", { - metadata <- list(Group1 = list("Hello", "WT2", "WT2"), Group2 = list("WT", "WT", "WTA")) - input <- mock_input(metadata) - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + metadata <- list(Group1 = list("Hello", "WT2"), Group2 = list("WT", "WT")) + config <- mock_config(metadata) + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, config) flat_cell_sets <- unlist(cell_sets) colors <- flat_cell_sets[grepl("[.]color", names(flat_cell_sets))] @@ -193,22 +173,22 @@ test_that("get_cell_sets uses unique colors for each cell set", { test_that("get_cell_sets without metadata matches snapshot", { - input <- mock_input() - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + config <- mock_config() + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, config) expect_snapshot(str(cell_sets)) }) test_that("get_cell_sets with two metadata groups matches snapshot", { - metadata <- list(Group1 = list("Hello", "WT2", "WT2"), Group2 = list("WT", "WT", "WT124")) - input <- mock_input(metadata) - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + metadata <- list(Group1 = list("Hello", "WT2"), Group2 = list("WT", "WT")) + + config <- mock_config(metadata) + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + scdata@misc <- list(metadata_lookups = c(Group1 = "Group1", Group2 = "Group2")) + cell_sets <- get_cell_sets(scdata, config) expect_snapshot(str(cell_sets)) }) diff --git a/pipeline-runner/tests/testthat/test-qc-1-filter_emptydrops.R b/pipeline-runner/tests/testthat/test-qc-1-filter_emptydrops.R index 7224bd57..46bb29ec 100644 --- a/pipeline-runner/tests/testthat/test-qc-1-filter_emptydrops.R +++ b/pipeline-runner/tests/testthat/test-qc-1-filter_emptydrops.R @@ -12,106 +12,89 @@ mock_config <- function() { return(config) } - -mock_scdata_list <- function() { +mock_scdata <- function() { pbmc_raw <- read.table( file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), as.is = TRUE ) - sample_ids <- c("sample_1", "sample_2") - scdata_list <- {} - for (i in seq_along(sample_ids)) { - message(i) - scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) - n_cells <- ncol(scdata) - start_idx <- (i-1)*n_cells - end_idx <- ((i)*n_cells) - 1 - - scdata$cells_id <- start_idx:end_idx - - # add empty drops stuff - scdata$emptyDrops_FDR <- NA - scdata$emptyDrops_FDR[0:70] <- 0.009 - - # add samples - scdata$samples <- rep(sample_ids[i], each = n_cells) - # scdata_list <- Seurat::RenameCells(scdata, paste(scdata$samples, colnames(scdata), sep = "")) - scdata_list[[sample_ids[i]]] <- scdata - } - return(scdata_list) + scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) + scdata$cells_id <- 0:(ncol(scdata) - 1) + + # add empty drops stuff + scdata$emptyDrops_FDR <- NA + scdata$emptyDrops_FDR[1:70] <- 0.009 + + # add samples + scdata$samples <- rep(c("123abc", "123def"), each = 40) + scdata <- Seurat::RenameCells(scdata, paste(scdata$samples, colnames(scdata), sep = "")) + return(scdata) } test_that("Initial cells_id are correct", { - scdata_list <- mock_scdata_list() - cells_id <- generate_first_step_ids(scdata_list) - expect_equal(unique(cells_id[["sample_1"]]), 0:79) - expect_equal(unique(cells_id[["sample_2"]]), 80:159) + scdata <- mock_scdata() + cells_id <- generate_first_step_ids(scdata) + expect_equal(unique(cells_id$`123abc`), 0:39) + expect_equal(unique(cells_id$`123def`), 40:79) }) test_that("filter_emptydrops removes NA with threshold < 1", { - scdata_list <- mock_scdata_list() + scdata <- mock_scdata() config <- mock_config() - cells_id <- generate_first_step_ids(scdata_list) + cells_id <- mock_ids() - - for (sample_id in c("sample_1", "sample_2")){ - out <- filter_emptydrops(scdata_list, config, sample_id, cells_id) - expect_equal(ncol(out$data[[sample_id]]), 80) - expect_equal(length(out$new_ids[[sample_id]]), 70) - } + out <- filter_emptydrops(scdata, config, "123def", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 30) }) test_that("filter_emptydrops is sample aware", { - scdata_list <- mock_scdata_list() + scdata <- mock_scdata() config <- mock_config() - cells_id <- generate_first_step_ids(scdata_list) - - # NA (empty) drops are in sample_2 only - scdata_list[["sample_1"]]$emptyDrops_FDR <- 0.009 - out <- filter_emptydrops(scdata_list, config, "sample_2", cells_id) - expect_equal(ncol(out$data[["sample_1"]]), 80) - expect_equal(length(out$new_ids[["sample_1"]]), 80) - expect_equal(length(out$new_ids[["sample_2"]]), 70) + cells_id <- mock_ids() + + # NA (empty) drops are in 123def only + out <- filter_emptydrops(scdata, config, "123abc", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 40) }) test_that("if FDR=1 filter_emptydrops keeps everything", { - scdata_list <- mock_scdata_list() + scdata <- mock_scdata() config <- mock_config() config$filterSettings$FDR <- 1 - cells_id <- generate_first_step_ids(scdata_list) + cells_id <- mock_ids() - out <- filter_emptydrops(scdata_list, config, "sample_2", cells_id) - expect_equal(ncol(out$data[["sample_2"]]), 80) - expect_equal(length(out$new_ids[["sample_1"]]), 80) - expect_equal(length(out$new_ids[["sample_2"]]), 80) + out <- filter_emptydrops(scdata, config, "123def", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 40) }) test_that("filter_emptydrops can be disabled", { - scdata_list <- mock_scdata_list() + scdata <- mock_scdata() config <- mock_config() config$enabled <- FALSE - cells_id <- generate_first_step_ids(scdata_list) + cells_id <- mock_ids() - out <- filter_emptydrops(scdata_list, config, "sample_2", cells_id) - expect_equal(ncol(out$data[["sample_2"]]), 80) - expect_equal(length(out$new_ids[["sample_1"]]), 80) - expect_equal(length(out$new_ids[["sample_2"]]), 80) + out <- filter_emptydrops(scdata, config, "123def", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 40) }) test_that("filter_emptydrops handles missing emptyDrops_FDR", { - scdata_list <- mock_scdata_list() + scdata <- mock_scdata() config <- mock_config() - cells_id <- generate_first_step_ids(scdata_list) - - # remove emptyDrops_FDR field - scdata_list[["sample_1"]]$emptyDrops_FDR <- NULL - scdata_list[["sample_2"]]$emptyDrops_FDR <- NULL + scdata$emptyDrops_FDR <- NULL + cells_id <- mock_ids() - out <- filter_emptydrops(scdata_list, config, "sample_2", cells_id) - expect_equal(ncol(out$data[["sample_2"]]), 80) - expect_equal(length(out$new_ids[["sample_1"]]), 80) - expect_equal(length(out$new_ids[["sample_2"]]), 80) + out <- filter_emptydrops(scdata, config, "123def", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 40) }) diff --git a/pipeline-runner/tests/testthat/test-qc-2-filter_low_cellsize.R b/pipeline-runner/tests/testthat/test-qc-2-filter_low_cellsize.R index 861d98ca..0e409f5e 100644 --- a/pipeline-runner/tests/testthat/test-qc-2-filter_low_cellsize.R +++ b/pipeline-runner/tests/testthat/test-qc-2-filter_low_cellsize.R @@ -15,129 +15,120 @@ mock_scdata <- function() { as.is = TRUE ) - sample_1_id <- "123abc" - sample_2_id <- "123def" - scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) scdata$cells_id <- 0:(ncol(scdata) - 1) # add samples - scdata$samples <- rep(c(sample_1_id, sample_2_id), each = 40) + scdata$samples <- rep(c("123abc", "123def"), each = 40) scdata <- Seurat::RenameCells(scdata, paste(scdata$samples, colnames(scdata), sep = "")) - scdata$emptyDrops_FDR <- NA - - scdata_sample1 <- subset(scdata, samples == sample_1_id) - scdata_sample2 <- subset(scdata, samples == sample_2_id) - scdata_list <- list(scdata_sample1, scdata_sample2) - names(scdata_list) <- c(sample_1_id, sample_2_id) - - return(list(scdata_list, sample_1_id, sample_2_id)) + scdata$emptyDrops_FDR <- NA + return(scdata) } test_that("filter_low_cellsize removes cells", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() config <- mock_config(mcs = 10000) cells_id <- mock_ids() - out <- filter_low_cellsize(scdata_list, config, sample_1_id, cells_id) + out <- filter_low_cellsize(scdata, config, "123abc", cells_id) - expect_equal(ncol(out$data[[sample_1_id]]), ncol(scdata_list[[sample_1_id]])) - expect_lt(length(out$new_ids[[sample_1_id]]), length(cells_id[[sample_1_id]])) - - expect_equal(length(out$new_ids[[sample_2_id]]), length(cells_id[[sample_2_id]])) + expect_equal(ncol(out$data), ncol(scdata)) + expect_lt(length(out$new_ids$`123abc`), length(cells_id$`123abc`)) + expect_equal(length(out$new_ids$`123def`), length(cells_id$`123def`)) }) test_that("filter_low_cellsize filters only appropiate cells", { mcs <- 100 - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() config <- mock_config(mcs) cells_id <- mock_ids() - out <- filter_low_cellsize(scdata_list, config, sample_1_id, cells_id) - + out <- filter_low_cellsize(scdata, config, "123abc", cells_id) - data_sample_1 <- out$data[[sample_1_id]] - - barcode_names_this_sample <- colnames(data_sample_1)[data_sample_1$samples == sample_1_id] - sample_subset <- subset(data_sample_1, cells = barcode_names_this_sample) - sample_subset <- subset_ids(sample_subset, out$new_ids[[sample_1_id]]) + data <- out$data + barcode_names_this_sample <- colnames(data)[data$samples == "123abc"] + sample_subset <- subset(data, cells = barcode_names_this_sample) + sample_subset <- subset_ids(sample_subset, out$new_ids$`123abc`) expect_false(all(sample_subset$nCount_RNA <= mcs)) }) test_that("filter_low_cellsize is sample aware", { - mcs <- 200 - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + mcs <- 10000 + scdata <- mock_scdata() config <- mock_config(mcs) cells_id <- mock_ids() - out <- filter_low_cellsize(scdata_list, config, sample_1_id, cells_id) + out <- filter_low_cellsize(scdata, config, "123abc", cells_id) data <- out$data - barcode_names_this_sample <- colnames(data[[sample_1_id]]) + barcode_names_this_sample <- colnames(data)[data$samples == "123abc"] expect_equal(length(barcode_names_this_sample), 40) - barcode_names_this_sample <- colnames(data[[sample_2_id]]) + barcode_names_this_sample <- colnames(data)[data$samples == "123def"] expect_equal(length(barcode_names_this_sample), 40) - sample_1_new_ids <- out$new_ids[[sample_1_id]] - sample_2_new_ids <- out$new_ids[[sample_2_id]] + expect_lt(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 40) + + out <- filter_low_cellsize(scdata, config, "123def", cells_id) + data <- out$data + + barcode_names_this_sample <- colnames(data)[data$samples == "123def"] + expect_equal(length(barcode_names_this_sample), 40) - expect_lt(length(sample_1_new_ids), 40) - expect_equal(length(sample_2_new_ids), 40) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_lt(length(out$new_ids$`123def`), 40) }) test_that("filter_low_cellsize works on empty data/wrong sample", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - config <- mock_config(1000000) + scdata <- mock_scdata() + config <- mock_config(100000) cells_id <- mock_ids() - out <- filter_low_cellsize(scdata_list, config, sample_1_id, cells_id) + out <- filter_low_cellsize(scdata, config, "123abc", cells_id) - out <- filter_low_cellsize(out$data, config, sample_2_id, out$new_ids) + out <- filter_low_cellsize(out$data, config, "123def", out$new_ids) new_ids <- out$new_ids - expect_equal(0, length(new_ids[[sample_1_id]])) - expect_equal(0, length(new_ids[[sample_2_id]])) + expect_equal(0, length(new_ids$`123abc`)) + expect_equal(0, length(new_ids$`123def`)) - out <- filter_low_cellsize(out$data, config, sample_1_id, new_ids) - expect_equal(0, length(new_ids[[sample_1_id]])) - expect_equal(0, length(new_ids[[sample_2_id]])) + out <- filter_low_cellsize(out$data, config, "123abc", new_ids) + expect_equal(0, length(new_ids$`123abc`)) + expect_equal(0, length(new_ids$`123def`)) - # Seurat object wasn't changed - expect_equal(ncol(out$data), 40) + expect_equal(ncol(out$data), 80) }) test_that("filter_low_cellsize works with auto", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() config <- mock_config(auto_settings = TRUE) cells_id <- mock_ids() - out <- filter_low_cellsize(scdata_list, config, sample_2_id, cells_id) + out <- filter_low_cellsize(scdata, config, "123def", cells_id) expect_false(config$filterSettings$minCellSize == out$config$filterSettings$minCellSize) - data <- out$data[[sample_2_id]] - barcode_names_this_sample <- colnames(data)[data$samples == sample_2_id] + data <- out$data + barcode_names_this_sample <- colnames(data)[data$samples == "123def"] sample_subset <- subset(data, cells = barcode_names_this_sample) - sample_subset <- subset_ids(data, out$new_ids[[sample_2_id]]) + sample_subset <- subset_ids(data, out$new_ids$`123def`) - expect_equal(ncol(data), 40) + expect_equal(ncol(out$data), 80) expect_true(all(sample_subset$nCount_RNA >= out$config$filterSettings$minCellSize)) }) test_that("filter_low_cellsize can be disabled", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() config <- mock_config(mcs = 10000, enabled = FALSE) cells_id <- mock_ids() - out <- filter_low_cellsize(scdata_list, config, sample_2_id, cells_id) + out <- filter_low_cellsize(scdata, config, "123def", cells_id) - data <- out$data - expect_equal(sum(purrr::map_int(data, ncol)), 80) - expect_equal(length(out$new_ids[[sample_1_id]]), 40) - expect_equal(length(out$new_ids[[sample_2_id]]), 40) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 40) }) - diff --git a/pipeline-runner/tests/testthat/test-qc-3-filter_high_mito.R b/pipeline-runner/tests/testthat/test-qc-3-filter_high_mito.R index f1c361d9..3a3fa661 100644 --- a/pipeline-runner/tests/testthat/test-qc-3-filter_high_mito.R +++ b/pipeline-runner/tests/testthat/test-qc-3-filter_high_mito.R @@ -13,15 +13,13 @@ mock_config <- function(max_fraction = 0.1) { return(config) } + mock_scdata <- function() { pbmc_raw <- read.table( file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), as.is = TRUE ) - sample_1_id <- "123abc" - sample_2_id <- "123def" - scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) scdata$cells_id <- 0:(ncol(scdata) - 1) @@ -30,111 +28,103 @@ mock_scdata <- function() { scdata@meta.data$percent.mt[1:10] <- 11 # add samples - scdata$samples <- rep(c(sample_1_id, sample_2_id), each = 40) + scdata$samples <- rep(c("123abc", "123def"), each = 40) scdata <- Seurat::RenameCells(scdata, paste(scdata$samples, colnames(scdata), sep = "")) - - scdata_sample1 <- subset(scdata, samples == sample_1_id) - scdata_sample2 <- subset(scdata, samples == sample_2_id) - - scdata_list <- list(scdata_sample1, scdata_sample2) - names(scdata_list) <- c(sample_1_id, sample_2_id) - - return(list(scdata_list, sample_1_id, sample_2_id)) + return(scdata) } get_threshold <- function(config) config$filterSettings$methodSettings$absoluteThreshold$maxFraction test_that("filter_high_mito filters based on threshold", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() # should filter first 10 cells config <- mock_config(0.1) config$auto <- FALSE - out <- filter_high_mito(scdata_list, config, sample_1_id, cells_id) - expect_equal(ncol(out$data[[sample_1_id]]), 40) - expect_equal(length(out$new_ids[[sample_1_id]]), 30) + out <- filter_high_mito(scdata, config, "123abc", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 30) # should filter all cells in first sample config <- mock_config(0.01) config$auto <- FALSE - out <- filter_high_mito(scdata_list, config, sample_1_id, cells_id) - expect_equal(ncol(out$data[[sample_1_id]]), 40) - expect_equal(length(out$new_ids[[sample_1_id]]), 0) + out <- filter_high_mito(scdata, config, "123abc", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 0) }) test_that("filter_high_mito can be disabled", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() - nstart <- ncol(scdata_list[[sample_1_id]]) + nstart <- ncol(scdata) # should filter first 10 cells config <- mock_config(0.1) config$enabled <- FALSE - out <- filter_high_mito(scdata_list, config, sample_1_id, cells_id) - expect_equal(ncol(out$data[[sample_1_id]]), nstart) - expect_equal(out$new_ids[[sample_1_id]], 0:39) - expect_equal(out$new_ids[[sample_2_id]], 40:79) + out <- filter_high_mito(scdata, config, "123abc", cells_id) + expect_equal(ncol(out$data), nstart) + expect_equal(out$new_ids$`123abc`, 0:39) + expect_equal(out$new_ids$`123def`, 40:79) }) test_that("filter_high_mito is sample specific", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() + nstart <- ncol(scdata) # should filter first 10 cells of 123abc only config <- mock_config(0.1) config$auto <- FALSE - out1 <- filter_high_mito(scdata_list, config, sample_1_id, cells_id) - out2 <- filter_high_mito(scdata_list, config, sample_2_id, cells_id) + out1 <- filter_high_mito(scdata, config, "123abc", cells_id) + out2 <- filter_high_mito(scdata, config, "123def", cells_id) - expect_equal(ncol(out1$data[[sample_1_id]]), 40) - expect_equal(length(out1$new_ids[[sample_1_id]]), 30) - expect_equal(length(out1$new_ids[[sample_2_id]]), 40) - expect_equal(ncol(out2$data[[sample_2_id]]), 40) - expect_equal(length(out2$new_ids[[sample_1_id]]), 40) - expect_equal(length(out2$new_ids[[sample_2_id]]), 40) + expect_equal(ncol(out1$data), 80) + expect_equal(length(out1$new_ids$`123abc`), 30) + expect_equal(length(out1$new_ids$`123def`), 40) + expect_equal(ncol(out2$data), 80) + expect_equal(length(out2$new_ids$`123abc`), 40) + expect_equal(length(out2$new_ids$`123def`), 40) }) test_that("filter_high_mito can be set to auto", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() + nstart <- ncol(scdata) # would filter all 40 cells in first sample unless auto config <- mock_config(0.01) - out <- filter_high_mito(scdata_list, config, sample_1_id, cells_id) + out <- filter_high_mito(scdata, config, "123abc", cells_id) # check that threshold was updated expect_true(get_threshold(out$config) > get_threshold(config)) # check that updated threshold was used - nkeep <- length(unlist(out$new_ids[[sample_1_id]])) - expected <- sum(scdata_list[[sample_1_id]]$percent.mt <= get_threshold(out$config) * 100) + nkeep <- length(unlist(out$new_ids)) + expected <- sum(scdata$percent.mt <= get_threshold(out$config) * 100) expect_equal(nkeep, expected) # didn't subset original data - expect_equal(ncol(out$data[[sample_1_id]]), 40) + expect_equal(ncol(out$data), 80) }) test_that("data without percent.mt outliers uses the max percentage as the threshold", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() set.seed(gem2s$random.seed) - - scdata_sample_1 <- scdata_list[[sample_1_id]] - - scdata_list$percent.mt <- rnorm(ncol(scdata_sample_1), 6) + scdata$percent.mt <- rnorm(ncol(scdata), 6) cells_id <- mock_ids() config <- mock_config() - out <- filter_high_mito(scdata_list, config, sample_1_id, cells_id) + out <- filter_high_mito(scdata, config, "123abc", cells_id) # nothing filtered - expect_length(unlist(out$new_ids[[sample_1_id]]), ncol(scdata_sample_1)) + expect_length(unlist(out$new_ids), ncol(scdata)) # threshold equals max for sample - expect_equal(get_threshold(out$config), max(scdata_sample_1$percent.mt / 100)) + expect_equal(get_threshold(out$config), max(scdata$percent.mt[scdata$samples == "123abc"]) / 100) }) diff --git a/pipeline-runner/tests/testthat/test-qc-4-filter_gene_umi_outlier.R b/pipeline-runner/tests/testthat/test-qc-4-filter_gene_umi_outlier.R index 45476767..52e3d64f 100644 --- a/pipeline-runner/tests/testthat/test-qc-4-filter_gene_umi_outlier.R +++ b/pipeline-runner/tests/testthat/test-qc-4-filter_gene_umi_outlier.R @@ -3,60 +3,46 @@ mock_ids <- function() { } mock_config <- function() { - config <- list( - auto = TRUE, - enabled = TRUE, - filterSettings = list( - regressionType = "linear", - regressionTypeSettings = list( - linear = list(p.level = 0.1) - ) + config <- list( + auto = TRUE, + enabled = TRUE, + filterSettings = list( + regressionType = 'linear', + regressionTypeSettings = list( + linear = list(p.level = 0.1) + ) + ) ) - ) return(config) } -mock_scdata <- function(with_outlier = FALSE) { - pbmc_raw <- read.table( - file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), - as.is = TRUE - ) - - if (with_outlier) { - pbmc_raw[, 1] <- 0 - pbmc_raw[1:10, 1] <- 100 - } - - sample_1_id <- "123abc" - sample_2_id <- "123def" - scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) - scdata$cells_id <- 0:(ncol(scdata) - 1) - - # add samples - scdata$samples <- rep(c(sample_1_id, sample_2_id), each = 40) - scdata <- Seurat::RenameCells(scdata, paste(scdata$samples, colnames(scdata), sep = "")) - scdata$emptyDrops_FDR <- NA +mock_scdata <- function(with_outlier = FALSE) { + pbmc_raw <- read.table( + file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), + as.is = TRUE) - scdata_sample1 <- subset(scdata, samples == sample_1_id) - scdata_sample2 <- subset(scdata, samples == sample_2_id) + if (with_outlier) { + pbmc_raw[, 1] <- 0 + pbmc_raw[1:10, 1] <- 100 + } - scdata_list <- list(scdata_sample1, scdata_sample2) - names(scdata_list) <- c(sample_1_id, sample_2_id) + scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) + scdata$cells_id <- 0:(ncol(scdata)-1) - return(list(scdata_list, sample_1_id, sample_2_id)) + # add samples + scdata$samples <- rep(c('123abc', '123def'), each = 40) + return(scdata) } - test_that("filter_gene_umi_outlier updates linear p.level in config if auto", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - config <- mock_config() - cells_id <- mock_ids() - - config$filterSettings$regressionTypeSettings$linear$p.level <- 1 - out <- filter_gene_umi_outlier(scdata_list, config, sample_2_id, cells_id) - new <- out$config$filterSettings$regressionTypeSettings$linear$p.level + scdata <- mock_scdata() + config <- mock_config() + cells_id <- mock_ids() + config$filterSettings$regressionTypeSettings$linear$p.level <- 1 + out <- filter_gene_umi_outlier(scdata, config, '123def', cells_id) + new <- out$config$filterSettings$regressionTypeSettings$linear$p.level expect_lt(new, 1) }) @@ -64,122 +50,151 @@ test_that("filter_gene_umi_outlier updates linear p.level in config if auto", { test_that("filter_gene_umi_outlier is sample specific", { - # one outlier in first sample - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(with_outlier = T) - config <- mock_config() - cells_id <- mock_ids() + # one outlier in first sample + scdata <- mock_scdata(with_outlier = TRUE) + config <- mock_config() + cells_id <- mock_ids() + + cell_ids1 <- scdata$cells_id[scdata$samples == '123abc'] + cell_ids2 <- scdata$cells_id[scdata$samples == '123def'] - cell_ids1 <- scdata_list[[sample_1_id]]$cells_id - cell_ids2 <- scdata_list[[sample_2_id]]$cells_id - out1 <- filter_gene_umi_outlier(scdata_list, config, sample_1_id, cells_id) - out2 <- filter_gene_umi_outlier(scdata_list, config, sample_2_id, cells_id) + out1 <- filter_gene_umi_outlier(scdata, config, '123abc', cells_id) + out2 <- filter_gene_umi_outlier(scdata, config, '123def', cells_id) - # filtered something - expect_lt(length(out1$new_ids[[sample_1_id]]), length(cells_id[[sample_1_id]])) - expect_lt(length(out2$new_ids[[sample_2_id]]), length(cells_id[[sample_2_id]])) + # filtered something + expect_lt(length(out1$new_ids$`123abc`), length(cells_id$`123abc`)) + expect_lt(length(out2$new_ids$`123def`), length(cells_id$`123def`)) - # didn't filter other sample - expect_true(all(cell_ids1 %in% out2$new_ids[[sample_1_id]])) - expect_true(all(cell_ids2 %in% out1$new_ids[[sample_2_id]])) + # didn't filter other sample + expect_true(all(cell_ids1 %in% out2$new_ids$`123abc`)) + expect_true(all(cell_ids2 %in% out1$new_ids$`123def`)) }) test_that("filter_gene_umi_outlier can filter cells", { - # single outlier in single sample - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(with_outlier = T) - config <- mock_config() - cells_id <- mock_ids() - nstart <- ncol(scdata_list[[sample_1_id]]) - out <- - filter_gene_umi_outlier(scdata_list, config, sample_1_id, cells_id) - expect_lt(length(unlist(out$new_ids[[sample_1_id]])), nstart) + # single outlier in single sample + scdata <- mock_scdata(with_outlier = TRUE) + config <- mock_config() + cells_id <- mock_ids() + nstart <- ncol(scdata) + scdata$samples <- '123abc' + + out <- filter_gene_umi_outlier(scdata, config, '123abc', cells_id) + expect_lt(length(unlist(out$new_ids)), nstart) }) +test_that("filter_gene_umi_outlier uses linear if regressionType is gam (legacy)", { + + # single outlier in single sample + scdata <- mock_scdata(with_outlier = TRUE) + config <- mock_config() + cells_id <- mock_ids() + nstart <- ncol(scdata) + scdata$samples <- '123abc' + + out1 <- filter_gene_umi_outlier(scdata, config, '123abc', cells_id) + expect_lt(length(unlist(out1$new_ids)), nstart) + + + config$filterSettings$regressionType <- 'gam' + out2 <- filter_gene_umi_outlier(scdata, config, '123abc', cells_id) + expect_identical(out1$new_ids, out2$new_ids) +}) test_that("filter_gene_umi_outlier can be disabled", { - # single outlier in single sample - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(with_outlier = T) - config <- mock_config() - cells_id <- mock_ids() - nstart <- ncol(scdata_list[[sample_1_id]]) + # single outlier in single sample + scdata <- mock_scdata(with_outlier = TRUE) + config <- mock_config() + nstart <- ncol(scdata) + cells_id <- mock_ids() - # disabled - config$enabled <- FALSE + # disabled + config$enabled <- FALSE - out <- filter_gene_umi_outlier(scdata_list, config, sample_1_id, cells_id) - expect_equal(ncol(out$data[[sample_1_id]]), nstart) - expect_equal(out$new_ids[[sample_1_id]], 0:39) - expect_equal(out$new_ids[[sample_2_id]], 40:79) + out <- filter_gene_umi_outlier(scdata, config, '123abc',cells_id) + expect_equal(ncol(out$data), nstart) + expect_equal(out$new_ids$`123abc`,0:39) + expect_equal(out$new_ids$`123def`,40:79) }) test_that("filter_gene_umi_outlier return the appropriate plot data", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - config <- mock_config() + scdata <- mock_scdata() cells_id <- mock_ids() + config <- mock_config() - out <- filter_gene_umi_outlier(scdata_list, config, sample_2_id, cells_id) - plot_data <- out$plotData[[1]] + out <- filter_gene_umi_outlier(scdata, config, '123def',cells_id) + plot_data <- out$plotData[[1]] - # points and lines data - expect_equal(names(plot_data), c("pointsData", "linesData")) + # points and lines data + expect_equal(names(plot_data), c('pointsData', 'linesData')) - # one value per cell - points_data <- plot_data$pointsData - lines_data <- plot_data$linesData - expect_equal(length(points_data), ncol(scdata_list[[sample_2_id]])) + # one value per cell + points_data <- plot_data$pointsData + lines_data <- plot_data$linesData + expect_equal(length(points_data), sum(scdata$samples == '123def')) - # has the right names - expect_equal(names(points_data[[1]]), c("log_molecules", "log_genes")) - expect_equal(names(lines_data[[1]]), c("log_molecules", "lower_cutoff", "upper_cutoff")) + # has the right names + expect_equal(names(points_data[[1]]), c('log_molecules', 'log_genes')) + expect_equal(names(lines_data[[1]]), c('log_molecules', 'lower_cutoff', 'upper_cutoff')) }) +test_that("filter_gene_umi_outlier skips if no barcodes for this sample", { + scdata <- mock_scdata(with_outlier = TRUE) + config <- mock_config() + cells_id <- mock_ids() + out <- filter_gene_umi_outlier(scdata, config, 'not a sample',cells_id) + + expect_equal(ncol(out$data), ncol(scdata)) + expect_equal(out$new_ids$`123abc`,0:39) + expect_equal(out$new_ids$`123def`,40:79) +}) test_that("filter_gene_umi_outlier gives different results with spline fit", { # single outlier in single sample - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(with_outlier = T) - config <- mock_config() + scdata <- mock_scdata(with_outlier = TRUE) cells_id <- mock_ids() - nstart <- ncol(scdata_list[[sample_1_id]]) + config <- mock_config() + nstart <- ncol(scdata) + scdata$samples <- '123abc' - out1 <- filter_gene_umi_outlier(scdata_list, config, sample_1_id, cells_id) - expect_equal(ncol(out1$data[[sample_1_id]]), nstart) - expect_lt(length(out1$new_ids[[sample_1_id]]), length(cells_id[[sample_1_id]])) + out1 <- filter_gene_umi_outlier(scdata, config, '123abc',cells_id) + expect_equal(ncol(out1$data), nstart) + expect_lt(length(out1$new_ids$`123abc`),length(cells_id$`123abc`)) - config$filterSettings$regressionType <- "spline" + config$filterSettings$regressionType <- 'spline' - out2 <- filter_gene_umi_outlier(scdata_list, config, sample_1_id, cells_id) + out2 <- filter_gene_umi_outlier(scdata, config, '123abc',cells_id) expect_true(!identical(out1$plotData, out2$plotData)) - expect_true(!identical(out1$new_ids[[sample_1_id]], out2$new_ids[[sample_1_id]])) + expect_true(!identical(out1$new_ids$`123abc`, out2$new_ids$`123abc`)) }) test_that("Gene UMI filter works if input is a a float-interpretable string", { - # single outlier in single sample - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(with_outlier = T) - config <- mock_config() + scdata <- mock_scdata() cells_id <- mock_ids() - nstart <- ncol(scdata_list[[sample_2_id]]) + config <- mock_config() + nstart <- ncol(scdata) config$auto <- FALSE - out_number <- filter_gene_umi_outlier(scdata_list, config, sample_2_id, cells_id) + out_number <- filter_gene_umi_outlier(scdata, config, "123def", cells_id) config$filterSettings$regressionTypeSettings$linear$p.level <- "0.1" - out_string <- filter_gene_umi_outlier(scdata_list, config, sample_2_id, cells_id) + out_string <- filter_gene_umi_outlier(scdata, config, '123def',cells_id) - expect_equal(ncol(out_number$data[[sample_2_id]]), nstart) - expect_equal(out_string$new_ids[[sample_2_id]], out_number$new_ids[[sample_2_id]]) + expect_equal(ncol(out_number$data), nstart) + expect_equal(out_string$new_ids$`123def`, out_number$new_ids$`123def`) }) test_that("Gene UMI filter throws error if input is a non float-interpretable string", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - config <- mock_config() - config$auto <- FALSE - config$filterSettings$regressionTypeSettings$linear$p.level <- - "asd" + scdata <- mock_scdata() + config <- mock_config() + config$auto <- FALSE + config$filterSettings$regressionTypeSettings$linear$p.level <- "asd" - expect_error(filter_gene_umi_outlier(scdata, config, sample_2_id, cells_id)) + expect_error(filter_gene_umi_outlier(scdata, config, "123def", cells_id)) }) + diff --git a/pipeline-runner/tests/testthat/test-qc-5-filter_doublets.R b/pipeline-runner/tests/testthat/test-qc-5-filter_doublets.R index 045afceb..21ac5ca0 100644 --- a/pipeline-runner/tests/testthat/test-qc-5-filter_doublets.R +++ b/pipeline-runner/tests/testthat/test-qc-5-filter_doublets.R @@ -9,15 +9,13 @@ mock_config <- function(thr = 0.1, auto = FALSE, enabled = TRUE) { return(config) } + mock_scdata <- function() { pbmc_raw <- read.table( file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), as.is = TRUE ) - sample_1_id <- "123abc" - sample_2_id <- "123def" - scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) scdata$cells_id <- 0:(ncol(scdata) - 1) @@ -29,70 +27,64 @@ mock_scdata <- function() { scdata@meta.data$doublet_class[1:10] <- rep("doublet", 10) # add samples - scdata$samples <- rep(c(sample_1_id, sample_2_id), each = 40) + scdata$samples <- rep(c("123abc", "123def"), each = 40) scdata <- Seurat::RenameCells(scdata, paste(scdata$samples, colnames(scdata), sep = "")) + return(scdata) +} - scdata_sample1 <- subset(scdata, samples == sample_1_id) - scdata_sample2 <- subset(scdata, samples == sample_2_id) - - scdata_list <- list(scdata_sample1, scdata_sample2) - names(scdata_list) <- c(sample_1_id, sample_2_id) - return(list(scdata_list, sample_1_id, sample_2_id)) -} test_that("filter_doublets filters based on threshold", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() # should filter first 10 cells config <- mock_config(0.5) - out <- filter_doublets(scdata_list, config, sample_1_id, cells_id) - - expect_equal(ncol(out$data[[sample_1_id]]), 40) - expect_equal(out$new_ids[[sample_1_id]], 10:39) + out <- filter_doublets(scdata, config, "123abc", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(out$new_ids$`123abc`, 10:39) }) test_that("filter_doublets is sample aware", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() - scdata_list[[sample_2_id]]@meta.data$doublet_scores[31:40] <- 0.9 + scdata@meta.data$doublet_scores[71:80] <- 0.9 config <- mock_config(0.5) - out <- filter_doublets(scdata_list, config, sample_1_id, cells_id) - expect_equal(ncol(out$data[[sample_1_id]]), 40) - expect_equal(out$new_ids[[sample_1_id]], 10:39) - expect_equal(out$new_ids[[sample_2_id]], 40:79) + out <- filter_doublets(scdata, config, "123abc", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(out$new_ids$`123abc`, 10:39) + expect_equal(out$new_ids$`123def`, 40:79) - out <- filter_doublets(out$data, config, sample_2_id, out$new_ids) - expect_equal(ncol(out$data[[sample_2_id]]), 40) - expect_equal(out$new_ids[[sample_1_id]], 10:39) - expect_equal(out$new_ids[[sample_2_id]], 40:69) + out <- filter_doublets(out$data, config, "123def", out$new_ids) + expect_equal(ncol(out$data), 80) + expect_equal(out$new_ids$`123abc`, 10:39) + expect_equal(out$new_ids$`123def`, 40:69) }) test_that("filter_doublets filters works with auto", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() # should filter first 10 cells config <- mock_config(0.001, auto = TRUE) - out <- filter_doublets(scdata_list, config, sample_1_id, cells_id) - expect_equal(out$new_ids[[sample_1_id]], 10:39) + out <- filter_doublets(scdata, config, "123abc", cells_id) + expect_equal(out$new_ids$`123abc`, 10:39) config <- mock_config(0.001, auto = FALSE) - out <- filter_doublets(scdata_list, config, sample_1_id, cells_id) - expect_equal(length(out$new_ids[[sample_1_id]]), 0) + out <- filter_doublets(scdata, config, "123abc", cells_id) + expect_equal(length(out$new_ids$`123abc`), 0) }) test_that("filter_doublets can be disabled", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() config <- mock_config(0.5, enabled = FALSE) - out <- filter_doublets(scdata_list, config, sample_1_id, cells_id) - expect_equal(out$new_ids[[sample_1_id]], 0:39) - expect_equal(out$new_ids[[sample_2_id]], 40:79) + out <- filter_doublets(scdata, config, "123abc", cells_id) + expect_equal(out$new_ids$`123abc`, 0:39) + expect_equal(out$new_ids$`123def`, 40:79) config <- mock_config(0.5, enabled = TRUE) - out <- filter_doublets(scdata_list, config, sample_1_id, cells_id) - expect_equal(out$new_ids[[sample_1_id]], 10:39) - expect_equal(out$new_ids[[sample_2_id]], 40:79) + out <- filter_doublets(scdata, config, "123abc", cells_id) + expect_equal(out$new_ids$`123abc`, 10:39) + expect_equal(out$new_ids$`123def`, 40:79) }) diff --git a/pipeline-runner/tests/testthat/test-qc-6-integrate_scdata.R b/pipeline-runner/tests/testthat/test-qc-6-integrate_scdata.R index 07d6cf81..6612458c 100644 --- a/pipeline-runner/tests/testthat/test-qc-6-integrate_scdata.R +++ b/pipeline-runner/tests/testthat/test-qc-6-integrate_scdata.R @@ -1,43 +1,5 @@ human_cc_genes <- cc_genes[["human"]] -mock_prev_out <- function(samples = "sample_a", counts = NULL) { - if (is.null(counts)) { - counts <- DropletUtils:::simCounts() - colnames(counts) <- paste0("cell", seq_len(ncol(counts))) - } - - eout <- DropletUtils::emptyDrops(counts) - - counts_list <- list() - edrops <- list() - doublet_scores <- list() - - for (sample in samples) { - counts_list[[sample]] <- counts - edrops[[sample]] <- eout - doublet_scores[[sample]] <- mock_doublet_scores(counts) - } - - annot <- data.frame(name = row.names(counts), input = row.names(counts)) - - # as passed to create_seurat - prev_out <- list( - counts_list = counts_list, - edrops = edrops, - doublet_scores = doublet_scores, - annot = annot, - config = list(name = "project name") - ) - - # call create_seurat to get prev_out to pass to prepare_experiment - prev_out <- create_seurat(NULL, NULL, prev_out)$output - - - prev_out$scdata_list <- add_metadata_to_samples(prev_out$scdata_list, annot = annot, experiment_id = "expid") - - return(prev_out) -} - mock_scdata <- function(rename_genes = c(), n_rep = 1) { pbmc_raw <- read.table( file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), @@ -52,33 +14,12 @@ mock_scdata <- function(rename_genes = c(), n_rep = 1) { rownames(pbmc_raw)[some_genes] <- rename_genes } - sample_1_id <- "123abc" - sample_2_id <- "123def" - scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) # add samples - scdata$samples <- rep(c(sample_1_id, sample_2_id), each = ncol(scdata) / 2) + scdata$samples <- rep(c("123abc", "123def"), each = ncol(scdata) / 2) scdata$cells_id <- 0:(ncol(scdata) - 1) - scdata@misc$gene_annotations <- data.frame(input = rownames(scdata), name = paste("SYMBOL -", rownames(scdata))) - rownames(scdata@misc$gene_annotations) <- rownames(scdata) - - scdata_sample1 <- subset(scdata, samples == sample_1_id) - scdata_sample2 <- subset(scdata, samples == sample_2_id) - - - scdata_list <- list(scdata_sample1, scdata_sample2) - names(scdata_list) <- c(sample_1_id, sample_2_id) - - return(list(scdata_list, sample_1_id, sample_2_id)) - -} - -mock_ids <- function() { - return(list("123abc" = 0:39, "123def" = 40:79)) -} - -scdata_preprocessing <- function(scdata) { + scdata@misc$gene_annotations$input <- rownames(scdata) # scale and PCA scdata <- Seurat::NormalizeData(scdata, normalization.method = "LogNormalize", verbose = FALSE) @@ -88,101 +29,79 @@ scdata_preprocessing <- function(scdata) { scdata@misc[["active.reduction"]] <- "pca" return(scdata) - -} - -mock_doublet_scores <- function(counts) { - doublet_scores <- runif(ncol(counts)) - doublet_class <- ifelse(doublet_scores < 0.8, "singlet", "doublet") - - data.frame( - row.names = colnames(counts), - barcodes = colnames(counts), - doublet_class = doublet_class, - doublet_scores = doublet_scores - ) } test_that("Integrate scdata works", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - cells_id <- mock_ids() + scdata <- mock_scdata() + cells_id <- 0:79 config <- list( dimensionalityReduction = list(numPCs = 2), dataIntegration = list(method = "harmony", methodSettings = list(harmony = list(numGenes = 10, normalisation = "logNormalize"))) ) - integrated_scdata <- suppressWarnings(integrate_scdata(scdata_list, config, "", cells_id, task_name = "dataIntegration"))$data - expect_s4_class(integrated_scdata, "Seurat") - expect_equal(ncol(integrated_scdata), 80) + scdata <- suppressWarnings(integrate_scdata(scdata, config, "", cells_id, task_name = "dataIntegration"))$data + expect_s4_class(scdata, "Seurat") + expect_equal(ncol(scdata), 80) }) test_that("Integrate scdata filters out cells ids", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - cells_id <- mock_ids() - cells_id[[sample_1_id]] <- cells_id[[sample_1_id]][-c(23:31)] - cells_id[[sample_2_id]] <- cells_id[[sample_2_id]][-c(40:47)] - + scdata <- mock_scdata() + cells_id <- 0:40 config <- list( dimensionalityReduction = list(numPCs = 2), dataIntegration = list(method = "harmony", methodSettings = list(harmony = list(numGenes = 10, normalisation = "logNormalize"))) ) - scdata <- suppressWarnings(integrate_scdata(scdata_list, config, "", cells_id, task_name = "dataIntegration"))$data + scdata <- suppressWarnings(integrate_scdata(scdata, config, "", cells_id, task_name = "dataIntegration"))$data expect_lt(ncol(scdata), 80) }) test_that("harmony integration works", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata() config <- list( dimensionalityReduction = list(numPCs = 2), dataIntegration = list(method = "harmony", methodSettings = list(harmony = list(numGenes = 10, normalisation = "logNormalize"))) ) - integrated_scdata <- suppressWarnings(run_dataIntegration(merged_scdata, config)) - expect_s4_class(integrated_scdata, "Seurat") + scdata <- suppressWarnings(run_dataIntegration(scdata, config)) + expect_s4_class(scdata, "Seurat") }) test_that("SeuratV4 integration doesnt error out with small dataset", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata() config <- list( dimensionalityReduction = list(numPCs = 2, method = "rpca"), dataIntegration = list(method = "seuratv4", methodSettings = list(seuratv4 = list(numGenes = 1000, normalisation = "logNormalize"))) ) - integrated_scdata <- suppressWarnings(run_dataIntegration(merged_scdata, config)) - expect_s4_class(integrated_scdata, "Seurat") + scdata <- suppressWarnings(run_dataIntegration(scdata, config)) + expect_s4_class(scdata, "Seurat") }) test_that("Unisample integration works", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata() config <- list( dimensionalityReduction = list(numPCs = 2), dataIntegration = list(method = "unisample", methodSettings = list(unisample = list(numGenes = 1000, normalisation = "logNormalize"))) ) - integrated_scdata <- suppressWarnings(run_dataIntegration(merged_scdata, config)) - expect_s4_class(integrated_scdata, "Seurat") + scdata <- suppressWarnings(run_dataIntegration(scdata, config)) + expect_s4_class(scdata, "Seurat") }) test_that("FastMNN is not working", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata() config <- list( dimensionalityReduction = list(numPCs = 2), dataIntegration = list(method = "fastmnn", methodSettings = list(fastmnn = list(numGenes = 1000, normalisation = "logNormalize"))) ) - expect_error(suppressWarnings(run_dataIntegration(merged_scdata, config))) + expect_error(suppressWarnings(run_dataIntegration(scdata, config))) }) test_that("numPCs estimation works", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) - merged_scdata <- suppressWarnings(scdata_preprocessing(merged_scdata)) - npcs <- get_npcs(merged_scdata) + scdata <- suppressWarnings(mock_scdata()) + npcs <- get_npcs(scdata) expect_lte(npcs, 30) }) @@ -190,9 +109,8 @@ test_that("numPCs estimation works", { test_that("build_cc_gene_list correctly makes the list of cc genes when there are matches", { n_rename <- 10 some_cc_genes <- sample(human_cc_genes$symbol, n_rename) - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(rename_genes = some_cc_genes) - merged_scdata <- create_scdata(scdata_list) - all_genes <- merged_scdata@misc$gene_annotations$input + scdata <- suppressWarnings(mock_scdata(rename_genes = some_cc_genes)) + all_genes <- scdata@misc$gene_annotations$input expected_res <- match(some_cc_genes, all_genes) res <- build_cc_gene_list(all_genes) @@ -202,9 +120,8 @@ test_that("build_cc_gene_list correctly makes the list of cc genes when there ar test_that("build_cc_gene_list returns empty int vector when there aren't matches", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) - all_genes <- merged_scdata@misc$gene_annotations$input + scdata <- suppressWarnings(mock_scdata()) + all_genes <- scdata@misc$gene_annotations$input # empty integer vector expected_res <- integer() @@ -217,9 +134,8 @@ test_that("build_cc_gene_list returns empty int vector when there aren't matches test_that("list_exclude_genes adds custom genes to exclusion", { n_rename <- 10 some_cc_genes <- sample(human_cc_genes$symbol, n_rename) - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(rename_genes = some_cc_genes) - merged_scdata <- create_scdata(scdata_list) - all_genes <- merged_scdata@misc$gene_annotations$input + scdata <- suppressWarnings(mock_scdata(rename_genes = some_cc_genes)) + all_genes <- scdata@misc$gene_annotations$input exclude_custom <- sample(setdiff(all_genes, some_cc_genes), 7) exclude_custom_indices <- match(exclude_custom, all_genes) @@ -234,132 +150,108 @@ test_that("list_exclude_genes adds custom genes to exclusion", { test_that("remove_genes removes the correct genes when there are genes to remove", { n_rename <- 10 some_cc_genes <- sample(human_cc_genes$symbol, n_rename) - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(rename_genes = some_cc_genes) - merged_scdata <- create_scdata(scdata_list) - all_genes <- merged_scdata@misc$gene_annotations$input + scdata <- suppressWarnings(mock_scdata(rename_genes = some_cc_genes)) + all_genes <- scdata@misc$gene_annotations$input - res <- remove_genes(merged_scdata, exclude_groups = list("cellCycle")) + res <- remove_genes(scdata, exclude_groups = list("cellCycle")) # only cc genes - expect_equal(nrow(res), nrow(merged_scdata) - 10) + expect_equal(nrow(res), nrow(scdata) - 10) expect_false(any(some_cc_genes %in% rownames(res))) exclude_custom <- sample(setdiff(all_genes, some_cc_genes), 7) # cc genes and custom - res <- remove_genes(merged_scdata, exclude_groups = list("cellCycle"), exclude_custom) + res <- remove_genes(scdata, exclude_groups = list("cellCycle"), exclude_custom) - expect_equal(nrow(res), nrow(merged_scdata) - 17) + expect_equal(nrow(res), nrow(scdata) - 17) expect_false(any(c(some_cc_genes, exclude_custom) %in% rownames(res))) }) test_that("remove_genes doesn't modify the object when there are no matches", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) + scdata <- suppressWarnings(mock_scdata()) + # empty integer vector - res <- remove_genes(merged_scdata, exclude_groups = "cellCycle") + res <- remove_genes(scdata, exclude_groups = "cellCycle") - expect_equal(res, merged_scdata) + expect_equal(res, scdata) }) test_that("SeuratV4 integration works", { # mock a bigger dataset to run Seurat v4 integration without skipping it - c(scdata_list, sample_1_id, sample_2_id) %<-% suppressWarnings(mock_scdata(n_rep = 3)) - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata(n_rep = 3) - merged_scdata <- suppressWarnings(scdata_preprocessing(merged_scdata)) - npcs <- get_npcs(merged_scdata) + npcs <- get_npcs(scdata) config <- list( dimensionalityReduction = list(numPCs = npcs, method = "rpca"), dataIntegration = list(method = "seuratv4", methodSettings = list(seuratv4 = list(numGenes = 1000, normalisation = "logNormalize"))) ) - merged_scdata <- suppressWarnings(run_dataIntegration(merged_scdata, config)) - expect_s4_class(merged_scdata, "Seurat") + scdata <- suppressWarnings(run_dataIntegration(scdata, config)) + expect_s4_class(scdata, "Seurat") }) test_that("PCA is computed when RPCA method is selected within SeuratV4 integration", { # mock a bigger dataset to run Seurat v4 integration without skipping it - c(scdata_list, sample_1_id, sample_2_id) %<-% suppressWarnings(mock_scdata(n_rep = 3)) - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata(n_rep = 3) - merged_scdata <- suppressWarnings(scdata_preprocessing(merged_scdata)) - npcs <- get_npcs(merged_scdata) + npcs <- get_npcs(scdata) config <- list( dimensionalityReduction = list(numPCs = npcs, method = "rpca"), dataIntegration = list(method = "seuratv4", methodSettings = list(seuratv4 = list(numGenes = 1000, normalisation = "logNormalize"))) ) - expect_message(run_dataIntegration(merged_scdata, config), "Running PCA") + expect_message(run_dataIntegration(scdata, config), "Running PCA") }) test_that("PCA is not computed when CCA method is selected within SeuratV4 integration", { # mock a bigger dataset to run Seurat v4 integration without skipping it - c(scdata_list, sample_1_id, sample_2_id) %<-% suppressWarnings(mock_scdata(n_rep = 3)) - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata(n_rep = 3) - merged_scdata <- suppressWarnings(scdata_preprocessing(merged_scdata)) - npcs <- get_npcs(merged_scdata) + npcs <- get_npcs(scdata) config <- list( dimensionalityReduction = list(numPCs = npcs, method = "cca"), dataIntegration = list(method = "seuratv4", methodSettings = list(seuratv4 = list(numGenes = 1000, normalisation = "logNormalize"))) ) - expect_message(run_dataIntegration(merged_scdata, config), "PCA is not running .*") + expect_message(run_dataIntegration(scdata, config), "PCA is not running .*") }) test_that("SeuratV4 integration finds integration anchors using RPCA method, if method in config is RPCA", { # mock a bigger dataset to run Seurat v4 integration without skipping it - c(scdata_list, sample_1_id, sample_2_id) %<-% suppressWarnings(mock_scdata(n_rep = 3)) - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata(n_rep = 3) - merged_scdata <- suppressWarnings(scdata_preprocessing(merged_scdata)) - npcs <- get_npcs(merged_scdata) + npcs <- get_npcs(scdata) config <- list( dimensionalityReduction = list(numPCs = npcs, method = "rpca"), dataIntegration = list(method = "seuratv4", methodSettings = list(seuratv4 = list(numGenes = 1000, normalisation = "logNormalize"))) ) - expect_message(merged_scdata <- suppressWarnings(run_dataIntegration(merged_scdata, config)), "Finding integration anchors using RPCA reduction") - expect_equal(merged_scdata@commands$FindIntegrationAnchors$reduction, "pca") + expect_message(scdata <- suppressWarnings(run_dataIntegration(scdata, config)), "Finding integration anchors using RPCA reduction") + expect_equal(scdata@commands$FindIntegrationAnchors$reduction, "pca") }) test_that("SeuratV4 integration finds integration anchors using CCA method, if method in config is CCA", { # mock a bigger dataset to run Seurat v4 integration without skipping it - c(scdata_list, sample_1_id, sample_2_id) %<-% suppressWarnings(mock_scdata(n_rep = 3)) - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata(n_rep = 3) - merged_scdata <- suppressWarnings(scdata_preprocessing(merged_scdata)) - npcs <- get_npcs(merged_scdata) + npcs <- get_npcs(scdata) config <- list( dimensionalityReduction = list(numPCs = npcs, method = "cca"), dataIntegration = list(method = "seuratv4", methodSettings = list(seuratv4 = list(numGenes = 1000, normalisation = "logNormalize"))) ) - expect_message(merged_scdata <- suppressWarnings(run_dataIntegration(merged_scdata, config)), "Finding integration anchors using CCA reduction") - expect_equal(merged_scdata@commands$FindIntegrationAnchors$reduction, "cca") + expect_message(scdata <- suppressWarnings(run_dataIntegration(scdata, config)), "Finding integration anchors using CCA reduction") + expect_equal(scdata@commands$FindIntegrationAnchors$reduction, "cca") }) - - -test_that("merge_scdata_list correctly merges seurat objects", { - prev_out <- mock_prev_out(samples = c("a", "b", "c")) - scdata_list <- prev_out$scdata_list - - - scdata <- suppressWarnings(merge_scdata_list(scdata_list)) - - expect_equal(sum(unlist(lapply(scdata_list, ncol))), ncol(scdata)) - expect_true(all(scdata$samples[1:ncol(scdata_list[[1]])] == "a")) - -}) \ No newline at end of file