diff --git a/Makefile b/Makefile index 422b10b8..5c959f6d 100644 --- a/Makefile +++ b/Makefile @@ -16,13 +16,12 @@ endif #-------------------------------------------------- # Targets #-------------------------------------------------- -install: +install: @echo "Installing local runner" @(cd ./local-runner && npm install) - @echo "Installing renv packages" + @echo "Installing R env packages" @(cd ./pipeline-runner && R -e "renv::restore()") - @(cd ./pipeline-runner && R -e "renv::install()") -build: +build: # regenerate sysdata.rda env file @(cd ./pipeline-runner && Rscript data-raw/sysdata.R) @(cd ./local-runner && npm run build) diff --git a/local-runner/cf-local-container-launcher.yaml b/local-runner/cf-local-container-launcher.yaml index 6d46aa36..2ed9a436 100644 --- a/local-runner/cf-local-container-launcher.yaml +++ b/local-runner/cf-local-container-launcher.yaml @@ -30,7 +30,7 @@ Resources: docker_command = ' '.join([ "docker run --rm -t", - f"--name {event['name']}-{random_string(10)}", + f"--name {event['name']}-{random_string(10)}", f"{'-d' if event['detached'] else ''}", f"--env ACTIVITY_ARN={event.get('activityArn', '')}", f"--env HOST_IP=__HOST_IP__", @@ -47,7 +47,7 @@ Resources: Timeout: 25 QualityControlActivity: Type: AWS::StepFunctions::Activity - Properties: + Properties: Name: biomage-qc-activity-development RemovePreviousPipelineContainers: Type: "AWS::Lambda::Function" diff --git a/pipeline-runner/Dockerfile b/pipeline-runner/Dockerfile index a42a177f..30b84246 100644 --- a/pipeline-runner/Dockerfile +++ b/pipeline-runner/Dockerfile @@ -105,8 +105,6 @@ FROM common AS dev # also install watchdog to automatically restart # when source files change RUN pip install -U jedi radian PyYAML watchdog[watchmedo] -RUN apt update && apt -y install git -RUN pip install memory_profiler # add R package files and runner ADD R ./R @@ -114,5 +112,4 @@ ADD tests ./tests COPY DESCRIPTION NAMESPACE init.R ./ # start app -COPY start.sh start.sh -ENTRYPOINT ["./start.sh"] \ No newline at end of file +ENTRYPOINT ["Rscript", "init.R"] diff --git a/pipeline-runner/NAMESPACE b/pipeline-runner/NAMESPACE index e9da0fb3..5519ebf0 100644 --- a/pipeline-runner/NAMESPACE +++ b/pipeline-runner/NAMESPACE @@ -1,9 +1,6 @@ # Generated by roxygen2: do not edit by hand -export(add_metadata) export(build_cc_gene_list) -export(build_metadata_cellsets) -export(create_scdata) export(create_seurat) export(download_user_files) export(filter_doublets) @@ -11,27 +8,21 @@ export(filter_emptydrops) export(filter_gene_umi_outlier) export(filter_high_mito) export(filter_low_cellsize) -export(filter_unnamed_features) export(generate_default_values_cellSizeDistribution) export(generate_default_values_classifier) export(generate_first_step_ids) export(getClusters) export(get_feature_types) export(get_gem2s_file) -export(ids_to_sym) export(list_exclude_genes) export(load_user_files) -export(make_annot_with_ids) -export(merge_scdata_list) -export(normalize_annotation_types) +export(merge_scdatas) export(prepare_experiment) -export(read_10x_annotations) export(remove_genes) export(runClusters) export(run_emptydrops) export(score_doublets) export(subset_ids) export(subset_safe) -export(sym_to_ids) import(data.table) importFrom(magrittr,"%>%") diff --git a/pipeline-runner/R/gem2s-2-load_user_files.R b/pipeline-runner/R/gem2s-2-load_user_files.R index 2a70e913..8d8c85c9 100644 --- a/pipeline-runner/R/gem2s-2-load_user_files.R +++ b/pipeline-runner/R/gem2s-2-load_user_files.R @@ -222,6 +222,7 @@ parse_rhapsody_matrix <- function(config, input_dir) { #' @return list of annotations data.frame #' @export #' +#' @examples read_10x_annotations <- function(annot_fpath, sample) { gene_column <- 1 @@ -408,7 +409,7 @@ make_annot_with_ids <- function(annot_list, feature_types_list) { #' @param sample_annot data.frame of annotations #' @param annot_with_ids data.frame of annotations with IDs. Key data.frame #' -#' @return data.frame of annotations +#' @return #' @export #' sym_to_ids <- function(sample_annot, annot_with_ids) { @@ -436,7 +437,7 @@ sym_to_ids <- function(sample_annot, annot_with_ids) { #' @param sample_annot data.frame of annotations #' @param annot_with_ids data.frame of annotations with IDs. Key data.frame #' -#' @return data.frame of annotations +#' @return #' @export #' ids_to_sym <- function(sample_annot, annot_with_ids) { @@ -464,7 +465,7 @@ ids_to_sym <- function(sample_annot, annot_with_ids) { #' @param annotations list of annotations data.frame, feature types and gene_column #' @param sample character specifying current sample #' -#' @return list of counts and annotations +#' @return #' @export #' filter_unnamed_features <- function(counts, annotations, sample) { diff --git a/pipeline-runner/R/gem2s-4-score_doublets.R b/pipeline-runner/R/gem2s-4-score_doublets.R index b68405f9..436452ea 100644 --- a/pipeline-runner/R/gem2s-4-score_doublets.R +++ b/pipeline-runner/R/gem2s-4-score_doublets.R @@ -55,7 +55,7 @@ score_doublets <- function(input, pipeline_config, prev_out) { #' @return data.frame with doublet scores and assigned classes #' compute_sample_doublet_scores <- function(sample_counts) { - set.seed(RANDOM_SEED) + set.seed(gem2s$random.seed) sce <- scDblFinder::scDblFinder(sample_counts) doublet_res <- data.frame( row.names = colnames(sce), diff --git a/pipeline-runner/R/gem2s-5-create_seurat.R b/pipeline-runner/R/gem2s-5-create_seurat.R index b61fbc22..2fea8475 100644 --- a/pipeline-runner/R/gem2s-5-create_seurat.R +++ b/pipeline-runner/R/gem2s-5-create_seurat.R @@ -15,11 +15,7 @@ create_seurat <- function(input, pipeline_config, prev_out) { check_prev_out(prev_out, check_names) # destructure previous output: config, counts_list, annot, and doublet_scores - config <- prev_out$config - counts_list <- prev_out$counts_list - annot <- prev_out$annot - doublet_scores <- prev_out$doublet_scores - edrops <- prev_out$edrops + list2env(prev_out, envir = environment()) samples <- names(counts_list) scdata_list <- list() @@ -68,33 +64,26 @@ construct_scdata <- function(counts, doublet_score, edrops_out, sample, annot, c } -#' Construct metadata for each SeuratObject -#' -#' This function creates a `data.frame` with the barcodes, sampleIDs and user supplied -#' metadata that corresponds to each one. -#' -#' @param counts count matrix -#' @param sample character sample ID -#' @param config list containing experiment config -#' -#' @return data.frame of sample metadata -#' + +# NOTE: any changes here must be reflected in meta_sets + +# construct metadata for each SeuratObject construct_metadata <- function(counts, sample, config) { - message("Constructing metadata data.frame...") - metadata_df <- data.frame(row.names = colnames(counts), samples = rep(sample, ncol(counts))) + message("Constructing metadata df...") + metadata <- data.frame(row.names = colnames(counts), samples = rep(sample, ncol(counts))) # Add "metadata" if exists in config - user_metadata <- config$metadata - if (!is.null(user_metadata)) { - user_metadata <- lapply(user_metadata, unlist) - user_metadata <- data.frame(user_metadata, row.names = config$samples, check.names = FALSE) - metadata_df[names(user_metadata)] <- user_metadata[sample, ] + rest <- config$metadata + if (!is.null(rest)) { + rest <- lapply(rest, unlist) + rest <- data.frame(rest, row.names = config$samples, check.names = FALSE) + metadata[names(rest)] <- rest[sample, ] } # make syntactically valid column names - colnames(metadata_df) <- make.names(colnames(metadata_df), unique = TRUE) + colnames(metadata) <- make.names(colnames(metadata), unique = TRUE) - return(metadata_df) + return(metadata) } # add mitochondrial percent to SeuratObject diff --git a/pipeline-runner/R/gem2s-6-construct_qc_config.R b/pipeline-runner/R/gem2s-6-construct_qc_config.R index 34f7c2e1..7060767a 100644 --- a/pipeline-runner/R/gem2s-6-construct_qc_config.R +++ b/pipeline-runner/R/gem2s-6-construct_qc_config.R @@ -1,16 +1,6 @@ -#' Constructs default QC configuration -#' -#' This function returns the default parameters used during QC as a nested list. -#' It is sent to the API, which in turn saves it as a jsonb object in the PostgreSQL -#' database. -#' -#' @param scdata_list list of seurat objects -#' @param any_filtered boolean indicating if barcodes were filtered by the emptyDrops. -#' -#' @return list of QC configuration parameters -#' -construct_qc_config <- function(scdata_list, any_filtered) { - samples <- names(scdata_list) +# constructs default QC configuration for merged SeuratObject +construct_qc_config <- function(scdata, any_filtered) { + samples <- scdata$samples # classifier config.classifier <- list( @@ -36,7 +26,7 @@ construct_qc_config <- function(scdata_list, any_filtered) { filterSettings = list(minCellSize = 1080, binStep = 200) ) - config.cellSizeDistribution <- add_custom_config_per_sample(get_cellsize_config, config.cellSizeDistribution, scdata_list) + config.cellSizeDistribution <- add_custom_config_per_sample(get_cellsize_config, config.cellSizeDistribution, scdata) # mito config.mitochondrialContent <- list( @@ -53,7 +43,7 @@ construct_qc_config <- function(scdata_list, any_filtered) { ) ) - config.mitochondrialContent <- add_custom_config_per_sample(get_sample_mitochondrial_config, config.mitochondrialContent, scdata_list) + config.mitochondrialContent <- add_custom_config_per_sample(get_sample_mitochondrial_config, config.mitochondrialContent, scdata) # ngenes vs umis config.numGenesVsNumUmis <- list( @@ -68,7 +58,7 @@ construct_qc_config <- function(scdata_list, any_filtered) { ) ) - config.numGenesVsNumUmis <- add_custom_config_per_sample(get_gene_umi_config, config.numGenesVsNumUmis, scdata_list) + config.numGenesVsNumUmis <- add_custom_config_per_sample(get_gene_umi_config, config.numGenesVsNumUmis, scdata) # doublet scores @@ -81,7 +71,7 @@ construct_qc_config <- function(scdata_list, any_filtered) { ) ) - config.doubletScores <- add_custom_config_per_sample(get_dblscore_config, config.doubletScores, scdata_list) + config.doubletScores <- add_custom_config_per_sample(get_dblscore_config, config.doubletScores, scdata) # data integration config.dataIntegration <- list( @@ -112,8 +102,8 @@ construct_qc_config <- function(scdata_list, any_filtered) { distanceMetric = "cosine" ), tsne = list( - perplexity = min(30, ncol(scdata_list) / 100), - learningRate = max(200, ncol(scdata_list) / 12) + perplexity = min(30, ncol(scdata) / 100), + learningRate = max(200, ncol(scdata) / 12) ) ) ), @@ -138,13 +128,13 @@ construct_qc_config <- function(scdata_list, any_filtered) { } -get_cellsize_config <- function(scdata_list, config) { - minCellSize <- generate_default_values_cellSizeDistribution(scdata_list, config) +get_cellsize_config <- function(scdata, config) { + minCellSize <- generate_default_values_cellSizeDistribution(scdata, config) config$filterSettings$minCellSize <- minCellSize return(config) } -get_sample_mitochondrial_config <- function(scdata_list.sample, config) { +get_sample_mitochondrial_config <- function(scdata.sample, config) { config.sample <- list( auto = TRUE, @@ -155,32 +145,32 @@ get_sample_mitochondrial_config <- function(scdata_list.sample, config) { ) config.sample$filterSettings$methodSettings$absoluteThreshold <- list( - maxFraction = generate_default_values_mitochondrialContent(scdata_list.sample, config.sample), + maxFraction = generate_default_values_mitochondrialContent(scdata.sample, config.sample), binStep = 0.3 ) return(config.sample) } - # threshold for doublet score is the max score given to a singlet (above score => doublets) -get_dblscore_config <- function(scdata_list, config) { - probabilityThreshold <- max(scdata_list$doublet_scores[scdata_list$doublet_class == "singlet"], na.rm = TRUE) +get_dblscore_config <- function(scdata, config) { + probabilityThreshold <- max(scdata$doublet_scores[scdata$doublet_class == "singlet"], na.rm = TRUE) config$filterSettings$probabilityThreshold <- probabilityThreshold return(config) } -get_gene_umi_config <- function(scdata_list, config) { +get_gene_umi_config <- function(scdata, config) { # Sensible values are based on the function "gene.vs.molecule.cell.filter" from the pagoda2 package - p.level <- min(0.001, 1 / ncol(scdata_list)) + p.level <- min(0.001, 1 / ncol(scdata)) config$filterSettings$regressionTypeSettings[[config$filterSettings$regressionType]]$p.level <- p.level return(config) } + duplicate_config_per_sample <- function(step_config, config, samples) { for (sample in unique(samples)) { config[[sample]] <- step_config @@ -190,23 +180,25 @@ duplicate_config_per_sample <- function(step_config, config, samples) { return(config) } +add_custom_config_per_sample <- function(generate_sample_config, config, scdata) { -add_custom_config_per_sample <- function(generate_sample_config, config, scdata_list) { # We update the config file, so to be able to access the raw config we create a copy - raw_config <- config + config.raw <- config - for (sample in names(scdata_list)) { + samples <- scdata$samples + + for (sample in unique(samples)) { # subset the Seurat object to a single sample - sample_data <- scdata_list[[sample]] + scdata.sample <- scdata[, samples %in% sample] # run the function to generate config for a sample - sample_config <- generate_sample_config(sample_data, raw_config) + config.sample <- generate_sample_config(scdata.sample, config.raw) # update sample config thresholds - config[[sample]] <- sample_config + config[[sample]] <- config.sample # add auto settings - config[[sample]]$defaultFilterSettings <- sample_config$filterSettings + config[[sample]]$defaultFilterSettings <- config.sample$filterSettings } return(config) diff --git a/pipeline-runner/R/gem2s-6-prepare_experiment.R b/pipeline-runner/R/gem2s-6-prepare_experiment.R index d49c780c..c81e7a71 100644 --- a/pipeline-runner/R/gem2s-6-prepare_experiment.R +++ b/pipeline-runner/R/gem2s-6-prepare_experiment.R @@ -14,57 +14,70 @@ #' prepare_experiment <- function(input, pipeline_config, prev_out) { message("Preparing experiment ...") - check_names <- c("config", "counts_list", "annot", "doublet_scores", "scdata_list") check_prev_out(prev_out, check_names) scdata_list <- prev_out$scdata_list samples <- names(scdata_list) - message("Total cells:", sum(sapply(scdata_list, ncol))) + message("Merging Seurat Objects...") + scdata <- merge_scdatas(scdata_list) + + #If subsetting all cells, Seurat will not reorder the cells in the object. We need to subset to [-1] and [1] and merge to shuffle. + set.seed(gem2s$random.seed) + shuffle_mask <- sample(colnames(scdata)) + scdata <- merge(scdata[,shuffle_mask[1]],scdata[,shuffle_mask[-1]]) - scdata_list <- add_metadata_to_samples(scdata_list, prev_out$annot, input$experimentId) - prev_out$scdata_list <- scdata_list + scdata <- add_metadata(scdata, prev_out$annot, input$experimentId) + prev_out$scdata <- scdata # construct default QC config and update prev out message("Constructing default QC configuration...") any_filtered <- !(length(prev_out$edrops) == length(samples)) - prev_out$qc_config <- construct_qc_config(scdata_list, any_filtered) + prev_out$qc_config <- construct_qc_config(scdata, any_filtered) res <- list( data = list(), output = prev_out ) - message("\nPreparation for AWS upload step complete.") + message("\nPreperation for AWS upload step complete.") return(res) } -add_metadata_to_samples <- function(scdata_list, annot, experiment_id) { - # cell ids will be generated at random in order to shuffle samples. it is done - # here because merging samples in QC means that shuffling the cells will not - # result in a shuffled cell_ids - set.seed(RANDOM_SEED) - total_cells <- sum(sapply(scdata_list, ncol)) - cell_ids <- 0:total_cells-1 +#' Merge scdatas: merge rds files in input list +#' +#' @param scdata_list +#' +#' @return +#' @export +#' +#' @examples +merge_scdatas <- function(scdata_list) { + if (length(scdata_list) == 1) { + scdata <- scdata_list[[1]] + } else { + scdata <- merge(scdata_list[[1]], y = scdata_list[-1]) + } + + return(scdata) +} - for (sample in names(scdata_list)) { - sample_size <- ncol(scdata_list[[sample]]) +add_metadata <- function(scdata, annot, experiment_id) { - # select only the annotations of the current sample - sample_annotations_idx <- match(rownames(scdata_list[[sample]]), annot$input) - sample_annot <- annot[sample_annotations_idx, ] - scdata_list[[sample]]@misc[["gene_annotations"]] <- sample_annot + # Ensure index by rownames in scdata + annot <- annot[match(rownames(scdata), annot$input), ] + scdata@misc[["gene_annotations"]] <- annot - # add the experiment ID so it's available later - scdata_list[[sample]]@misc[["experimentId"]] <- experiment_id + message("Storing cells id...") + # Keeping old version of ids starting from 0 + scdata$cells_id <- 0:(ncol(scdata) - 1) - # sample cell ids to shuffle them - idxs <- sample(seq_along(cell_ids), sample_size) - scdata_list[[sample]]$cells_id <- cell_ids[idxs] - # remove the selected cell ids for next samples - cell_ids <- cell_ids[-idxs] - } + message("Storing color pool...") + # We store the color pool in a slot in order to be able to access it during configureEmbedding + scdata@misc[["color_pool"]] <- get_color_pool() + scdata@misc[["experimentId"]] <- experiment_id + scdata@misc[["ingestionDate"]] <- Sys.time() - return(scdata_list) + return(scdata) } diff --git a/pipeline-runner/R/gem2s-7-upload_to_aws.R b/pipeline-runner/R/gem2s-7-upload_to_aws.R index e5420b76..e0f389f0 100644 --- a/pipeline-runner/R/gem2s-7-upload_to_aws.R +++ b/pipeline-runner/R/gem2s-7-upload_to_aws.R @@ -1,19 +1,19 @@ upload_to_aws <- function(input, pipeline_config, prev_out) { message("Uploading to AWS ...") - check_names <- c("config", "counts_list", "annot", "doublet_scores", "scdata_list", "qc_config") + check_names <- c("config", "counts_list", "annot", "doublet_scores", "scdata_list", "scdata", "qc_config") check_prev_out(prev_out, check_names) experiment_id <- input$experimentId project_id <- input$projectId # destructure what need from prev_out - scdata_list <- prev_out$scdata_list + scdata <- prev_out$scdata config <- prev_out$config - qc_config <- prev_out$qc_config + config_dataProcessing <- prev_out$qc_config message("Constructing cell sets ...") - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, input) # cell sets file to s3 cell_sets_data <- RJSONIO::toJSON(cell_sets) @@ -24,20 +24,17 @@ upload_to_aws <- function(input, pipeline_config, prev_out) { key = experiment_id ) - for (sample in names(scdata_list)) { - message("Uploading sample ", sample, " object to S3 ...") - fpath <- file.path(tempdir(), "experiment.rds") - saveRDS(scdata_list[[sample]], fpath, compress = FALSE) - # can only upload up to 50Gb because multipart uploads work by uploading - # smaller chunks (parts) and the max amount of parts is 10,000. - put_object_in_s3_multipart(pipeline_config, - bucket = pipeline_config$source_bucket, - object = fpath, - key = file.path(experiment_id, sample, "r.rds") - ) - } + # seurat object to s3 + message("Uploading Seurat Object to S3 ...") + fpath <- file.path(tempdir(), "experiment.rds") + saveRDS(scdata, fpath, compress = FALSE) - message("Samples uploaded") + # can only upload up to 50Gb because part numbers can be any number from 1 to 10,000, inclusive. + put_object_in_s3_multipart(pipeline_config, + bucket = pipeline_config$source_bucket, + object = fpath, + key = file.path(experiment_id, "r.rds") + ) cluster_env <- pipeline_config$cluster_env @@ -49,7 +46,7 @@ upload_to_aws <- function(input, pipeline_config, prev_out) { organism = config$organism, type = config$input$type ), - processingConfig = qc_config + processingConfig = config_dataProcessing ) res <- list( @@ -61,12 +58,11 @@ upload_to_aws <- function(input, pipeline_config, prev_out) { ) message("\nUpload to AWS step complete.") - return(res) } # creates initial cell sets object -get_cell_sets <- function(scdata_list, input) { +get_cell_sets <- function(scdata, input) { scratchpad <- list( key = "scratchpad", name = "Custom cell sets", @@ -76,24 +72,24 @@ get_cell_sets <- function(scdata_list, input) { ) color_pool <- get_color_pool() - sample_cellsets <- build_sample_cellsets(input, scdata_list, color_pool) + samples_set <- samples_sets(input, scdata, color_pool) # remove used colors from pool - used <- seq_along(sample_cellsets$children) + used <- seq_along(samples_set$children) color_pool <- color_pool[-used] # Design cell_set meta_data for DynamoDB - cell_sets <- c(list(scratchpad), list(sample_cellsets)) + cell_sets <- c(list(scratchpad), list(samples_set)) if ("metadata" %in% names(input)) { - cell_sets <- c(cell_sets, build_metadata_cellsets(input, scdata_list, color_pool)) + cell_sets <- c(cell_sets, meta_sets(input, scdata, color_pool)) } cell_sets <- list(cellSets = cell_sets) } # cell_sets fn for seurat samples information -build_sample_cellsets <- function(input, scdata_list, color_pool) { +samples_sets <- function(input, scdata, color_pool) { cell_set <- list( key = "sample", name = "Samples", @@ -102,15 +98,14 @@ build_sample_cellsets <- function(input, scdata_list, color_pool) { type = "metadataCategorical" ) + cells_sample <- scdata$samples sample_ids <- unlist(input$sampleIds) sample_names <- unlist(input$sampleNames) for (i in seq_along(sample_ids)) { - scdata <- scdata_list[[i]] sample_id <- sample_ids[i] sample_name <- sample_names[i] - - cell_ids <- scdata$cells_id + cell_ids <- scdata$cells_id[cells_sample == sample_id] cell_set$children[[i]] <- list( key = sample_id, @@ -124,60 +119,40 @@ build_sample_cellsets <- function(input, scdata_list, color_pool) { } - -#' Create cellsets from user-supplied metadata -#' -#' This function creates the cellsets for the user-supplied metadata (in data -#' management module). It also takes care of assigning a color from the color pool -#' to each cellset. -#' -#' @param input list -#' @param scdata_list list of Seurat objects -#' @param color_pool list of colors to use -#' -#' @return list of cellsets -#' @export -#' -build_metadata_cellsets <- function(input, scdata_list, color_pool) { +# cell_sets fn for seurat metadata information +meta_sets <- function(input, scdata, color_pool) { cell_set_list <- c() - user_metadata <- lapply(input$metadata, unlist) + meta <- lapply(input$metadata, unlist) # user-supplied metadata track names - metadata_names <- names(user_metadata) + keys <- names(meta) - # user supplied metadata names must be made syntactically valid, as seurat does. - # We use them to access the cell ids stored in the seurat object, to create the - # cellsets. The same names as used in construct_metadata including internal - # 'samples' column which is dropped after making the names. - valid_metadata_names <- make.names(c("samples", metadata_names), unique = TRUE)[-1] + # syntactically valid metadata names as stored in scdata + # same names as used in construct_metadata including internal 'samples' column (dropped) + seurat_keys <- make.names(c("samples", keys), unique = TRUE)[-1] color_index <- 1 - for (i in seq_along(metadata_names)) { - user_metadata_name <- metadata_names[i] - valid_metadata_name <- valid_metadata_names[i] + for (i in seq_along(keys)) { + key <- keys[i] + seurat_key <- seurat_keys[i] cell_set <- list( - "key" = user_metadata_name, - "name" = user_metadata_name, + "key" = key, + "name" = key, "rootNode" = TRUE, "children" = c(), "type" = "metadataCategorical" ) # values of current metadata track - values <- unique(user_metadata[[i]]) + values <- unique(meta[[i]]) for (j in seq_along(values)) { value <- values[j] - - cell_ids <- list() - for (scdata in scdata_list) { - cells_in_value <- scdata[[valid_metadata_name]] == value - cell_ids <- append(cell_ids, scdata$cells_id[cells_in_value]) - } + cell_ids <- scdata$cells_id[scdata[[seurat_key]] == value] cell_set$children[[j]] <- list( - "key" = paste(user_metadata_name, value, sep = "-"), + "key" = paste(key, value, sep = "-"), "name" = value, "color" = color_pool[color_index], "cellIds" = unname(cell_ids) diff --git a/pipeline-runner/R/handle_data.R b/pipeline-runner/R/handle_data.R index fd48f93d..1f9ad076 100644 --- a/pipeline-runner/R/handle_data.R +++ b/pipeline-runner/R/handle_data.R @@ -30,8 +30,17 @@ upload_cells_id <- function(pipeline_config, object_key, cells_id) { return(object_key) } -reload_scdata_from_s3 <- function (s3, pipeline_config, experiment_id) { - bucket <- pipeline_config$processed_bucket + +reload_scdata_from_s3 <- function(pipeline_config, experiment_id, task_name, tasks) { + # If the task is after data integration, we need to get scdata from processed_matrix + task_names <- names(tasks) + integration_index <- match("dataIntegration", task_names) + if (match(task_name, task_names) > integration_index) { + bucket <- pipeline_config$processed_bucket + } else { + bucket <- pipeline_config$source_bucket + } + s3 <- paws::s3(config = pipeline_config$aws_config) message(bucket) message(paste(experiment_id, "r.rds", sep = "/")) @@ -42,54 +51,10 @@ reload_scdata_from_s3 <- function (s3, pipeline_config, experiment_id) { obj <- readRDS(rawConnection(body)) return(obj) } -reload_scdata_list_from_s3 <- function (s3, pipeline_config, experiment_id) { - bucket <- pipeline_config$source_bucket - objects <- s3$list_objects( - Bucket = bucket, - Prefix = experiment_id - ) - samples <- objects$Contents - - scdata_list <- list() - for (sample in samples) { - key <- sample$Key - - c(body, ...rest) %<-% s3$get_object( - Bucket = bucket, - Key = paste(key, sep = "/") - ) - obj <- readRDS(rawConnection(body)) - sample_id <- strsplit(key, "/")[[1]][[2]] - scdata_list[[sample_id]] <- obj - } - - return(scdata_list) -} - -# reload_data_from_s3 will reload: -# * scdata_list for all steps before integration (included) -# * scdata file for all steps after data integration -reload_data_from_s3 <- function(pipeline_config, experiment_id, task_name, tasks) { - task_names <- names(tasks) - integration_index <- match("dataIntegration", task_names) - s3 <- paws::s3(config = pipeline_config$aws_config) - - # If the task is after data integration, we need to get scdata from processed_matrix - if (match(task_name, task_names) > integration_index) { - return(reload_scdata_from_s3(s3, pipeline_config, experiment_id)) - } - - # Otherwise, return scdata_list - return(reload_scdata_list_from_s3(s3, pipeline_config, experiment_id)) - -} load_cells_id_from_s3 <- function(pipeline_config, experiment_id, task_name, tasks, samples) { s3 <- paws::s3(config = pipeline_config$aws_config) - object_list <- s3$list_objects( - Bucket = pipeline_config$cells_id_bucket, - Prefix = paste0(experiment_id, "/", task_name, "/") - ) + object_list <- s3$list_objects(pipeline_config$cells_id_bucket, Prefix = paste0(experiment_id, "/", task_name, "/")) message(pipeline_config$cells_id_bucket) message(paste(experiment_id, "r.rds", sep = "/")) cells_id <- list() @@ -97,8 +62,6 @@ load_cells_id_from_s3 <- function(pipeline_config, experiment_id, task_name, tas task_names <- names(tasks) integration_index <- match("dataIntegration", task_names) - # after data integration the cell ids are no longer used because there is no filtering - # so they are not loaded if (match(task_name, task_names) <= integration_index) { for (object in object_list$Contents) { key <- object$Key @@ -155,7 +118,6 @@ send_output_to_api <- function(pipeline_config, input, plot_data_keys, output) { message("Sending to SNS topic ", pipeline_config$sns_topic) sns <- paws::sns(config = pipeline_config$aws_config) - message("Building the message") msg <- list( experimentId = input$experimentId, taskName = input$taskName, @@ -169,7 +131,6 @@ send_output_to_api <- function(pipeline_config, input, plot_data_keys, output) { ) ) - message("Publishing the message") result <- sns$publish( Message = RJSONIO::toJSON(msg), TopicArn = pipeline_config$sns_topic, @@ -181,7 +142,6 @@ send_output_to_api <- function(pipeline_config, input, plot_data_keys, output) { ) ) ) - message("Done publishing") return(result$MessageId) } @@ -212,7 +172,7 @@ send_pipeline_fail_update <- function(pipeline_config, input, error_message) { error_msg <- list() - # TODO - REMOVE THE DUPLICATE EXPERIMENT ID FROM INPUT RESPONSE + # TODO - REMOVE THE DUPLICATE EXPERIMETN ID FROM INPUT RESPONSE error_msg$experimentId <- input$experimentId error_msg$taskName <- input$taskName @@ -327,6 +287,7 @@ put_object_in_s3_multipart <- function(pipeline_config, bucket, object, key) { message(sprintf("Putting %s in %s from object %s", key, bucket, object)) s3 <- paws::s3(config = pipeline_config$aws_config) + multipart <- s3$create_multipart_upload( Bucket = bucket, Key = key @@ -341,7 +302,6 @@ put_object_in_s3_multipart <- function(pipeline_config, bucket, object, key) { ) } }) - message("Uploading multiparts") resp <- try({ parts <- upload_multipart_parts(s3, bucket, object, key, multipart$UploadId) s3$complete_multipart_upload( diff --git a/pipeline-runner/R/qc-1-filter_emptydrops.R b/pipeline-runner/R/qc-1-filter_emptydrops.R index e808e33f..80ef2fd8 100644 --- a/pipeline-runner/R/qc-1-filter_emptydrops.R +++ b/pipeline-runner/R/qc-1-filter_emptydrops.R @@ -16,42 +16,38 @@ #' @export #' @return a list with the filtered seurat object by mitochondrial content, the config and the plot values #' -filter_emptydrops <- function(scdata_list, config, sample_id, cells_id, task_name = "classifier", num_cells_to_downsample = 6000) { - sample_cell_ids <- cells_id[[sample_id]] - message("Number of cells IDs: ", length(sample_cell_ids)) - message("Number of cells: ", ncol(scdata_list[[sample_id]])) +filter_emptydrops <- function(scdata, config, sample_id, cells_id, task_name = "classifier", num_cells_to_downsample = 6000) { + cells_id.sample <- cells_id[[sample_id]] - if (length(sample_cell_ids) == 0) { - return(list(data = scdata_list[[sample_id]], new_ids = cells_id, config = config, plotData = list())) + if (length(cells_id.sample) == 0) { + return(list(data = scdata, new_ids = cells_id, config = config, plotData = list())) } - # TODO this is probably not needed because it's the first filter and it has not been yet applied - # so the cells_id are the ones generate in gem2s - sample_data <- subset_ids(scdata_list[[sample_id]], sample_cell_ids) + scdata.sample <- subset_ids(scdata, cells_id.sample) FDR <- config$filterSettings$FDR if (isTRUE(config$auto)) { - FDR <- generate_default_values_classifier(sample_data, config) + FDR <- generate_default_values_classifier(scdata.sample, config) } # for plots and filter stats to be populated guidata <- list() # check if filter data is actually available - if (is.null(scdata_list[[sample_id]]@meta.data$emptyDrops_FDR)) { + if (is.null(scdata@meta.data$emptyDrops_FDR)) { message("Classify is enabled but has no classify data available: will dissable it: no filtering!") config$enabled <- FALSE } if (config$enabled) { message("Classify is enabled: filtering with FDR=", FDR) - meta <- sample_data@meta.data + meta <- scdata.sample@meta.data message("Info: empty-drops table of FDR threshold categories (# UMIs for a given threshold interval)") print(table(meta$samples, cut(meta$emptyDrops_FDR, breaks = c(-Inf, 0, 0.0001, 0.01, 0.1, 0.5, 1, Inf)), useNA = "ifany")) # prevents filtering of NA FDRs if FDR=1 - ed_fdr <- sample_data$emptyDrops_FDR + ed_fdr <- scdata.sample$emptyDrops_FDR ed_fdr[is.na(ed_fdr)] <- 1 message( @@ -59,20 +55,20 @@ filter_emptydrops <- function(scdata_list, config, sample_id, cells_id, task_nam sum(ed_fdr > FDR, na.rm = TRUE), "/", length(ed_fdr) ) - numis <- log10(sample_data@meta.data$nCount_RNA) + numis <- log10(scdata.sample@meta.data$nCount_RNA) fdr_data <- unname(purrr::map2(ed_fdr, numis, function(x, y) { c("FDR" = x, "log_u" = y) })) - fdr_data <- fdr_data[get_positions_to_keep(sample_data, num_cells_to_downsample)] + fdr_data <- fdr_data[get_positions_to_keep(scdata.sample, num_cells_to_downsample)] - remaining_ids <- sample_data@meta.data$cells_id[ed_fdr <= FDR] + remaining_ids <- scdata.sample@meta.data$cells_id[ed_fdr <= FDR] # update config config$filterSettings$FDR <- FDR # Downsample plotData - knee_data <- get_bcranks_plot_data(sample_data, is.cellsize = FALSE)[["knee"]] + knee_data <- get_bcranks_plot_data(scdata.sample, is.cellsize = FALSE)[["knee"]] # Populate guidata list guidata[[generate_gui_uuid(sample_id, task_name, 0)]] <- fdr_data @@ -83,13 +79,13 @@ filter_emptydrops <- function(scdata_list, config, sample_id, cells_id, task_nam guidata[[generate_gui_uuid(sample_id, task_name, 0)]] <- list() guidata[[generate_gui_uuid(sample_id, task_name, 1)]] <- list() guidata[[generate_gui_uuid(sample_id, task_name, 2)]] <- list() - remaining_ids <- sample_cell_ids + remaining_ids <- cells_id.sample } # get filter stats after filtering filter_stats <- list( - before = calc_filter_stats(sample_data), - after = calc_filter_stats(subset_ids(sample_data, remaining_ids)) + before = calc_filter_stats(scdata.sample), + after = calc_filter_stats(subset_ids(scdata.sample, remaining_ids)) ) guidata[[generate_gui_uuid(sample_id, task_name, 2)]] <- filter_stats @@ -97,7 +93,7 @@ filter_emptydrops <- function(scdata_list, config, sample_id, cells_id, task_nam cells_id[[sample_id]] <- remaining_ids result <- list( - data = scdata_list, + data = scdata, new_ids = cells_id, config = config, plotData = guidata diff --git a/pipeline-runner/R/qc-2-filter_low_cellsize.R b/pipeline-runner/R/qc-2-filter_low_cellsize.R index 4b3148c8..84a5b7b1 100644 --- a/pipeline-runner/R/qc-2-filter_low_cellsize.R +++ b/pipeline-runner/R/qc-2-filter_low_cellsize.R @@ -2,11 +2,9 @@ #' #' cell_size_distribution_filter function #' -#' This is a simplest filter that looks at the shape of the cell size (# of UMIs -#' per cell) distribution and looks for some local minima, minimum of second derivative, etc. -#' To separate the large cell barcodes that correspond to real cells from the -#' tail containing 'empty droplets'. This can be a useful first guess. The settings -#'for such a filter can also contain a simple "min cell size" setting. +#' This is a simplest filter that looks at the shape of the cell size (# of UMIs per cell) distribution and looks for some local minima, minimum of second derivative, etc. +#' To separate the large cell barcodes that correspond to real cells from the tail containing 'empty droplets'. +#' This can be a useful first guess. The settings for such a filter can also contain a simple "min cell size" setting. #' #' @param config list containing the following information #' - enable: true/false. Refering to apply or not the filter. @@ -18,43 +16,43 @@ #' @export #' @return a list with the filtered seurat object by cell size ditribution, the config and the plot values #' -filter_low_cellsize <- function(scdata_list, config, sample_id, cells_id, task_name = "cellSizeDistribution", num_cells_to_downsample = 6000) { - sample_cell_ids <- cells_id[[sample_id]] +filter_low_cellsize <- function(scdata, config, sample_id, cells_id, task_name = "cellSizeDistribution", num_cells_to_downsample = 6000) { + cells_id.sample <- cells_id[[sample_id]] - if (length(sample_cell_ids) == 0) { - return(list(data = scdata_list[[sample_id]], new_ids = cells_id, config = config, plotData = list())) + if (length(cells_id.sample) == 0) { + return(list(data = scdata, new_ids = cells_id, config = config, plotData = list())) } - sample_data <- subset_ids(scdata_list[[sample_id]], sample_cell_ids) + scdata.sample <- subset_ids(scdata, cells_id.sample) minCellSize <- as.numeric(config$filterSettings$minCellSize) # extract plotting data of original data to return to plot slot later - plot_data <- get_bcranks_plot_data(sample_data, num_cells_to_downsample) + plot_data <- get_bcranks_plot_data(scdata.sample, num_cells_to_downsample) # Check if it is required to compute sensible values. From the function 'generate_default_values_cellSizeDistribution', it is expected # to get a list with two elements {minCellSize and binStep} if (exists("auto", where = config)) { if (as.logical(toupper(config$auto))) { - # config not really needed for this one (maybe later for threshold_low/high): - # HARDCODE Value. threshold_low + # config not really needed for this one (maybe later for threshold.low/high): + # HARDCODE Value. threshold.low # [ Parameter for function CalculateBarcodeInflections. Description: Ignore barcodes of rank below this threshold in inflection calculation] - threshold_low <- 1e2 - # If there are less cells than the value threshold_low, the function CalculateBarcodeInflections fails. So we need to handle by not removing any cells, that is, + threshold.low <- 1e2 + # If there are less cells than the value threshold.low, the function CalculateBarcodeInflections fails. So we need to handle by not removing any cells, that is, # consider the minCellSize as the minimun UMIs in the dataset. # This should be handled in a long-term by adding a different function for computing the default value. - if (ncol(sample_data) < threshold_low) { - minCellSize <- min(sample_data$nCount_RNA) + if (ncol(scdata.sample) < threshold.low) { + minCellSize <- min(scdata.sample$nCount_RNA) } else { - minCellSize <- generate_default_values_cellSizeDistribution(sample_data, config) + minCellSize <- generate_default_values_cellSizeDistribution(scdata.sample, config) } } } if (as.logical(toupper(config$enabled))) { - remaining_ids <- sample_data@meta.data$cells_id[sample_data$nCount_RNA >= minCellSize] + remaining_ids <- scdata.sample@meta.data$cells_id[scdata.sample$nCount_RNA >= minCellSize] } else { - remaining_ids <- sample_cell_ids + remaining_ids <- cells_id.sample } # update config @@ -65,15 +63,15 @@ filter_low_cellsize <- function(scdata_list, config, sample_id, cells_id, task_n guidata[[generate_gui_uuid(sample_id, task_name, 1)]] <- plot_data[["hist"]] # Populate with filter statistics filter_stats <- list( - before = calc_filter_stats(sample_data), - after = calc_filter_stats(subset_ids(sample_data, remaining_ids)) + before = calc_filter_stats(scdata.sample), + after = calc_filter_stats(subset_ids(scdata.sample, remaining_ids)) ) guidata[[generate_gui_uuid(sample_id, task_name, 3)]] <- filter_stats cells_id[[sample_id]] <- remaining_ids result <- list( - data = scdata_list, + data = scdata, new_ids = cells_id, config = config, plotData = guidata @@ -96,7 +94,7 @@ filter_low_cellsize <- function(scdata_list, config, sample_id, cells_id, task_n #' @importFrom magrittr %>% #' get_bcranks_plot_data <- function(sample_subset, nmax = 6000, is.cellsize = TRUE) { - set.seed(RANDOM_SEED) + set.seed(gem2s$random.seed) plot_data <- list() numis <- unname(sample_subset$nCount_RNA) ord <- order(numis, decreasing = TRUE) @@ -188,12 +186,12 @@ plot_knee_regions <- function(dt, thresh = 0.01) { # samples. generate_default_values_cellSizeDistribution <- function(scdata, config) { # `CalculateBarcodeInflections` including inflection point calculation - threshold_low <- if (ncol(scdata) <= 200) NULL else 100 + threshold.low <- if (ncol(scdata) <= 200) NULL else 100 scdata_tmp <- Seurat::CalculateBarcodeInflections( object = scdata, barcode.column = "nCount_RNA", group.column = "samples", - threshold.low = threshold_low + threshold.low = threshold.low ) # returned is both the rank(s) as well as inflection point # extracting only inflection point(s) diff --git a/pipeline-runner/R/qc-3-filter_high_mito.R b/pipeline-runner/R/qc-3-filter_high_mito.R index 0eae72d3..12413d9e 100644 --- a/pipeline-runner/R/qc-3-filter_high_mito.R +++ b/pipeline-runner/R/qc-3-filter_high_mito.R @@ -19,45 +19,45 @@ #' @export #' @return a list with the filtered seurat object by mitochondrial content, the config and the plot values #' -filter_high_mito <- function(scdata_list, config, sample_id, cells_id, task_name = "mitochondrialContent", num_cells_to_downsample = 6000) { - sample_cell_ids <- cells_id[[sample_id]] +filter_high_mito <- function(scdata, config, sample_id, cells_id, task_name = "mitochondrialContent", num_cells_to_downsample = 6000) { + cells_id.sample <- cells_id[[sample_id]] - if (length(sample_cell_ids) == 0) { - return(list(data = scdata_list[[sample_id]], new_ids = cells_id, config = config, plotData = list())) + if (length(cells_id.sample) == 0) { + return(list(data = scdata, new_ids = cells_id, config = config, plotData = list())) } - sample_data <- subset_ids(scdata_list[[sample_id]], sample_cell_ids) + scdata.sample <- subset_ids(scdata, cells_id.sample) # Check if the experiment has MT-content - if (!"percent.mt" %in% colnames(scdata_list[[sample_id]]@meta.data)) { + if (!"percent.mt" %in% colnames(scdata@meta.data)) { message("Warning! No MT-content has been computed for this experiment!") guidata <- list() - return(list(data = scdata_list[[sample_id]], config = config, plotData = guidata)) + return(list(data = scdata, config = config, plotData = guidata)) } - max_fraction <- config$filterSettings$methodSettings[[config$filterSettings$method]]$maxFraction + maxFraction <- config$filterSettings$methodSettings[[config$filterSettings$method]]$maxFraction - plot_data <- generate_mito_plot_data(sample_data) + plot_data <- generate_mito_plot_data(scdata.sample) if (exists("auto", where = config)) { if (as.logical(toupper(config$auto))) { - max_fraction <- generate_default_values_mitochondrialContent(sample_data, config) + maxFraction <- generate_default_values_mitochondrialContent(scdata.sample, config) } } if (as.logical(toupper(config$enabled))) { - remaining_ids <- sample_data@meta.data$cells_id[sample_data$percent.mt <= max_fraction * 100] + remaining_ids <- scdata.sample@meta.data$cells_id[scdata.sample$percent.mt <= maxFraction * 100] } else { - remaining_ids <- sample_cell_ids + remaining_ids <- cells_id.sample } - config$filterSettings$methodSettings[[config$filterSettings$method]]$maxFraction <- max_fraction + config$filterSettings$methodSettings[[config$filterSettings$method]]$maxFraction <- maxFraction # Downsample plotData - num_cells_to_downsample <- downsample_plotdata(ncol(sample_data), num_cells_to_downsample) + num_cells_to_downsample <- downsample_plotdata(ncol(scdata.sample), num_cells_to_downsample) - set.seed(RANDOM_SEED) - cells_position_to_keep <- sample(1:ncol(sample_data), num_cells_to_downsample, replace = FALSE) + set.seed(gem2s$random.seed) + cells_position_to_keep <- sample(1:ncol(scdata.sample), num_cells_to_downsample, replace = FALSE) cells_position_to_keep <- sort(cells_position_to_keep) plot1_data <- plot_data$plot1_data[cells_position_to_keep] plot2_data <- plot_data$plot2_data[cells_position_to_keep] @@ -71,8 +71,8 @@ filter_high_mito <- function(scdata_list, config, sample_id, cells_id, task_name # populate with filter statistics filter_stats <- list( - before = calc_filter_stats(sample_data), - after = calc_filter_stats(subset_ids(sample_data, remaining_ids)) + before = calc_filter_stats(scdata.sample), + after = calc_filter_stats(subset_ids(scdata.sample, remaining_ids)) ) guidata[[generate_gui_uuid(sample_id, task_name, 2)]] <- filter_stats @@ -80,7 +80,7 @@ filter_high_mito <- function(scdata_list, config, sample_id, cells_id, task_name cells_id[[sample_id]] <- remaining_ids result <- list( - data = scdata_list, + data = scdata, new_ids = cells_id, config = config, plotData = guidata diff --git a/pipeline-runner/R/qc-4-filter_gene_umi_outlier.R b/pipeline-runner/R/qc-4-filter_gene_umi_outlier.R index f475c6a1..041b6c8a 100644 --- a/pipeline-runner/R/qc-4-filter_gene_umi_outlier.R +++ b/pipeline-runner/R/qc-4-filter_gene_umi_outlier.R @@ -15,7 +15,7 @@ #' - linear and spline: for each there is only one element: #' - p.level: which refers to confidence level for deviation from the main trend #' -#' @param scdata_list list of \code{SeuratObject} +#' @param scdata \code{SeuratObject} #' @param sample_id value in \code{scdata$samples} to apply filter for #' @param task_name name of task: \code{'numGenesVsNumUmis'} #' @param num_cells_to_downsample maximum number of cells for returned plots @@ -24,34 +24,34 @@ #' @return a list with the filtered seurat object by numGenesVsNumUmis, the config and the plot values #' #' -filter_gene_umi_outlier <- function(scdata_list, config, sample_id, cells_id, task_name = "numGenesVsNumUmis", num_cells_to_downsample = 6000) { - sample_cell_ids <- cells_id[[sample_id]] +filter_gene_umi_outlier <- function(scdata, config, sample_id, cells_id, task_name = "numGenesVsNumUmis", num_cells_to_downsample = 6000) { + cells_id.sample <- cells_id[[sample_id]] - if (length(sample_cell_ids) == 0) { - return(list(data = scdata_list[[sample_id]], new_ids = cells_id, config = config, plotData = list())) + if (length(cells_id.sample) == 0) { + return(list(data = scdata, new_ids = cells_id, config = config, plotData = list())) } - sample_data <- subset_ids(scdata_list[[sample_id]], sample_cell_ids) + scdata.sample <- subset_ids(scdata, cells_id.sample) type <- config$filterSettings$regressionType - # get p_level and update in config + # get p.level and update in config # defaults from "gene.vs.molecule.cell.filter" in pagoda2 if (safeTRUE(config$auto)) - p_level <- min(0.001, 1 / ncol(sample_data)) + p.level <- min(0.001, 1 / ncol(scdata.sample)) else - p_level <- config$filterSettings$regressionTypeSettings[[type]]$p.level + p.level <- config$filterSettings$regressionTypeSettings[[type]]$p.level - p_level <- suppressWarnings(as.numeric(p_level)) - if(is.na(p_level)) stop("p_level couldn't be interpreted as a number.") + p.level <- suppressWarnings(as.numeric(p.level)) + if(is.na(p.level)) stop("p.level couldnt be interpreted as a number.") - config$filterSettings$regressionTypeSettings[[type]]$p.level <- p_level + config$filterSettings$regressionTypeSettings[[type]]$p.level <- p.level # regress log10 molecules vs genes fit.data <- data.frame( - log_molecules = log10(sample_data$nCount_RNA), - log_genes = log10(sample_data$nFeature_RNA), - row.names = sample_data$cells_id + log_molecules = log10(scdata.sample$nCount_RNA), + log_genes = log10(scdata.sample$nFeature_RNA), + row.names = scdata.sample$cells_id ) fit.data <- fit.data[order(fit.data$log_molecules), ] @@ -60,21 +60,21 @@ filter_gene_umi_outlier <- function(scdata_list, config, sample_id, cells_id, ta else fit <- MASS::rlm(log_genes ~ log_molecules, data = fit.data) if (safeTRUE(config$enabled)) { - # get the interval based on p_level parameter - preds <- suppressWarnings(predict(fit, interval = "prediction", level = 1 - p_level)) + # get the interval based on p.level parameter + preds <- suppressWarnings(predict(fit, interval = "prediction", level = 1 - p.level)) # filter outliers above/below cutoff bands is.outlier <- fit.data$log_genes > preds[, 'upr'] | fit.data$log_genes < preds[, 'lwr'] remaining_ids <- as.numeric(rownames(fit.data)[!is.outlier]) remaining_ids <- remaining_ids[order(remaining_ids)] } else { - remaining_ids <- sample_cell_ids + remaining_ids <- cells_id.sample } # downsample for plot data - nkeep <- downsample_plotdata(ncol(sample_data), num_cells_to_downsample) + nkeep <- downsample_plotdata(ncol(scdata.sample), num_cells_to_downsample) - set.seed(RANDOM_SEED) + set.seed(gem2s$random.seed) keep_rows <- sample(nrow(fit.data), nkeep) keep_rows <- sort(keep_rows) downsampled_data <- fit.data[keep_rows, ] @@ -82,7 +82,7 @@ filter_gene_umi_outlier <- function(scdata_list, config, sample_id, cells_id, ta # get evenly spaced predictions on downsampled data for plotting lines xrange <- range(downsampled_data$log_molecules) newdata <- data.frame(log_molecules = seq(xrange[1], xrange[2], length.out = 10)) - line_preds <- suppressWarnings(predict(fit, newdata, interval = "prediction", level = 1 - p_level)) + line_preds <- suppressWarnings(predict(fit, newdata, interval = "prediction", level = 1 - p.level)) line_preds <- cbind(newdata, line_preds) %>% dplyr::select(-fit) %>% @@ -95,8 +95,8 @@ filter_gene_umi_outlier <- function(scdata_list, config, sample_id, cells_id, ta # Populate with filter statistics and plot data filter_stats <- list( - before = calc_filter_stats(sample_data), - after = calc_filter_stats(subset_ids(sample_data, remaining_ids)) + before = calc_filter_stats(scdata.sample), + after = calc_filter_stats(subset_ids(scdata.sample, remaining_ids)) ) guidata <- list() @@ -106,7 +106,7 @@ filter_gene_umi_outlier <- function(scdata_list, config, sample_id, cells_id, ta cells_id[[sample_id]] <- remaining_ids result <- list( - data = scdata_list, + data = scdata, new_ids = cells_id, config = config, plotData = guidata diff --git a/pipeline-runner/R/qc-5-filter_doublets.R b/pipeline-runner/R/qc-5-filter_doublets.R index 044ac2df..b99bf9ce 100644 --- a/pipeline-runner/R/qc-5-filter_doublets.R +++ b/pipeline-runner/R/qc-5-filter_doublets.R @@ -17,47 +17,47 @@ #' @export #' @return a list with the filtered seurat object by doublet score, the config and the plot values #' -filter_doublets <- function(scdata_list, config, sample_id, cells_id, task_name = "doubletScores", num_cells_to_downsample = 6000) { - sample_cell_ids <- cells_id[[sample_id]] +filter_doublets <- function(scdata, config, sample_id, cells_id, task_name = "doubletScores", num_cells_to_downsample = 6000) { + cells_id.sample <- cells_id[[sample_id]] - if (length(sample_cell_ids) == 0) { - return(list(data = scdata_list[[sample_id]], new_ids = cells_id, config = config, plotData = list())) + if (length(cells_id.sample) == 0) { + return(list(data = scdata, new_ids = cells_id, config = config, plotData = list())) } - sample_data <- subset_ids(scdata_list[[sample_id]], sample_cell_ids) + scdata.sample <- subset_ids(scdata, cells_id.sample) # Check if the experiment has doubletScores - if (!"doublet_scores" %in% colnames(scdata_list[[sample_id]]@meta.data)) { + if (!"doublet_scores" %in% colnames(scdata@meta.data)) { message("Warning! No doubletScores scores has been computed for this experiment!") guidata <- list() - return(list(data = scdata_list[[sample_id]], config = config, plotData = guidata)) + return(list(data = scdata, config = config, plotData = guidata)) } - probability_threshold <- config$filterSettings[["probabilityThreshold"]] + probabilityThreshold <- config$filterSettings[["probabilityThreshold"]] # Check if it is required to compute sensible values. From the function 'generate_default_values_doubletScores', it is expected - # to get a probability threshold value + # to get a value --> probabilityThreshold. if (exists("auto", where = config)) { if (as.logical(toupper(config$auto))) { - probability_threshold <- generate_default_values_doubletScores(sample_data) + probabilityThreshold <- generate_default_values_doubletScores(scdata.sample) } } - plot1_data <- generate_doublets_plot_data(sample_data, num_cells_to_downsample) + plot1_data <- generate_doublets_plot_data(scdata.sample, num_cells_to_downsample) # Check whether the filter is set to true or false if (as.logical(toupper(config$enabled))) { # all barcodes that match threshold in the subset data # treat NA doublet scores as defacto singlets - doublet_scores <- sample_data$doublet_scores + doublet_scores <- scdata.sample$doublet_scores doublet_scores[is.na(doublet_scores)] <- 0 - remaining_ids <- sample_data@meta.data$cells_id[doublet_scores <= probability_threshold] + remaining_ids <- scdata.sample@meta.data$cells_id[doublet_scores <= probabilityThreshold] } else { - remaining_ids <- sample_cell_ids + remaining_ids <- cells_id.sample } # update config - config$filterSettings$probabilityThreshold <- probability_threshold + config$filterSettings$probabilityThreshold <- probabilityThreshold guidata <- list() @@ -66,8 +66,8 @@ filter_doublets <- function(scdata_list, config, sample_id, cells_id, task_name # populate with filter statistics filter_stats <- list( - before = calc_filter_stats(sample_data), - after = calc_filter_stats(subset_ids(sample_data, remaining_ids)) + before = calc_filter_stats(scdata.sample), + after = calc_filter_stats(subset_ids(scdata.sample, remaining_ids)) ) guidata[[generate_gui_uuid(sample_id, task_name, 1)]] <- filter_stats @@ -75,7 +75,7 @@ filter_doublets <- function(scdata_list, config, sample_id, cells_id, task_name cells_id[[sample_id]] <- remaining_ids result <- list( - data = scdata_list, + data = scdata, new_ids = cells_id, config = config, plotData = guidata @@ -85,8 +85,8 @@ filter_doublets <- function(scdata_list, config, sample_id, cells_id, task_name generate_default_values_doubletScores <- function(scdata) { # default doublet score based of scDblFinder classification - is_singlet <- scdata$doublet_class == "singlet" - threshold <- max(scdata$doublet_scores[is_singlet], na.rm = TRUE) + is.singlet <- scdata$doublet_class == "singlet" + threshold <- max(scdata$doublet_scores[is.singlet], na.rm = TRUE) return(threshold) } diff --git a/pipeline-runner/R/qc-6-integrate_scdata.R b/pipeline-runner/R/qc-6-integrate_scdata.R index 646d818b..38c1383a 100644 --- a/pipeline-runner/R/qc-6-integrate_scdata.R +++ b/pipeline-runner/R/qc-6-integrate_scdata.R @@ -20,41 +20,33 @@ # } # }, -integrate_scdata <- function(scdata_list, config, sample_id, cells_id, task_name = "dataIntegration") { - # subset each of the samples before merging them - for (sample in names(scdata_list)) { - flat_cell_ids <- unname(unlist(cells_id[[sample]])) - scdata_list[[sample]] <- subset_ids(scdata_list[[sample]], flat_cell_ids) - } - - message("Total cells: ", sum(sapply(scdata_list, ncol))) - scdata <- create_scdata(scdata_list) - +integrate_scdata <- function(scdata, config, sample_id, cells_id, task_name = "dataIntegration") { + flat_cells_id <- unname(unlist(cells_id)) + scdata <- subset_ids(scdata, flat_cells_id) # main function - set.seed(RANDOM_SEED) - message("running data integration") - scdata_integrated <- run_dataIntegration(scdata, config) + set.seed(gem2s$random.seed) + scdata.integrated <- run_dataIntegration(scdata, config) # get npcs from the UMAP call in integration functions - npcs <- length(scdata_integrated@commands$RunUMAP@params$dims) + npcs <- length(scdata.integrated@commands$RunUMAP@params$dims) message("\nSet config numPCs to npcs used in last UMAP call: ", npcs, "\n") config$dimensionalityReduction$numPCs <- npcs - var_explained <- get_explained_variance(scdata_integrated) + var_explained <- get_explained_variance(scdata.integrated) # This same numPCs will be used throughout the platform. - scdata_integrated@misc[["numPCs"]] <- config$dimensionalityReduction$numPCs + scdata.integrated@misc[["numPCs"]] <- config$dimensionalityReduction$numPCs - scdata_integrated <- colorObject(scdata_integrated) - cells_order <- rownames(scdata_integrated@meta.data) - plot1_data <- unname(purrr::map2(scdata_integrated@reductions$umap@cell.embeddings[, 1], scdata_integrated@reductions$umap@cell.embeddings[, 2], function(x, y) { + scdata.integrated <- colorObject(scdata.integrated) + cells_order <- rownames(scdata.integrated@meta.data) + plot1_data <- unname(purrr::map2(scdata.integrated@reductions$umap@cell.embeddings[, 1], scdata.integrated@reductions$umap@cell.embeddings[, 2], function(x, y) { c("x" = x, "y" = y) })) # Adding color and sample id plot1_data <- purrr::map2( plot1_data, - unname(scdata_integrated@meta.data[cells_order, "samples"]), + unname(scdata.integrated@meta.data[cells_order, "samples"]), function(x, y) { append(x, list("sample" = y)) } @@ -62,7 +54,7 @@ integrate_scdata <- function(scdata_list, config, sample_id, cells_id, task_name plot1_data <- purrr::map2( plot1_data, - unname(scdata_integrated@meta.data[cells_order, "color_samples"]), + unname(scdata.integrated@meta.data[cells_order, "color_samples"]), function(x, y) { append(x, list("col" = y)) } @@ -79,7 +71,7 @@ integrate_scdata <- function(scdata_list, config, sample_id, cells_id, task_name # the result object will have to conform to this format: {data, config, plotData : {plot1, plot2}} result <- list( - data = scdata_integrated, + data = scdata.integrated, new_ids = cells_id, config = config, plotData = plots @@ -88,47 +80,6 @@ integrate_scdata <- function(scdata_list, config, sample_id, cells_id, task_name return(result) } -#' Create the merged Seurat object -#' -#' This function takes care of merging the sample seurat objects, shuffling -#' and adding the metadata to the complete Seurat object. It does so by calling -#' the corresponding functions. -#' -#' @param scdata_list -#' -#' @return SeuratObject -#' @export -#' -create_scdata <- function(scdata_list) { - - merged_scdatas <- merge_scdata_list(scdata_list) - merged_scdatas <- add_metadata(merged_scdatas, scdata_list) - - return(merged_scdatas) -} - -#' Merge the list of sample Seurat objects -#' -#' If the list contains more than one seurat object, it merges them all. Else, -#' returns the only element. -#' -#' @param scdata_list list of SeuratObjects -#' -#' @return SeuratObject -#' @export -#' -merge_scdata_list <- function(scdata_list) { - - if (length(scdata_list) == 1) { - scdata <- scdata_list[[1]] - } else { - scdata <- merge(scdata_list[[1]], y = scdata_list[-1]) - } - - return(scdata) - -} - # This function covers # - Integrate the data using the variable "type" (in case of data integration method is selected) and normalize using LogNormalize method. # - Compute PCA analysis @@ -481,37 +432,3 @@ build_cc_gene_list <- function(all_genes) { return(cc_gene_indices) } - -#' Add the metadata present in scdata_list into the merged SeuratObject -#' -#' This function adds metadata, some of which is present in the sample SeuratObjects -#' (the experimentID, the gene annotations), and some that is not (the color pool) -#' to the merged SeuratObject. -#' -#' @param scdata SeuratObject -#' @param scdata_list list of sample SeuratObjects -#' -#' @return SeuratObject with added metadata -#' @export -#' -add_metadata <- function(scdata, scdata_list) { - # misc data is duplicated in each of the samples and it does not - # need to be merge so pick the data in the first one and add it to the merged dataset - scdata@misc <- scdata_list[[1]]@misc - experiment_id <- scdata_list[[1]]@misc[["experimentId"]] - - annot_list <- list() - for (sample in names(scdata_list)) { - annot_list[[sample]] <- scdata_list[[sample]]@misc[["gene_annotations"]] - } - - scdata@misc[["gene_annotations"]] <- format_annot(annot_list) - - # We store the color pool in a slot in order to be able to access it during configureEmbedding - scdata@misc[["color_pool"]] <- get_color_pool() - scdata@misc[["experimentId"]] <- experiment_id - scdata@misc[["ingestionDate"]] <- Sys.time() - - return(scdata) -} - diff --git a/pipeline-runner/R/qc-7-embed_and_cluster.R b/pipeline-runner/R/qc-7-embed_and_cluster.R index 709033de..572f6971 100644 --- a/pipeline-runner/R/qc-7-embed_and_cluster.R +++ b/pipeline-runner/R/qc-7-embed_and_cluster.R @@ -9,6 +9,7 @@ embed_and_cluster <- sample_id, cells_id, task_name = "configureEmbedding") { + message("starting clusters") clustering_method <- config$clusteringSettings$method methodSettings <- @@ -77,7 +78,6 @@ update_sets_through_api <- experiment_id, cell_set_key, auth_JWT) { - httr_query <- paste0("$[?(@.key == \"", cell_set_key, "\")]") httr::PATCH( diff --git a/pipeline-runner/R/qc-helpers.R b/pipeline-runner/R/qc-helpers.R index 4a9af388..dcc9b574 100644 --- a/pipeline-runner/R/qc-helpers.R +++ b/pipeline-runner/R/qc-helpers.R @@ -1,14 +1,15 @@ #' Title #' -#' @param scdata_list +#' @param scdata #' #' @return #' @export #' -generate_first_step_ids <- function(scdata_list) { +#' @examples +generate_first_step_ids <- function(scdata) { cells_id <- list() - for (sample_id in names(scdata_list)) { - cells_id[[sample_id]] <- scdata_list[[sample_id]]$cells_id + for (sample_id in unique(scdata$samples)) { + cells_id[[sample_id]] <- scdata$cells_id[scdata$samples == sample_id] } return(cells_id) } @@ -44,7 +45,7 @@ remove_cell_ids <- function(pipeline_config, experiment_id) { get_positions_to_keep <- function(scdata, num_cells_to_downsample) { # Downsample plotData num_cells_to_downsample <- downsample_plotdata(ncol(scdata), num_cells_to_downsample) - set.seed(RANDOM_SEED) + set.seed(gem2s$random.seed) cells_position_to_keep <- sample(1:ncol(scdata), num_cells_to_downsample, replace = FALSE) cells_position_to_keep <- sort(cells_position_to_keep) @@ -180,6 +181,7 @@ calc_filter_stats <- function(scdata) { #' @return #' @export #' +#' @examples runClusters <- function(clustering_method, resolution, data) { data <- getClusters(clustering_method, resolution, data) res_col <- paste0(data@active.assay, "_snn_res.", toString(resolution)) @@ -201,6 +203,7 @@ runClusters <- function(clustering_method, resolution, data) { #' @return #' @export #' +#' @examples getClusters <- function(clustering_method, resolution, data) { res_col <- paste0(data@active.assay, "_snn_res.", toString(resolution)) algorithm <- list("louvain" = 1, "leiden" = 4)[[clustering_method]] diff --git a/pipeline-runner/R/sysdata.rda b/pipeline-runner/R/sysdata.rda index d71d6e79..27434084 100644 Binary files a/pipeline-runner/R/sysdata.rda and b/pipeline-runner/R/sysdata.rda differ diff --git a/pipeline-runner/README.md b/pipeline-runner/README.md index c3f5c48d..3ae97f33 100644 --- a/pipeline-runner/README.md +++ b/pipeline-runner/README.md @@ -13,7 +13,7 @@ Task definitions for individual pipeline items are located under `R/`. Development ----------- -You can open the VS Code workspace in a development container through VS Code, +You can open the VS Code wrkspace in a development container through VS Code, which will start the container. How to execute tasks locally @@ -24,4 +24,4 @@ AWS environment that. Task execution is done through Step Function activities, which are created by other parts of the platform. You can use the project under `local-runner` -to inspect this process. +to inspect this process. diff --git a/pipeline-runner/data-raw/sysdata.R b/pipeline-runner/data-raw/sysdata.R index 10b1b2ac..0ee07487 100644 --- a/pipeline-runner/data-raw/sysdata.R +++ b/pipeline-runner/data-raw/sysdata.R @@ -2,11 +2,10 @@ gem2s <- list( max.edrops.fdr = 0.001, max.empty.counts = 100, - max.empty.drops = 50 + max.empty.drops = 50, + random.seed = 42 ) -RANDOM_SEED <- 42 - # path where dump/log files are saved # mounted as a volume outside container to local-runner/debug DEBUG_PATH <- "/debug" @@ -37,7 +36,6 @@ IDS_IDS <- "ids_ids" usethis::use_data( gem2s, - RANDOM_SEED, DEBUG_PATH, file_names, file_types_by_technology, diff --git a/pipeline-runner/init.R b/pipeline-runner/init.R index eef3ab81..c77ceba8 100644 --- a/pipeline-runner/init.R +++ b/pipeline-runner/init.R @@ -217,7 +217,7 @@ call_data_processing <- function(task_name, input, pipeline_config) { 'configureEmbedding' = embed_and_cluster ) - # need this for embed_and_cluster + #need this for embed_and_cluster config$api_url <- pipeline_config$api_url config$auth_JWT <- input$authJWT @@ -226,7 +226,7 @@ call_data_processing <- function(task_name, input, pipeline_config) { # assign it to the global environment so we can # persist it across runs of the wrapper - assign("scdata", reload_data_from_s3(pipeline_config, experiment_id, task_name, tasks), pos = ".GlobalEnv") + assign("scdata", reload_scdata_from_s3(pipeline_config, experiment_id, task_name, tasks), pos = ".GlobalEnv") message("Single-cell data loaded.") } @@ -236,7 +236,7 @@ call_data_processing <- function(task_name, input, pipeline_config) { if(task_name == names(tasks)[1]){ assign("cells_id", generate_first_step_ids(scdata), pos = ".GlobalEnv") }else if(task_name %in% names(tasks)){ - samples <- names(scdata) + samples <- unique(scdata$samples) assign("cells_id", load_cells_id_from_s3(pipeline_config, experiment_id, task_name, tasks, samples), pos = ".GlobalEnv") }else{ stop("Invalid task name given: ", task_name) @@ -246,16 +246,19 @@ call_data_processing <- function(task_name, input, pipeline_config) { # call function to run and update global variable c( - data, new_ids,...rest_of_results + data,new_ids,...rest_of_results ) %<-% run_processing_step(scdata, config, tasks, task_name, cells_id, sample_id, debug_config) + message("Comparison between cell ids") + message("Old ids length ",length(cells_id[[sample_id]])) + message("New ids length ",length(new_ids[[sample_id]])) + assign("cells_id", new_ids, pos = ".GlobalEnv") task_names <- names(tasks) integration_index <- match("dataIntegration", task_names) task_index <- match(task_name, task_names) if(task_index < integration_index){ - message("Filtered cell ids from ", length(cells_id[[sample_id]]), " to ", length(new_ids[[sample_id]])) next_task <- names(tasks)[[task_index+1]] object_key <- paste0(experiment_id,"/",next_task,"/",sample_id,".rds") upload_cells_id(pipeline_config,object_key,cells_id) @@ -293,7 +296,7 @@ call_data_processing <- function(task_name, input, pipeline_config) { # start_heartbeat <- function(task_token, aws_config) { library(tryCatchLog) - message("Starting heartbeat") + message("Starting hearbeat") states <- paws::sfn(config=aws_config) keep_running <- TRUE @@ -313,7 +316,7 @@ start_heartbeat <- function(task_token, aws_config) { keep_running <- FALSE }) i <- i + 1 - # sleep until next heartbeat + # sleep until next hearbeat Sys.sleep(wait_time) } @@ -380,7 +383,6 @@ init <- function() { # parse data from state machine input input <- RJSONIO::fromJSON(input_json, simplify = FALSE) - message('Input json: ', input) # save logs to file debug_prefix <- file.path(input$experimentId, debug_timestamp) diff --git a/pipeline-runner/man/add_metadata.Rd b/pipeline-runner/man/add_metadata.Rd deleted file mode 100644 index 297d6808..00000000 --- a/pipeline-runner/man/add_metadata.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/qc-6-integrate_scdata.R -\name{add_metadata} -\alias{add_metadata} -\title{Add the metadata present in scdata_list into the merged SeuratObject} -\usage{ -add_metadata(scdata, scdata_list) -} -\arguments{ -\item{scdata}{SeuratObject} - -\item{scdata_list}{list of sample SeuratObjects} -} -\value{ -SeuratObject with added metadata -} -\description{ -This function adds metadata, some of which is present in the sample SeuratObjects -(the experimentID, the gene annotations), and some that is not (the color pool) -to the merged SeuratObject. -} diff --git a/pipeline-runner/man/build_metadata_cellsets.Rd b/pipeline-runner/man/build_metadata_cellsets.Rd deleted file mode 100644 index 849295c3..00000000 --- a/pipeline-runner/man/build_metadata_cellsets.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gem2s-7-upload_to_aws.R -\name{build_metadata_cellsets} -\alias{build_metadata_cellsets} -\title{Create cellsets from user-supplied metadata} -\usage{ -build_metadata_cellsets(input, scdata_list, color_pool) -} -\arguments{ -\item{input}{list} - -\item{scdata_list}{list of Seurat objects} - -\item{color_pool}{list of colors to use} -} -\value{ -list of cellsets -} -\description{ -This function creates the cellsets for the user-supplied metadata (in data -management module). It also takes care of assigning a color from the color pool -to each cellset. -} diff --git a/pipeline-runner/man/construct_metadata.Rd b/pipeline-runner/man/construct_metadata.Rd deleted file mode 100644 index 71d6de4d..00000000 --- a/pipeline-runner/man/construct_metadata.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gem2s-5-create_seurat.R -\name{construct_metadata} -\alias{construct_metadata} -\title{Construct metadata for each SeuratObject} -\usage{ -construct_metadata(counts, sample, config) -} -\arguments{ -\item{counts}{count matrix} - -\item{sample}{character sample ID} - -\item{config}{list containing experiment config} -} -\value{ -data.frame of sample metadata -} -\description{ -This function creates a \code{data.frame} with the barcodes, sampleIDs and user supplied -metadata that corresponds to each one. -} diff --git a/pipeline-runner/man/construct_qc_config.Rd b/pipeline-runner/man/construct_qc_config.Rd deleted file mode 100644 index e01390aa..00000000 --- a/pipeline-runner/man/construct_qc_config.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gem2s-6-construct_qc_config.R -\name{construct_qc_config} -\alias{construct_qc_config} -\title{Constructs default QC configuration} -\usage{ -construct_qc_config(scdata_list, any_filtered) -} -\arguments{ -\item{scdata_list}{list of seurat objects} - -\item{any_filtered}{boolean indicating if barcodes were filtered by the emptyDrops.} -} -\value{ -list of QC configuration parameters -} -\description{ -This function returns the default parameters used during QC as a nested list. -It is sent to the API, which in turn saves it as a jsonb object in the PostgreSQL -database. -} diff --git a/pipeline-runner/man/create_scdata.Rd b/pipeline-runner/man/create_scdata.Rd deleted file mode 100644 index 64aa32ca..00000000 --- a/pipeline-runner/man/create_scdata.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/qc-6-integrate_scdata.R -\name{create_scdata} -\alias{create_scdata} -\title{Create the merged Seurat object} -\usage{ -create_scdata(scdata_list) -} -\arguments{ -\item{scdata_list}{} -} -\value{ -SeuratObject -} -\description{ -This function takes care of merging the sample seurat objects, shuffling -and adding the metadata to the complete Seurat object. It does so by calling -the corresponding functions. -} diff --git a/pipeline-runner/man/filter_doublets.Rd b/pipeline-runner/man/filter_doublets.Rd index 03a66c72..7e2f5081 100644 --- a/pipeline-runner/man/filter_doublets.Rd +++ b/pipeline-runner/man/filter_doublets.Rd @@ -5,7 +5,7 @@ \title{STEP 5. Doublet score filter} \usage{ filter_doublets( - scdata_list, + scdata, config, sample_id, cells_id, diff --git a/pipeline-runner/man/filter_gene_umi_outlier.Rd b/pipeline-runner/man/filter_gene_umi_outlier.Rd index b017af9f..8ee4dbe1 100644 --- a/pipeline-runner/man/filter_gene_umi_outlier.Rd +++ b/pipeline-runner/man/filter_gene_umi_outlier.Rd @@ -5,7 +5,7 @@ \title{STEP 4. Number of genes vs UMIs filter} \usage{ filter_gene_umi_outlier( - scdata_list, + scdata, config, sample_id, cells_id, @@ -14,7 +14,7 @@ filter_gene_umi_outlier( ) } \arguments{ -\item{scdata_list}{list of \code{SeuratObject}} +\item{scdata}{\code{SeuratObject}} \item{config}{list containing the following information - enable: true/false. Refering to apply or not the filter. diff --git a/pipeline-runner/man/filter_high_mito.Rd b/pipeline-runner/man/filter_high_mito.Rd index 06ba6bd1..ee3009fc 100644 --- a/pipeline-runner/man/filter_high_mito.Rd +++ b/pipeline-runner/man/filter_high_mito.Rd @@ -5,7 +5,7 @@ \title{STEP 3. Mitochondrial content filter} \usage{ filter_high_mito( - scdata_list, + scdata, config, sample_id, cells_id, diff --git a/pipeline-runner/man/filter_low_cellsize.Rd b/pipeline-runner/man/filter_low_cellsize.Rd index 62fa54f7..cf2c5447 100644 --- a/pipeline-runner/man/filter_low_cellsize.Rd +++ b/pipeline-runner/man/filter_low_cellsize.Rd @@ -5,7 +5,7 @@ \title{STEP 2. Cell size distribution filter} \usage{ filter_low_cellsize( - scdata_list, + scdata, config, sample_id, cells_id, @@ -29,9 +29,7 @@ a list with the filtered seurat object by cell size ditribution, the config and cell_size_distribution_filter function } \details{ -This is a simplest filter that looks at the shape of the cell size (# of UMIs -per cell) distribution and looks for some local minima, minimum of second derivative, etc. -To separate the large cell barcodes that correspond to real cells from the -tail containing 'empty droplets'. This can be a useful first guess. The settings -for such a filter can also contain a simple "min cell size" setting. +This is a simplest filter that looks at the shape of the cell size (# of UMIs per cell) distribution and looks for some local minima, minimum of second derivative, etc. +To separate the large cell barcodes that correspond to real cells from the tail containing 'empty droplets'. +This can be a useful first guess. The settings for such a filter can also contain a simple "min cell size" setting. } diff --git a/pipeline-runner/man/filter_unnamed_features.Rd b/pipeline-runner/man/filter_unnamed_features.Rd index 0cc07f57..42c389cc 100644 --- a/pipeline-runner/man/filter_unnamed_features.Rd +++ b/pipeline-runner/man/filter_unnamed_features.Rd @@ -13,9 +13,6 @@ filter_unnamed_features(counts, annotations, sample) \item{sample}{character specifying current sample} } -\value{ -list of counts and annotations -} \description{ This function checks if genes whose annotations in the count matrix (\code{rownames(counts)}) are empty, can be annotated with a different column from the features file, diff --git a/pipeline-runner/man/generate_first_step_ids.Rd b/pipeline-runner/man/generate_first_step_ids.Rd index a7cd6149..bd2cc445 100644 --- a/pipeline-runner/man/generate_first_step_ids.Rd +++ b/pipeline-runner/man/generate_first_step_ids.Rd @@ -4,10 +4,10 @@ \alias{generate_first_step_ids} \title{Title} \usage{ -generate_first_step_ids(scdata_list) +generate_first_step_ids(scdata) } \arguments{ -\item{scdata_list}{} +\item{scdata}{} } \description{ Title diff --git a/pipeline-runner/man/get_gem2s_file_v1.Rd b/pipeline-runner/man/get_gem2s_file_v1.Rd new file mode 100644 index 00000000..d6b8f504 --- /dev/null +++ b/pipeline-runner/man/get_gem2s_file_v1.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gem2s-1-download_user_files.R +\name{get_gem2s_file_v1} +\alias{get_gem2s_file_v1} +\title{Download user files from S3} +\usage{ +get_gem2s_file_v1(input, originals_bucket, input_dir, s3) +} +\arguments{ +\item{input}{The input params received from the api} + +\item{originals_bucket}{The bucket where the sample files are} + +\item{input_dir}{A string, where the s3 object is going to be stored} + +\item{s3}{S3 aws client} +} +\description{ +Download user files from S3 +} diff --git a/pipeline-runner/man/get_gem2s_file.Rd b/pipeline-runner/man/get_gem2s_file_v2.Rd similarity index 81% rename from pipeline-runner/man/get_gem2s_file.Rd rename to pipeline-runner/man/get_gem2s_file_v2.Rd index f1baade9..2e844c45 100644 --- a/pipeline-runner/man/get_gem2s_file.Rd +++ b/pipeline-runner/man/get_gem2s_file_v2.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/gem2s-1-download_user_files.R -\name{get_gem2s_file} -\alias{get_gem2s_file} +\name{get_gem2s_file_v2} +\alias{get_gem2s_file_v2} \title{Download user files from S3} \usage{ -get_gem2s_file(input, originals_bucket, input_dir, s3) +get_gem2s_file_v2(input, originals_bucket, input_dir, s3) } \arguments{ \item{input}{The input params received from the api} diff --git a/pipeline-runner/man/ids_to_sym.Rd b/pipeline-runner/man/ids_to_sym.Rd index c22ec8d0..be0bbbcd 100644 --- a/pipeline-runner/man/ids_to_sym.Rd +++ b/pipeline-runner/man/ids_to_sym.Rd @@ -11,9 +11,6 @@ ids_to_sym(sample_annot, annot_with_ids) \item{annot_with_ids}{data.frame of annotations with IDs. Key data.frame} } -\value{ -data.frame of annotations -} \description{ This function tries to convert IDs to symbols using the key data.frame created with \code{make_annot_with_ids}. In case there's no symbol available, it keeps the diff --git a/pipeline-runner/man/merge_scdata_list.Rd b/pipeline-runner/man/merge_scdata_list.Rd deleted file mode 100644 index 124b8fe3..00000000 --- a/pipeline-runner/man/merge_scdata_list.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/qc-6-integrate_scdata.R -\name{merge_scdata_list} -\alias{merge_scdata_list} -\title{Merge the list of sample Seurat objects} -\usage{ -merge_scdata_list(scdata_list) -} -\arguments{ -\item{scdata_list}{list of SeuratObjects} -} -\value{ -SeuratObject -} -\description{ -If the list contains more than one seurat object, it merges them all. Else, -returns the only element. -} diff --git a/pipeline-runner/man/merge_scdatas.Rd b/pipeline-runner/man/merge_scdatas.Rd new file mode 100644 index 00000000..d96a9f07 --- /dev/null +++ b/pipeline-runner/man/merge_scdatas.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gem2s-6-prepare_experiment.R +\name{merge_scdatas} +\alias{merge_scdatas} +\title{Merge scdatas: merge rds files in input list} +\usage{ +merge_scdatas(scdata_list) +} +\arguments{ +\item{scdata_list}{} +} +\description{ +Merge scdatas: merge rds files in input list +} diff --git a/pipeline-runner/man/sym_to_ids.Rd b/pipeline-runner/man/sym_to_ids.Rd index 8c82d8b8..d99e5246 100644 --- a/pipeline-runner/man/sym_to_ids.Rd +++ b/pipeline-runner/man/sym_to_ids.Rd @@ -11,9 +11,6 @@ sym_to_ids(sample_annot, annot_with_ids) \item{annot_with_ids}{data.frame of annotations with IDs. Key data.frame} } -\value{ -data.frame of annotations -} \description{ This function tries to convert symbols to ids using the key data.frame created with \code{make_annot_with_ids}. In case there's no ID available, it keeps the diff --git a/pipeline-runner/start.sh b/pipeline-runner/start.sh deleted file mode 100755 index 9b7c13e2..00000000 --- a/pipeline-runner/start.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -mprof run -C -o /debug/mprof_$RANDOM.dat Rscript init.R \ No newline at end of file diff --git a/pipeline-runner/tests/testthat/_snaps/gem2s-6-prepare_experiment.md b/pipeline-runner/tests/testthat/_snaps/gem2s-6-prepare_experiment.md index 79e0230f..4afa8845 100644 --- a/pipeline-runner/tests/testthat/_snaps/gem2s-6-prepare_experiment.md +++ b/pipeline-runner/tests/testthat/_snaps/gem2s-6-prepare_experiment.md @@ -72,14 +72,14 @@ .. .. ..$ regressionType : chr "linear" .. .. ..$ regressionTypeSettings:List of 2 .. .. .. ..$ linear:List of 1 - .. .. .. .. ..$ p.level: num 0.000131 + .. .. .. .. ..$ p.level: num 0.00013 .. .. .. ..$ spline:List of 1 .. .. .. .. ..$ p.level: num 0.001 .. ..$ defaultFilterSettings:List of 2 .. .. ..$ regressionType : chr "linear" .. .. ..$ regressionTypeSettings:List of 2 .. .. .. ..$ linear:List of 1 - .. .. .. .. ..$ p.level: num 0.000131 + .. .. .. .. ..$ p.level: num 0.00013 .. .. .. ..$ spline:List of 1 .. .. .. .. ..$ p.level: num 0.001 $ doubletScores :List of 4 @@ -126,7 +126,7 @@ .. .. .. ..$ distanceMetric : chr "cosine" .. .. ..$ tsne:List of 2 .. .. .. ..$ perplexity : num 30 - .. .. .. ..$ learningRate: num 200 + .. .. .. ..$ learningRate: num 640 ..$ clusteringSettings:List of 2 .. ..$ method : chr "louvain" .. ..$ methodSettings:List of 1 diff --git a/pipeline-runner/tests/testthat/_snaps/gem2s-7-upload_to_aws.md b/pipeline-runner/tests/testthat/_snaps/gem2s-7-upload_to_aws.md index 58dab042..491ca021 100644 --- a/pipeline-runner/tests/testthat/_snaps/gem2s-7-upload_to_aws.md +++ b/pipeline-runner/tests/testthat/_snaps/gem2s-7-upload_to_aws.md @@ -15,22 +15,17 @@ .. ..$ key : chr "sample" .. ..$ name : chr "Samples" .. ..$ rootNode: logi TRUE - .. ..$ children:List of 3 + .. ..$ children:List of 2 .. .. ..$ :List of 4 .. .. .. ..$ key : chr "123abc" - .. .. .. ..$ name : chr "a" + .. .. .. ..$ name : chr "WT1" .. .. .. ..$ color : chr "#77aadd" - .. .. .. ..$ cellIds: num [1:7576] 18751 21655 9288 1250 15504 ... + .. .. .. ..$ cellIds: int [1:40] 0 1 2 3 4 5 6 7 8 9 ... .. .. ..$ :List of 4 .. .. .. ..$ key : chr "123def" - .. .. .. ..$ name : chr "b" + .. .. .. ..$ name : chr "WT2" .. .. .. ..$ color : chr "#ee8866" - .. .. .. ..$ cellIds: num [1:7576] 10944 12038 18839 9628 9767 ... - .. .. ..$ :List of 4 - .. .. .. ..$ key : chr "123ghi" - .. .. .. ..$ name : chr "c" - .. .. .. ..$ color : chr "#eedd88" - .. .. .. ..$ cellIds: num [1:7576] 2048 9385 17231 3431 20163 ... + .. .. .. ..$ cellIds: int [1:40] 40 41 42 43 44 45 46 47 48 49 ... .. ..$ type : chr "metadataCategorical" # get_cell_sets with two metadata groups matches snapshot @@ -50,22 +45,17 @@ .. ..$ key : chr "sample" .. ..$ name : chr "Samples" .. ..$ rootNode: logi TRUE - .. ..$ children:List of 3 + .. ..$ children:List of 2 .. .. ..$ :List of 4 .. .. .. ..$ key : chr "123abc" - .. .. .. ..$ name : chr "a" + .. .. .. ..$ name : chr "WT1" .. .. .. ..$ color : chr "#77aadd" - .. .. .. ..$ cellIds: num [1:7599] 18751 21655 9288 1250 15504 ... + .. .. .. ..$ cellIds: int [1:40] 0 1 2 3 4 5 6 7 8 9 ... .. .. ..$ :List of 4 .. .. .. ..$ key : chr "123def" - .. .. .. ..$ name : chr "b" + .. .. .. ..$ name : chr "WT2" .. .. .. ..$ color : chr "#ee8866" - .. .. .. ..$ cellIds: num [1:7599] 21991 11657 20196 4364 2677 ... - .. .. ..$ :List of 4 - .. .. .. ..$ key : chr "123ghi" - .. .. .. ..$ name : chr "c" - .. .. .. ..$ color : chr "#eedd88" - .. .. .. ..$ cellIds: num [1:7599] 19675 19816 13514 5567 11107 ... + .. .. .. ..$ cellIds: int [1:40] 40 41 42 43 44 45 46 47 48 49 ... .. ..$ type : chr "metadataCategorical" ..$ :List of 5 .. ..$ key : chr "Group1" @@ -75,428 +65,23 @@ .. .. ..$ :List of 4 .. .. .. ..$ key : chr "Group1-Hello" .. .. .. ..$ name : chr "Hello" - .. .. .. ..$ color : chr "#ffaabb" - .. .. .. ..$ cellIds:List of 7599 - .. .. .. .. ..$ : num 18751 - .. .. .. .. ..$ : num 21655 - .. .. .. .. ..$ : num 9288 - .. .. .. .. ..$ : num 1250 - .. .. .. .. ..$ : num 15504 - .. .. .. .. ..$ : num 8824 - .. .. .. .. ..$ : num 10287 - .. .. .. .. ..$ : num 13438 - .. .. .. .. ..$ : num 14358 - .. .. .. .. ..$ : num 16738 - .. .. .. .. ..$ : num 7698 - .. .. .. .. ..$ : num 3952 - .. .. .. .. ..$ : num 9089 - .. .. .. .. ..$ : num 5401 - .. .. .. .. ..$ : num 930 - .. .. .. .. ..$ : num 22019 - .. .. .. .. ..$ : num 15570 - .. .. .. .. ..$ : num 20384 - .. .. .. .. ..$ : num 12134 - .. .. .. .. ..$ : num 257 - .. .. .. .. ..$ : num 21816 - .. .. .. .. ..$ : num 479 - .. .. .. .. ..$ : num 13608 - .. .. .. .. ..$ : num 7324 - .. .. .. .. ..$ : num 2452 - .. .. .. .. ..$ : num 9172 - .. .. .. .. ..$ : num 7787 - .. .. .. .. ..$ : num 21850 - .. .. .. .. ..$ : num 22723 - .. .. .. .. ..$ : num 18656 - .. .. .. .. ..$ : num 2550 - .. .. .. .. ..$ : num 16304 - .. .. .. .. ..$ : num 11617 - .. .. .. .. ..$ : num 11222 - .. .. .. .. ..$ : num 17109 - .. .. .. .. ..$ : num 943 - .. .. .. .. ..$ : num 11507 - .. .. .. .. ..$ : num 17008 - .. .. .. .. ..$ : num 4356 - .. .. .. .. ..$ : num 12416 - .. .. .. .. ..$ : num 17778 - .. .. .. .. ..$ : num 21505 - .. .. .. .. ..$ : num 12660 - .. .. .. .. ..$ : num 15763 - .. .. .. .. ..$ : num 2816 - .. .. .. .. ..$ : num 9205 - .. .. .. .. ..$ : num 14180 - .. .. .. .. ..$ : num 12936 - .. .. .. .. ..$ : num 14374 - .. .. .. .. ..$ : num 515 - .. .. .. .. ..$ : num 22761 - .. .. .. .. ..$ : num 101 - .. .. .. .. ..$ : num 5346 - .. .. .. .. ..$ : num 10347 - .. .. .. .. ..$ : num 9543 - .. .. .. .. ..$ : num 18075 - .. .. .. .. ..$ : num 4170 - .. .. .. .. ..$ : num 15314 - .. .. .. .. ..$ : num 5895 - .. .. .. .. ..$ : num 5609 - .. .. .. .. ..$ : num 20001 - .. .. .. .. ..$ : num 10971 - .. .. .. .. ..$ : num 14 - .. .. .. .. ..$ : num 13667 - .. .. .. .. ..$ : num 15340 - .. .. .. .. ..$ : num 21827 - .. .. .. .. ..$ : num 986 - .. .. .. .. ..$ : num 15988 - .. .. .. .. ..$ : num 7808 - .. .. .. .. ..$ : num 8272 - .. .. .. .. ..$ : num 14230 - .. .. .. .. ..$ : num 14191 - .. .. .. .. ..$ : num 15718 - .. .. .. .. ..$ : num 15285 - .. .. .. .. ..$ : num 2342 - .. .. .. .. ..$ : num 147 - .. .. .. .. ..$ : num 7138 - .. .. .. .. ..$ : num 22071 - .. .. .. .. ..$ : num 16482 - .. .. .. .. ..$ : num 2344 - .. .. .. .. ..$ : num 2448 - .. .. .. .. ..$ : num 89 - .. .. .. .. ..$ : num 7347 - .. .. .. .. ..$ : num 21556 - .. .. .. .. ..$ : num 4814 - .. .. .. .. ..$ : num 5876 - .. .. .. .. ..$ : num 14394 - .. .. .. .. ..$ : num 15303 - .. .. .. .. ..$ : num 14711 - .. .. .. .. ..$ : num 6780 - .. .. .. .. ..$ : num 8617 - .. .. .. .. ..$ : num 5230 - .. .. .. .. ..$ : num 21062 - .. .. .. .. ..$ : num 7451 - .. .. .. .. ..$ : num 17333 - .. .. .. .. ..$ : num 8100 - .. .. .. .. ..$ : num 9727 - .. .. .. .. ..$ : num 5259 - .. .. .. .. ..$ : num 18307 - .. .. .. .. .. [list output truncated] + .. .. .. ..$ color : chr "#eedd88" + .. .. .. ..$ cellIds: int [1:40] 0 1 2 3 4 5 6 7 8 9 ... .. .. ..$ :List of 4 .. .. .. ..$ key : chr "Group1-WT2" .. .. .. ..$ name : chr "WT2" - .. .. .. ..$ color : chr "#99ddff" - .. .. .. ..$ cellIds:List of 15198 - .. .. .. .. ..$ : num 21991 - .. .. .. .. ..$ : num 11657 - .. .. .. .. ..$ : num 20196 - .. .. .. .. ..$ : num 4364 - .. .. .. .. ..$ : num 2677 - .. .. .. .. ..$ : num 20619 - .. .. .. .. ..$ : num 17193 - .. .. .. .. ..$ : num 7650 - .. .. .. .. ..$ : num 14391 - .. .. .. .. ..$ : num 8150 - .. .. .. .. ..$ : num 20461 - .. .. .. .. ..$ : num 2771 - .. .. .. .. ..$ : num 17019 - .. .. .. .. ..$ : num 7033 - .. .. .. .. ..$ : num 9250 - .. .. .. .. ..$ : num 13853 - .. .. .. .. ..$ : num 22446 - .. .. .. .. ..$ : num 18282 - .. .. .. .. ..$ : num 7177 - .. .. .. .. ..$ : num 20255 - .. .. .. .. ..$ : num 5019 - .. .. .. .. ..$ : num 20055 - .. .. .. .. ..$ : num 21369 - .. .. .. .. ..$ : num 16006 - .. .. .. .. ..$ : num 16904 - .. .. .. .. ..$ : num 16221 - .. .. .. .. ..$ : num 4105 - .. .. .. .. ..$ : num 18140 - .. .. .. .. ..$ : num 7521 - .. .. .. .. ..$ : num 17704 - .. .. .. .. ..$ : num 1413 - .. .. .. .. ..$ : num 5295 - .. .. .. .. ..$ : num 15210 - .. .. .. .. ..$ : num 7350 - .. .. .. .. ..$ : num 10544 - .. .. .. .. ..$ : num 4485 - .. .. .. .. ..$ : num 10344 - .. .. .. .. ..$ : num 1949 - .. .. .. .. ..$ : num 1926 - .. .. .. .. ..$ : num 20240 - .. .. .. .. ..$ : num 12413 - .. .. .. .. ..$ : num 13960 - .. .. .. .. ..$ : num 21303 - .. .. .. .. ..$ : num 5476 - .. .. .. .. ..$ : num 11969 - .. .. .. .. ..$ : num 6749 - .. .. .. .. ..$ : num 21412 - .. .. .. .. ..$ : num 10044 - .. .. .. .. ..$ : num 19445 - .. .. .. .. ..$ : num 18933 - .. .. .. .. ..$ : num 11149 - .. .. .. .. ..$ : num 17046 - .. .. .. .. ..$ : num 20439 - .. .. .. .. ..$ : num 830 - .. .. .. .. ..$ : num 14644 - .. .. .. .. ..$ : num 7237 - .. .. .. .. ..$ : num 9326 - .. .. .. .. ..$ : num 2106 - .. .. .. .. ..$ : num 3342 - .. .. .. .. ..$ : num 14931 - .. .. .. .. ..$ : num 819 - .. .. .. .. ..$ : num 11146 - .. .. .. .. ..$ : num 10467 - .. .. .. .. ..$ : num 15377 - .. .. .. .. ..$ : num 6594 - .. .. .. .. ..$ : num 6658 - .. .. .. .. ..$ : num 3779 - .. .. .. .. ..$ : num 22146 - .. .. .. .. ..$ : num 6656 - .. .. .. .. ..$ : num 14934 - .. .. .. .. ..$ : num 1587 - .. .. .. .. ..$ : num 4455 - .. .. .. .. ..$ : num 22744 - .. .. .. .. ..$ : num 466 - .. .. .. .. ..$ : num 11876 - .. .. .. .. ..$ : num 497 - .. .. .. .. ..$ : num 9673 - .. .. .. .. ..$ : num 7232 - .. .. .. .. ..$ : num 10792 - .. .. .. .. ..$ : num 19752 - .. .. .. .. ..$ : num 3965 - .. .. .. .. ..$ : num 8300 - .. .. .. .. ..$ : num 17940 - .. .. .. .. ..$ : num 17826 - .. .. .. .. ..$ : num 15608 - .. .. .. .. ..$ : num 18458 - .. .. .. .. ..$ : num 19348 - .. .. .. .. ..$ : num 1603 - .. .. .. .. ..$ : num 21975 - .. .. .. .. ..$ : num 644 - .. .. .. .. ..$ : num 20825 - .. .. .. .. ..$ : num 1961 - .. .. .. .. ..$ : num 14486 - .. .. .. .. ..$ : num 13703 - .. .. .. .. ..$ : num 11859 - .. .. .. .. ..$ : num 12673 - .. .. .. .. ..$ : num 4241 - .. .. .. .. ..$ : num 7036 - .. .. .. .. ..$ : num 15272 - .. .. .. .. .. [list output truncated] + .. .. .. ..$ color : chr "#ffaabb" + .. .. .. ..$ cellIds: int [1:40] 40 41 42 43 44 45 46 47 48 49 ... .. ..$ type : chr "metadataCategorical" ..$ :List of 5 .. ..$ key : chr "Group2" .. ..$ name : chr "Group2" .. ..$ rootNode: logi TRUE - .. ..$ children:List of 2 + .. ..$ children:List of 1 .. .. ..$ :List of 4 .. .. .. ..$ key : chr "Group2-WT" .. .. .. ..$ name : chr "WT" - .. .. .. ..$ color : chr "#44bb99" - .. .. .. ..$ cellIds:List of 15198 - .. .. .. .. ..$ : num 18751 - .. .. .. .. ..$ : num 21655 - .. .. .. .. ..$ : num 9288 - .. .. .. .. ..$ : num 1250 - .. .. .. .. ..$ : num 15504 - .. .. .. .. ..$ : num 8824 - .. .. .. .. ..$ : num 10287 - .. .. .. .. ..$ : num 13438 - .. .. .. .. ..$ : num 14358 - .. .. .. .. ..$ : num 16738 - .. .. .. .. ..$ : num 7698 - .. .. .. .. ..$ : num 3952 - .. .. .. .. ..$ : num 9089 - .. .. .. .. ..$ : num 5401 - .. .. .. .. ..$ : num 930 - .. .. .. .. ..$ : num 22019 - .. .. .. .. ..$ : num 15570 - .. .. .. .. ..$ : num 20384 - .. .. .. .. ..$ : num 12134 - .. .. .. .. ..$ : num 257 - .. .. .. .. ..$ : num 21816 - .. .. .. .. ..$ : num 479 - .. .. .. .. ..$ : num 13608 - .. .. .. .. ..$ : num 7324 - .. .. .. .. ..$ : num 2452 - .. .. .. .. ..$ : num 9172 - .. .. .. .. ..$ : num 7787 - .. .. .. .. ..$ : num 21850 - .. .. .. .. ..$ : num 22723 - .. .. .. .. ..$ : num 18656 - .. .. .. .. ..$ : num 2550 - .. .. .. .. ..$ : num 16304 - .. .. .. .. ..$ : num 11617 - .. .. .. .. ..$ : num 11222 - .. .. .. .. ..$ : num 17109 - .. .. .. .. ..$ : num 943 - .. .. .. .. ..$ : num 11507 - .. .. .. .. ..$ : num 17008 - .. .. .. .. ..$ : num 4356 - .. .. .. .. ..$ : num 12416 - .. .. .. .. ..$ : num 17778 - .. .. .. .. ..$ : num 21505 - .. .. .. .. ..$ : num 12660 - .. .. .. .. ..$ : num 15763 - .. .. .. .. ..$ : num 2816 - .. .. .. .. ..$ : num 9205 - .. .. .. .. ..$ : num 14180 - .. .. .. .. ..$ : num 12936 - .. .. .. .. ..$ : num 14374 - .. .. .. .. ..$ : num 515 - .. .. .. .. ..$ : num 22761 - .. .. .. .. ..$ : num 101 - .. .. .. .. ..$ : num 5346 - .. .. .. .. ..$ : num 10347 - .. .. .. .. ..$ : num 9543 - .. .. .. .. ..$ : num 18075 - .. .. .. .. ..$ : num 4170 - .. .. .. .. ..$ : num 15314 - .. .. .. .. ..$ : num 5895 - .. .. .. .. ..$ : num 5609 - .. .. .. .. ..$ : num 20001 - .. .. .. .. ..$ : num 10971 - .. .. .. .. ..$ : num 14 - .. .. .. .. ..$ : num 13667 - .. .. .. .. ..$ : num 15340 - .. .. .. .. ..$ : num 21827 - .. .. .. .. ..$ : num 986 - .. .. .. .. ..$ : num 15988 - .. .. .. .. ..$ : num 7808 - .. .. .. .. ..$ : num 8272 - .. .. .. .. ..$ : num 14230 - .. .. .. .. ..$ : num 14191 - .. .. .. .. ..$ : num 15718 - .. .. .. .. ..$ : num 15285 - .. .. .. .. ..$ : num 2342 - .. .. .. .. ..$ : num 147 - .. .. .. .. ..$ : num 7138 - .. .. .. .. ..$ : num 22071 - .. .. .. .. ..$ : num 16482 - .. .. .. .. ..$ : num 2344 - .. .. .. .. ..$ : num 2448 - .. .. .. .. ..$ : num 89 - .. .. .. .. ..$ : num 7347 - .. .. .. .. ..$ : num 21556 - .. .. .. .. ..$ : num 4814 - .. .. .. .. ..$ : num 5876 - .. .. .. .. ..$ : num 14394 - .. .. .. .. ..$ : num 15303 - .. .. .. .. ..$ : num 14711 - .. .. .. .. ..$ : num 6780 - .. .. .. .. ..$ : num 8617 - .. .. .. .. ..$ : num 5230 - .. .. .. .. ..$ : num 21062 - .. .. .. .. ..$ : num 7451 - .. .. .. .. ..$ : num 17333 - .. .. .. .. ..$ : num 8100 - .. .. .. .. ..$ : num 9727 - .. .. .. .. ..$ : num 5259 - .. .. .. .. ..$ : num 18307 - .. .. .. .. .. [list output truncated] - .. .. ..$ :List of 4 - .. .. .. ..$ key : chr "Group2-WT124" - .. .. .. ..$ name : chr "WT124" - .. .. .. ..$ color : chr "#bbcc33" - .. .. .. ..$ cellIds:List of 7599 - .. .. .. .. ..$ : num 19675 - .. .. .. .. ..$ : num 19816 - .. .. .. .. ..$ : num 13514 - .. .. .. .. ..$ : num 5567 - .. .. .. .. ..$ : num 11107 - .. .. .. .. ..$ : num 7686 - .. .. .. .. ..$ : num 14536 - .. .. .. .. ..$ : num 13233 - .. .. .. .. ..$ : num 22117 - .. .. .. .. ..$ : num 11047 - .. .. .. .. ..$ : num 18659 - .. .. .. .. ..$ : num 5431 - .. .. .. .. ..$ : num 7261 - .. .. .. .. ..$ : num 4842 - .. .. .. .. ..$ : num 15338 - .. .. .. .. ..$ : num 22236 - .. .. .. .. ..$ : num 16656 - .. .. .. .. ..$ : num 22490 - .. .. .. .. ..$ : num 15771 - .. .. .. .. ..$ : num 11213 - .. .. .. .. ..$ : num 637 - .. .. .. .. ..$ : num 19499 - .. .. .. .. ..$ : num 10179 - .. .. .. .. ..$ : num 2214 - .. .. .. .. ..$ : num 11253 - .. .. .. .. ..$ : num 17981 - .. .. .. .. ..$ : num 20460 - .. .. .. .. ..$ : num 6484 - .. .. .. .. ..$ : num 19949 - .. .. .. .. ..$ : num 5844 - .. .. .. .. ..$ : num 7571 - .. .. .. .. ..$ : num 14101 - .. .. .. .. ..$ : num 8366 - .. .. .. .. ..$ : num 12469 - .. .. .. .. ..$ : num 4201 - .. .. .. .. ..$ : num 14467 - .. .. .. .. ..$ : num 2022 - .. .. .. .. ..$ : num 11335 - .. .. .. .. ..$ : num 21767 - .. .. .. .. ..$ : num 18419 - .. .. .. .. ..$ : num 1660 - .. .. .. .. ..$ : num 5703 - .. .. .. .. ..$ : num 11390 - .. .. .. .. ..$ : num 2176 - .. .. .. .. ..$ : num 11306 - .. .. .. .. ..$ : num 13776 - .. .. .. .. ..$ : num 15414 - .. .. .. .. ..$ : num 4464 - .. .. .. .. ..$ : num 6026 - .. .. .. .. ..$ : num 16109 - .. .. .. .. ..$ : num 15928 - .. .. .. .. ..$ : num 3099 - .. .. .. .. ..$ : num 19729 - .. .. .. .. ..$ : num 21812 - .. .. .. .. ..$ : num 10721 - .. .. .. .. ..$ : num 18785 - .. .. .. .. ..$ : num 3247 - .. .. .. .. ..$ : num 5797 - .. .. .. .. ..$ : num 19780 - .. .. .. .. ..$ : num 212 - .. .. .. .. ..$ : num 7240 - .. .. .. .. ..$ : num 20631 - .. .. .. .. ..$ : num 1072 - .. .. .. .. ..$ : num 3391 - .. .. .. .. ..$ : num 18 - .. .. .. .. ..$ : num 2884 - .. .. .. .. ..$ : num 2987 - .. .. .. .. ..$ : num 8398 - .. .. .. .. ..$ : num 9595 - .. .. .. .. ..$ : num 19074 - .. .. .. .. ..$ : num 11837 - .. .. .. .. ..$ : num 4200 - .. .. .. .. ..$ : num 7868 - .. .. .. .. ..$ : num 16188 - .. .. .. .. ..$ : num 5082 - .. .. .. .. ..$ : num 3828 - .. .. .. .. ..$ : num 5216 - .. .. .. .. ..$ : num 129 - .. .. .. .. ..$ : num 6105 - .. .. .. .. ..$ : num 19394 - .. .. .. .. ..$ : num 15759 - .. .. .. .. ..$ : num 2284 - .. .. .. .. ..$ : num 22263 - .. .. .. .. ..$ : num 2206 - .. .. .. .. ..$ : num 16298 - .. .. .. .. ..$ : num 17306 - .. .. .. .. ..$ : num 20330 - .. .. .. .. ..$ : num 5932 - .. .. .. .. ..$ : num 1358 - .. .. .. .. ..$ : num 15223 - .. .. .. .. ..$ : num 9400 - .. .. .. .. ..$ : num 11626 - .. .. .. .. ..$ : num 2436 - .. .. .. .. ..$ : num 14848 - .. .. .. .. ..$ : num 7314 - .. .. .. .. ..$ : num 19944 - .. .. .. .. ..$ : num 13972 - .. .. .. .. ..$ : num 3775 - .. .. .. .. ..$ : num 22506 - .. .. .. .. .. [list output truncated] + .. .. .. ..$ color : chr "#99ddff" + .. .. .. ..$ cellIds: int [1:80] 0 1 2 3 4 5 6 7 8 9 ... .. ..$ type : chr "metadataCategorical" diff --git a/pipeline-runner/tests/testthat/qc_mock.R b/pipeline-runner/tests/testthat/qc_mock.R index b33cfe56..b4f033e4 100644 --- a/pipeline-runner/tests/testthat/qc_mock.R +++ b/pipeline-runner/tests/testthat/qc_mock.R @@ -5,6 +5,5 @@ #' #' @examples mock_ids <- function() { - # TODO: parametrize sample ids return(list("123abc" = 0:39, "123def" = 40:79)) } diff --git a/pipeline-runner/tests/testthat/test-gem2s-6-construct_qc_config.R b/pipeline-runner/tests/testthat/test-gem2s-6-construct_qc_config.R index 39d021c3..c497978e 100644 --- a/pipeline-runner/tests/testthat/test-gem2s-6-construct_qc_config.R +++ b/pipeline-runner/tests/testthat/test-gem2s-6-construct_qc_config.R @@ -23,7 +23,7 @@ mock_scdata <- function() { test_that("cellsize filter is disabled by default and classifier if pre-filtered", { scdata <- mock_scdata() - qc_config <- construct_qc_config(list(scdata), any_filtered = TRUE) + qc_config <- construct_qc_config(scdata, any_filtered = TRUE) expect_false(qc_config$classifier$enabled) expect_true(qc_config$classifier$prefiltered) @@ -34,7 +34,7 @@ test_that("cellsize filter is disabled by default and classifier if pre-filtered test_that("cellsize filter is disabled by default and classifier if not pre-filtered", { scdata <- mock_scdata() - qc_config <- construct_qc_config(list(scdata), any_filtered = FALSE) + qc_config <- construct_qc_config(scdata, any_filtered = FALSE) expect_false(qc_config$cellSizeDistribution$enabled) expect_true(qc_config$classifier$enabled) }) diff --git a/pipeline-runner/tests/testthat/test-gem2s-6-prepare_experiment.R b/pipeline-runner/tests/testthat/test-gem2s-6-prepare_experiment.R index 6ffa74c0..057c076e 100644 --- a/pipeline-runner/tests/testthat/test-gem2s-6-prepare_experiment.R +++ b/pipeline-runner/tests/testthat/test-gem2s-6-prepare_experiment.R @@ -49,166 +49,129 @@ mock_prev_out <- function(samples = "sample_a", counts = NULL) { } -test_that("prepare_experiment ensures gene_annotations are indexed correctly for each sample", { - samples <- c("a", "b", "c") - prev_out <- mock_prev_out(samples = samples) +test_that("prepare_experiment merges multiple SeuratObjects", { + prev_out <- mock_prev_out(samples = c("a", "b", "c")) + scdata_list <- prev_out$scdata_list - # remove some genes from each sample - prev_out$counts_list$a <- prev_out$counts_list$a[-c(1:9), ] - prev_out$counts_list$b <- prev_out$counts_list$b[-c(21:30), ] - prev_out$counts_list$c <- prev_out$counts_list$c[-c(5:25), ] + task_out <- suppressWarnings(prepare_experiment(NULL, NULL, prev_out)$output) - # re-create seurat object - prev_out <- create_seurat(NULL, NULL, prev_out)$output - scdata_list <- prepare_experiment(NULL, NULL, prev_out)$output$scdata + scdata <- task_out$scdata - # we expect that the input in gene_annotations is the same as the rownames of - # each sample seurat object - for (sample in samples) { - expect_equal(scdata_list[[sample]]@misc$gene_annotations$input, rownames(scdata_list[[sample]])) - } + expect_equal(ncol(scdata), sum(sapply(scdata_list, ncol))) +}) + +test_that("prepare_experiment shuffles cells after merge", { + prev_out <- mock_prev_out(samples = c("a", "b", "c")) + scdata_list <- prev_out$scdata_list + + task_out <- suppressWarnings(prepare_experiment(NULL, NULL, prev_out)$output) + + scdata <- task_out$scdata + merged_scdatas <- suppressWarnings(merge_scdatas(scdata_list)) + set.seed(gem2s$random.seed) + shuffle_mask <- sample(colnames(merged_scdatas)) + + expect_equal(ncol(scdata), ncol(merged_scdatas)) + expect_equal(merged_scdatas$samples[shuffle_mask],scdata$samples) + expect_true(all(shuffle_mask == colnames(scdata))) + expect_false(all(shuffle_mask == colnames(merged_scdatas))) + expect_true(all(merged_scdatas$samples[1:ncol(scdata_list[[1]])]=="a")) + expect_false(all(scdata$samples[1:ncol(scdata_list[[1]])]=="a")) }) -test_that("prepare_experiment adds 0 indexed cell_ids to each sample in scdata_list", { - # TODO make this test test add_metadata_to_samples instead of prepare experiment - samples <- c("a", "b", "c") - prev_out <- mock_prev_out(samples = samples) - input <- list(experimentId = "1234") +test_that("prepare_experiment ensures gene_annotations are indexed the same as scdata", { + prev_out <- mock_prev_out() - scdata_list <- prepare_experiment(input, NULL, prev_out)$output$scdata + # shuffle gene order of annot + annot <- prev_out$annot + prev_out$annot <- annot[sample(nrow(annot)), ] - added_to_misc <- c("gene_annotations", "experimentId") + scdata <- prepare_experiment(NULL, NULL, prev_out)$output$scdata - for (sample in samples) { - expect_true(all(added_to_misc %in% names(scdata_list[[sample]]@misc))) - } + expect_equal(row.names(scdata), scdata@misc$gene_annotations$input) +}) + +test_that("prepare_experiment adds 0 indexed cell_ids and other metadata to scdata", { + prev_out <- mock_prev_out() + input <- list(experimentId = "1234") + scdata <- prepare_experiment(input, NULL, prev_out)$output$scdata - # list of added cell_ids per sample in scdata_list - added_ids <- purrr::map(scdata_list, ~unname(.$cells_id)) + added_to_misc <- c("gene_annotations", "color_pool", "experimentId", "ingestionDate") + expect_true(all(added_to_misc %in% names(scdata@misc))) - set.seed(RANDOM_SEED) - total_cells <- sum(sapply(scdata_list, ncol)) - cell_ids <- 0:total_cells-1 - start <- 0 - expected_ids <- list() - for (sample in samples) { - sample_size <- ncol(scdata_list[[sample]]) - idxs <- sample(seq_along(cell_ids), sample_size) - expected_ids[[sample]] <- cell_ids[idxs] - # remove the selected cell ids for next samples - cell_ids <- cell_ids[-idxs] - } + added_ids <- unname(scdata$cells_id) + expected_ids <- seq(0, ncol(scdata) - 1) expect_equal(added_ids, expected_ids) }) - test_that("prepare_experiment generates qc_config that matches snapshot", { prev_out <- mock_prev_out() input <- list(experimentId = "1234") task_out <- prepare_experiment(input, NULL, prev_out)$output - # TODO fix snapshot test expect_snapshot(str(task_out$qc_config)) }) - -test_that("prepare_experiment creates a list of valid Seurat objects", { - - samples <- c("a", "b", "c") - prev_out <- mock_prev_out(samples ) +test_object <- function() { + prev_out <- mock_prev_out(samples = c("a", "b", "c")) scdata_list <- prev_out$scdata_list task_out <- prepare_experiment(NULL, NULL, prev_out)$output - scdata_list <- task_out$scdata_list + scdata <- task_out$scdata - expect_type(scdata_list, "list") - for (sample in samples) { - scdata <- scdata_list[[sample]] + test_that("prepare_experiment creates a valid Seurat object", { expect_type(scdata, "S4") expect_true(nrow(scdata) == 100, "Seurat") expect_true(scdata@active.assay == "RNA") - } -}) - - -test_that("prepare_experiment properly populates the misc slot", { - - samples <- c("a", "b", "c") - prev_out <- mock_prev_out(samples ) - scdata_list <- prev_out$scdata_list - - task_out <- prepare_experiment(NULL, NULL, prev_out)$output - scdata_list <- task_out$scdata_list - - for (sample in samples) { + }) - scdata <- scdata_list[[sample]] + test_that("prepare_experiment properly populates the misc slot", { expect_type(scdata@misc, "list") misc <- scdata@misc expect_true("gene_annotations" %in% names(misc)) + expect_true("color_pool" %in% names(misc)) + expect_type(misc$color_pool, "character") expect_true(all(misc$gene_annotations$input == rownames(scdata))) expect_equal(sum(duplicated(misc$gene_annotations$name)), 0) # check that in duplicated positions (including the first) we have the gene id instead of the name. - } + }) -}) - - -test_that("prepare_experiment properly populates the metadata slot", { - samples <- c("a", "b", "c") - prev_out <- mock_prev_out(samples) - scdata_list <- prev_out$scdata_list - - task_out <- prepare_experiment(NULL, NULL, prev_out)$output - scdata_list <- task_out$scdata_list - - for (sample in samples) { - scdata <- scdata_list[[sample]] - - metadata <- scdata@meta.data + test_that("prepare_experiment properly populates the metadata slot", { + md <- scdata@meta.data annotations <- scdata@misc$gene_annotations - expect_type(metadata, "list") - expect_true(nrow(metadata) == ncol(scdata)) - expect_true("barcode" %in% colnames(metadata)) - expect_true(all(colnames(scdata) %in% rownames(metadata))) - expect_true("samples" %in% colnames(metadata)) + expect_type(md, "list") + expect_true(nrow(md) == ncol(scdata)) + expect_true("barcode" %in% colnames(md)) + expect_true(all(colnames(scdata) %in% rownames(md))) + expect_true("samples" %in% colnames(md)) if (any(grepl("^mt-", annotations$name, ignore.case = T))) { - expect_true("percent.mt" %in% colnames(metadata)) + expect_true("percent.mt" %in% colnames(md)) } - expect_true("doublet_scores" %in% colnames(metadata)) - expect_true("cells_id" %in% colnames(metadata)) - expect_true("samples" %in% colnames(metadata)) - - } - -}) - - - -test_that("Mitochondrial percentage is correct", { - samples <- c("a", "b", "c") - prev_out <- mock_prev_out(samples) - scdata_list <- prev_out$scdata_list - - task_out <- prepare_experiment(NULL, NULL, prev_out)$output - scdata_list <- task_out$scdata_list - - for (sample in samples) { - scdata <- scdata_list[[sample]] - - metadata <- scdata@meta.data - - - expect_true(max(metadata$percent.mt) <= 100) - expect_true(min(metadata$percent.mt) >= 0) - #Verify that we have percent mt and not fraction - expect_true(max(metadata$percent.mt) > 1 || all(metadata$percent.mt == 0)) + expect_true("doublet_scores" %in% colnames(md)) + expect_true("cells_id" %in% colnames(md)) + expect_true("samples" %in% colnames(md)) + + test_that("Cell ids are assigned correctly", { + cellNumber <- ncol(scdata@assays$RNA@data) + expect_equal(md$cells_id, 0:(cellNumber - 1)) + }) + + test_that("Mitochondrial percentage is correct", { + expect_true(max(md$percent.mt) <= 100) + expect_true(min(md$percent.mt) >= 0) + #Verify that we have percent mt and not fraction + expect_true(max(md$percent.mt) > 1 || all(md$percent.mt == 0)) + }) + }) +} - } +test_that("prepare_experiment creates a valid Seurat object", { + expect_true(TRUE) + suppressWarnings(test_object()) }) diff --git a/pipeline-runner/tests/testthat/test-gem2s-7-upload_to_aws.R b/pipeline-runner/tests/testthat/test-gem2s-7-upload_to_aws.R index 9af76d18..b0554952 100644 --- a/pipeline-runner/tests/testthat/test-gem2s-7-upload_to_aws.R +++ b/pipeline-runner/tests/testthat/test-gem2s-7-upload_to_aws.R @@ -1,188 +1,168 @@ -set.seed(1) - -mock_doublet_scores <- function(counts) { - doublet_scores <- runif(ncol(counts)) - doublet_class <- ifelse(doublet_scores < 0.8, "singlet", "doublet") - - data.frame( - row.names = colnames(counts), - barcodes = colnames(counts), - doublet_class = doublet_class, - doublet_scores = doublet_scores - ) -} - -mock_scdata_list = function (config) { - prev_out <- mock_prev_out(config) - scdata_list <- prev_out$scdata_list - - task_out <- prepare_experiment(NULL, NULL, prev_out)$output - scdata_list <- task_out$scdata_list -} - -mock_input <- function(metadata = NULL) { - input <- list( - name = "project name", - sampleNames = list("a", "b", "c"), - sampleIds = list("123abc", "123def", "123ghi"), +mock_config <- function(metadata = NULL) { + config <- list( + sampleNames = list("WT1", "WT2"), + sampleIds = list("123abc", "123def"), metadata = metadata ) - return(input) + return(config) } -mock_config <- function(input) { - config <- list( - name = input$name, - samples = input$sampleIds, - metadata = input$metadata + +mock_scdata <- function(config) { + pbmc_raw <- read.table( + file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), + as.is = TRUE ) - return (config) -} + # construct metadata + # add samples + samples <- unlist(config$sampleIds) -mock_prev_out <- function(config, counts = NULL) { - samples <- config$samples + rest <- config$metadata + keys <- c("samples", names(rest)) + metadata <- data.frame(row.names = colnames(pbmc_raw), samples = rep(samples, each = 40)) - if (is.null(counts)) { - counts <- DropletUtils:::simCounts() - colnames(counts) <- paste0("cell", seq_len(ncol(counts))) + # Add "metadata" if exists in config + if (!is.null(rest)) { + rest <- lapply(rest, unlist) + rest <- data.frame(rest, row.names = samples, check.names = FALSE) + metadata[names(rest)] <- rest[metadata$samples, ] } - eout <- DropletUtils::emptyDrops(counts) - - counts_list <- list() - edrops <- list() - doublet_scores <- list() + # make syntactically valid + colnames(metadata) <- make.names(colnames(metadata), unique = TRUE) - for (sampleId in samples) { - counts_list[[sampleId]] <- counts - edrops[[sampleId]] <- eout - doublet_scores[[sampleId]] <- mock_doublet_scores(counts) - } + scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw, meta.data = metadata) - # as passed to create_seurat - prev_out <- list( - counts_list = counts_list, - edrops = edrops, - doublet_scores = doublet_scores, - annot = data.frame(name = row.names(counts), input = row.names(counts)), - config = config - ) - # call create_seurat to get prev_out to pass to prepare_experiment - return(create_seurat(NULL, NULL, prev_out)$output) + scdata$cells_id <- seq(0, ncol(scdata) - 1) + return(scdata) } -check_metadata_cell_ids <- function(metadata_key, metadata_value, sample_keys, cell_sets) { - keys <- sapply(cell_sets$cellSets, `[[`, "key") - metadata_set <- cell_sets$cellSets[[which(keys == metadata_key)]] - metadata_name <- sapply(metadata_set$children, `[[`, "name") +test_that("get_cell_sets creates scratchpad and sample sets if no metadata", { + config <- mock_config() + scdata <- mock_scdata(config) - sample_set <- cell_sets$cellSets[[which(keys == "sample")]] - sample_names <- sapply(sample_set$children, `[[`, "name") + cell_sets <- get_cell_sets(scdata, config) + keys <- sapply(cell_sets$cellSets, `[[`, "key") - metadata_cell_ids <- unlist(metadata_set$children[[which(metadata_name == metadata_value)]]$cellId) + expect_setequal(keys, c("scratchpad", "sample")) +}) - sample_cell_sets = purrr::keep(sample_set$children, \(x) x[["key"]] %in% sample_keys) - sample_cell_ids = unlist(lapply(sample_cell_sets, `[[`, "cellIds")) - expect_equal(metadata_cell_ids, sample_cell_ids) -} +test_that("get_cell_sets adds correct cell ids for each sample", { + config <- mock_config() + scdata <- mock_scdata(config) -test_that("get_cell_sets creates scratchpad and sample sets if no metadata", { - input <- mock_input() - config <- mock_config(input) + cell_sets <- get_cell_sets(scdata, config) + sets_key <- sapply(cell_sets$cellSets, `[[`, "key") - scdata_list <- mock_scdata_list(config) + sample_sets <- cell_sets$cellSets[[which(sets_key == "sample")]] + samples_key <- sapply(sample_sets$children, `[[`, "key") - cell_sets <- get_cell_sets(scdata_list, input) - keys <- sapply(cell_sets$cellSets, `[[`, "key") + for (sample_id in config$sampleIds) { + sample_cells <- sample_sets$children[[which(samples_key == sample_id)]]$cellIds + expected_cells <- unname(scdata$cells_id)[scdata$samples == sample_id] - expect_setequal(keys, c("scratchpad", "sample")) + expect_equal(sample_cells, expected_cells) + } }) test_that("get_cell_sets adds correct cell ids for each sample", { - input <- mock_input() - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + config <- mock_config() + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, config) sets_key <- sapply(cell_sets$cellSets, `[[`, "key") sample_sets <- cell_sets$cellSets[[which(sets_key == "sample")]] samples_key <- sapply(sample_sets$children, `[[`, "key") - for (sample_id in config$samples) { + # ids are correct for each child + for (sample_id in config$sampleIds) { sample_cells <- sample_sets$children[[which(samples_key == sample_id)]]$cellIds - expected_cells <- unname(scdata_list[[sample_id]]$cells_id) + expected_cells <- unname(scdata$cells_id)[scdata$samples == sample_id] expect_equal(sample_cells, expected_cells) } }) + test_that("get_cell_sets adds a single metadata column", { - metadata <- list(Group = list("Hello", "WT2", "WT2")) - input <- mock_input(metadata) - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + metadata <- list(Group = list("Hello", "WT2")) + config <- mock_config(metadata) + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, config) + # have it as a key keys <- sapply(cell_sets$cellSets, `[[`, "key") - # has sample key as one of the keys expect_setequal(keys, c("scratchpad", "sample", "Group")) - # Check that each sample/metadata intersection contains the correct cell ids - check_metadata_cell_ids("Group", "WT2", c("123def", "123ghi"), cell_sets) - check_metadata_cell_ids("Group", "Hello", c("123abc"), cell_sets) + group_set <- cell_sets$cellSets[[which(keys == "Group")]] + group_names <- sapply(group_set$children, `[[`, "name") + + + # cell ids are correct for each child + for (group_name in group_names) { + group_cells <- group_set$children[[which(group_names == group_name)]]$cellIds + expected_cells <- unname(scdata$cells_id)[scdata$Group == group_name] + + expect_equal(group_cells, expected_cells) + } }) + test_that("get_cell_sets uses user-supplied syntactically invalid metadata column names", { - metadata <- list("TRUE" = list("Hello", "WT2", "WT2")) - input <- mock_input(metadata) - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + metadata <- list("TRUE" = list("Hello", "WT2")) + config <- mock_config(metadata) + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, config) - # has sample key as one of the keys + # have TRUE as a key keys <- sapply(cell_sets$cellSets, `[[`, "key") - expect_setequal(keys, c("scratchpad", "sample", "TRUE")) + expect_true("TRUE" %in% keys) - check_metadata_cell_ids("TRUE", "WT2", c("123def", "123ghi"), cell_sets) - check_metadata_cell_ids("TRUE", "Hello", c("123abc"), cell_sets) + group_set <- cell_sets$cellSets[[which(keys == "TRUE")]] + group_names <- sapply(group_set$children, `[[`, "name") + + # cell ids are correct for each child + for (group_name in group_names) { + group_cells <- group_set$children[[which(group_names == group_name)]]$cellIds + expected_cells <- unname(scdata$cells_id)[scdata$TRUE. == group_name] + + expect_equal(group_cells, expected_cells) + } }) test_that("get_cell_sets adds two metadata columns", { - metadata <- list(Group1 = list("Hello", "WT2", "WT2"), Group2 = list("WT", "WT", "WTA")) - input <- mock_input(metadata) - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + metadata <- list(Group1 = list("Hello", "WT2"), Group2 = list("WT", "WT")) + config <- mock_config(metadata) + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, config) # have as keys keys <- sapply(cell_sets$cellSets, `[[`, "key") expect_setequal(keys, c("scratchpad", "sample", "Group1", "Group2")) - check_metadata_cell_ids("Group1", "Hello", c("123abc"), cell_sets) - check_metadata_cell_ids("Group1", "WT2", c("123def", "123ghi"), cell_sets) - - check_metadata_cell_ids("Group2", "WT", c("123abc", "123def"), cell_sets) - check_metadata_cell_ids("Group2", "WTA", c( "123ghi"), cell_sets) + # check that Group2 has all cells + group2_set <- cell_sets$cellSets[[which(keys == "Group2")]] + group2_cells <- group2_set$children[[1]]$cellIds + expect_equal(group2_cells, unname(scdata$cells_id)) }) test_that("get_cell_sets uses unique colors for each cell set", { - metadata <- list(Group1 = list("Hello", "WT2", "WT2"), Group2 = list("WT", "WT", "WTA")) - input <- mock_input(metadata) - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + metadata <- list(Group1 = list("Hello", "WT2"), Group2 = list("WT", "WT")) + config <- mock_config(metadata) + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, config) flat_cell_sets <- unlist(cell_sets) colors <- flat_cell_sets[grepl("[.]color", names(flat_cell_sets))] @@ -193,22 +173,22 @@ test_that("get_cell_sets uses unique colors for each cell set", { test_that("get_cell_sets without metadata matches snapshot", { - input <- mock_input() - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + config <- mock_config() + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + cell_sets <- get_cell_sets(scdata, config) expect_snapshot(str(cell_sets)) }) test_that("get_cell_sets with two metadata groups matches snapshot", { - metadata <- list(Group1 = list("Hello", "WT2", "WT2"), Group2 = list("WT", "WT", "WT124")) - input <- mock_input(metadata) - config <- mock_config(input) - scdata_list <- mock_scdata_list(config) + metadata <- list(Group1 = list("Hello", "WT2"), Group2 = list("WT", "WT")) + + config <- mock_config(metadata) + scdata <- mock_scdata(config) - cell_sets <- get_cell_sets(scdata_list, input) + scdata@misc <- list(metadata_lookups = c(Group1 = "Group1", Group2 = "Group2")) + cell_sets <- get_cell_sets(scdata, config) expect_snapshot(str(cell_sets)) }) diff --git a/pipeline-runner/tests/testthat/test-qc-1-filter_emptydrops.R b/pipeline-runner/tests/testthat/test-qc-1-filter_emptydrops.R index 7224bd57..46bb29ec 100644 --- a/pipeline-runner/tests/testthat/test-qc-1-filter_emptydrops.R +++ b/pipeline-runner/tests/testthat/test-qc-1-filter_emptydrops.R @@ -12,106 +12,89 @@ mock_config <- function() { return(config) } - -mock_scdata_list <- function() { +mock_scdata <- function() { pbmc_raw <- read.table( file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), as.is = TRUE ) - sample_ids <- c("sample_1", "sample_2") - scdata_list <- {} - for (i in seq_along(sample_ids)) { - message(i) - scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) - n_cells <- ncol(scdata) - start_idx <- (i-1)*n_cells - end_idx <- ((i)*n_cells) - 1 - - scdata$cells_id <- start_idx:end_idx - - # add empty drops stuff - scdata$emptyDrops_FDR <- NA - scdata$emptyDrops_FDR[0:70] <- 0.009 - - # add samples - scdata$samples <- rep(sample_ids[i], each = n_cells) - # scdata_list <- Seurat::RenameCells(scdata, paste(scdata$samples, colnames(scdata), sep = "")) - scdata_list[[sample_ids[i]]] <- scdata - } - return(scdata_list) + scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) + scdata$cells_id <- 0:(ncol(scdata) - 1) + + # add empty drops stuff + scdata$emptyDrops_FDR <- NA + scdata$emptyDrops_FDR[1:70] <- 0.009 + + # add samples + scdata$samples <- rep(c("123abc", "123def"), each = 40) + scdata <- Seurat::RenameCells(scdata, paste(scdata$samples, colnames(scdata), sep = "")) + return(scdata) } test_that("Initial cells_id are correct", { - scdata_list <- mock_scdata_list() - cells_id <- generate_first_step_ids(scdata_list) - expect_equal(unique(cells_id[["sample_1"]]), 0:79) - expect_equal(unique(cells_id[["sample_2"]]), 80:159) + scdata <- mock_scdata() + cells_id <- generate_first_step_ids(scdata) + expect_equal(unique(cells_id$`123abc`), 0:39) + expect_equal(unique(cells_id$`123def`), 40:79) }) test_that("filter_emptydrops removes NA with threshold < 1", { - scdata_list <- mock_scdata_list() + scdata <- mock_scdata() config <- mock_config() - cells_id <- generate_first_step_ids(scdata_list) + cells_id <- mock_ids() - - for (sample_id in c("sample_1", "sample_2")){ - out <- filter_emptydrops(scdata_list, config, sample_id, cells_id) - expect_equal(ncol(out$data[[sample_id]]), 80) - expect_equal(length(out$new_ids[[sample_id]]), 70) - } + out <- filter_emptydrops(scdata, config, "123def", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 30) }) test_that("filter_emptydrops is sample aware", { - scdata_list <- mock_scdata_list() + scdata <- mock_scdata() config <- mock_config() - cells_id <- generate_first_step_ids(scdata_list) - - # NA (empty) drops are in sample_2 only - scdata_list[["sample_1"]]$emptyDrops_FDR <- 0.009 - out <- filter_emptydrops(scdata_list, config, "sample_2", cells_id) - expect_equal(ncol(out$data[["sample_1"]]), 80) - expect_equal(length(out$new_ids[["sample_1"]]), 80) - expect_equal(length(out$new_ids[["sample_2"]]), 70) + cells_id <- mock_ids() + + # NA (empty) drops are in 123def only + out <- filter_emptydrops(scdata, config, "123abc", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 40) }) test_that("if FDR=1 filter_emptydrops keeps everything", { - scdata_list <- mock_scdata_list() + scdata <- mock_scdata() config <- mock_config() config$filterSettings$FDR <- 1 - cells_id <- generate_first_step_ids(scdata_list) + cells_id <- mock_ids() - out <- filter_emptydrops(scdata_list, config, "sample_2", cells_id) - expect_equal(ncol(out$data[["sample_2"]]), 80) - expect_equal(length(out$new_ids[["sample_1"]]), 80) - expect_equal(length(out$new_ids[["sample_2"]]), 80) + out <- filter_emptydrops(scdata, config, "123def", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 40) }) test_that("filter_emptydrops can be disabled", { - scdata_list <- mock_scdata_list() + scdata <- mock_scdata() config <- mock_config() config$enabled <- FALSE - cells_id <- generate_first_step_ids(scdata_list) + cells_id <- mock_ids() - out <- filter_emptydrops(scdata_list, config, "sample_2", cells_id) - expect_equal(ncol(out$data[["sample_2"]]), 80) - expect_equal(length(out$new_ids[["sample_1"]]), 80) - expect_equal(length(out$new_ids[["sample_2"]]), 80) + out <- filter_emptydrops(scdata, config, "123def", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 40) }) test_that("filter_emptydrops handles missing emptyDrops_FDR", { - scdata_list <- mock_scdata_list() + scdata <- mock_scdata() config <- mock_config() - cells_id <- generate_first_step_ids(scdata_list) - - # remove emptyDrops_FDR field - scdata_list[["sample_1"]]$emptyDrops_FDR <- NULL - scdata_list[["sample_2"]]$emptyDrops_FDR <- NULL + scdata$emptyDrops_FDR <- NULL + cells_id <- mock_ids() - out <- filter_emptydrops(scdata_list, config, "sample_2", cells_id) - expect_equal(ncol(out$data[["sample_2"]]), 80) - expect_equal(length(out$new_ids[["sample_1"]]), 80) - expect_equal(length(out$new_ids[["sample_2"]]), 80) + out <- filter_emptydrops(scdata, config, "123def", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 40) }) diff --git a/pipeline-runner/tests/testthat/test-qc-2-filter_low_cellsize.R b/pipeline-runner/tests/testthat/test-qc-2-filter_low_cellsize.R index 861d98ca..0e409f5e 100644 --- a/pipeline-runner/tests/testthat/test-qc-2-filter_low_cellsize.R +++ b/pipeline-runner/tests/testthat/test-qc-2-filter_low_cellsize.R @@ -15,129 +15,120 @@ mock_scdata <- function() { as.is = TRUE ) - sample_1_id <- "123abc" - sample_2_id <- "123def" - scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) scdata$cells_id <- 0:(ncol(scdata) - 1) # add samples - scdata$samples <- rep(c(sample_1_id, sample_2_id), each = 40) + scdata$samples <- rep(c("123abc", "123def"), each = 40) scdata <- Seurat::RenameCells(scdata, paste(scdata$samples, colnames(scdata), sep = "")) - scdata$emptyDrops_FDR <- NA - - scdata_sample1 <- subset(scdata, samples == sample_1_id) - scdata_sample2 <- subset(scdata, samples == sample_2_id) - scdata_list <- list(scdata_sample1, scdata_sample2) - names(scdata_list) <- c(sample_1_id, sample_2_id) - - return(list(scdata_list, sample_1_id, sample_2_id)) + scdata$emptyDrops_FDR <- NA + return(scdata) } test_that("filter_low_cellsize removes cells", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() config <- mock_config(mcs = 10000) cells_id <- mock_ids() - out <- filter_low_cellsize(scdata_list, config, sample_1_id, cells_id) + out <- filter_low_cellsize(scdata, config, "123abc", cells_id) - expect_equal(ncol(out$data[[sample_1_id]]), ncol(scdata_list[[sample_1_id]])) - expect_lt(length(out$new_ids[[sample_1_id]]), length(cells_id[[sample_1_id]])) - - expect_equal(length(out$new_ids[[sample_2_id]]), length(cells_id[[sample_2_id]])) + expect_equal(ncol(out$data), ncol(scdata)) + expect_lt(length(out$new_ids$`123abc`), length(cells_id$`123abc`)) + expect_equal(length(out$new_ids$`123def`), length(cells_id$`123def`)) }) test_that("filter_low_cellsize filters only appropiate cells", { mcs <- 100 - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() config <- mock_config(mcs) cells_id <- mock_ids() - out <- filter_low_cellsize(scdata_list, config, sample_1_id, cells_id) - + out <- filter_low_cellsize(scdata, config, "123abc", cells_id) - data_sample_1 <- out$data[[sample_1_id]] - - barcode_names_this_sample <- colnames(data_sample_1)[data_sample_1$samples == sample_1_id] - sample_subset <- subset(data_sample_1, cells = barcode_names_this_sample) - sample_subset <- subset_ids(sample_subset, out$new_ids[[sample_1_id]]) + data <- out$data + barcode_names_this_sample <- colnames(data)[data$samples == "123abc"] + sample_subset <- subset(data, cells = barcode_names_this_sample) + sample_subset <- subset_ids(sample_subset, out$new_ids$`123abc`) expect_false(all(sample_subset$nCount_RNA <= mcs)) }) test_that("filter_low_cellsize is sample aware", { - mcs <- 200 - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + mcs <- 10000 + scdata <- mock_scdata() config <- mock_config(mcs) cells_id <- mock_ids() - out <- filter_low_cellsize(scdata_list, config, sample_1_id, cells_id) + out <- filter_low_cellsize(scdata, config, "123abc", cells_id) data <- out$data - barcode_names_this_sample <- colnames(data[[sample_1_id]]) + barcode_names_this_sample <- colnames(data)[data$samples == "123abc"] expect_equal(length(barcode_names_this_sample), 40) - barcode_names_this_sample <- colnames(data[[sample_2_id]]) + barcode_names_this_sample <- colnames(data)[data$samples == "123def"] expect_equal(length(barcode_names_this_sample), 40) - sample_1_new_ids <- out$new_ids[[sample_1_id]] - sample_2_new_ids <- out$new_ids[[sample_2_id]] + expect_lt(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 40) + + out <- filter_low_cellsize(scdata, config, "123def", cells_id) + data <- out$data + + barcode_names_this_sample <- colnames(data)[data$samples == "123def"] + expect_equal(length(barcode_names_this_sample), 40) - expect_lt(length(sample_1_new_ids), 40) - expect_equal(length(sample_2_new_ids), 40) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_lt(length(out$new_ids$`123def`), 40) }) test_that("filter_low_cellsize works on empty data/wrong sample", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - config <- mock_config(1000000) + scdata <- mock_scdata() + config <- mock_config(100000) cells_id <- mock_ids() - out <- filter_low_cellsize(scdata_list, config, sample_1_id, cells_id) + out <- filter_low_cellsize(scdata, config, "123abc", cells_id) - out <- filter_low_cellsize(out$data, config, sample_2_id, out$new_ids) + out <- filter_low_cellsize(out$data, config, "123def", out$new_ids) new_ids <- out$new_ids - expect_equal(0, length(new_ids[[sample_1_id]])) - expect_equal(0, length(new_ids[[sample_2_id]])) + expect_equal(0, length(new_ids$`123abc`)) + expect_equal(0, length(new_ids$`123def`)) - out <- filter_low_cellsize(out$data, config, sample_1_id, new_ids) - expect_equal(0, length(new_ids[[sample_1_id]])) - expect_equal(0, length(new_ids[[sample_2_id]])) + out <- filter_low_cellsize(out$data, config, "123abc", new_ids) + expect_equal(0, length(new_ids$`123abc`)) + expect_equal(0, length(new_ids$`123def`)) - # Seurat object wasn't changed - expect_equal(ncol(out$data), 40) + expect_equal(ncol(out$data), 80) }) test_that("filter_low_cellsize works with auto", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() config <- mock_config(auto_settings = TRUE) cells_id <- mock_ids() - out <- filter_low_cellsize(scdata_list, config, sample_2_id, cells_id) + out <- filter_low_cellsize(scdata, config, "123def", cells_id) expect_false(config$filterSettings$minCellSize == out$config$filterSettings$minCellSize) - data <- out$data[[sample_2_id]] - barcode_names_this_sample <- colnames(data)[data$samples == sample_2_id] + data <- out$data + barcode_names_this_sample <- colnames(data)[data$samples == "123def"] sample_subset <- subset(data, cells = barcode_names_this_sample) - sample_subset <- subset_ids(data, out$new_ids[[sample_2_id]]) + sample_subset <- subset_ids(data, out$new_ids$`123def`) - expect_equal(ncol(data), 40) + expect_equal(ncol(out$data), 80) expect_true(all(sample_subset$nCount_RNA >= out$config$filterSettings$minCellSize)) }) test_that("filter_low_cellsize can be disabled", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() config <- mock_config(mcs = 10000, enabled = FALSE) cells_id <- mock_ids() - out <- filter_low_cellsize(scdata_list, config, sample_2_id, cells_id) + out <- filter_low_cellsize(scdata, config, "123def", cells_id) - data <- out$data - expect_equal(sum(purrr::map_int(data, ncol)), 80) - expect_equal(length(out$new_ids[[sample_1_id]]), 40) - expect_equal(length(out$new_ids[[sample_2_id]]), 40) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 40) + expect_equal(length(out$new_ids$`123def`), 40) }) - diff --git a/pipeline-runner/tests/testthat/test-qc-3-filter_high_mito.R b/pipeline-runner/tests/testthat/test-qc-3-filter_high_mito.R index f1c361d9..3a3fa661 100644 --- a/pipeline-runner/tests/testthat/test-qc-3-filter_high_mito.R +++ b/pipeline-runner/tests/testthat/test-qc-3-filter_high_mito.R @@ -13,15 +13,13 @@ mock_config <- function(max_fraction = 0.1) { return(config) } + mock_scdata <- function() { pbmc_raw <- read.table( file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), as.is = TRUE ) - sample_1_id <- "123abc" - sample_2_id <- "123def" - scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) scdata$cells_id <- 0:(ncol(scdata) - 1) @@ -30,111 +28,103 @@ mock_scdata <- function() { scdata@meta.data$percent.mt[1:10] <- 11 # add samples - scdata$samples <- rep(c(sample_1_id, sample_2_id), each = 40) + scdata$samples <- rep(c("123abc", "123def"), each = 40) scdata <- Seurat::RenameCells(scdata, paste(scdata$samples, colnames(scdata), sep = "")) - - scdata_sample1 <- subset(scdata, samples == sample_1_id) - scdata_sample2 <- subset(scdata, samples == sample_2_id) - - scdata_list <- list(scdata_sample1, scdata_sample2) - names(scdata_list) <- c(sample_1_id, sample_2_id) - - return(list(scdata_list, sample_1_id, sample_2_id)) + return(scdata) } get_threshold <- function(config) config$filterSettings$methodSettings$absoluteThreshold$maxFraction test_that("filter_high_mito filters based on threshold", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() # should filter first 10 cells config <- mock_config(0.1) config$auto <- FALSE - out <- filter_high_mito(scdata_list, config, sample_1_id, cells_id) - expect_equal(ncol(out$data[[sample_1_id]]), 40) - expect_equal(length(out$new_ids[[sample_1_id]]), 30) + out <- filter_high_mito(scdata, config, "123abc", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 30) # should filter all cells in first sample config <- mock_config(0.01) config$auto <- FALSE - out <- filter_high_mito(scdata_list, config, sample_1_id, cells_id) - expect_equal(ncol(out$data[[sample_1_id]]), 40) - expect_equal(length(out$new_ids[[sample_1_id]]), 0) + out <- filter_high_mito(scdata, config, "123abc", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(length(out$new_ids$`123abc`), 0) }) test_that("filter_high_mito can be disabled", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() - nstart <- ncol(scdata_list[[sample_1_id]]) + nstart <- ncol(scdata) # should filter first 10 cells config <- mock_config(0.1) config$enabled <- FALSE - out <- filter_high_mito(scdata_list, config, sample_1_id, cells_id) - expect_equal(ncol(out$data[[sample_1_id]]), nstart) - expect_equal(out$new_ids[[sample_1_id]], 0:39) - expect_equal(out$new_ids[[sample_2_id]], 40:79) + out <- filter_high_mito(scdata, config, "123abc", cells_id) + expect_equal(ncol(out$data), nstart) + expect_equal(out$new_ids$`123abc`, 0:39) + expect_equal(out$new_ids$`123def`, 40:79) }) test_that("filter_high_mito is sample specific", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() + nstart <- ncol(scdata) # should filter first 10 cells of 123abc only config <- mock_config(0.1) config$auto <- FALSE - out1 <- filter_high_mito(scdata_list, config, sample_1_id, cells_id) - out2 <- filter_high_mito(scdata_list, config, sample_2_id, cells_id) + out1 <- filter_high_mito(scdata, config, "123abc", cells_id) + out2 <- filter_high_mito(scdata, config, "123def", cells_id) - expect_equal(ncol(out1$data[[sample_1_id]]), 40) - expect_equal(length(out1$new_ids[[sample_1_id]]), 30) - expect_equal(length(out1$new_ids[[sample_2_id]]), 40) - expect_equal(ncol(out2$data[[sample_2_id]]), 40) - expect_equal(length(out2$new_ids[[sample_1_id]]), 40) - expect_equal(length(out2$new_ids[[sample_2_id]]), 40) + expect_equal(ncol(out1$data), 80) + expect_equal(length(out1$new_ids$`123abc`), 30) + expect_equal(length(out1$new_ids$`123def`), 40) + expect_equal(ncol(out2$data), 80) + expect_equal(length(out2$new_ids$`123abc`), 40) + expect_equal(length(out2$new_ids$`123def`), 40) }) test_that("filter_high_mito can be set to auto", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() + nstart <- ncol(scdata) # would filter all 40 cells in first sample unless auto config <- mock_config(0.01) - out <- filter_high_mito(scdata_list, config, sample_1_id, cells_id) + out <- filter_high_mito(scdata, config, "123abc", cells_id) # check that threshold was updated expect_true(get_threshold(out$config) > get_threshold(config)) # check that updated threshold was used - nkeep <- length(unlist(out$new_ids[[sample_1_id]])) - expected <- sum(scdata_list[[sample_1_id]]$percent.mt <= get_threshold(out$config) * 100) + nkeep <- length(unlist(out$new_ids)) + expected <- sum(scdata$percent.mt <= get_threshold(out$config) * 100) expect_equal(nkeep, expected) # didn't subset original data - expect_equal(ncol(out$data[[sample_1_id]]), 40) + expect_equal(ncol(out$data), 80) }) test_that("data without percent.mt outliers uses the max percentage as the threshold", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() set.seed(gem2s$random.seed) - - scdata_sample_1 <- scdata_list[[sample_1_id]] - - scdata_list$percent.mt <- rnorm(ncol(scdata_sample_1), 6) + scdata$percent.mt <- rnorm(ncol(scdata), 6) cells_id <- mock_ids() config <- mock_config() - out <- filter_high_mito(scdata_list, config, sample_1_id, cells_id) + out <- filter_high_mito(scdata, config, "123abc", cells_id) # nothing filtered - expect_length(unlist(out$new_ids[[sample_1_id]]), ncol(scdata_sample_1)) + expect_length(unlist(out$new_ids), ncol(scdata)) # threshold equals max for sample - expect_equal(get_threshold(out$config), max(scdata_sample_1$percent.mt / 100)) + expect_equal(get_threshold(out$config), max(scdata$percent.mt[scdata$samples == "123abc"]) / 100) }) diff --git a/pipeline-runner/tests/testthat/test-qc-4-filter_gene_umi_outlier.R b/pipeline-runner/tests/testthat/test-qc-4-filter_gene_umi_outlier.R index 45476767..52e3d64f 100644 --- a/pipeline-runner/tests/testthat/test-qc-4-filter_gene_umi_outlier.R +++ b/pipeline-runner/tests/testthat/test-qc-4-filter_gene_umi_outlier.R @@ -3,60 +3,46 @@ mock_ids <- function() { } mock_config <- function() { - config <- list( - auto = TRUE, - enabled = TRUE, - filterSettings = list( - regressionType = "linear", - regressionTypeSettings = list( - linear = list(p.level = 0.1) - ) + config <- list( + auto = TRUE, + enabled = TRUE, + filterSettings = list( + regressionType = 'linear', + regressionTypeSettings = list( + linear = list(p.level = 0.1) + ) + ) ) - ) return(config) } -mock_scdata <- function(with_outlier = FALSE) { - pbmc_raw <- read.table( - file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), - as.is = TRUE - ) - - if (with_outlier) { - pbmc_raw[, 1] <- 0 - pbmc_raw[1:10, 1] <- 100 - } - - sample_1_id <- "123abc" - sample_2_id <- "123def" - scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) - scdata$cells_id <- 0:(ncol(scdata) - 1) - - # add samples - scdata$samples <- rep(c(sample_1_id, sample_2_id), each = 40) - scdata <- Seurat::RenameCells(scdata, paste(scdata$samples, colnames(scdata), sep = "")) - scdata$emptyDrops_FDR <- NA +mock_scdata <- function(with_outlier = FALSE) { + pbmc_raw <- read.table( + file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), + as.is = TRUE) - scdata_sample1 <- subset(scdata, samples == sample_1_id) - scdata_sample2 <- subset(scdata, samples == sample_2_id) + if (with_outlier) { + pbmc_raw[, 1] <- 0 + pbmc_raw[1:10, 1] <- 100 + } - scdata_list <- list(scdata_sample1, scdata_sample2) - names(scdata_list) <- c(sample_1_id, sample_2_id) + scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) + scdata$cells_id <- 0:(ncol(scdata)-1) - return(list(scdata_list, sample_1_id, sample_2_id)) + # add samples + scdata$samples <- rep(c('123abc', '123def'), each = 40) + return(scdata) } - test_that("filter_gene_umi_outlier updates linear p.level in config if auto", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - config <- mock_config() - cells_id <- mock_ids() - - config$filterSettings$regressionTypeSettings$linear$p.level <- 1 - out <- filter_gene_umi_outlier(scdata_list, config, sample_2_id, cells_id) - new <- out$config$filterSettings$regressionTypeSettings$linear$p.level + scdata <- mock_scdata() + config <- mock_config() + cells_id <- mock_ids() + config$filterSettings$regressionTypeSettings$linear$p.level <- 1 + out <- filter_gene_umi_outlier(scdata, config, '123def', cells_id) + new <- out$config$filterSettings$regressionTypeSettings$linear$p.level expect_lt(new, 1) }) @@ -64,122 +50,151 @@ test_that("filter_gene_umi_outlier updates linear p.level in config if auto", { test_that("filter_gene_umi_outlier is sample specific", { - # one outlier in first sample - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(with_outlier = T) - config <- mock_config() - cells_id <- mock_ids() + # one outlier in first sample + scdata <- mock_scdata(with_outlier = TRUE) + config <- mock_config() + cells_id <- mock_ids() + + cell_ids1 <- scdata$cells_id[scdata$samples == '123abc'] + cell_ids2 <- scdata$cells_id[scdata$samples == '123def'] - cell_ids1 <- scdata_list[[sample_1_id]]$cells_id - cell_ids2 <- scdata_list[[sample_2_id]]$cells_id - out1 <- filter_gene_umi_outlier(scdata_list, config, sample_1_id, cells_id) - out2 <- filter_gene_umi_outlier(scdata_list, config, sample_2_id, cells_id) + out1 <- filter_gene_umi_outlier(scdata, config, '123abc', cells_id) + out2 <- filter_gene_umi_outlier(scdata, config, '123def', cells_id) - # filtered something - expect_lt(length(out1$new_ids[[sample_1_id]]), length(cells_id[[sample_1_id]])) - expect_lt(length(out2$new_ids[[sample_2_id]]), length(cells_id[[sample_2_id]])) + # filtered something + expect_lt(length(out1$new_ids$`123abc`), length(cells_id$`123abc`)) + expect_lt(length(out2$new_ids$`123def`), length(cells_id$`123def`)) - # didn't filter other sample - expect_true(all(cell_ids1 %in% out2$new_ids[[sample_1_id]])) - expect_true(all(cell_ids2 %in% out1$new_ids[[sample_2_id]])) + # didn't filter other sample + expect_true(all(cell_ids1 %in% out2$new_ids$`123abc`)) + expect_true(all(cell_ids2 %in% out1$new_ids$`123def`)) }) test_that("filter_gene_umi_outlier can filter cells", { - # single outlier in single sample - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(with_outlier = T) - config <- mock_config() - cells_id <- mock_ids() - nstart <- ncol(scdata_list[[sample_1_id]]) - out <- - filter_gene_umi_outlier(scdata_list, config, sample_1_id, cells_id) - expect_lt(length(unlist(out$new_ids[[sample_1_id]])), nstart) + # single outlier in single sample + scdata <- mock_scdata(with_outlier = TRUE) + config <- mock_config() + cells_id <- mock_ids() + nstart <- ncol(scdata) + scdata$samples <- '123abc' + + out <- filter_gene_umi_outlier(scdata, config, '123abc', cells_id) + expect_lt(length(unlist(out$new_ids)), nstart) }) +test_that("filter_gene_umi_outlier uses linear if regressionType is gam (legacy)", { + + # single outlier in single sample + scdata <- mock_scdata(with_outlier = TRUE) + config <- mock_config() + cells_id <- mock_ids() + nstart <- ncol(scdata) + scdata$samples <- '123abc' + + out1 <- filter_gene_umi_outlier(scdata, config, '123abc', cells_id) + expect_lt(length(unlist(out1$new_ids)), nstart) + + + config$filterSettings$regressionType <- 'gam' + out2 <- filter_gene_umi_outlier(scdata, config, '123abc', cells_id) + expect_identical(out1$new_ids, out2$new_ids) +}) test_that("filter_gene_umi_outlier can be disabled", { - # single outlier in single sample - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(with_outlier = T) - config <- mock_config() - cells_id <- mock_ids() - nstart <- ncol(scdata_list[[sample_1_id]]) + # single outlier in single sample + scdata <- mock_scdata(with_outlier = TRUE) + config <- mock_config() + nstart <- ncol(scdata) + cells_id <- mock_ids() - # disabled - config$enabled <- FALSE + # disabled + config$enabled <- FALSE - out <- filter_gene_umi_outlier(scdata_list, config, sample_1_id, cells_id) - expect_equal(ncol(out$data[[sample_1_id]]), nstart) - expect_equal(out$new_ids[[sample_1_id]], 0:39) - expect_equal(out$new_ids[[sample_2_id]], 40:79) + out <- filter_gene_umi_outlier(scdata, config, '123abc',cells_id) + expect_equal(ncol(out$data), nstart) + expect_equal(out$new_ids$`123abc`,0:39) + expect_equal(out$new_ids$`123def`,40:79) }) test_that("filter_gene_umi_outlier return the appropriate plot data", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - config <- mock_config() + scdata <- mock_scdata() cells_id <- mock_ids() + config <- mock_config() - out <- filter_gene_umi_outlier(scdata_list, config, sample_2_id, cells_id) - plot_data <- out$plotData[[1]] + out <- filter_gene_umi_outlier(scdata, config, '123def',cells_id) + plot_data <- out$plotData[[1]] - # points and lines data - expect_equal(names(plot_data), c("pointsData", "linesData")) + # points and lines data + expect_equal(names(plot_data), c('pointsData', 'linesData')) - # one value per cell - points_data <- plot_data$pointsData - lines_data <- plot_data$linesData - expect_equal(length(points_data), ncol(scdata_list[[sample_2_id]])) + # one value per cell + points_data <- plot_data$pointsData + lines_data <- plot_data$linesData + expect_equal(length(points_data), sum(scdata$samples == '123def')) - # has the right names - expect_equal(names(points_data[[1]]), c("log_molecules", "log_genes")) - expect_equal(names(lines_data[[1]]), c("log_molecules", "lower_cutoff", "upper_cutoff")) + # has the right names + expect_equal(names(points_data[[1]]), c('log_molecules', 'log_genes')) + expect_equal(names(lines_data[[1]]), c('log_molecules', 'lower_cutoff', 'upper_cutoff')) }) +test_that("filter_gene_umi_outlier skips if no barcodes for this sample", { + scdata <- mock_scdata(with_outlier = TRUE) + config <- mock_config() + cells_id <- mock_ids() + out <- filter_gene_umi_outlier(scdata, config, 'not a sample',cells_id) + + expect_equal(ncol(out$data), ncol(scdata)) + expect_equal(out$new_ids$`123abc`,0:39) + expect_equal(out$new_ids$`123def`,40:79) +}) test_that("filter_gene_umi_outlier gives different results with spline fit", { # single outlier in single sample - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(with_outlier = T) - config <- mock_config() + scdata <- mock_scdata(with_outlier = TRUE) cells_id <- mock_ids() - nstart <- ncol(scdata_list[[sample_1_id]]) + config <- mock_config() + nstart <- ncol(scdata) + scdata$samples <- '123abc' - out1 <- filter_gene_umi_outlier(scdata_list, config, sample_1_id, cells_id) - expect_equal(ncol(out1$data[[sample_1_id]]), nstart) - expect_lt(length(out1$new_ids[[sample_1_id]]), length(cells_id[[sample_1_id]])) + out1 <- filter_gene_umi_outlier(scdata, config, '123abc',cells_id) + expect_equal(ncol(out1$data), nstart) + expect_lt(length(out1$new_ids$`123abc`),length(cells_id$`123abc`)) - config$filterSettings$regressionType <- "spline" + config$filterSettings$regressionType <- 'spline' - out2 <- filter_gene_umi_outlier(scdata_list, config, sample_1_id, cells_id) + out2 <- filter_gene_umi_outlier(scdata, config, '123abc',cells_id) expect_true(!identical(out1$plotData, out2$plotData)) - expect_true(!identical(out1$new_ids[[sample_1_id]], out2$new_ids[[sample_1_id]])) + expect_true(!identical(out1$new_ids$`123abc`, out2$new_ids$`123abc`)) }) test_that("Gene UMI filter works if input is a a float-interpretable string", { - # single outlier in single sample - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(with_outlier = T) - config <- mock_config() + scdata <- mock_scdata() cells_id <- mock_ids() - nstart <- ncol(scdata_list[[sample_2_id]]) + config <- mock_config() + nstart <- ncol(scdata) config$auto <- FALSE - out_number <- filter_gene_umi_outlier(scdata_list, config, sample_2_id, cells_id) + out_number <- filter_gene_umi_outlier(scdata, config, "123def", cells_id) config$filterSettings$regressionTypeSettings$linear$p.level <- "0.1" - out_string <- filter_gene_umi_outlier(scdata_list, config, sample_2_id, cells_id) + out_string <- filter_gene_umi_outlier(scdata, config, '123def',cells_id) - expect_equal(ncol(out_number$data[[sample_2_id]]), nstart) - expect_equal(out_string$new_ids[[sample_2_id]], out_number$new_ids[[sample_2_id]]) + expect_equal(ncol(out_number$data), nstart) + expect_equal(out_string$new_ids$`123def`, out_number$new_ids$`123def`) }) test_that("Gene UMI filter throws error if input is a non float-interpretable string", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - config <- mock_config() - config$auto <- FALSE - config$filterSettings$regressionTypeSettings$linear$p.level <- - "asd" + scdata <- mock_scdata() + config <- mock_config() + config$auto <- FALSE + config$filterSettings$regressionTypeSettings$linear$p.level <- "asd" - expect_error(filter_gene_umi_outlier(scdata, config, sample_2_id, cells_id)) + expect_error(filter_gene_umi_outlier(scdata, config, "123def", cells_id)) }) + diff --git a/pipeline-runner/tests/testthat/test-qc-5-filter_doublets.R b/pipeline-runner/tests/testthat/test-qc-5-filter_doublets.R index 045afceb..21ac5ca0 100644 --- a/pipeline-runner/tests/testthat/test-qc-5-filter_doublets.R +++ b/pipeline-runner/tests/testthat/test-qc-5-filter_doublets.R @@ -9,15 +9,13 @@ mock_config <- function(thr = 0.1, auto = FALSE, enabled = TRUE) { return(config) } + mock_scdata <- function() { pbmc_raw <- read.table( file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), as.is = TRUE ) - sample_1_id <- "123abc" - sample_2_id <- "123def" - scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) scdata$cells_id <- 0:(ncol(scdata) - 1) @@ -29,70 +27,64 @@ mock_scdata <- function() { scdata@meta.data$doublet_class[1:10] <- rep("doublet", 10) # add samples - scdata$samples <- rep(c(sample_1_id, sample_2_id), each = 40) + scdata$samples <- rep(c("123abc", "123def"), each = 40) scdata <- Seurat::RenameCells(scdata, paste(scdata$samples, colnames(scdata), sep = "")) + return(scdata) +} - scdata_sample1 <- subset(scdata, samples == sample_1_id) - scdata_sample2 <- subset(scdata, samples == sample_2_id) - - scdata_list <- list(scdata_sample1, scdata_sample2) - names(scdata_list) <- c(sample_1_id, sample_2_id) - return(list(scdata_list, sample_1_id, sample_2_id)) -} test_that("filter_doublets filters based on threshold", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() # should filter first 10 cells config <- mock_config(0.5) - out <- filter_doublets(scdata_list, config, sample_1_id, cells_id) - - expect_equal(ncol(out$data[[sample_1_id]]), 40) - expect_equal(out$new_ids[[sample_1_id]], 10:39) + out <- filter_doublets(scdata, config, "123abc", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(out$new_ids$`123abc`, 10:39) }) test_that("filter_doublets is sample aware", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() - scdata_list[[sample_2_id]]@meta.data$doublet_scores[31:40] <- 0.9 + scdata@meta.data$doublet_scores[71:80] <- 0.9 config <- mock_config(0.5) - out <- filter_doublets(scdata_list, config, sample_1_id, cells_id) - expect_equal(ncol(out$data[[sample_1_id]]), 40) - expect_equal(out$new_ids[[sample_1_id]], 10:39) - expect_equal(out$new_ids[[sample_2_id]], 40:79) + out <- filter_doublets(scdata, config, "123abc", cells_id) + expect_equal(ncol(out$data), 80) + expect_equal(out$new_ids$`123abc`, 10:39) + expect_equal(out$new_ids$`123def`, 40:79) - out <- filter_doublets(out$data, config, sample_2_id, out$new_ids) - expect_equal(ncol(out$data[[sample_2_id]]), 40) - expect_equal(out$new_ids[[sample_1_id]], 10:39) - expect_equal(out$new_ids[[sample_2_id]], 40:69) + out <- filter_doublets(out$data, config, "123def", out$new_ids) + expect_equal(ncol(out$data), 80) + expect_equal(out$new_ids$`123abc`, 10:39) + expect_equal(out$new_ids$`123def`, 40:69) }) test_that("filter_doublets filters works with auto", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() # should filter first 10 cells config <- mock_config(0.001, auto = TRUE) - out <- filter_doublets(scdata_list, config, sample_1_id, cells_id) - expect_equal(out$new_ids[[sample_1_id]], 10:39) + out <- filter_doublets(scdata, config, "123abc", cells_id) + expect_equal(out$new_ids$`123abc`, 10:39) config <- mock_config(0.001, auto = FALSE) - out <- filter_doublets(scdata_list, config, sample_1_id, cells_id) - expect_equal(length(out$new_ids[[sample_1_id]]), 0) + out <- filter_doublets(scdata, config, "123abc", cells_id) + expect_equal(length(out$new_ids$`123abc`), 0) }) test_that("filter_doublets can be disabled", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() + scdata <- mock_scdata() cells_id <- mock_ids() config <- mock_config(0.5, enabled = FALSE) - out <- filter_doublets(scdata_list, config, sample_1_id, cells_id) - expect_equal(out$new_ids[[sample_1_id]], 0:39) - expect_equal(out$new_ids[[sample_2_id]], 40:79) + out <- filter_doublets(scdata, config, "123abc", cells_id) + expect_equal(out$new_ids$`123abc`, 0:39) + expect_equal(out$new_ids$`123def`, 40:79) config <- mock_config(0.5, enabled = TRUE) - out <- filter_doublets(scdata_list, config, sample_1_id, cells_id) - expect_equal(out$new_ids[[sample_1_id]], 10:39) - expect_equal(out$new_ids[[sample_2_id]], 40:79) + out <- filter_doublets(scdata, config, "123abc", cells_id) + expect_equal(out$new_ids$`123abc`, 10:39) + expect_equal(out$new_ids$`123def`, 40:79) }) diff --git a/pipeline-runner/tests/testthat/test-qc-6-integrate_scdata.R b/pipeline-runner/tests/testthat/test-qc-6-integrate_scdata.R index 07d6cf81..6612458c 100644 --- a/pipeline-runner/tests/testthat/test-qc-6-integrate_scdata.R +++ b/pipeline-runner/tests/testthat/test-qc-6-integrate_scdata.R @@ -1,43 +1,5 @@ human_cc_genes <- cc_genes[["human"]] -mock_prev_out <- function(samples = "sample_a", counts = NULL) { - if (is.null(counts)) { - counts <- DropletUtils:::simCounts() - colnames(counts) <- paste0("cell", seq_len(ncol(counts))) - } - - eout <- DropletUtils::emptyDrops(counts) - - counts_list <- list() - edrops <- list() - doublet_scores <- list() - - for (sample in samples) { - counts_list[[sample]] <- counts - edrops[[sample]] <- eout - doublet_scores[[sample]] <- mock_doublet_scores(counts) - } - - annot <- data.frame(name = row.names(counts), input = row.names(counts)) - - # as passed to create_seurat - prev_out <- list( - counts_list = counts_list, - edrops = edrops, - doublet_scores = doublet_scores, - annot = annot, - config = list(name = "project name") - ) - - # call create_seurat to get prev_out to pass to prepare_experiment - prev_out <- create_seurat(NULL, NULL, prev_out)$output - - - prev_out$scdata_list <- add_metadata_to_samples(prev_out$scdata_list, annot = annot, experiment_id = "expid") - - return(prev_out) -} - mock_scdata <- function(rename_genes = c(), n_rep = 1) { pbmc_raw <- read.table( file = system.file("extdata", "pbmc_raw.txt", package = "Seurat"), @@ -52,33 +14,12 @@ mock_scdata <- function(rename_genes = c(), n_rep = 1) { rownames(pbmc_raw)[some_genes] <- rename_genes } - sample_1_id <- "123abc" - sample_2_id <- "123def" - scdata <- Seurat::CreateSeuratObject(counts = pbmc_raw) # add samples - scdata$samples <- rep(c(sample_1_id, sample_2_id), each = ncol(scdata) / 2) + scdata$samples <- rep(c("123abc", "123def"), each = ncol(scdata) / 2) scdata$cells_id <- 0:(ncol(scdata) - 1) - scdata@misc$gene_annotations <- data.frame(input = rownames(scdata), name = paste("SYMBOL -", rownames(scdata))) - rownames(scdata@misc$gene_annotations) <- rownames(scdata) - - scdata_sample1 <- subset(scdata, samples == sample_1_id) - scdata_sample2 <- subset(scdata, samples == sample_2_id) - - - scdata_list <- list(scdata_sample1, scdata_sample2) - names(scdata_list) <- c(sample_1_id, sample_2_id) - - return(list(scdata_list, sample_1_id, sample_2_id)) - -} - -mock_ids <- function() { - return(list("123abc" = 0:39, "123def" = 40:79)) -} - -scdata_preprocessing <- function(scdata) { + scdata@misc$gene_annotations$input <- rownames(scdata) # scale and PCA scdata <- Seurat::NormalizeData(scdata, normalization.method = "LogNormalize", verbose = FALSE) @@ -88,101 +29,79 @@ scdata_preprocessing <- function(scdata) { scdata@misc[["active.reduction"]] <- "pca" return(scdata) - -} - -mock_doublet_scores <- function(counts) { - doublet_scores <- runif(ncol(counts)) - doublet_class <- ifelse(doublet_scores < 0.8, "singlet", "doublet") - - data.frame( - row.names = colnames(counts), - barcodes = colnames(counts), - doublet_class = doublet_class, - doublet_scores = doublet_scores - ) } test_that("Integrate scdata works", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - cells_id <- mock_ids() + scdata <- mock_scdata() + cells_id <- 0:79 config <- list( dimensionalityReduction = list(numPCs = 2), dataIntegration = list(method = "harmony", methodSettings = list(harmony = list(numGenes = 10, normalisation = "logNormalize"))) ) - integrated_scdata <- suppressWarnings(integrate_scdata(scdata_list, config, "", cells_id, task_name = "dataIntegration"))$data - expect_s4_class(integrated_scdata, "Seurat") - expect_equal(ncol(integrated_scdata), 80) + scdata <- suppressWarnings(integrate_scdata(scdata, config, "", cells_id, task_name = "dataIntegration"))$data + expect_s4_class(scdata, "Seurat") + expect_equal(ncol(scdata), 80) }) test_that("Integrate scdata filters out cells ids", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - cells_id <- mock_ids() - cells_id[[sample_1_id]] <- cells_id[[sample_1_id]][-c(23:31)] - cells_id[[sample_2_id]] <- cells_id[[sample_2_id]][-c(40:47)] - + scdata <- mock_scdata() + cells_id <- 0:40 config <- list( dimensionalityReduction = list(numPCs = 2), dataIntegration = list(method = "harmony", methodSettings = list(harmony = list(numGenes = 10, normalisation = "logNormalize"))) ) - scdata <- suppressWarnings(integrate_scdata(scdata_list, config, "", cells_id, task_name = "dataIntegration"))$data + scdata <- suppressWarnings(integrate_scdata(scdata, config, "", cells_id, task_name = "dataIntegration"))$data expect_lt(ncol(scdata), 80) }) test_that("harmony integration works", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata() config <- list( dimensionalityReduction = list(numPCs = 2), dataIntegration = list(method = "harmony", methodSettings = list(harmony = list(numGenes = 10, normalisation = "logNormalize"))) ) - integrated_scdata <- suppressWarnings(run_dataIntegration(merged_scdata, config)) - expect_s4_class(integrated_scdata, "Seurat") + scdata <- suppressWarnings(run_dataIntegration(scdata, config)) + expect_s4_class(scdata, "Seurat") }) test_that("SeuratV4 integration doesnt error out with small dataset", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata() config <- list( dimensionalityReduction = list(numPCs = 2, method = "rpca"), dataIntegration = list(method = "seuratv4", methodSettings = list(seuratv4 = list(numGenes = 1000, normalisation = "logNormalize"))) ) - integrated_scdata <- suppressWarnings(run_dataIntegration(merged_scdata, config)) - expect_s4_class(integrated_scdata, "Seurat") + scdata <- suppressWarnings(run_dataIntegration(scdata, config)) + expect_s4_class(scdata, "Seurat") }) test_that("Unisample integration works", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata() config <- list( dimensionalityReduction = list(numPCs = 2), dataIntegration = list(method = "unisample", methodSettings = list(unisample = list(numGenes = 1000, normalisation = "logNormalize"))) ) - integrated_scdata <- suppressWarnings(run_dataIntegration(merged_scdata, config)) - expect_s4_class(integrated_scdata, "Seurat") + scdata <- suppressWarnings(run_dataIntegration(scdata, config)) + expect_s4_class(scdata, "Seurat") }) test_that("FastMNN is not working", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata() config <- list( dimensionalityReduction = list(numPCs = 2), dataIntegration = list(method = "fastmnn", methodSettings = list(fastmnn = list(numGenes = 1000, normalisation = "logNormalize"))) ) - expect_error(suppressWarnings(run_dataIntegration(merged_scdata, config))) + expect_error(suppressWarnings(run_dataIntegration(scdata, config))) }) test_that("numPCs estimation works", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) - merged_scdata <- suppressWarnings(scdata_preprocessing(merged_scdata)) - npcs <- get_npcs(merged_scdata) + scdata <- suppressWarnings(mock_scdata()) + npcs <- get_npcs(scdata) expect_lte(npcs, 30) }) @@ -190,9 +109,8 @@ test_that("numPCs estimation works", { test_that("build_cc_gene_list correctly makes the list of cc genes when there are matches", { n_rename <- 10 some_cc_genes <- sample(human_cc_genes$symbol, n_rename) - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(rename_genes = some_cc_genes) - merged_scdata <- create_scdata(scdata_list) - all_genes <- merged_scdata@misc$gene_annotations$input + scdata <- suppressWarnings(mock_scdata(rename_genes = some_cc_genes)) + all_genes <- scdata@misc$gene_annotations$input expected_res <- match(some_cc_genes, all_genes) res <- build_cc_gene_list(all_genes) @@ -202,9 +120,8 @@ test_that("build_cc_gene_list correctly makes the list of cc genes when there ar test_that("build_cc_gene_list returns empty int vector when there aren't matches", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) - all_genes <- merged_scdata@misc$gene_annotations$input + scdata <- suppressWarnings(mock_scdata()) + all_genes <- scdata@misc$gene_annotations$input # empty integer vector expected_res <- integer() @@ -217,9 +134,8 @@ test_that("build_cc_gene_list returns empty int vector when there aren't matches test_that("list_exclude_genes adds custom genes to exclusion", { n_rename <- 10 some_cc_genes <- sample(human_cc_genes$symbol, n_rename) - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(rename_genes = some_cc_genes) - merged_scdata <- create_scdata(scdata_list) - all_genes <- merged_scdata@misc$gene_annotations$input + scdata <- suppressWarnings(mock_scdata(rename_genes = some_cc_genes)) + all_genes <- scdata@misc$gene_annotations$input exclude_custom <- sample(setdiff(all_genes, some_cc_genes), 7) exclude_custom_indices <- match(exclude_custom, all_genes) @@ -234,132 +150,108 @@ test_that("list_exclude_genes adds custom genes to exclusion", { test_that("remove_genes removes the correct genes when there are genes to remove", { n_rename <- 10 some_cc_genes <- sample(human_cc_genes$symbol, n_rename) - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata(rename_genes = some_cc_genes) - merged_scdata <- create_scdata(scdata_list) - all_genes <- merged_scdata@misc$gene_annotations$input + scdata <- suppressWarnings(mock_scdata(rename_genes = some_cc_genes)) + all_genes <- scdata@misc$gene_annotations$input - res <- remove_genes(merged_scdata, exclude_groups = list("cellCycle")) + res <- remove_genes(scdata, exclude_groups = list("cellCycle")) # only cc genes - expect_equal(nrow(res), nrow(merged_scdata) - 10) + expect_equal(nrow(res), nrow(scdata) - 10) expect_false(any(some_cc_genes %in% rownames(res))) exclude_custom <- sample(setdiff(all_genes, some_cc_genes), 7) # cc genes and custom - res <- remove_genes(merged_scdata, exclude_groups = list("cellCycle"), exclude_custom) + res <- remove_genes(scdata, exclude_groups = list("cellCycle"), exclude_custom) - expect_equal(nrow(res), nrow(merged_scdata) - 17) + expect_equal(nrow(res), nrow(scdata) - 17) expect_false(any(c(some_cc_genes, exclude_custom) %in% rownames(res))) }) test_that("remove_genes doesn't modify the object when there are no matches", { - c(scdata_list, sample_1_id, sample_2_id) %<-% mock_scdata() - merged_scdata <- create_scdata(scdata_list) + scdata <- suppressWarnings(mock_scdata()) + # empty integer vector - res <- remove_genes(merged_scdata, exclude_groups = "cellCycle") + res <- remove_genes(scdata, exclude_groups = "cellCycle") - expect_equal(res, merged_scdata) + expect_equal(res, scdata) }) test_that("SeuratV4 integration works", { # mock a bigger dataset to run Seurat v4 integration without skipping it - c(scdata_list, sample_1_id, sample_2_id) %<-% suppressWarnings(mock_scdata(n_rep = 3)) - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata(n_rep = 3) - merged_scdata <- suppressWarnings(scdata_preprocessing(merged_scdata)) - npcs <- get_npcs(merged_scdata) + npcs <- get_npcs(scdata) config <- list( dimensionalityReduction = list(numPCs = npcs, method = "rpca"), dataIntegration = list(method = "seuratv4", methodSettings = list(seuratv4 = list(numGenes = 1000, normalisation = "logNormalize"))) ) - merged_scdata <- suppressWarnings(run_dataIntegration(merged_scdata, config)) - expect_s4_class(merged_scdata, "Seurat") + scdata <- suppressWarnings(run_dataIntegration(scdata, config)) + expect_s4_class(scdata, "Seurat") }) test_that("PCA is computed when RPCA method is selected within SeuratV4 integration", { # mock a bigger dataset to run Seurat v4 integration without skipping it - c(scdata_list, sample_1_id, sample_2_id) %<-% suppressWarnings(mock_scdata(n_rep = 3)) - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata(n_rep = 3) - merged_scdata <- suppressWarnings(scdata_preprocessing(merged_scdata)) - npcs <- get_npcs(merged_scdata) + npcs <- get_npcs(scdata) config <- list( dimensionalityReduction = list(numPCs = npcs, method = "rpca"), dataIntegration = list(method = "seuratv4", methodSettings = list(seuratv4 = list(numGenes = 1000, normalisation = "logNormalize"))) ) - expect_message(run_dataIntegration(merged_scdata, config), "Running PCA") + expect_message(run_dataIntegration(scdata, config), "Running PCA") }) test_that("PCA is not computed when CCA method is selected within SeuratV4 integration", { # mock a bigger dataset to run Seurat v4 integration without skipping it - c(scdata_list, sample_1_id, sample_2_id) %<-% suppressWarnings(mock_scdata(n_rep = 3)) - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata(n_rep = 3) - merged_scdata <- suppressWarnings(scdata_preprocessing(merged_scdata)) - npcs <- get_npcs(merged_scdata) + npcs <- get_npcs(scdata) config <- list( dimensionalityReduction = list(numPCs = npcs, method = "cca"), dataIntegration = list(method = "seuratv4", methodSettings = list(seuratv4 = list(numGenes = 1000, normalisation = "logNormalize"))) ) - expect_message(run_dataIntegration(merged_scdata, config), "PCA is not running .*") + expect_message(run_dataIntegration(scdata, config), "PCA is not running .*") }) test_that("SeuratV4 integration finds integration anchors using RPCA method, if method in config is RPCA", { # mock a bigger dataset to run Seurat v4 integration without skipping it - c(scdata_list, sample_1_id, sample_2_id) %<-% suppressWarnings(mock_scdata(n_rep = 3)) - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata(n_rep = 3) - merged_scdata <- suppressWarnings(scdata_preprocessing(merged_scdata)) - npcs <- get_npcs(merged_scdata) + npcs <- get_npcs(scdata) config <- list( dimensionalityReduction = list(numPCs = npcs, method = "rpca"), dataIntegration = list(method = "seuratv4", methodSettings = list(seuratv4 = list(numGenes = 1000, normalisation = "logNormalize"))) ) - expect_message(merged_scdata <- suppressWarnings(run_dataIntegration(merged_scdata, config)), "Finding integration anchors using RPCA reduction") - expect_equal(merged_scdata@commands$FindIntegrationAnchors$reduction, "pca") + expect_message(scdata <- suppressWarnings(run_dataIntegration(scdata, config)), "Finding integration anchors using RPCA reduction") + expect_equal(scdata@commands$FindIntegrationAnchors$reduction, "pca") }) test_that("SeuratV4 integration finds integration anchors using CCA method, if method in config is CCA", { # mock a bigger dataset to run Seurat v4 integration without skipping it - c(scdata_list, sample_1_id, sample_2_id) %<-% suppressWarnings(mock_scdata(n_rep = 3)) - merged_scdata <- create_scdata(scdata_list) + scdata <- mock_scdata(n_rep = 3) - merged_scdata <- suppressWarnings(scdata_preprocessing(merged_scdata)) - npcs <- get_npcs(merged_scdata) + npcs <- get_npcs(scdata) config <- list( dimensionalityReduction = list(numPCs = npcs, method = "cca"), dataIntegration = list(method = "seuratv4", methodSettings = list(seuratv4 = list(numGenes = 1000, normalisation = "logNormalize"))) ) - expect_message(merged_scdata <- suppressWarnings(run_dataIntegration(merged_scdata, config)), "Finding integration anchors using CCA reduction") - expect_equal(merged_scdata@commands$FindIntegrationAnchors$reduction, "cca") + expect_message(scdata <- suppressWarnings(run_dataIntegration(scdata, config)), "Finding integration anchors using CCA reduction") + expect_equal(scdata@commands$FindIntegrationAnchors$reduction, "cca") }) - - -test_that("merge_scdata_list correctly merges seurat objects", { - prev_out <- mock_prev_out(samples = c("a", "b", "c")) - scdata_list <- prev_out$scdata_list - - - scdata <- suppressWarnings(merge_scdata_list(scdata_list)) - - expect_equal(sum(unlist(lapply(scdata_list, ncol))), ncol(scdata)) - expect_true(all(scdata$samples[1:ncol(scdata_list[[1]])] == "a")) - -}) \ No newline at end of file