Merge pull request #177 from biomage-ltd/separate-qc-steps

Separate qc steps
hms-dbmi-cellenics · Nov 4, 2021 · e019c19 · e019c19
2 parents fe6bded + 2b1e198
commit e019c19
Show file tree

Hide file tree

Showing 45 changed files with 1,486 additions and 1,202 deletions.
diff --git a/pipeline-runner/.gitignore b/pipeline-runner/.gitignore
@@ -1 +1,2 @@
 .Rproj.user
+tests/testthat/testthat-problems.rds
diff --git a/pipeline-runner/NAMESPACE b/pipeline-runner/NAMESPACE
@@ -9,8 +9,11 @@ export(filter_high_mito)
 export(filter_low_cellsize)
 export(generate_default_values_cellSizeDistribution)
 export(generate_default_values_classifier)
+export(generate_first_step_ids)
+export(getClusters)
 export(load_cellranger_files)
 export(prepare_experiment)
+export(runClusters)
 export(run_emptydrops)
 export(score_doublets)
 import(data.table)

diff --git a/pipeline-runner/R/gem2s-1-download_cellranger_files.R b/pipeline-runner/R/gem2s-1-download_cellranger_files.R
@@ -57,7 +57,8 @@ download_cellranger_files <- function(input, pipeline_config, prev_out = list())
   prev_out$config <- config
   res <- list(
     data = list(),
-    ouput = prev_out)
+    ouput = prev_out
+  )
 
   message("\nDownloading of cellranger files step complete.")
   return(res)

diff --git a/pipeline-runner/R/gem2s-2-load_cellranger_files.R b/pipeline-runner/R/gem2s-2-load_cellranger_files.R
@@ -9,9 +9,9 @@
 #' }
 #' @export
 #'
-load_cellranger_files <- function(input, pipeline_config, prev_out, input_dir = '/input') {
+load_cellranger_files <- function(input, pipeline_config, prev_out, input_dir = "/input") {
   message("Loading cellranger output ...")
-  check_prev_out(prev_out, 'config')
+  check_prev_out(prev_out, "config")
 
   # destructure previous output
   config <- prev_out$config
@@ -20,7 +20,8 @@ load_cellranger_files <- function(input, pipeline_config, prev_out, input_dir =
 
   res <- list(
     data = list(),
-    output = output)
+    output = output
+  )
 
   message("\nLoading of cellranger files step complete.")
   return(res)
@@ -44,15 +45,15 @@ call_read10x <- function(config, input_dir) {
   for (sample in samples) {
     sample_dir <- file.path(input_dir, sample)
     sample_fpaths <- list.files(sample_dir)
-    annot_fpath <- file.path(sample_dir, 'features.tsv.gz')
+    annot_fpath <- file.path(sample_dir, "features.tsv.gz")
 
     message("\nSample --> ", sample)
-    message("Reading files from ", sample_dir, ' --> ', paste(sample_fpaths, collapse = ' - '))
+    message("Reading files from ", sample_dir, " --> ", paste(sample_fpaths, collapse = " - "))
 
     counts <- Seurat::Read10X(sample_dir, gene.column = 1)
 
-    if(is(counts, 'list')) {
-      slot <- 'Gene Expression'
+    if (is(counts, "list")) {
+      slot <- "Gene Expression"
       # questionable: grab first slot if no gene expression
       if (!slot %in% names(counts)) slot <- names(counts)[1]
       counts <- counts[[slot]]
@@ -86,8 +87,8 @@ format_annot <- function(annot_list) {
   annot$original_name <- gname
   is.dup <- duplicated(gname) | duplicated(gname, fromLast = TRUE)
 
-  #We need to convert the gene inputs from _ to - bc when we create the Seurat object we do this, and the match would return NA values if any of the inputs still has _.
-  annot$input <- gsub('_', '-', annot$input)
+  # We need to convert the gene inputs from _ to - bc when we create the Seurat object we do this, and the match would return NA values if any of the inputs still has _.
+  annot$input <- gsub("_", "-", annot$input)
   annot$name[is.dup] <- paste(gname[is.dup], annot$input[is.dup], sep = " - ")
 
   annot <- annot[!duplicated(annot$input), ]

diff --git a/pipeline-runner/R/gem2s-3-run_emptydrops.R b/pipeline-runner/R/gem2s-3-run_emptydrops.R
@@ -10,7 +10,7 @@
 #'
 run_emptydrops <- function(input, pipeline_config, prev_out) {
   message("Testing if droplets are empty...")
-  check_prev_out(prev_out, c('config', 'counts_list', 'annot'))
+  check_prev_out(prev_out, c("config", "counts_list", "annot"))
 
   # destructure previous output
   counts_list <- prev_out$counts_list
@@ -25,7 +25,8 @@ run_emptydrops <- function(input, pipeline_config, prev_out) {
   prev_out$edrops <- edrops
   res <- list(
     data = list(),
-    output = prev_out)
+    output = prev_out
+  )
 
   message("\nRunning of emptydrops step complete.")
   return(res)

diff --git a/pipeline-runner/R/gem2s-4-score_doublets.R b/pipeline-runner/R/gem2s-4-score_doublets.R
@@ -12,7 +12,7 @@ score_doublets <- function(input, pipeline_config, prev_out) {
   message("Calculating probability of droplets being doublets...")
 
   # NOTE: edrops is not required
-  check_prev_out(prev_out, c('config', 'counts_list', 'annot'))
+  check_prev_out(prev_out, c("config", "counts_list", "annot"))
 
   edrops_list <- prev_out$edrops
   counts_list <- prev_out$counts_list
@@ -40,7 +40,8 @@ score_doublets <- function(input, pipeline_config, prev_out) {
   prev_out$doublet_scores <- scores
   res <- list(
     data = list(),
-    output = prev_out)
+    output = prev_out
+  )
 
   message("\nScoring doublets step complete.")
   return(res)
@@ -54,7 +55,6 @@ score_doublets <- function(input, pipeline_config, prev_out) {
 #' @return data.frame with doublet scores and assigned classes
 #'
 compute_sample_doublet_scores <- function(sample_counts) {
-
   set.seed(0)
   sce <- scDblFinder::scDblFinder(sample_counts)
   doublet_res <- data.frame(

diff --git a/pipeline-runner/R/gem2s-5-create_seurat.R b/pipeline-runner/R/gem2s-5-create_seurat.R
@@ -11,7 +11,7 @@ create_seurat <- function(input, pipeline_config, prev_out) {
   message("Creating Seurat Objects...")
 
   # NOTE: edrops can be empty list
-  check_names <- c('config', 'counts_list', 'annot', 'doublet_scores', 'edrops')
+  check_names <- c("config", "counts_list", "annot", "doublet_scores", "edrops")
   check_prev_out(prev_out, check_names)
 
   # destructure previous output: config, counts_list, annot, and doublet_scores
@@ -28,30 +28,32 @@ create_seurat <- function(input, pipeline_config, prev_out) {
       edrops_out = edrops[[sample]],
       sample = sample,
       annot = annot,
-      config = config)
+      config = config
+    )
   }
 
 
   prev_out$scdata_list <- scdata_list
   res <- list(
     data = list(),
-    output = prev_out)
+    output = prev_out
+  )
 
   message("\nCreation of Seurat objects step complete.")
   return(res)
 }
 
 # construct SeuratObject
 construct_scdata <- function(counts, doublet_score, edrops_out, sample, annot, config, min.cells = 3, min.features = 10) {
-
   metadata <- construct_metadata(counts, sample, config)
 
   scdata <- Seurat::CreateSeuratObject(
     counts,
     meta.data = metadata,
     project = config$name,
     min.cells = min.cells,
-    min.features = min.features)
+    min.features = min.features
+  )
 
   scdata <- scdata %>%
     add_mito(annot) %>%
@@ -86,7 +88,6 @@ construct_metadata <- function(counts, sample, config) {
 
 # add mitochondrial percent to SeuratObject
 add_mito <- function(scdata, annot) {
-
   if (any(grepl("^mt-", annot$name, ignore.case = TRUE))) {
     message("Adding MT information...")
     mt.features <- annot$input[grep("^mt-", annot$name, ignore.case = TRUE)]
@@ -102,7 +103,6 @@ add_mito <- function(scdata, annot) {
 
 # add emptyDrops result to SeuratObject
 add_edrops <- function(scdata, edout) {
-
   scdata@tools$flag_filtered <- is.null(edout)
 
   if (!scdata@tools$flag_filtered) {
@@ -120,7 +120,6 @@ add_edrops <- function(scdata, edout) {
     rownames(meta.data) <- meta.data$barcode
 
     scdata@meta.data <- meta.data
-
   } else {
     message("emptyDrops results not present, skipping...")
     scdata@meta.data$emptyDrops_FDR <- NA
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		.Rproj.user
		tests/testthat/testthat-problems.rds