Merge pull request #355 from biomage-org/adjust-doublet-score

Adjust doublet score calculation according to sequencing technology
hms-dbmi-cellenics · Jan 29, 2024 · dddacd0 · dddacd0
2 parents cc2d233 + dd8a236
commit dddacd0
Show file tree

Hide file tree

Showing 8 changed files with 101 additions and 15 deletions.
diff --git a/.github/workflows/pr_validate.yaml b/.github/workflows/pr_validate.yaml
@@ -63,7 +63,7 @@ jobs:
       - id: reach-staging
         if: steps.extract-staging.outputs.url != 'N/A'
         name: Attempt to reach staging environments
-        uses: nick-invision/retry@v2
+        uses: nick-fields/retry@v2
         with:
           timeout_seconds: 30
           max_attempts: 5

diff --git a/pipeline-runner/R/gem2s-4-score_doublets.R b/pipeline-runner/R/gem2s-4-score_doublets.R
@@ -14,6 +14,7 @@ score_doublets <- function(input, pipeline_config, prev_out) {
   # NOTE: edrops is not required
   check_prev_out(prev_out, c("config", "counts_list", "annot"))
 
+  technology <- input$input$type
   edrops_list <- prev_out$edrops
   counts_list <- prev_out$counts_list
   samples <- names(counts_list)
@@ -30,7 +31,8 @@ score_doublets <- function(input, pipeline_config, prev_out) {
       sample_counts <- sample_counts[, keep]
     }
 
-    scores[[sample]] <- get_doublet_scores(sample_counts)
+    # TODO: Pass also parse_kit when available from the UI
+    scores[[sample]] <- get_doublet_scores(sample_counts, technology = technology)
 
   }
 
@@ -48,12 +50,24 @@ score_doublets <- function(input, pipeline_config, prev_out) {
 #' Compute doublets scores per sample.
 #'
 #' @param sample_counts Sparse matrix with the counts for one sample.
+#' @param technology Technology used to generate the data (e.g. 10x, Parse).
+#' @param parse_kit Kit used to generate the data (specific to Parse data).
 #'
 #' @return data.frame with doublet scores and assigned classes
 #'
-compute_sample_doublet_scores <- function(sample_counts) {
+compute_sample_doublet_scores <- function(sample_counts, technology, parse_kit = "WT") {
   set.seed(RANDOM_SEED)
-  sce <- scDblFinder::scDblFinder(sample_counts)
+
+  dbr <- NULL
+  if (technology == "parse") {
+    if (parse_kit %in% names(DOUBLET_RATE_PARSE)) {
+      dbr <- DOUBLET_RATE_PARSE[[parse_kit]]
+    } else {
+      stop("Invalid parse kit value: ", parse_kit)
+    }
+  }
+  sce <- scDblFinder::scDblFinder(sample_counts, dbr = dbr)
+
   doublet_res <- data.frame(
     row.names = colnames(sce),
     barcodes = colnames(sce),
@@ -65,7 +79,7 @@ compute_sample_doublet_scores <- function(sample_counts) {
 }
 
 
-get_doublet_scores <- function(sample_counts, max_attempts = 5) {
+get_doublet_scores <- function(sample_counts, max_attempts = 5, technology = "10x") {
   # also filter low UMI as per scDblFinder:::.checkSCE()
   ntot <- Matrix::colSums(sample_counts)
 
@@ -77,7 +91,8 @@ get_doublet_scores <- function(sample_counts, max_attempts = 5) {
     # make the threshold stricter in every attempt
     empty_cells_mask <- ntot > (200 * attempt)
     try({
-      scores <- compute_sample_doublet_scores(sample_counts[, empty_cells_mask])
+      # TODO: Pass also parse_kit when available from the UI
+      scores <- compute_sample_doublet_scores(sample_counts[, empty_cells_mask], technology)
       retry <- "not null"
     })
     attempt <- attempt + 1

diff --git a/pipeline-runner/R/qc-5-filter_doublets.R b/pipeline-runner/R/qc-5-filter_doublets.R
@@ -28,7 +28,7 @@ filter_doublets <- function(scdata_list, config, sample_id, cells_id, task_name
 
   if ("recomputeDoubletScore" %in% names(config)) {
     if (config$recomputeDoubletScore) {
-    scores <- get_doublet_scores(sample_data@assays$RNA@counts)
+    scores <- get_doublet_scores(sample_data@assays$RNA@counts, technology = config$sampleTechnology)
     sample_data <- add_dblscore(sample_data, scores)
     # update doublet scores in original scdata
     scdata_list[[sample_id]] <- add_dblscore(scdata_list[[sample_id]], scores)

diff --git a/pipeline-runner/R/sysdata.rda b/pipeline-runner/R/sysdata.rda
diff --git a/pipeline-runner/data-raw/sysdata.R b/pipeline-runner/data-raw/sysdata.R
@@ -113,6 +113,9 @@ pipeline_version <- 2
 
 UNISAMPLE <- "unisample"
 
+# Parse kits empirical doublet rates
+DOUBLET_RATE_PARSE <- list(mini = 0.046, WT = 0.034, mega = 0.064)
+
 # pipeline error constants
 errors <- list(
   ERROR_SEURAT_RDS = 'ERROR_SEURAT_RDS',
@@ -148,6 +151,7 @@ usethis::use_data(
   RIBOSOMAL_REGEX,
   pipeline_version,
   UNISAMPLE,
+  DOUBLET_RATE_PARSE,
   errors,
   internal = TRUE,
   overwrite = TRUE

diff --git a/pipeline-runner/man/compute_sample_doublet_scores.Rd b/pipeline-runner/man/compute_sample_doublet_scores.Rd
diff --git a/pipeline-runner/tests/testthat/test-gem2s-4-score_doublets.R b/pipeline-runner/tests/testthat/test-gem2s-4-score_doublets.R
@@ -7,16 +7,22 @@ mock_counts <- function(...) {
   return(Matrix::Matrix(counts, sparse = T))
 }
 
+mock_input <- function(technology) {
+  input <- list()
+  input$input$type <- technology
+  return(input)
+}
 
 test_that("score_doublets returns expected columns", {
   counts <- mock_counts()
+  input <- mock_input("10x")
 
   prev_out <- list(
     counts_list = list(sample1 = counts),
     config = list(),
     annot = list()
   )
-  out <- score_doublets(NULL, NULL, prev_out)$output
+  out <- score_doublets(input, NULL, prev_out)$output
 
   expect_setequal(
     colnames(out$doublet_scores$sample1),
@@ -29,14 +35,16 @@ test_that("score_doublets filters cells to avoid warning of extremely low read c
   counts <- mock_counts(ncells = c(200, 300, 400, 200, 500, 300))
   counts <- round(counts / 2)
 
+  input <- mock_input("10x")
+
   expect_warning(scDblFinder:::.checkSCE(counts), "extremely low read counts")
 
   prev_out <- list(
     counts_list = list(sample1 = counts),
     config = list(),
     annot = list()
   )
-  expect_warning(score_doublets(NULL, NULL, prev_out), NA)
+  expect_warning(score_doublets(input, NULL, prev_out), NA)
 })
 
 
@@ -47,15 +55,69 @@ test_that("score_doublets fails if prev_out is missing 'config', 'counts_list',
     config = list(),
     annot = list()
   )
+  input <- mock_input("10x")
 
   prev_out$config <- NULL
-  expect_error(score_doublets(NULL, NULL, prev_out), "config is missing")
+  expect_error(score_doublets(input, NULL, prev_out), "config is missing")
 
   prev_out$config <- list()
   prev_out$annot <- NULL
-  expect_error(score_doublets(NULL, NULL, prev_out), "annot is missing")
+  expect_error(score_doublets(input, NULL, prev_out), "annot is missing")
 
   prev_out$annot <- data.frame()
   prev_out$counts_list <- NULL
-  expect_error(score_doublets(NULL, NULL, prev_out), "counts_list is missing")
+  expect_error(score_doublets(input, NULL, prev_out), "counts_list is missing")
+})
+
+
+test_that("compute_sample_doublet_scores handles technology 'parse' correctly", {
+  counts <- mock_counts()
+  input <- mock_input("parse")
+
+  prev_out <- list(
+    counts_list = list(sample1 = counts),
+    config = list(),
+    annot = list()
+  )
+  out <-suppressWarnings(score_doublets(input, NULL, prev_out)$output)
+
+  expect_setequal(
+    colnames(out$doublet_scores$sample1),
+    c("barcodes", "doublet_class", "doublet_scores")
+  )
+})
+
+
+test_that("compute_sample_doublet_scores uses correct dbr for different Parse kits", {
+  counts <- mock_counts()
+
+  dbr_mini <- DOUBLET_RATE_PARSE[["mini"]]
+  set.seed(RANDOM_SEED)
+  expected_sce_mini <- suppressWarnings(scDblFinder::scDblFinder(counts, dbr = dbr_mini))
+
+  dbr_wt <- DOUBLET_RATE_PARSE[["WT"]]
+  set.seed(RANDOM_SEED)
+  expected_sce_wt <- suppressWarnings(scDblFinder::scDblFinder(counts, dbr = dbr_wt))
+
+  dbr_mega <- DOUBLET_RATE_PARSE[["mega"]]
+  set.seed(RANDOM_SEED)
+  expected_sce_mega <- suppressWarnings(scDblFinder::scDblFinder(counts, dbr = dbr_mega))
+
+  observed_sce_mini <- suppressWarnings(compute_sample_doublet_scores(counts, technology = "parse", parse_kit = "mini"))
+  observed_sce_wt <- suppressWarnings(compute_sample_doublet_scores(counts, technology = "parse", parse_kit = "WT"))
+  observed_sce_mega <- suppressWarnings(compute_sample_doublet_scores(counts, technology = "parse", parse_kit = "mega"))
+
+  expect_identical(expected_sce_mini$scDblFinder.score, observed_sce_mini$doublet_scores)
+  expect_identical(expected_sce_wt$scDblFinder.score, observed_sce_wt$doublet_scores)
+  expect_identical(expected_sce_mega$scDblFinder.score, observed_sce_mega$doublet_scores)
+})
+
+
+test_that("compute_sample_doublet_scores stops with an error for invalid parse_kit values", {
+  counts <- mock_counts()
+
+  expect_error(
+    compute_sample_doublet_scores(counts, technology = "parse", parse_kit = "invalid_kit"),
+    "Invalid parse kit value"
+  )
 })
diff --git a/pipeline-runner/tests/testthat/test-qc-5-filter_doublets.R b/pipeline-runner/tests/testthat/test-qc-5-filter_doublets.R
@@ -4,14 +4,15 @@ mock_ids <- function() {
 }
 
 
-mock_config <- function(thr = 0.1, auto = FALSE, enabled = TRUE, recomputeDoubletScore = FALSE) {
+mock_config <- function(thr = 0.1, auto = FALSE, enabled = TRUE, recomputeDoubletScore = FALSE, sampleTechnology = "10x") {
   config <- list(
     auto = auto,
     enabled = enabled,
     filterSettings = list(
       probabilityThreshold = thr
     ),
-    recomputeDoubletScore = recomputeDoubletScore
+    recomputeDoubletScore = recomputeDoubletScore,
+    sampleTechnology = sampleTechnology
   )
   return(config)
 }