Skip to content

Commit

Permalink
Merge pull request hms-dbmi-cellenics#355 from biomage-org/adjust-dou…
Browse files Browse the repository at this point in the history
…blet-score

Adjust doublet score calculation according to sequencing technology
  • Loading branch information
saracastel authored Jan 29, 2024
2 parents cc2d233 + dd8a236 commit dddacd0
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 15 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pr_validate.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ jobs:
- id: reach-staging
if: steps.extract-staging.outputs.url != 'N/A'
name: Attempt to reach staging environments
uses: nick-invision/retry@v2
uses: nick-fields/retry@v2
with:
timeout_seconds: 30
max_attempts: 5
Expand Down
25 changes: 20 additions & 5 deletions pipeline-runner/R/gem2s-4-score_doublets.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ score_doublets <- function(input, pipeline_config, prev_out) {
# NOTE: edrops is not required
check_prev_out(prev_out, c("config", "counts_list", "annot"))

technology <- input$input$type
edrops_list <- prev_out$edrops
counts_list <- prev_out$counts_list
samples <- names(counts_list)
Expand All @@ -30,7 +31,8 @@ score_doublets <- function(input, pipeline_config, prev_out) {
sample_counts <- sample_counts[, keep]
}

scores[[sample]] <- get_doublet_scores(sample_counts)
# TODO: Pass also parse_kit when available from the UI
scores[[sample]] <- get_doublet_scores(sample_counts, technology = technology)

}

Expand All @@ -48,12 +50,24 @@ score_doublets <- function(input, pipeline_config, prev_out) {
#' Compute doublets scores per sample.
#'
#' @param sample_counts Sparse matrix with the counts for one sample.
#' @param technology Technology used to generate the data (e.g. 10x, Parse).
#' @param parse_kit Kit used to generate the data (specific to Parse data).
#'
#' @return data.frame with doublet scores and assigned classes
#'
compute_sample_doublet_scores <- function(sample_counts) {
compute_sample_doublet_scores <- function(sample_counts, technology, parse_kit = "WT") {
set.seed(RANDOM_SEED)
sce <- scDblFinder::scDblFinder(sample_counts)

dbr <- NULL
if (technology == "parse") {
if (parse_kit %in% names(DOUBLET_RATE_PARSE)) {
dbr <- DOUBLET_RATE_PARSE[[parse_kit]]
} else {
stop("Invalid parse kit value: ", parse_kit)
}
}
sce <- scDblFinder::scDblFinder(sample_counts, dbr = dbr)

doublet_res <- data.frame(
row.names = colnames(sce),
barcodes = colnames(sce),
Expand All @@ -65,7 +79,7 @@ compute_sample_doublet_scores <- function(sample_counts) {
}


get_doublet_scores <- function(sample_counts, max_attempts = 5) {
get_doublet_scores <- function(sample_counts, max_attempts = 5, technology = "10x") {
# also filter low UMI as per scDblFinder:::.checkSCE()
ntot <- Matrix::colSums(sample_counts)

Expand All @@ -77,7 +91,8 @@ get_doublet_scores <- function(sample_counts, max_attempts = 5) {
# make the threshold stricter in every attempt
empty_cells_mask <- ntot > (200 * attempt)
try({
scores <- compute_sample_doublet_scores(sample_counts[, empty_cells_mask])
# TODO: Pass also parse_kit when available from the UI
scores <- compute_sample_doublet_scores(sample_counts[, empty_cells_mask], technology)
retry <- "not null"
})
attempt <- attempt + 1
Expand Down
2 changes: 1 addition & 1 deletion pipeline-runner/R/qc-5-filter_doublets.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ filter_doublets <- function(scdata_list, config, sample_id, cells_id, task_name

if ("recomputeDoubletScore" %in% names(config)) {
if (config$recomputeDoubletScore) {
scores <- get_doublet_scores(sample_data@assays$RNA@counts)
scores <- get_doublet_scores(sample_data@assays$RNA@counts, technology = config$sampleTechnology)
sample_data <- add_dblscore(sample_data, scores)
# update doublet scores in original scdata
scdata_list[[sample_id]] <- add_dblscore(scdata_list[[sample_id]], scores)
Expand Down
Binary file modified pipeline-runner/R/sysdata.rda
Binary file not shown.
4 changes: 4 additions & 0 deletions pipeline-runner/data-raw/sysdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ pipeline_version <- 2

UNISAMPLE <- "unisample"

# Parse kits empirical doublet rates
DOUBLET_RATE_PARSE <- list(mini = 0.046, WT = 0.034, mega = 0.064)

# pipeline error constants
errors <- list(
ERROR_SEURAT_RDS = 'ERROR_SEURAT_RDS',
Expand Down Expand Up @@ -148,6 +151,7 @@ usethis::use_data(
RIBOSOMAL_REGEX,
pipeline_version,
UNISAMPLE,
DOUBLET_RATE_PARSE,
errors,
internal = TRUE,
overwrite = TRUE
Expand Down
6 changes: 5 additions & 1 deletion pipeline-runner/man/compute_sample_doublet_scores.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

72 changes: 67 additions & 5 deletions pipeline-runner/tests/testthat/test-gem2s-4-score_doublets.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,22 @@ mock_counts <- function(...) {
return(Matrix::Matrix(counts, sparse = T))
}

mock_input <- function(technology) {
input <- list()
input$input$type <- technology
return(input)
}

test_that("score_doublets returns expected columns", {
counts <- mock_counts()
input <- mock_input("10x")

prev_out <- list(
counts_list = list(sample1 = counts),
config = list(),
annot = list()
)
out <- score_doublets(NULL, NULL, prev_out)$output
out <- score_doublets(input, NULL, prev_out)$output

expect_setequal(
colnames(out$doublet_scores$sample1),
Expand All @@ -29,14 +35,16 @@ test_that("score_doublets filters cells to avoid warning of extremely low read c
counts <- mock_counts(ncells = c(200, 300, 400, 200, 500, 300))
counts <- round(counts / 2)

input <- mock_input("10x")

expect_warning(scDblFinder:::.checkSCE(counts), "extremely low read counts")

prev_out <- list(
counts_list = list(sample1 = counts),
config = list(),
annot = list()
)
expect_warning(score_doublets(NULL, NULL, prev_out), NA)
expect_warning(score_doublets(input, NULL, prev_out), NA)
})


Expand All @@ -47,15 +55,69 @@ test_that("score_doublets fails if prev_out is missing 'config', 'counts_list',
config = list(),
annot = list()
)
input <- mock_input("10x")

prev_out$config <- NULL
expect_error(score_doublets(NULL, NULL, prev_out), "config is missing")
expect_error(score_doublets(input, NULL, prev_out), "config is missing")

prev_out$config <- list()
prev_out$annot <- NULL
expect_error(score_doublets(NULL, NULL, prev_out), "annot is missing")
expect_error(score_doublets(input, NULL, prev_out), "annot is missing")

prev_out$annot <- data.frame()
prev_out$counts_list <- NULL
expect_error(score_doublets(NULL, NULL, prev_out), "counts_list is missing")
expect_error(score_doublets(input, NULL, prev_out), "counts_list is missing")
})


test_that("compute_sample_doublet_scores handles technology 'parse' correctly", {
counts <- mock_counts()
input <- mock_input("parse")

prev_out <- list(
counts_list = list(sample1 = counts),
config = list(),
annot = list()
)
out <-suppressWarnings(score_doublets(input, NULL, prev_out)$output)

expect_setequal(
colnames(out$doublet_scores$sample1),
c("barcodes", "doublet_class", "doublet_scores")
)
})


test_that("compute_sample_doublet_scores uses correct dbr for different Parse kits", {
counts <- mock_counts()

dbr_mini <- DOUBLET_RATE_PARSE[["mini"]]
set.seed(RANDOM_SEED)
expected_sce_mini <- suppressWarnings(scDblFinder::scDblFinder(counts, dbr = dbr_mini))

dbr_wt <- DOUBLET_RATE_PARSE[["WT"]]
set.seed(RANDOM_SEED)
expected_sce_wt <- suppressWarnings(scDblFinder::scDblFinder(counts, dbr = dbr_wt))

dbr_mega <- DOUBLET_RATE_PARSE[["mega"]]
set.seed(RANDOM_SEED)
expected_sce_mega <- suppressWarnings(scDblFinder::scDblFinder(counts, dbr = dbr_mega))

observed_sce_mini <- suppressWarnings(compute_sample_doublet_scores(counts, technology = "parse", parse_kit = "mini"))
observed_sce_wt <- suppressWarnings(compute_sample_doublet_scores(counts, technology = "parse", parse_kit = "WT"))
observed_sce_mega <- suppressWarnings(compute_sample_doublet_scores(counts, technology = "parse", parse_kit = "mega"))

expect_identical(expected_sce_mini$scDblFinder.score, observed_sce_mini$doublet_scores)
expect_identical(expected_sce_wt$scDblFinder.score, observed_sce_wt$doublet_scores)
expect_identical(expected_sce_mega$scDblFinder.score, observed_sce_mega$doublet_scores)
})


test_that("compute_sample_doublet_scores stops with an error for invalid parse_kit values", {
counts <- mock_counts()

expect_error(
compute_sample_doublet_scores(counts, technology = "parse", parse_kit = "invalid_kit"),
"Invalid parse kit value"
)
})
5 changes: 3 additions & 2 deletions pipeline-runner/tests/testthat/test-qc-5-filter_doublets.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ mock_ids <- function() {
}


mock_config <- function(thr = 0.1, auto = FALSE, enabled = TRUE, recomputeDoubletScore = FALSE) {
mock_config <- function(thr = 0.1, auto = FALSE, enabled = TRUE, recomputeDoubletScore = FALSE, sampleTechnology = "10x") {
config <- list(
auto = auto,
enabled = enabled,
filterSettings = list(
probabilityThreshold = thr
),
recomputeDoubletScore = recomputeDoubletScore
recomputeDoubletScore = recomputeDoubletScore,
sampleTechnology = sampleTechnology
)
return(config)
}
Expand Down

0 comments on commit dddacd0

Please sign in to comment.