Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjust doublet score calculation according to sequencing technology #355

Merged
merged 17 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pr_validate.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ jobs:
- id: reach-staging
if: steps.extract-staging.outputs.url != 'N/A'
name: Attempt to reach staging environments
uses: nick-invision/retry@v2
uses: nick-fields/retry@v2
with:
timeout_seconds: 30
max_attempts: 5
Expand Down
25 changes: 20 additions & 5 deletions pipeline-runner/R/gem2s-4-score_doublets.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ score_doublets <- function(input, pipeline_config, prev_out) {
# NOTE: edrops is not required
check_prev_out(prev_out, c("config", "counts_list", "annot"))

technology <- input$input$type
edrops_list <- prev_out$edrops
counts_list <- prev_out$counts_list
samples <- names(counts_list)
Expand All @@ -30,7 +31,8 @@ score_doublets <- function(input, pipeline_config, prev_out) {
sample_counts <- sample_counts[, keep]
}

scores[[sample]] <- get_doublet_scores(sample_counts)
# TODO: Pass also parse_kit when available from the UI
scores[[sample]] <- get_doublet_scores(sample_counts, technology = technology)

}

Expand All @@ -48,12 +50,24 @@ score_doublets <- function(input, pipeline_config, prev_out) {
#' Compute doublets scores per sample.
#'
#' @param sample_counts Sparse matrix with the counts for one sample.
#' @param technology Technology used to generate the data (e.g. 10x, Parse).
#' @param parse_kit Kit used to generate the data (specific to Parse data).
#'
#' @return data.frame with doublet scores and assigned classes
#'
compute_sample_doublet_scores <- function(sample_counts) {
compute_sample_doublet_scores <- function(sample_counts, technology, parse_kit = "WT") {
set.seed(RANDOM_SEED)
sce <- scDblFinder::scDblFinder(sample_counts)

dbr <- NULL
if (technology == "parse") {
if (parse_kit %in% names(DOUBLET_RATE_PARSE)) {
dbr <- DOUBLET_RATE_PARSE[[parse_kit]]
} else {
stop("Invalid parse kit value: ", parse_kit)
}
}
sce <- scDblFinder::scDblFinder(sample_counts, dbr = dbr)

doublet_res <- data.frame(
row.names = colnames(sce),
barcodes = colnames(sce),
Expand All @@ -65,7 +79,7 @@ compute_sample_doublet_scores <- function(sample_counts) {
}


get_doublet_scores <- function(sample_counts, max_attempts = 5) {
get_doublet_scores <- function(sample_counts, max_attempts = 5, technology = "10x") {
# also filter low UMI as per scDblFinder:::.checkSCE()
ntot <- Matrix::colSums(sample_counts)

Expand All @@ -77,7 +91,8 @@ get_doublet_scores <- function(sample_counts, max_attempts = 5) {
# make the threshold stricter in every attempt
empty_cells_mask <- ntot > (200 * attempt)
try({
scores <- compute_sample_doublet_scores(sample_counts[, empty_cells_mask])
# TODO: Pass also parse_kit when available from the UI
scores <- compute_sample_doublet_scores(sample_counts[, empty_cells_mask], technology)
retry <- "not null"
})
attempt <- attempt + 1
Expand Down
2 changes: 1 addition & 1 deletion pipeline-runner/R/qc-5-filter_doublets.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ filter_doublets <- function(scdata_list, config, sample_id, cells_id, task_name

if ("recomputeDoubletScore" %in% names(config)) {
if (config$recomputeDoubletScore) {
scores <- get_doublet_scores(sample_data@assays$RNA@counts)
scores <- get_doublet_scores(sample_data@assays$RNA@counts, technology = config$sampleTechnology)
sample_data <- add_dblscore(sample_data, scores)
# update doublet scores in original scdata
scdata_list[[sample_id]] <- add_dblscore(scdata_list[[sample_id]], scores)
Expand Down
Binary file modified pipeline-runner/R/sysdata.rda
Binary file not shown.
4 changes: 4 additions & 0 deletions pipeline-runner/data-raw/sysdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ pipeline_version <- 2

UNISAMPLE <- "unisample"

# Parse kits empirical doublet rates
DOUBLET_RATE_PARSE <- list(mini = 0.046, WT = 0.034, mega = 0.064)

# pipeline error constants
errors <- list(
ERROR_SEURAT_RDS = 'ERROR_SEURAT_RDS',
Expand Down Expand Up @@ -148,6 +151,7 @@ usethis::use_data(
RIBOSOMAL_REGEX,
pipeline_version,
UNISAMPLE,
DOUBLET_RATE_PARSE,
errors,
internal = TRUE,
overwrite = TRUE
Expand Down
6 changes: 5 additions & 1 deletion pipeline-runner/man/compute_sample_doublet_scores.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

72 changes: 67 additions & 5 deletions pipeline-runner/tests/testthat/test-gem2s-4-score_doublets.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,22 @@ mock_counts <- function(...) {
return(Matrix::Matrix(counts, sparse = T))
}

mock_input <- function(technology) {
input <- list()
input$input$type <- technology
return(input)
}

test_that("score_doublets returns expected columns", {
counts <- mock_counts()
input <- mock_input("10x")

prev_out <- list(
counts_list = list(sample1 = counts),
config = list(),
annot = list()
)
out <- score_doublets(NULL, NULL, prev_out)$output
out <- score_doublets(input, NULL, prev_out)$output

expect_setequal(
colnames(out$doublet_scores$sample1),
Expand All @@ -29,14 +35,16 @@ test_that("score_doublets filters cells to avoid warning of extremely low read c
counts <- mock_counts(ncells = c(200, 300, 400, 200, 500, 300))
counts <- round(counts / 2)

input <- mock_input("10x")

expect_warning(scDblFinder:::.checkSCE(counts), "extremely low read counts")

prev_out <- list(
counts_list = list(sample1 = counts),
config = list(),
annot = list()
)
expect_warning(score_doublets(NULL, NULL, prev_out), NA)
expect_warning(score_doublets(input, NULL, prev_out), NA)
})


Expand All @@ -47,15 +55,69 @@ test_that("score_doublets fails if prev_out is missing 'config', 'counts_list',
config = list(),
annot = list()
)
input <- mock_input("10x")

prev_out$config <- NULL
expect_error(score_doublets(NULL, NULL, prev_out), "config is missing")
expect_error(score_doublets(input, NULL, prev_out), "config is missing")

prev_out$config <- list()
prev_out$annot <- NULL
expect_error(score_doublets(NULL, NULL, prev_out), "annot is missing")
expect_error(score_doublets(input, NULL, prev_out), "annot is missing")

prev_out$annot <- data.frame()
prev_out$counts_list <- NULL
expect_error(score_doublets(NULL, NULL, prev_out), "counts_list is missing")
expect_error(score_doublets(input, NULL, prev_out), "counts_list is missing")
})


test_that("compute_sample_doublet_scores handles technology 'parse' correctly", {
counts <- mock_counts()
input <- mock_input("parse")

prev_out <- list(
counts_list = list(sample1 = counts),
config = list(),
annot = list()
)
out <-suppressWarnings(score_doublets(input, NULL, prev_out)$output)

expect_setequal(
colnames(out$doublet_scores$sample1),
c("barcodes", "doublet_class", "doublet_scores")
)
})


test_that("compute_sample_doublet_scores uses correct dbr for different Parse kits", {
counts <- mock_counts()

dbr_mini <- DOUBLET_RATE_PARSE[["mini"]]
set.seed(RANDOM_SEED)
expected_sce_mini <- suppressWarnings(scDblFinder::scDblFinder(counts, dbr = dbr_mini))

dbr_wt <- DOUBLET_RATE_PARSE[["WT"]]
set.seed(RANDOM_SEED)
expected_sce_wt <- suppressWarnings(scDblFinder::scDblFinder(counts, dbr = dbr_wt))

dbr_mega <- DOUBLET_RATE_PARSE[["mega"]]
set.seed(RANDOM_SEED)
expected_sce_mega <- suppressWarnings(scDblFinder::scDblFinder(counts, dbr = dbr_mega))

observed_sce_mini <- suppressWarnings(compute_sample_doublet_scores(counts, technology = "parse", parse_kit = "mini"))
observed_sce_wt <- suppressWarnings(compute_sample_doublet_scores(counts, technology = "parse", parse_kit = "WT"))
observed_sce_mega <- suppressWarnings(compute_sample_doublet_scores(counts, technology = "parse", parse_kit = "mega"))

expect_identical(expected_sce_mini$scDblFinder.score, observed_sce_mini$doublet_scores)
expect_identical(expected_sce_wt$scDblFinder.score, observed_sce_wt$doublet_scores)
expect_identical(expected_sce_mega$scDblFinder.score, observed_sce_mega$doublet_scores)
})


test_that("compute_sample_doublet_scores stops with an error for invalid parse_kit values", {
counts <- mock_counts()

expect_error(
compute_sample_doublet_scores(counts, technology = "parse", parse_kit = "invalid_kit"),
"Invalid parse kit value"
)
})
5 changes: 3 additions & 2 deletions pipeline-runner/tests/testthat/test-qc-5-filter_doublets.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ mock_ids <- function() {
}


mock_config <- function(thr = 0.1, auto = FALSE, enabled = TRUE, recomputeDoubletScore = FALSE) {
mock_config <- function(thr = 0.1, auto = FALSE, enabled = TRUE, recomputeDoubletScore = FALSE, sampleTechnology = "10x") {
config <- list(
auto = auto,
enabled = enabled,
filterSettings = list(
probabilityThreshold = thr
),
recomputeDoubletScore = recomputeDoubletScore
recomputeDoubletScore = recomputeDoubletScore,
sampleTechnology = sampleTechnology
)
return(config)
}
Expand Down
Loading