Skip to content

Commit

Permalink
Merge pull request #354 from biomage-org/splines-default-for-parse-data
Browse files Browse the repository at this point in the history
Improve support for parse data
  • Loading branch information
ogibson authored Jan 26, 2024
2 parents f513491 + 94343af commit cc2d233
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 28 deletions.
31 changes: 20 additions & 11 deletions pipeline-runner/R/gem2s-6-construct_qc_config.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@
#'
#' @return list of QC configuration parameters
#'
construct_qc_config <- function(scdata_list, unfiltered_samples) {
construct_qc_config <- function(scdata_list, unfiltered_samples, technology) {
samples <- names(scdata_list)

config_classifier <-
add_custom_config_per_sample(
customize_classifier_config,
Expand Down Expand Up @@ -42,6 +41,7 @@ construct_qc_config <- function(scdata_list, unfiltered_samples) {
customize_genes_vs_umis_config,
processing_config_template[["genes_vs_umis"]],
scdata_list,
technology = technology
)


Expand Down Expand Up @@ -77,7 +77,8 @@ customize_classifier_config <-
function(scdata,
config,
sample_name,
unfiltered_samples) {
unfiltered_samples,
technology) {
config$enabled <- sample_name %in% unfiltered_samples
config$prefiltered <- !(sample_name %in% unfiltered_samples)

Expand All @@ -89,7 +90,8 @@ customize_cellsize_config <-
function(scdata,
config,
sample_name,
unfiltered_samples) {
unfiltered_samples,
technology) {
minCellSize <- generate_default_values_cellSizeDistribution(scdata, config)
config$filterSettings$minCellSize <- minCellSize
return(config)
Expand All @@ -100,7 +102,8 @@ customize_mitochondrial_config <-
function(scdata,
config,
sample_name,
unfiltered_samples) {
unfiltered_samples,
technology) {
default_max_fraction <- generate_default_values_mitochondrialContent(scdata, config)
config$filterSettings$methodSettings$absoluteThreshold$maxFraction <-
default_max_fraction
Expand All @@ -113,23 +116,27 @@ customize_doublet_config <-
function(scdata,
config,
sample_name,
unfiltered_samples) {
unfiltered_samples,
technology) {
probabilityThreshold <- generate_default_values_doubletScores(scdata)
config$filterSettings$probabilityThreshold <- probabilityThreshold

return(config)
}


customize_genes_vs_umis_config <-
function(scdata,
config,
sample_name,
unfiltered_samples) {
unfiltered_samples,
technology) {
# Sensible values are based on the function "gene.vs.molecule.cell.filter"
# from the pagoda2 package
p.level <- min(0.001, 1 / ncol(scdata))
regression_type <- config$filterSettings$regressionType

regression_type <- ifelse( technology == "parse" , "spline" , config$filterSettings$regressionType)

config$filterSettings$regressionType <- regression_type
config$filterSettings$regressionTypeSettings[[regression_type]]$p.level <- p.level

return(config)
Expand Down Expand Up @@ -175,7 +182,8 @@ add_custom_config_per_sample <-
function(customize_template_config,
config_template,
scdata_list,
unfiltered_samples = NA) {
unfiltered_samples = NA,
technology = NA) {
config <- list()
for (sample_name in names(scdata_list)) {
# subset the Seurat object list to a single sample
Expand All @@ -187,7 +195,8 @@ add_custom_config_per_sample <-
sample_scdata,
config_template,
sample_name,
unfiltered_samples
unfiltered_samples,
technology
)

# update sample config thresholds
Expand Down
3 changes: 2 additions & 1 deletion pipeline-runner/R/gem2s-6-prepare_experiment.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ prepare_experiment <- function(input, pipeline_config, prev_out) {
# construct default QC config and update prev out
message("Constructing default QC configuration...")
unfiltered_samples <- names(prev_out$edrops[!is.null(prev_out$edrops)])
prev_out$default_qc_config <- construct_qc_config(scdata_list, unfiltered_samples)

prev_out$default_qc_config <- construct_qc_config(scdata_list, unfiltered_samples, input$input$type)

# If we received a qc_config (subset pipeline case) then
# we want to set that one as the custom config
Expand Down
2 changes: 1 addition & 1 deletion pipeline-runner/R/seurat-3-upload_seurat_to_aws.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ upload_seurat_to_aws <- function(input, pipeline_config, prev_out) {

# replicate qc config for simplicity
# could also create a 'seurat_config' column in experiment table and change the ui/api around more
qc_config <- construct_qc_config(list(one = scdata), unfiltered_samples = 'one')
qc_config <- construct_qc_config(list(one = scdata), unfiltered_samples = 'one', technology="seurat")
qc_config$configureEmbedding$embeddingSettings$useSaved <- TRUE
qc_config$configureEmbedding$embeddingSettings$method <- SeuratObject::DefaultDimReduc(scdata)

Expand Down
28 changes: 24 additions & 4 deletions pipeline-runner/tests/testthat/test-gem2s-6-construct_qc_config.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ mock_scdata_list <- function() {
test_that("cellsize filter is disabled by default and classifier is pre-filtered", {
scdata_list <- mock_scdata_list()
unfiltered_samples <- c("123abc")
qc_config <- construct_qc_config(scdata_list, unfiltered_samples = unfiltered_samples)
qc_config <- construct_qc_config(scdata_list, unfiltered_samples = unfiltered_samples, technology = "10X")

for (sample in names(scdata_list)) {
if (sample %in% unfiltered_samples) {
Expand All @@ -46,7 +46,7 @@ test_that("cellsize filter is disabled by default and classifier is pre-filtered
test_that("cellsize filter is disabled by default and classifier is not pre-filtered", {
scdata_list <- mock_scdata_list()
unfiltered_samples <- c()
qc_config <- construct_qc_config(scdata_list, unfiltered_samples = unfiltered_samples)
qc_config <- construct_qc_config(scdata_list, unfiltered_samples = unfiltered_samples, technology = "10X")

for (sample in names(scdata_list)) {
expect_false(qc_config$cellSizeDistribution[[sample]]$enabled)
Expand All @@ -63,7 +63,7 @@ test_that("cellsize filter is disabled by default and classifier is not pre-filt
test_that("customize_doublet_config sets threshold to 0 when there are no singlets", {
scdata_list <- mock_scdata_list()
unfiltered_samples <- c("123abc")
qc_config <- construct_qc_config(scdata_list, unfiltered_samples = unfiltered_samples)
qc_config <- construct_qc_config(scdata_list, unfiltered_samples = unfiltered_samples, technology = "10X")

for (sample in names(scdata_list)) {
scdata_list[[sample]]$doublet_class <- "doublet"
Expand All @@ -76,7 +76,7 @@ test_that("customize_doublet_config sets threshold to 0 when there are no single
test_that("classifier filter config is enabled for unfiltered samples and disabled for pre-filtered samples", {
scdata_list <- mock_scdata_list()
unfiltered_samples <- c("123abc")
qc_config <- construct_qc_config(scdata_list, unfiltered_samples = unfiltered_samples)
qc_config <- construct_qc_config(scdata_list, unfiltered_samples = unfiltered_samples, technology = "10X")

for (sample in names(scdata_list)) {
if (sample %in% unfiltered_samples) {
Expand All @@ -88,3 +88,23 @@ test_that("classifier filter config is enabled for unfiltered samples and disabl
}
}
})

test_that("NumGenesVsUmis filter config has spline as default for Parse Datasets", {
scdata_list <- mock_scdata_list()
unfiltered_samples <- c("123abc")
qc_config <- construct_qc_config(scdata_list, unfiltered_samples = unfiltered_samples, "parse")

for (sample in names(scdata_list)) {
expect_true(qc_config$numGenesVsNumUmis[[sample]]$filterSettings$regressionType == "spline")
}
})

test_that("NumGenesVsUmis filter config has linear as default for 10x datasets", {
scdata_list <- mock_scdata_list()
unfiltered_samples <- c("123abc")
qc_config <- construct_qc_config(scdata_list, unfiltered_samples = unfiltered_samples, "10X")

for (sample in names(scdata_list)) {
expect_true(qc_config$numGenesVsNumUmis[[sample]]$filterSettings$regressionType == "linear")
}
})
28 changes: 21 additions & 7 deletions pipeline-runner/tests/testthat/test-gem2s-6-prepare_experiment.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,25 @@ mock_prev_out <- function(samples = "sample_a", counts = NULL, prev_out_config =
create_seurat(NULL, NULL, prev_out)$output
}

mock_input <- function(){
input <- list(input = list(type="10X"), experimentId = "1234")
}

test_that("prepare_experiment ensures gene_annotations are indexed correctly for each sample", {

samples <- c("a", "b", "c")
prev_out <- mock_prev_out(samples = samples)

input <- mock_input()

# remove some genes from each sample
prev_out$counts_list$a <- prev_out$counts_list$a[-c(1:9), ]
prev_out$counts_list$b <- prev_out$counts_list$b[-c(21:30), ]
prev_out$counts_list$c <- prev_out$counts_list$c[-c(5:25), ]

# re-create seurat object
prev_out <- create_seurat(NULL, NULL, prev_out)$output
scdata_list <- prepare_experiment(NULL, NULL, prev_out)$output$scdata
scdata_list <- prepare_experiment(input, NULL, prev_out)$output$scdata

# we expect that the input in gene_annotations is the same as the rownames of
# each sample seurat object
Expand Down Expand Up @@ -142,7 +147,7 @@ test_that("add_metadata_to_samples generated cell ids do not depend on sample or

test_that("prepare_experiment generates qc_config that matches snapshot", {
prev_out <- mock_prev_out()
input <- list(experimentId = "1234")
input <- mock_input()
task_out <- prepare_experiment(input, NULL, prev_out)$output

expect_snapshot(str(task_out$qc_config))
Expand All @@ -154,8 +159,9 @@ test_that("prepare_experiment creates a list of valid Seurat objects", {
samples <- c("a", "b", "c")
prev_out <- mock_prev_out(samples )
scdata_list <- prev_out$scdata_list
input <- mock_input()

task_out <- prepare_experiment(NULL, NULL, prev_out)$output
task_out <- prepare_experiment(input, NULL, prev_out)$output
scdata_list <- task_out$scdata_list


Expand All @@ -175,7 +181,9 @@ test_that("prepare_experiment properly populates the misc slot", {
prev_out <- mock_prev_out(samples )
scdata_list <- prev_out$scdata_list

task_out <- prepare_experiment(NULL, NULL, prev_out)$output
input <- mock_input()

task_out <- prepare_experiment(input, NULL, prev_out)$output
scdata_list <- task_out$scdata_list

for (sample in samples) {
Expand All @@ -197,7 +205,9 @@ test_that("prepare_experiment properly populates the metadata slot", {
prev_out <- mock_prev_out(samples)
scdata_list <- prev_out$scdata_list

task_out <- prepare_experiment(NULL, NULL, prev_out)$output
input <- mock_input()

task_out <- prepare_experiment(input, NULL, prev_out)$output
scdata_list <- task_out$scdata_list

for (sample in samples) {
Expand Down Expand Up @@ -230,7 +240,9 @@ test_that("Mitochondrial percentage is correct", {
prev_out <- mock_prev_out(samples)
scdata_list <- prev_out$scdata_list

task_out <- prepare_experiment(NULL, NULL, prev_out)$output
input <- mock_input()

task_out <- prepare_experiment(input, NULL, prev_out)$output
scdata_list <- task_out$scdata_list

for (sample in samples) {
Expand All @@ -254,9 +266,11 @@ test_that("Skips qc config creation if it is already created in prev_out", {
samples <- c("a", "b", "c")
prev_out <- mock_prev_out(samples = samples, prev_out_config = c('mocked'))

input <- mock_input()

# re-create seurat object
prev_out <- create_seurat(NULL, NULL, prev_out)$output
scdata_list <- prepare_experiment(NULL, NULL, prev_out)
scdata_list <- prepare_experiment(input, NULL, prev_out)

expect_true(prev_out$qc_config == c('mocked'))
})
7 changes: 5 additions & 2 deletions pipeline-runner/tests/testthat/test-gem2s-7-upload_to_aws.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ mock_scdata_list <- function(config) {
prev_out <- mock_prev_out(config)
scdata_list <- prev_out$scdata_list

task_out <- prepare_experiment(NULL, NULL, prev_out)$output
input <- mock_input()

task_out <- prepare_experiment(input, NULL, prev_out)$output
scdata_list <- task_out$scdata_list
}

Expand All @@ -26,7 +28,8 @@ mock_input <- function(metadata = NULL) {
sampleIds = list("123abc", "123def", "123ghi"),
metadata = metadata,
experimentId = "mock_experiment_id",
projectId = "mock_experiment_id"
projectId = "mock_experiment_id",
input = list( type= "10x")
)

return(input)
Expand Down
4 changes: 2 additions & 2 deletions pipeline-runner/tests/testthat/test-subset-1-subset_seurat.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ mock_scdata_list <- function(samples = rep("mock_sample_1_id", 80)) {
}

mock_input <- function(parent_experiment_id, cellset_keys, samples = rep("mock_sample_1_id", 80), sample_ids = c("mock_sample_1_id")) {
parentProcessingConfig <- construct_qc_config(mock_scdata_list(samples), unfiltered_samples = sample_ids)
parentProcessingConfig <- construct_qc_config(mock_scdata_list(samples), unfiltered_samples = sample_ids, technology = "10x")

list(
parentExperimentId = parent_experiment_id,
Expand Down Expand Up @@ -190,7 +190,7 @@ test_that("generate_subset_config works correctly", {
parent_sample_ids <- c("sample-id-1", "sample-id-2", "sample-id-3", "sample-id-4")
scdata_list <- mock_scdata_list(samples = rep(parent_sample_ids, 20))

parent_processing_config <- construct_qc_config(scdata_list, unfiltered_samples = parent_sample_ids)
parent_processing_config <- construct_qc_config(scdata_list, unfiltered_samples = parent_sample_ids, technology = "10x")

# Make some of the configs unique to each sample
# so we can check that the translation preserves the configs
Expand Down

0 comments on commit cc2d233

Please sign in to comment.