Skip to content

Commit

Permalink
Merge pull request #302 from hms-dbmi-cellenics/biomage-changes-4
Browse files Browse the repository at this point in the history
Changes 4
  • Loading branch information
alexvpickering authored May 30, 2023
2 parents e8233df + ec21a0a commit 6b1d8e8
Show file tree
Hide file tree
Showing 40 changed files with 3,874 additions and 251 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ jobs:
EOF
)
UNCHECKED_BODY="${UNCHECKED_BODY//'%'/'%25'}"
UNCHECKED_BODY="${UNCHECKED_BODY//$'\n'/'%0A'}"
UNCHECKED_BODY="${UNCHECKED_BODY//$'\r'/'%0D'}"
echo "Unchecked PR body"
echo $UNCHECKED_BODY
echo "::set-output name=body::$UNCHECKED_BODY"
# This sets multiline strings into the output variable
# See https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-a-multiline-string
echo "body<<EOF" >> "$GITHUB_OUTPUT"
echo "$UNCHECKED_BODY" >> "$GITHUB_OUTPUT"
echo "EOF" >> "$GITHUB_OUTPUT"
- id: uncheck-integration-checkbox
Expand Down Expand Up @@ -288,7 +288,7 @@ jobs:
export CHART_REF="$GITHUB_SHA"
export KUBERNETES_ENV="production"
export IMAGE_NAME=$IMAGE_TAG-pipeline-runner
export REPLICA_COUNT="3"
export REPLICA_COUNT="1"
export VERSION_NUMBER=${REF_ID/refs-tags-/}
export IMAGE_PATTERN="^refs-tags-(?P<version>[0-9]+\.[0-9]+\.[0-9]+)-pipeline-runner$"
export IMAGE_EXTRACT='$version'
Expand Down
26 changes: 13 additions & 13 deletions .github/workflows/pr_validate.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,19 +68,19 @@ jobs:
echo "Is staging N/A?"
echo $IS_STAGING_NA
echo "::set-output name=is-staging-na::$IS_STAGING_NA"
echo "is-staging-na=$IS_STAGING_NA" >> $GITHUB_OUTPUT
echo "Full URL:"
echo $URL
echo "::set-output name=url::$URL"
echo "url=$URL" >> $GITHUB_OUTPUT
SANDBOX=$(pcregrep -o2 -M "$REGEX" <<\EOF
${{ github.event.pull_request.body }}
EOF
)
echo "Extracted sandbox:"
echo $SANDBOX
echo "::set-output name=sandbox::$SANDBOX"
echo "sandbox=$SANDBOX" >> $GITHUB_OUTPUT
- id: reach-staging
if: steps.extract-staging.outputs.url != 'N/A'
Expand Down Expand Up @@ -130,8 +130,8 @@ jobs:
fi
echo $SHOULD_RUN_E2E
echo "::set-output name=should-e2e-cancel::$SHOULD_CANCEL_E2E"
echo "::set-output name=should-e2e-run::$SHOULD_RUN_E2E"
echo "should-e2e-cancel=$SHOULD_CANCEL_E2E" >> $GITHUB_OUTPUT
echo "should-e2e-run=$SHOULD_RUN_E2E" >> $GITHUB_OUTPUT
- id: cancel
name: Cancel if staging is not N/A and box is not checked
Expand All @@ -151,12 +151,12 @@ jobs:
run: |-
echo "SHA of latest GitHub branch commit:"
echo ${{ github.event.pull_request.head.sha }}
echo "::set-output name=github_sha::${{ github.event.pull_request.head.sha }}"
echo "github_sha=${{ github.event.pull_request.head.sha }}" >> $GITHUB_OUTPUT
REPO_NAME=$(echo $GITHUB_REPOSITORY | awk -F '/' '{print $2}')
echo "Repo name:"
echo $REPO_NAME
echo "::set-output name=repo_name::$REPO_NAME"
echo "repo_name=$REPO_NAME" >> $GITHUB_OUTPUT
- id: extract-integration-test-ref
if: steps.check-e2e-run.outputs.should-e2e-run == 'true'
Expand All @@ -174,7 +174,7 @@ jobs:
)
echo "Ref given is $INTEGRATION_TEST_REF, setting it as is."
echo "::set-output name=ref::$INTEGRATION_TEST_REF"
echo "ref=$INTEGRATION_TEST_REF" >> $GITHUB_OUTPUT
- id: run-integration-test
if: steps.check-e2e-run.outputs.should-e2e-run == 'true'
Expand Down Expand Up @@ -203,14 +203,14 @@ jobs:
EOF
)
UNCHECKED_BODY="${UNCHECKED_BODY//'%'/'%25'}"
UNCHECKED_BODY="${UNCHECKED_BODY//$'\n'/'%0A'}"
UNCHECKED_BODY="${UNCHECKED_BODY//$'\r'/'%0D'}"
echo "Unchecked PR body"
echo $UNCHECKED_BODY
echo "::set-output name=body::$UNCHECKED_BODY"
# This sets multiline strings into the output variable
# See https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-a-multiline-string
echo "body<<EOF" >> "$GITHUB_OUTPUT"
echo "$UNCHECKED_BODY" >> "$GITHUB_OUTPUT"
echo "EOF" >> "$GITHUB_OUTPUT"
- id: uncheck-integration-checkbox
if: failure()
Expand Down
2 changes: 1 addition & 1 deletion pipeline-runner/.lintr
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
linters: with_defaults(
line_length_linter = line_length_linter(80),
line_length_linter = line_length_linter(100),
object_usage_linter = NULL
)
2 changes: 2 additions & 0 deletions pipeline-runner/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ export(add_new_sample_ids)
export(add_subset_metadata)
export(build_cc_gene_list)
export(build_metadata_cellsets)
export(build_mitochondrial_gene_list)
export(build_ribosomal_gene_list)
export(build_sample_cellsets)
export(build_scratchpad_cellsets)
export(cbind_cellset_type)
Expand Down
38 changes: 23 additions & 15 deletions pipeline-runner/R/gem2s-4-score_doublets.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,8 @@ score_doublets <- function(input, pipeline_config, prev_out) {
sample_counts <- sample_counts[, keep]
}

# also filter low UMI as per scDblFinder:::.checkSCE()
ntot <- Matrix::colSums(sample_counts)
scores[[sample]] <- get_doublet_scores(sample_counts)

# retry increasing the minimum counts in case of low sparsity in the sample
r <- NULL
attempt <- 1
while (is.null(r) && attempt <= 5) {
message("\ntrying to score doublets, attempt: ", attempt)
# make the threshold stricter in every attempt
empty_cells_mask <- ntot > (200 * attempt)
try({
scores[[sample]] <- compute_sample_doublet_scores(sample_counts[, empty_cells_mask])
r <- "not null"
})
attempt <- attempt + 1
}
}

prev_out$doublet_scores <- scores
Expand Down Expand Up @@ -77,3 +63,25 @@ compute_sample_doublet_scores <- function(sample_counts) {

return(doublet_res)
}


get_doublet_scores <- function(sample_counts, max_attempts = 5) {
# also filter low UMI as per scDblFinder:::.checkSCE()
ntot <- Matrix::colSums(sample_counts)

# retry increasing the minimum counts in case of low sparsity in the sample
retry <- NULL
attempt <- 1
while (is.null(retry) && attempt <= max_attempts) {
message("\nTrying to score doublets, attempt: ", attempt)
# make the threshold stricter in every attempt
empty_cells_mask <- ntot > (200 * attempt)
try({
scores <- compute_sample_doublet_scores(sample_counts[, empty_cells_mask])
retry <- "not null"
})
attempt <- attempt + 1
}

return(scores)
}
5 changes: 2 additions & 3 deletions pipeline-runner/R/gem2s-5-create_seurat.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,9 @@ construct_metadata <- function(counts, sample, config) {

# add mitochondrial percent to SeuratObject
add_mito <- function(scdata, annot) {
mt_regex <- "^mt[-:]"
if (any(grepl(mt_regex, annot$name, ignore.case = TRUE))) {
if (any(grepl(MITOCHONDRIAL_REGEX, annot$name, ignore.case = TRUE))) {
message("Adding MT information...")
mt.features <- annot$input[grep(mt_regex, annot$name, ignore.case = TRUE)]
mt.features <- annot$input[grep(MITOCHONDRIAL_REGEX, annot$name, ignore.case = TRUE)]
mt.features <- mt.features[mt.features %in% rownames(scdata)]
if (length(mt.features)) {
scdata <- Seurat::PercentageFeatureSet(scdata, features = mt.features, col.name = "percent.mt")
Expand Down
24 changes: 2 additions & 22 deletions pipeline-runner/R/gem2s-6-construct_qc_config.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,17 @@
#'
#' @param scdata_list list of Seurat objects
#' @param unfiltered_samples character vector of unfiltered sample ids
#' @param disable_qc_filters bool indicating if filters should be disabled.
#'
#' @return list of QC configuration parameters
#'
construct_qc_config <-
function(scdata_list,
disable_qc_filters,
unfiltered_samples) {
construct_qc_config <- function(scdata_list, unfiltered_samples) {
samples <- names(scdata_list)

config_classifier <-
add_custom_config_per_sample(
customize_classifier_config,
processing_config_template[["classifier"]],
scdata_list,
disable_qc_filters,
unfiltered_samples
)

Expand All @@ -33,23 +28,20 @@ construct_qc_config <-
customize_cellsize_config,
processing_config_template[["cell_size"]],
scdata_list,
disable_qc_filters
)

config_mitochondrial <-
add_custom_config_per_sample(
customize_mitochondrial_config,
processing_config_template[["mitochondrial"]],
scdata_list,
disable_qc_filters
)

config_genes_vs_umis <-
add_custom_config_per_sample(
customize_genes_vs_umis_config,
processing_config_template[["genes_vs_umis"]],
scdata_list,
disable_qc_filters
)


Expand All @@ -58,7 +50,6 @@ construct_qc_config <-
customize_doublet_config,
processing_config_template[["doublet"]],
scdata_list,
disable_qc_filters
)

config_data_integration <-
Expand Down Expand Up @@ -86,9 +77,8 @@ customize_classifier_config <-
function(scdata,
config,
sample_name,
disable_qc_filters,
unfiltered_samples) {
config$enabled <- sample_name %in% unfiltered_samples && !disable_qc_filters
config$enabled <- sample_name %in% unfiltered_samples
config$prefiltered <- !(sample_name %in% unfiltered_samples)

return(config)
Expand All @@ -99,7 +89,6 @@ customize_cellsize_config <-
function(scdata,
config,
sample_name,
disable_qc_filters,
unfiltered_samples) {
minCellSize <- generate_default_values_cellSizeDistribution(scdata, config)
config$filterSettings$minCellSize <- minCellSize
Expand All @@ -111,7 +100,6 @@ customize_mitochondrial_config <-
function(scdata,
config,
sample_name,
disable_qc_filters,
unfiltered_samples) {
default_max_fraction <- generate_default_values_mitochondrialContent(scdata, config)
config$filterSettings$methodSettings$absoluteThreshold$maxFraction <-
Expand All @@ -125,7 +113,6 @@ customize_doublet_config <-
function(scdata,
config,
sample_name,
disable_qc_filters,
unfiltered_samples) {
probabilityThreshold <- generate_default_values_doubletScores(scdata)
config$filterSettings$probabilityThreshold <- probabilityThreshold
Expand All @@ -138,7 +125,6 @@ customize_genes_vs_umis_config <-
function(scdata,
config,
sample_name,
disable_qc_filters,
unfiltered_samples) {
# Sensible values are based on the function "gene.vs.molecule.cell.filter"
# from the pagoda2 package
Expand Down Expand Up @@ -180,7 +166,6 @@ get_embedding_config <- function(scdata_list, config) {
#' @param customize_template_config function - step customization function
#' @param config_template list - template of step configuration parameters
#' @param scdata_list list - with Seurat objects
#' @param disable_qc_filters logical
#' @param unfiltered_samples character vector
#'
#' @return list of customized QC parameters for each sample
Expand All @@ -190,7 +175,6 @@ add_custom_config_per_sample <-
function(customize_template_config,
config_template,
scdata_list,
disable_qc_filters = FALSE,
unfiltered_samples = NA) {
config <- list()
for (sample_name in names(scdata_list)) {
Expand All @@ -203,13 +187,9 @@ add_custom_config_per_sample <-
sample_scdata,
config_template,
sample_name,
disable_qc_filters,
unfiltered_samples
)

sample_config$enabled <-
sample_config$enabled && !disable_qc_filters

# update sample config thresholds
config[[sample_name]] <- sample_config
}
Expand Down
17 changes: 14 additions & 3 deletions pipeline-runner/R/gem2s-6-prepare_experiment.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
#' @param prev_out 'output' slot from call to \code{create_seurat}
#'
#' @return prev_out \code{prev_out} with added slots 'scdata' containing merged
#' \code{SeuratObject} and 'qc_config' containing default config for QC steps.
#' \code{SeuratObject} and 'qc_config' containing default config for QC steps
#' if it doesn't already exist.
#'
#' @export
#'
Expand All @@ -23,7 +24,7 @@ prepare_experiment <- function(input, pipeline_config, prev_out) {

message("Total cells:", sum(sapply(scdata_list, ncol)))

# metadata is added to subset experiment at the subset step 1
# metadata is added to subset experiment at the subset step 1
if (!disable_qc_filters) {
scdata_list <-
add_metadata_to_samples(scdata_list, prev_out$annot, input$experimentId)
Expand All @@ -33,7 +34,17 @@ prepare_experiment <- function(input, pipeline_config, prev_out) {
# construct default QC config and update prev out
message("Constructing default QC configuration...")
unfiltered_samples <- names(prev_out$edrops[!is.null(prev_out$edrops)])
prev_out$qc_config <- construct_qc_config(scdata_list, disable_qc_filters, unfiltered_samples)
prev_out$default_qc_config <- construct_qc_config(scdata_list, unfiltered_samples)

# If we received a qc_config (subset pipeline case) then
# we want to set that one as the custom config
if ("qc_config" %in% names(prev_out)) {
message("Custom QC config received in prev_out, setting it as custom")
prev_out$qc_config <- prev_out$qc_config
} else {
message("No custom QC config, setting default instead")
prev_out$qc_config <- prev_out$default_qc_config
}

res <- list(
data = list(),
Expand Down
15 changes: 12 additions & 3 deletions pipeline-runner/R/gem2s-7-upload_to_aws.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ upload_to_aws <- function(input, pipeline_config, prev_out) {
scdata_list <- prev_out$scdata_list
config <- prev_out$config
qc_config <- prev_out$qc_config
default_qc_config <- prev_out$default_qc_config
disable_qc_filters <- prev_out$disable_qc_filters

# TODO: replace with subset_experiment flag when available
Expand All @@ -31,7 +32,6 @@ upload_to_aws <- function(input, pipeline_config, prev_out) {
cell_sets <- get_subset_cell_sets(scdata_list, input, prev_out, disable_qc_filters)
}


# cell sets file to s3
cell_sets_data <- RJSONIO::toJSON(cell_sets)

Expand Down Expand Up @@ -69,7 +69,8 @@ upload_to_aws <- function(input, pipeline_config, prev_out) {
organism = config$organism,
type = config$input$type
),
processingConfig = qc_config
processingConfig = qc_config,
defaultProcessingConfig = default_qc_config
)

res <- list(
Expand Down Expand Up @@ -330,8 +331,16 @@ get_subset_cell_sets <- function(scdata_list, input, prev_out, disable_qc_filter
if ("scratchpad" %in% unique(subset_cellsets$type)) {
message("adding custom cellsets to subset experiment")
scratchpad_cellsets <- build_scratchpad_cellsets(color_pool, subset_cellsets)
cell_sets <- c(cell_sets, list(scratchpad_cellsets))
} else {
scratchpad_cellsets <- list(
key = "scratchpad",
name = "Custom cell sets",
rootNode = TRUE,
children = list(),
type = "cellSets"
)
}
cell_sets <- c(cell_sets, list(scratchpad_cellsets))

cell_sets <- list(cellSets = cell_sets)

Expand Down
Loading

0 comments on commit 6b1d8e8

Please sign in to comment.