Merge pull request #302 from hms-dbmi-cellenics/biomage-changes-4

Changes 4
hms-dbmi-cellenics · May 30, 2023 · 6b1d8e8 · 6b1d8e8
2 parents e8233df + ec21a0a
commit 6b1d8e8
Show file tree

Hide file tree

Showing 40 changed files with 3,874 additions and 251 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -34,14 +34,14 @@ jobs:
           EOF
           )
 
-          UNCHECKED_BODY="${UNCHECKED_BODY//'%'/'%25'}"
-          UNCHECKED_BODY="${UNCHECKED_BODY//$'\n'/'%0A'}"
-          UNCHECKED_BODY="${UNCHECKED_BODY//$'\r'/'%0D'}"
-
           echo "Unchecked PR body"
           echo $UNCHECKED_BODY
 
-          echo "::set-output name=body::$UNCHECKED_BODY"
+          # This sets multiline strings into the output variable
+          # See https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-a-multiline-string
+          echo "body<<EOF" >> "$GITHUB_OUTPUT"
+          echo "$UNCHECKED_BODY" >> "$GITHUB_OUTPUT"
+          echo "EOF" >> "$GITHUB_OUTPUT"
 
 
       - id: uncheck-integration-checkbox
@@ -288,7 +288,7 @@ jobs:
             export CHART_REF="$GITHUB_SHA"
             export KUBERNETES_ENV="production"
             export IMAGE_NAME=$IMAGE_TAG-pipeline-runner
-            export REPLICA_COUNT="3"
+            export REPLICA_COUNT="1"
             export VERSION_NUMBER=${REF_ID/refs-tags-/}
             export IMAGE_PATTERN="^refs-tags-(?P<version>[0-9]+\.[0-9]+\.[0-9]+)-pipeline-runner$"
             export IMAGE_EXTRACT='$version'

diff --git a/.github/workflows/pr_validate.yaml b/.github/workflows/pr_validate.yaml
@@ -68,19 +68,19 @@ jobs:
 
           echo "Is staging N/A?"
           echo $IS_STAGING_NA
-          echo "::set-output name=is-staging-na::$IS_STAGING_NA"
+          echo "is-staging-na=$IS_STAGING_NA" >> $GITHUB_OUTPUT
 
           echo "Full URL:"
           echo $URL
-          echo "::set-output name=url::$URL"
+          echo "url=$URL" >> $GITHUB_OUTPUT
 
           SANDBOX=$(pcregrep -o2 -M "$REGEX" <<\EOF
           ${{ github.event.pull_request.body }}
           EOF
           )
           echo "Extracted sandbox:"
           echo $SANDBOX
-          echo "::set-output name=sandbox::$SANDBOX"
+          echo "sandbox=$SANDBOX" >> $GITHUB_OUTPUT
 
       - id: reach-staging
         if: steps.extract-staging.outputs.url != 'N/A'
@@ -130,8 +130,8 @@ jobs:
           fi
           echo $SHOULD_RUN_E2E
 
-          echo "::set-output name=should-e2e-cancel::$SHOULD_CANCEL_E2E"
-          echo "::set-output name=should-e2e-run::$SHOULD_RUN_E2E"
+          echo "should-e2e-cancel=$SHOULD_CANCEL_E2E" >> $GITHUB_OUTPUT
+          echo "should-e2e-run=$SHOULD_RUN_E2E" >> $GITHUB_OUTPUT
 
       - id: cancel
         name: Cancel if staging is not N/A and box is not checked
@@ -151,12 +151,12 @@ jobs:
         run: |-
           echo "SHA of latest GitHub branch commit:"
           echo ${{ github.event.pull_request.head.sha }}
-          echo "::set-output name=github_sha::${{ github.event.pull_request.head.sha }}"
+          echo "github_sha=${{ github.event.pull_request.head.sha }}" >> $GITHUB_OUTPUT
 
           REPO_NAME=$(echo $GITHUB_REPOSITORY | awk -F '/' '{print $2}')
           echo "Repo name:"
           echo $REPO_NAME
-          echo "::set-output name=repo_name::$REPO_NAME"
+          echo "repo_name=$REPO_NAME" >> $GITHUB_OUTPUT
 
       - id: extract-integration-test-ref
         if: steps.check-e2e-run.outputs.should-e2e-run == 'true'
@@ -174,7 +174,7 @@ jobs:
           )
 
           echo "Ref given is $INTEGRATION_TEST_REF, setting it as is."
-          echo "::set-output name=ref::$INTEGRATION_TEST_REF"
+          echo "ref=$INTEGRATION_TEST_REF" >> $GITHUB_OUTPUT
 
       - id: run-integration-test
         if: steps.check-e2e-run.outputs.should-e2e-run == 'true'
@@ -203,14 +203,14 @@ jobs:
           EOF
           )
 
-          UNCHECKED_BODY="${UNCHECKED_BODY//'%'/'%25'}"
-          UNCHECKED_BODY="${UNCHECKED_BODY//$'\n'/'%0A'}"
-          UNCHECKED_BODY="${UNCHECKED_BODY//$'\r'/'%0D'}"
-
           echo "Unchecked PR body"
           echo $UNCHECKED_BODY
 
-          echo "::set-output name=body::$UNCHECKED_BODY"
+          # This sets multiline strings into the output variable
+          # See https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-a-multiline-string
+          echo "body<<EOF" >> "$GITHUB_OUTPUT"
+          echo "$UNCHECKED_BODY" >> "$GITHUB_OUTPUT"
+          echo "EOF" >> "$GITHUB_OUTPUT"
 
       - id: uncheck-integration-checkbox
         if: failure()

diff --git a/pipeline-runner/.lintr b/pipeline-runner/.lintr
@@ -1,4 +1,4 @@
 linters: with_defaults(
-   line_length_linter = line_length_linter(80),
+   line_length_linter = line_length_linter(100),
    object_usage_linter = NULL
  )
diff --git a/pipeline-runner/NAMESPACE b/pipeline-runner/NAMESPACE
@@ -6,6 +6,8 @@ export(add_new_sample_ids)
 export(add_subset_metadata)
 export(build_cc_gene_list)
 export(build_metadata_cellsets)
+export(build_mitochondrial_gene_list)
+export(build_ribosomal_gene_list)
 export(build_sample_cellsets)
 export(build_scratchpad_cellsets)
 export(cbind_cellset_type)

diff --git a/pipeline-runner/R/gem2s-4-score_doublets.R b/pipeline-runner/R/gem2s-4-score_doublets.R
@@ -30,22 +30,8 @@ score_doublets <- function(input, pipeline_config, prev_out) {
       sample_counts <- sample_counts[, keep]
     }
 
-    # also filter low UMI as per scDblFinder:::.checkSCE()
-    ntot <- Matrix::colSums(sample_counts)
+    scores[[sample]] <- get_doublet_scores(sample_counts)
 
-    # retry increasing the minimum counts in case of low sparsity in the sample
-    r <- NULL
-    attempt <- 1
-    while (is.null(r) && attempt <= 5) {
-      message("\ntrying to score doublets, attempt: ", attempt)
-      # make the threshold stricter in every attempt
-      empty_cells_mask <- ntot > (200 * attempt)
-      try({
-        scores[[sample]] <- compute_sample_doublet_scores(sample_counts[, empty_cells_mask])
-        r <- "not null"
-      })
-      attempt <- attempt + 1
-    }
   }
 
   prev_out$doublet_scores <- scores
@@ -77,3 +63,25 @@ compute_sample_doublet_scores <- function(sample_counts) {
 
   return(doublet_res)
 }
+
+
+get_doublet_scores <- function(sample_counts, max_attempts = 5) {
+  # also filter low UMI as per scDblFinder:::.checkSCE()
+  ntot <- Matrix::colSums(sample_counts)
+
+  # retry increasing the minimum counts in case of low sparsity in the sample
+  retry <- NULL
+  attempt <- 1
+  while (is.null(retry) && attempt <= max_attempts) {
+    message("\nTrying to score doublets, attempt: ", attempt)
+    # make the threshold stricter in every attempt
+    empty_cells_mask <- ntot > (200 * attempt)
+    try({
+      scores <- compute_sample_doublet_scores(sample_counts[, empty_cells_mask])
+      retry <- "not null"
+    })
+    attempt <- attempt + 1
+  }
+
+  return(scores)
+}
diff --git a/pipeline-runner/R/gem2s-5-create_seurat.R b/pipeline-runner/R/gem2s-5-create_seurat.R
@@ -100,10 +100,9 @@ construct_metadata <- function(counts, sample, config) {
 
 # add mitochondrial percent to SeuratObject
 add_mito <- function(scdata, annot) {
-  mt_regex <- "^mt[-:]"
-  if (any(grepl(mt_regex, annot$name, ignore.case = TRUE))) {
+  if (any(grepl(MITOCHONDRIAL_REGEX, annot$name, ignore.case = TRUE))) {
     message("Adding MT information...")
-    mt.features <- annot$input[grep(mt_regex, annot$name, ignore.case = TRUE)]
+    mt.features <- annot$input[grep(MITOCHONDRIAL_REGEX, annot$name, ignore.case = TRUE)]
     mt.features <- mt.features[mt.features %in% rownames(scdata)]
     if (length(mt.features)) {
       scdata <- Seurat::PercentageFeatureSet(scdata, features = mt.features, col.name = "percent.mt")

diff --git a/pipeline-runner/R/gem2s-6-construct_qc_config.R b/pipeline-runner/R/gem2s-6-construct_qc_config.R
@@ -9,22 +9,17 @@
 #'
 #' @param scdata_list list of Seurat objects
 #' @param unfiltered_samples character vector of unfiltered sample ids
-#' @param disable_qc_filters bool indicating if filters should be disabled.
 #'
 #' @return list of QC configuration parameters
 #'
-construct_qc_config <-
-  function(scdata_list,
-           disable_qc_filters,
-           unfiltered_samples) {
+construct_qc_config <- function(scdata_list, unfiltered_samples) {
     samples <- names(scdata_list)
 
     config_classifier <-
       add_custom_config_per_sample(
         customize_classifier_config,
         processing_config_template[["classifier"]],
         scdata_list,
-        disable_qc_filters,
         unfiltered_samples
       )
 
@@ -33,23 +28,20 @@ construct_qc_config <-
         customize_cellsize_config,
         processing_config_template[["cell_size"]],
         scdata_list,
-        disable_qc_filters
       )
 
     config_mitochondrial <-
       add_custom_config_per_sample(
         customize_mitochondrial_config,
         processing_config_template[["mitochondrial"]],
         scdata_list,
-        disable_qc_filters
       )
 
     config_genes_vs_umis <-
       add_custom_config_per_sample(
         customize_genes_vs_umis_config,
         processing_config_template[["genes_vs_umis"]],
         scdata_list,
-        disable_qc_filters
       )
 
 
@@ -58,7 +50,6 @@ construct_qc_config <-
         customize_doublet_config,
         processing_config_template[["doublet"]],
         scdata_list,
-        disable_qc_filters
       )
 
     config_data_integration <-
@@ -86,9 +77,8 @@ customize_classifier_config <-
   function(scdata,
            config,
            sample_name,
-           disable_qc_filters,
            unfiltered_samples) {
-    config$enabled <- sample_name %in% unfiltered_samples && !disable_qc_filters
+    config$enabled <- sample_name %in% unfiltered_samples
     config$prefiltered <- !(sample_name %in% unfiltered_samples)
 
     return(config)
@@ -99,7 +89,6 @@ customize_cellsize_config <-
   function(scdata,
            config,
            sample_name,
-           disable_qc_filters,
            unfiltered_samples) {
     minCellSize <- generate_default_values_cellSizeDistribution(scdata, config)
     config$filterSettings$minCellSize <- minCellSize
@@ -111,7 +100,6 @@ customize_mitochondrial_config <-
   function(scdata,
            config,
            sample_name,
-           disable_qc_filters,
            unfiltered_samples) {
     default_max_fraction <- generate_default_values_mitochondrialContent(scdata, config)
     config$filterSettings$methodSettings$absoluteThreshold$maxFraction <-
@@ -125,7 +113,6 @@ customize_doublet_config <-
   function(scdata,
            config,
            sample_name,
-           disable_qc_filters,
            unfiltered_samples) {
     probabilityThreshold <- generate_default_values_doubletScores(scdata)
     config$filterSettings$probabilityThreshold <- probabilityThreshold
@@ -138,7 +125,6 @@ customize_genes_vs_umis_config <-
   function(scdata,
            config,
            sample_name,
-           disable_qc_filters,
            unfiltered_samples) {
     # Sensible values are based on the function "gene.vs.molecule.cell.filter"
     # from the pagoda2 package
@@ -180,7 +166,6 @@ get_embedding_config <- function(scdata_list, config) {
 #' @param customize_template_config function - step customization function
 #' @param config_template list - template of step configuration parameters
 #' @param scdata_list list - with Seurat objects
-#' @param disable_qc_filters logical
 #' @param unfiltered_samples character vector
 #'
 #' @return list of customized QC parameters for each sample
@@ -190,7 +175,6 @@ add_custom_config_per_sample <-
   function(customize_template_config,
            config_template,
            scdata_list,
-           disable_qc_filters = FALSE,
            unfiltered_samples = NA) {
     config <- list()
     for (sample_name in names(scdata_list)) {
@@ -203,13 +187,9 @@ add_custom_config_per_sample <-
           sample_scdata,
           config_template,
           sample_name,
-          disable_qc_filters,
           unfiltered_samples
         )
 
-      sample_config$enabled <-
-        sample_config$enabled && !disable_qc_filters
-
       # update sample config thresholds
       config[[sample_name]] <- sample_config
     }

diff --git a/pipeline-runner/R/gem2s-6-prepare_experiment.R b/pipeline-runner/R/gem2s-6-prepare_experiment.R
@@ -7,7 +7,8 @@
 #' @param prev_out  'output' slot from call to \code{create_seurat}
 #'
 #' @return prev_out \code{prev_out} with added slots 'scdata' containing merged
-#'   \code{SeuratObject} and 'qc_config' containing default config for QC steps.
+#'  \code{SeuratObject} and 'qc_config' containing default config for QC steps
+#'  if it doesn't already exist.
 #'
 #' @export
 #'
@@ -23,7 +24,7 @@ prepare_experiment <- function(input, pipeline_config, prev_out) {
 
   message("Total cells:", sum(sapply(scdata_list, ncol)))
 
-# metadata is added to subset experiment at the subset step 1
+  # metadata is added to subset experiment at the subset step 1
   if (!disable_qc_filters) {
     scdata_list <-
       add_metadata_to_samples(scdata_list, prev_out$annot, input$experimentId)
@@ -33,7 +34,17 @@ prepare_experiment <- function(input, pipeline_config, prev_out) {
   # construct default QC config and update prev out
   message("Constructing default QC configuration...")
   unfiltered_samples <- names(prev_out$edrops[!is.null(prev_out$edrops)])
-  prev_out$qc_config <- construct_qc_config(scdata_list, disable_qc_filters, unfiltered_samples)
+  prev_out$default_qc_config <- construct_qc_config(scdata_list, unfiltered_samples)
+
+  # If we received a qc_config (subset pipeline case) then
+  # we want to set that one as the custom config
+  if ("qc_config" %in% names(prev_out)) {
+    message("Custom QC config received in prev_out, setting it as custom")
+    prev_out$qc_config <- prev_out$qc_config
+  } else {
+    message("No custom QC config, setting default instead")
+    prev_out$qc_config <- prev_out$default_qc_config
+  }
 
   res <- list(
     data = list(),

diff --git a/pipeline-runner/R/gem2s-7-upload_to_aws.R b/pipeline-runner/R/gem2s-7-upload_to_aws.R
@@ -20,6 +20,7 @@ upload_to_aws <- function(input, pipeline_config, prev_out) {
   scdata_list <- prev_out$scdata_list
   config <- prev_out$config
   qc_config <- prev_out$qc_config
+  default_qc_config <- prev_out$default_qc_config
   disable_qc_filters <- prev_out$disable_qc_filters
 
   # TODO: replace with subset_experiment flag when available
@@ -31,7 +32,6 @@ upload_to_aws <- function(input, pipeline_config, prev_out) {
     cell_sets <- get_subset_cell_sets(scdata_list, input, prev_out, disable_qc_filters)
   }
 
-
   # cell sets file to s3
   cell_sets_data <- RJSONIO::toJSON(cell_sets)
 
@@ -69,7 +69,8 @@ upload_to_aws <- function(input, pipeline_config, prev_out) {
       organism = config$organism,
       type = config$input$type
     ),
-    processingConfig = qc_config
+    processingConfig = qc_config,
+    defaultProcessingConfig = default_qc_config
   )
 
   res <- list(
@@ -330,8 +331,16 @@ get_subset_cell_sets <- function(scdata_list, input, prev_out, disable_qc_filter
   if ("scratchpad" %in% unique(subset_cellsets$type)) {
     message("adding custom cellsets to subset experiment")
     scratchpad_cellsets <- build_scratchpad_cellsets(color_pool, subset_cellsets)
-    cell_sets <- c(cell_sets, list(scratchpad_cellsets))
+  } else {
+    scratchpad_cellsets <- list(
+      key = "scratchpad",
+      name = "Custom cell sets",
+      rootNode = TRUE,
+      children = list(),
+      type = "cellSets"
+    )
   }
+  cell_sets <- c(cell_sets, list(scratchpad_cellsets))
 
   cell_sets <- list(cellSets = cell_sets)