Merge pull request #315 from hms-dbmi-cellenics/improve-seurat

add automated cluster detection for Seurat object uploads
hms-dbmi-cellenics · Aug 28, 2023 · 9f0d87d · 9f0d87d
2 parents e4e39eb + 2e6d24e
commit 9f0d87d
Show file tree

Hide file tree

Showing 5 changed files with 280 additions and 56 deletions.
diff --git a/pipeline-runner/R/qc-7-embed_and_cluster.R b/pipeline-runner/R/qc-7-embed_and_cluster.R
@@ -72,9 +72,12 @@ ensure_is_list_in_json <- function(vector) {
   }
 }
 
-format_cell_sets_object <-
-  function(cell_sets, clustering_method, color_pool) {
-    name <- paste0(clustering_method, " clusters")
+format_cell_sets_object <- function(
+    cell_sets,
+    clustering_method,
+    color_pool,
+    name = paste0(clustering_method, " clusters")
+    ) {
 
     # careful with capital l on type for the key.
     cell_sets_object <-

diff --git a/pipeline-runner/R/seurat-2-load_seurat.R b/pipeline-runner/R/seurat-2-load_seurat.R
@@ -53,53 +53,75 @@ reconstruct_seurat <- function(dataset_fpath) {
 
   # get counts
   tryCatch({
+    SeuratObject::DefaultAssay(user_scdata) <- 'RNA'
     counts <- user_scdata[['RNA']]@counts
     test_user_sparse_mat(counts)
     rns <- row.names(counts)
     check_type_is_safe(rns)
-  },
-  error = function(e) {
-    message(e$message)
-    stop(errors$ERROR_SEURAT_COUNTS, call. = FALSE)
-  })
-
-  gene_annotations <- data.frame(input = rns, name = rns, original_name = rns, row.names = rns)
-  user_scdata@misc$gene_annotations <- gene_annotations
 
-  # add dispersions
-  tryCatch({
-    user_scdata <- add_dispersions(user_scdata, method = "default")
-    dispersions <- user_scdata@misc$gene_dispersion
-    test_user_df(dispersions)
   },
   error = function(e) {
     message(e$message)
-    stop(errors$ERROR_SEURAT_HVFINFO, call. = FALSE)
+    stop(errors$ERROR_SEURAT_COUNTS, call. = FALSE)
   })
 
   # get meta data
   tryCatch({
     metadata <- user_scdata@meta.data
+    metadata$active.ident <- user_scdata@active.ident
     test_user_df(metadata)
   },
   error = function(e) {
     message(e$message)
     stop(errors$ERROR_SEURAT_METADATA, call. = FALSE)
   })
 
-  # check for clusters
-  if (!'seurat_clusters' %in% colnames(metadata))
-    stop(errors$ERROR_SEURAT_CLUSTERS, call. = FALSE)
-
-
-  # reconstruct seurat object
+  # reconstruct Seurat object
   scdata <- SeuratObject::CreateSeuratObject(
     counts,
     meta.data = metadata,
   )
 
-  # add dispersions and gene annotations to new scdata
-  scdata@misc$gene_dispersion <- dispersions
+  # add logcounts
+  tryCatch({
+    logcounts <- user_scdata[['RNA']]@data
+    test_user_sparse_mat(logcounts)
+
+    # shouldn't be raw counts
+    suspect.counts <- max(logcounts) > 100
+    if (suspect.counts) logcounts <- Seurat::NormalizeData(logcounts)
+
+    scdata[['RNA']]@data <- logcounts
+  },
+  error = function(e) {
+    message(e$message)
+    stop(errors$ERROR_SEURAT_LOGCOUNTS, call. = FALSE)
+  })
+
+
+  # add dispersions (need for gene list)
+  tryCatch({
+    dispersions <- Seurat::FindVariableFeatures(logcounts)
+    test_user_df(dispersions)
+    colnames(dispersions) <- gsub('^vst[.]', '', colnames(dispersions))
+
+    # keep columns that use: same as `HVFInfo(scdata)`
+    dispersions <- dispersions[, c('mean', 'variance', 'variance.standardized')]
+    dispersions$SYMBOL <- dispersions$ENSEMBL <- row.names(dispersions)
+    scdata@misc$gene_dispersion <- dispersions
+  },
+  error = function(e) {
+    message(e$message)
+    stop(errors$ERROR_SEURAT_HVFINFO, call. = FALSE)
+  })
+
+  # add gene annotations
+  gene_annotations <- data.frame(
+    input = rns,
+    name = rns,
+    original_name = rns,
+    row.names = rns
+  )
   scdata@misc$gene_annotations <- gene_annotations
 
 
@@ -140,21 +162,9 @@ reconstruct_seurat <- function(dataset_fpath) {
     stop(errors$ERROR_SEURAT_REDUCTION, call. = FALSE)
   })
 
-  # add logcounts
-  tryCatch({
-    data <- user_scdata[['RNA']]@data
-    test_user_sparse_mat(data)
-    scdata[['RNA']]@data <- data
-  },
-  error = function(e) {
-    message(e$message)
-    stop(errors$ERROR_SEURAT_LOGCOUNTS, call. = FALSE)
-  })
-
   return(scdata)
 }
 
-
 test_user_df <- function(df) {
   for (col in colnames(df)) {
     check_type_is_safe(df[, col])

diff --git a/pipeline-runner/R/seurat-3-upload_seurat_to_aws.R b/pipeline-runner/R/seurat-3-upload_seurat_to_aws.R
@@ -15,15 +15,28 @@ upload_seurat_to_aws <- function(input, pipeline_config, prev_out) {
   scdata <- change_sample_names_to_ids(scdata, input)
   cell_sets <- get_cell_sets(scdata, input)
 
-  # add louvain clusters
-  cluster_sets <- data.frame(
-    cluster = scdata$seurat_clusters,
-    cell_ids = scdata$cells_id
-  ) %>%
-    format_cell_sets_object('louvain', scdata@misc$color_pool)
+  # detect cluster metadata
+  cluster_columns <- find_cluster_columns(scdata)
+  if (!length(cluster_columns))
+    stop(errors$ERROR_SEURAT_CLUSTERS, call. = FALSE)
+
+  # add clusters
+  cluster_sets <- list()
+  for (i in seq_along(cluster_columns)) {
+    col <- cluster_columns[i]
+
+    # first cluster column used as 'louvain' clusters
+    method <- ifelse(i == 1, 'louvain', col)
+
+    cluster_sets[[i]] <- data.frame(
+      cluster = scdata@meta.data[[col]],
+      cell_ids = scdata$cells_id
+    ) |>
+      format_cell_sets_object(method, scdata@misc$color_pool, name = col)
+  }
 
   # cell sets file to s3
-  cell_sets$cellSets <- c(list(cluster_sets), cell_sets$cellSets)
+  cell_sets$cellSets <- c(cluster_sets, cell_sets$cellSets)
   cell_sets_data <- RJSONIO::toJSON(cell_sets)
 
   put_object_in_s3(pipeline_config,
@@ -64,6 +77,69 @@ upload_seurat_to_aws <- function(input, pipeline_config, prev_out) {
   return(res)
 }
 
+find_cluster_columns <- function(scdata) {
+
+  # exclude all group columns, including duplicates
+  group_cols <- find_group_columns(scdata, remove.dups = FALSE)
+  exclude_cols <- c(group_cols, 'samples')
+
+  # order meta to indicate preference for louvain clusters
+  meta <- scdata@meta.data
+  louvain_cols <- c('louvain', 'active.ident', 'seurat_clusters')
+  meta <- meta |> dplyr::relocate(dplyr::any_of(louvain_cols))
+
+  check_cols <- setdiff(colnames(meta), exclude_cols)
+
+  cluster_cols <- c()
+  for (check_col in check_cols) {
+    check_vals <- meta[[check_col]]
+
+    # skip if too few or way too many values
+    value_counts <- table(check_vals)
+    n.vals <- length(value_counts)
+    if (n.vals < 2) next()
+    if (n.vals > 1000) next()
+    cat("candidate cluster columns:", check_col, "--> nvals:", n.vals, '\n')
+
+    # skip if values are numeric but non-integer
+    if (is.numeric(check_vals) &&
+        !all(as.integer(check_vals) == check_vals)) next()
+
+    # skip if more than 1/3 of values are repeated fewer than 4 times
+    nreps_lt4 <- sum(value_counts < 4)
+    if (nreps_lt4 > n.vals/3) next()
+
+    # skip if col is same as samples or group column
+    is_sample_col <- FALSE
+    for (exclude_col in exclude_cols)  {
+      exclude_vals <- meta[[exclude_col]]
+      if (test_groups_equal(check_vals, exclude_vals)) {
+        is_sample_col <- TRUE
+        break
+      }
+    }
+    if (is_sample_col) next()
+
+    # TODO: decide if should skip if values are boolean?
+    # add remains to cluster_cols
+    cluster_cols <- c(cluster_cols, check_col)
+  }
+
+  return(cluster_cols)
+}
+
+make_vals_numeric <- function(vals) {
+  as.numeric(factor(vals, levels = unique(vals)))
+}
+
+test_groups_equal <- function(vals1, vals2) {
+  vals1 <- make_vals_numeric(vals1)
+  vals2 <- make_vals_numeric(vals2)
+
+  all(vals1 == vals2)
+}
+
+
 add_samples_to_input <- function(scdata, input) {
   samples <- unique(scdata$samples)
   input$sampleNames <- samples
@@ -96,7 +172,7 @@ add_metadata_to_input <- function(scdata, input) {
 }
 
 # get column names that are consistent with sample groups
-find_group_columns <- function(scdata) {
+find_group_columns <- function(scdata, remove.dups = TRUE) {
   meta <- scdata@meta.data
 
   ndistinct_sample <- meta |>
@@ -118,11 +194,16 @@ find_group_columns <- function(scdata) {
   one.per.sample <- apply(ndistinct_sample, 2, function(x) all(x == 1))
   group.cols <- names(ndistinct)[!too.many & !too.few & one.per.sample]
 
+  # remove duplicated group columns
+  if (remove.dups) {
+    meta[group.cols] <- lapply(meta[group.cols], make_vals_numeric)
+    dups <- duplicated(as.list(meta))
+    group.cols <- group.cols[group.cols %in% colnames(meta)[!dups]]
+  }
+
   return(group.cols)
 }
 
-
-
 # add 'cells_id'
 # 'samples' must be already added
 # current input$metadata not yet implemented

diff --git a/pipeline-runner/tests/testthat/test-seurat-2-load_seurat.R b/pipeline-runner/tests/testthat/test-seurat-2-load_seurat.R
@@ -45,8 +45,8 @@ test_that("load_seurat has the correct structure", {
   # only the default and 'pca' reduction is present
   expect_equal(c(SeuratObject::DefaultDimReduc(orig_scdata), 'pca'), names(scdata@reductions))
 
-  # all metadata is preserved
-  expect_equal(scdata@meta.data, orig_scdata@meta.data)
+  # all metadata is preserved and active.ident is added
+  expect_equal(colnames(scdata@meta.data), c(colnames(orig_scdata@meta.data), 'active.ident'))
 
   # clean up
   unlink(data_dir, recursive = TRUE)
@@ -93,7 +93,7 @@ test_that("load_seurat fails if there is no RNA assay", {
   unlink(data_dir, recursive = TRUE)
 })
 
-test_that("load_seurat fails if there is no seurat_clusters column in metadata", {
+test_that("load_seurat does not fail if there is no seurat_clusters column in metadata", {
   # setup
   input_dir <- tempdir()
   data_dir <- file.path(input_dir, 'pbmc_small')
@@ -107,7 +107,7 @@ test_that("load_seurat fails if there is no seurat_clusters column in metadata",
   prev_out <- list(config = list(samples = 'pbmc_small'))
   expect_error(
     load_seurat(input = NULL, pipeline_config = NULL, prev_out = prev_out, input_dir = input_dir),
-    regexp = 'ERROR_SEURAT_CLUSTERS'
+    NA
     )
 
   # clean up
@@ -157,6 +157,7 @@ test_that("load_seurat fails if there is no pca reduction", {
   unlink(data_dir, recursive = TRUE)
 })
 
+
 test_that("load_seurat fails if there is inappropriate logcounts data", {
   # setup
   input_dir <- tempdir()
@@ -167,7 +168,6 @@ test_that("load_seurat fails if there is inappropriate logcounts data", {
   # sparse matrix with wrong dimensions
   bad_data <- Matrix::sparseMatrix(1, 1, x = 1)
   expect_error(test_user_sparse_mat(bad_data), NA)
-
   orig_scdata@assays$RNA@data <- bad_data
   saveRDS(orig_scdata, file.path(data_dir, 'r.rds'))
 
@@ -181,7 +181,7 @@ test_that("load_seurat fails if there is inappropriate logcounts data", {
   unlink(data_dir, recursive = TRUE)
 })
 
-test_that("load_seurat fails if there if HVFInfo can't be called", {
+test_that("load_seurat generates HVFInfo if it is not present", {
   # setup
   input_dir <- tempdir()
   data_dir <- file.path(input_dir, 'pbmc_small')
@@ -195,9 +195,45 @@ test_that("load_seurat fails if there if HVFInfo can't be called", {
 
   prev_out <- list(config = list(samples = 'pbmc_small'))
   expect_error(
-    load_seurat(input = NULL, pipeline_config = NULL, prev_out = prev_out, input_dir = input_dir),
-    regexp = 'ERROR_SEURAT_HVFINFO'
-    )
+    res <- load_seurat(input = NULL, pipeline_config = NULL, prev_out = prev_out, input_dir = input_dir),
+    NA)
+
+  scdata <- res$output$scdata
+  disp_colnames <- c('mean', 'variance', 'variance.standardized', 'ENSEMBL', 'SYMBOL')
+  expect_true(methods::is(scdata@misc$gene_dispersion, 'data.frame'))
+  expect_equal(colnames(scdata@misc$gene_dispersion), disp_colnames)
+
+  # clean up
+  unlink(data_dir, recursive = TRUE)
+})
+
+test_that("load_seurat identifies and log-transforms counts stored in data assay", {
+  # setup
+  input_dir <- tempdir()
+  data_dir <- file.path(input_dir, 'pbmc_small')
+  dir.create(data_dir)
+  orig_scdata <- mock_scdata(data_dir)
+
+
+  # replace logcounts with counts
+  orig_logcounts <- orig_scdata[['RNA']]@data
+  orig_counts <- orig_scdata[['RNA']]@counts
+  orig_scdata[['RNA']]@data <- orig_counts
+
+  # checking setup
+  expect_true(max(orig_counts) > 100)
+  expect_false(max(orig_logcounts) > 100)
+  expect_identical(orig_scdata[['RNA']]@data, orig_counts)
+
+  saveRDS(orig_scdata, file.path(data_dir, 'r.rds'))
+  prev_out <- list(config = list(samples = 'pbmc_small'))
+  res <- load_seurat(input = NULL, pipeline_config = NULL, prev_out = prev_out, input_dir = input_dir)
+  scdata <- res$output$scdata
+
+  # logcounts were log-transformed
+  counts <- scdata[['RNA']]@counts
+  expect_identical(orig_counts, counts)
+  expect_identical(scdata[['RNA']]@data, Seurat::NormalizeData(orig_counts))
 
   # clean up
   unlink(data_dir, recursive = TRUE)