Skip to content

Commit

Permalink
Merge pull request #315 from hms-dbmi-cellenics/improve-seurat
Browse files Browse the repository at this point in the history
add automated cluster detection for Seurat object uploads
  • Loading branch information
alexvpickering authored Aug 28, 2023
2 parents e4e39eb + 2e6d24e commit 9f0d87d
Show file tree
Hide file tree
Showing 5 changed files with 280 additions and 56 deletions.
9 changes: 6 additions & 3 deletions pipeline-runner/R/qc-7-embed_and_cluster.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,12 @@ ensure_is_list_in_json <- function(vector) {
}
}

format_cell_sets_object <-
function(cell_sets, clustering_method, color_pool) {
name <- paste0(clustering_method, " clusters")
format_cell_sets_object <- function(
cell_sets,
clustering_method,
color_pool,
name = paste0(clustering_method, " clusters")
) {

# careful with capital l on type for the key.
cell_sets_object <-
Expand Down
78 changes: 44 additions & 34 deletions pipeline-runner/R/seurat-2-load_seurat.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,53 +53,75 @@ reconstruct_seurat <- function(dataset_fpath) {

# get counts
tryCatch({
SeuratObject::DefaultAssay(user_scdata) <- 'RNA'
counts <- user_scdata[['RNA']]@counts
test_user_sparse_mat(counts)
rns <- row.names(counts)
check_type_is_safe(rns)
},
error = function(e) {
message(e$message)
stop(errors$ERROR_SEURAT_COUNTS, call. = FALSE)
})

gene_annotations <- data.frame(input = rns, name = rns, original_name = rns, row.names = rns)
user_scdata@misc$gene_annotations <- gene_annotations

# add dispersions
tryCatch({
user_scdata <- add_dispersions(user_scdata, method = "default")
dispersions <- user_scdata@misc$gene_dispersion
test_user_df(dispersions)
},
error = function(e) {
message(e$message)
stop(errors$ERROR_SEURAT_HVFINFO, call. = FALSE)
stop(errors$ERROR_SEURAT_COUNTS, call. = FALSE)
})

# get meta data
tryCatch({
metadata <- user_scdata@meta.data
metadata$active.ident <- user_scdata@active.ident
test_user_df(metadata)
},
error = function(e) {
message(e$message)
stop(errors$ERROR_SEURAT_METADATA, call. = FALSE)
})

# check for clusters
if (!'seurat_clusters' %in% colnames(metadata))
stop(errors$ERROR_SEURAT_CLUSTERS, call. = FALSE)


# reconstruct seurat object
# reconstruct Seurat object
scdata <- SeuratObject::CreateSeuratObject(
counts,
meta.data = metadata,
)

# add dispersions and gene annotations to new scdata
scdata@misc$gene_dispersion <- dispersions
# add logcounts
tryCatch({
logcounts <- user_scdata[['RNA']]@data
test_user_sparse_mat(logcounts)

# shouldn't be raw counts
suspect.counts <- max(logcounts) > 100
if (suspect.counts) logcounts <- Seurat::NormalizeData(logcounts)

scdata[['RNA']]@data <- logcounts
},
error = function(e) {
message(e$message)
stop(errors$ERROR_SEURAT_LOGCOUNTS, call. = FALSE)
})


# add dispersions (need for gene list)
tryCatch({
dispersions <- Seurat::FindVariableFeatures(logcounts)
test_user_df(dispersions)
colnames(dispersions) <- gsub('^vst[.]', '', colnames(dispersions))

# keep columns that use: same as `HVFInfo(scdata)`
dispersions <- dispersions[, c('mean', 'variance', 'variance.standardized')]
dispersions$SYMBOL <- dispersions$ENSEMBL <- row.names(dispersions)
scdata@misc$gene_dispersion <- dispersions
},
error = function(e) {
message(e$message)
stop(errors$ERROR_SEURAT_HVFINFO, call. = FALSE)
})

# add gene annotations
gene_annotations <- data.frame(
input = rns,
name = rns,
original_name = rns,
row.names = rns
)
scdata@misc$gene_annotations <- gene_annotations


Expand Down Expand Up @@ -140,21 +162,9 @@ reconstruct_seurat <- function(dataset_fpath) {
stop(errors$ERROR_SEURAT_REDUCTION, call. = FALSE)
})

# add logcounts
tryCatch({
data <- user_scdata[['RNA']]@data
test_user_sparse_mat(data)
scdata[['RNA']]@data <- data
},
error = function(e) {
message(e$message)
stop(errors$ERROR_SEURAT_LOGCOUNTS, call. = FALSE)
})

return(scdata)
}


test_user_df <- function(df) {
for (col in colnames(df)) {
check_type_is_safe(df[, col])
Expand Down
101 changes: 91 additions & 10 deletions pipeline-runner/R/seurat-3-upload_seurat_to_aws.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,28 @@ upload_seurat_to_aws <- function(input, pipeline_config, prev_out) {
scdata <- change_sample_names_to_ids(scdata, input)
cell_sets <- get_cell_sets(scdata, input)

# add louvain clusters
cluster_sets <- data.frame(
cluster = scdata$seurat_clusters,
cell_ids = scdata$cells_id
) %>%
format_cell_sets_object('louvain', scdata@misc$color_pool)
# detect cluster metadata
cluster_columns <- find_cluster_columns(scdata)
if (!length(cluster_columns))
stop(errors$ERROR_SEURAT_CLUSTERS, call. = FALSE)

# add clusters
cluster_sets <- list()
for (i in seq_along(cluster_columns)) {
col <- cluster_columns[i]

# first cluster column used as 'louvain' clusters
method <- ifelse(i == 1, 'louvain', col)

cluster_sets[[i]] <- data.frame(
cluster = scdata@meta.data[[col]],
cell_ids = scdata$cells_id
) |>
format_cell_sets_object(method, scdata@misc$color_pool, name = col)
}

# cell sets file to s3
cell_sets$cellSets <- c(list(cluster_sets), cell_sets$cellSets)
cell_sets$cellSets <- c(cluster_sets, cell_sets$cellSets)
cell_sets_data <- RJSONIO::toJSON(cell_sets)

put_object_in_s3(pipeline_config,
Expand Down Expand Up @@ -64,6 +77,69 @@ upload_seurat_to_aws <- function(input, pipeline_config, prev_out) {
return(res)
}

find_cluster_columns <- function(scdata) {

# exclude all group columns, including duplicates
group_cols <- find_group_columns(scdata, remove.dups = FALSE)
exclude_cols <- c(group_cols, 'samples')

# order meta to indicate preference for louvain clusters
meta <- scdata@meta.data
louvain_cols <- c('louvain', 'active.ident', 'seurat_clusters')
meta <- meta |> dplyr::relocate(dplyr::any_of(louvain_cols))

check_cols <- setdiff(colnames(meta), exclude_cols)

cluster_cols <- c()
for (check_col in check_cols) {
check_vals <- meta[[check_col]]

# skip if too few or way too many values
value_counts <- table(check_vals)
n.vals <- length(value_counts)
if (n.vals < 2) next()
if (n.vals > 1000) next()
cat("candidate cluster columns:", check_col, "--> nvals:", n.vals, '\n')

# skip if values are numeric but non-integer
if (is.numeric(check_vals) &&
!all(as.integer(check_vals) == check_vals)) next()

# skip if more than 1/3 of values are repeated fewer than 4 times
nreps_lt4 <- sum(value_counts < 4)
if (nreps_lt4 > n.vals/3) next()

# skip if col is same as samples or group column
is_sample_col <- FALSE
for (exclude_col in exclude_cols) {
exclude_vals <- meta[[exclude_col]]
if (test_groups_equal(check_vals, exclude_vals)) {
is_sample_col <- TRUE
break
}
}
if (is_sample_col) next()

# TODO: decide if should skip if values are boolean?
# add remains to cluster_cols
cluster_cols <- c(cluster_cols, check_col)
}

return(cluster_cols)
}

make_vals_numeric <- function(vals) {
as.numeric(factor(vals, levels = unique(vals)))
}

test_groups_equal <- function(vals1, vals2) {
vals1 <- make_vals_numeric(vals1)
vals2 <- make_vals_numeric(vals2)

all(vals1 == vals2)
}


add_samples_to_input <- function(scdata, input) {
samples <- unique(scdata$samples)
input$sampleNames <- samples
Expand Down Expand Up @@ -96,7 +172,7 @@ add_metadata_to_input <- function(scdata, input) {
}

# get column names that are consistent with sample groups
find_group_columns <- function(scdata) {
find_group_columns <- function(scdata, remove.dups = TRUE) {
meta <- scdata@meta.data

ndistinct_sample <- meta |>
Expand All @@ -118,11 +194,16 @@ find_group_columns <- function(scdata) {
one.per.sample <- apply(ndistinct_sample, 2, function(x) all(x == 1))
group.cols <- names(ndistinct)[!too.many & !too.few & one.per.sample]

# remove duplicated group columns
if (remove.dups) {
meta[group.cols] <- lapply(meta[group.cols], make_vals_numeric)
dups <- duplicated(as.list(meta))
group.cols <- group.cols[group.cols %in% colnames(meta)[!dups]]
}

return(group.cols)
}



# add 'cells_id'
# 'samples' must be already added
# current input$metadata not yet implemented
Expand Down
54 changes: 45 additions & 9 deletions pipeline-runner/tests/testthat/test-seurat-2-load_seurat.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ test_that("load_seurat has the correct structure", {
# only the default and 'pca' reduction is present
expect_equal(c(SeuratObject::DefaultDimReduc(orig_scdata), 'pca'), names(scdata@reductions))

# all metadata is preserved
expect_equal(scdata@meta.data, orig_scdata@meta.data)
# all metadata is preserved and active.ident is added
expect_equal(colnames(scdata@meta.data), c(colnames(orig_scdata@meta.data), 'active.ident'))

# clean up
unlink(data_dir, recursive = TRUE)
Expand Down Expand Up @@ -93,7 +93,7 @@ test_that("load_seurat fails if there is no RNA assay", {
unlink(data_dir, recursive = TRUE)
})

test_that("load_seurat fails if there is no seurat_clusters column in metadata", {
test_that("load_seurat does not fail if there is no seurat_clusters column in metadata", {
# setup
input_dir <- tempdir()
data_dir <- file.path(input_dir, 'pbmc_small')
Expand All @@ -107,7 +107,7 @@ test_that("load_seurat fails if there is no seurat_clusters column in metadata",
prev_out <- list(config = list(samples = 'pbmc_small'))
expect_error(
load_seurat(input = NULL, pipeline_config = NULL, prev_out = prev_out, input_dir = input_dir),
regexp = 'ERROR_SEURAT_CLUSTERS'
NA
)

# clean up
Expand Down Expand Up @@ -157,6 +157,7 @@ test_that("load_seurat fails if there is no pca reduction", {
unlink(data_dir, recursive = TRUE)
})


test_that("load_seurat fails if there is inappropriate logcounts data", {
# setup
input_dir <- tempdir()
Expand All @@ -167,7 +168,6 @@ test_that("load_seurat fails if there is inappropriate logcounts data", {
# sparse matrix with wrong dimensions
bad_data <- Matrix::sparseMatrix(1, 1, x = 1)
expect_error(test_user_sparse_mat(bad_data), NA)

orig_scdata@assays$RNA@data <- bad_data
saveRDS(orig_scdata, file.path(data_dir, 'r.rds'))

Expand All @@ -181,7 +181,7 @@ test_that("load_seurat fails if there is inappropriate logcounts data", {
unlink(data_dir, recursive = TRUE)
})

test_that("load_seurat fails if there if HVFInfo can't be called", {
test_that("load_seurat generates HVFInfo if it is not present", {
# setup
input_dir <- tempdir()
data_dir <- file.path(input_dir, 'pbmc_small')
Expand All @@ -195,9 +195,45 @@ test_that("load_seurat fails if there if HVFInfo can't be called", {

prev_out <- list(config = list(samples = 'pbmc_small'))
expect_error(
load_seurat(input = NULL, pipeline_config = NULL, prev_out = prev_out, input_dir = input_dir),
regexp = 'ERROR_SEURAT_HVFINFO'
)
res <- load_seurat(input = NULL, pipeline_config = NULL, prev_out = prev_out, input_dir = input_dir),
NA)

scdata <- res$output$scdata
disp_colnames <- c('mean', 'variance', 'variance.standardized', 'ENSEMBL', 'SYMBOL')
expect_true(methods::is(scdata@misc$gene_dispersion, 'data.frame'))
expect_equal(colnames(scdata@misc$gene_dispersion), disp_colnames)

# clean up
unlink(data_dir, recursive = TRUE)
})

test_that("load_seurat identifies and log-transforms counts stored in data assay", {
# setup
input_dir <- tempdir()
data_dir <- file.path(input_dir, 'pbmc_small')
dir.create(data_dir)
orig_scdata <- mock_scdata(data_dir)


# replace logcounts with counts
orig_logcounts <- orig_scdata[['RNA']]@data
orig_counts <- orig_scdata[['RNA']]@counts
orig_scdata[['RNA']]@data <- orig_counts

# checking setup
expect_true(max(orig_counts) > 100)
expect_false(max(orig_logcounts) > 100)
expect_identical(orig_scdata[['RNA']]@data, orig_counts)

saveRDS(orig_scdata, file.path(data_dir, 'r.rds'))
prev_out <- list(config = list(samples = 'pbmc_small'))
res <- load_seurat(input = NULL, pipeline_config = NULL, prev_out = prev_out, input_dir = input_dir)
scdata <- res$output$scdata

# logcounts were log-transformed
counts <- scdata[['RNA']]@counts
expect_identical(orig_counts, counts)
expect_identical(scdata[['RNA']]@data, Seurat::NormalizeData(orig_counts))

# clean up
unlink(data_dir, recursive = TRUE)
Expand Down
Loading

0 comments on commit 9f0d87d

Please sign in to comment.