diff --git a/pipeline-runner/R/qc-7-embed_and_cluster.R b/pipeline-runner/R/qc-7-embed_and_cluster.R index b8073c9d..6c6c336f 100644 --- a/pipeline-runner/R/qc-7-embed_and_cluster.R +++ b/pipeline-runner/R/qc-7-embed_and_cluster.R @@ -72,9 +72,12 @@ ensure_is_list_in_json <- function(vector) { } } -format_cell_sets_object <- - function(cell_sets, clustering_method, color_pool) { - name <- paste0(clustering_method, " clusters") +format_cell_sets_object <- function( + cell_sets, + clustering_method, + color_pool, + name = paste0(clustering_method, " clusters") + ) { # careful with capital l on type for the key. cell_sets_object <- diff --git a/pipeline-runner/R/seurat-2-load_seurat.R b/pipeline-runner/R/seurat-2-load_seurat.R index f8783ea3..60324299 100644 --- a/pipeline-runner/R/seurat-2-load_seurat.R +++ b/pipeline-runner/R/seurat-2-load_seurat.R @@ -53,33 +53,22 @@ reconstruct_seurat <- function(dataset_fpath) { # get counts tryCatch({ + SeuratObject::DefaultAssay(user_scdata) <- 'RNA' counts <- user_scdata[['RNA']]@counts test_user_sparse_mat(counts) rns <- row.names(counts) check_type_is_safe(rns) - }, - error = function(e) { - message(e$message) - stop(errors$ERROR_SEURAT_COUNTS, call. = FALSE) - }) - - gene_annotations <- data.frame(input = rns, name = rns, original_name = rns, row.names = rns) - user_scdata@misc$gene_annotations <- gene_annotations - # add dispersions - tryCatch({ - user_scdata <- add_dispersions(user_scdata, method = "default") - dispersions <- user_scdata@misc$gene_dispersion - test_user_df(dispersions) }, error = function(e) { message(e$message) - stop(errors$ERROR_SEURAT_HVFINFO, call. = FALSE) + stop(errors$ERROR_SEURAT_COUNTS, call. = FALSE) }) # get meta data tryCatch({ metadata <- user_scdata@meta.data + metadata$active.ident <- user_scdata@active.ident test_user_df(metadata) }, error = function(e) { @@ -87,19 +76,52 @@ reconstruct_seurat <- function(dataset_fpath) { stop(errors$ERROR_SEURAT_METADATA, call. = FALSE) }) - # check for clusters - if (!'seurat_clusters' %in% colnames(metadata)) - stop(errors$ERROR_SEURAT_CLUSTERS, call. = FALSE) - - - # reconstruct seurat object + # reconstruct Seurat object scdata <- SeuratObject::CreateSeuratObject( counts, meta.data = metadata, ) - # add dispersions and gene annotations to new scdata - scdata@misc$gene_dispersion <- dispersions + # add logcounts + tryCatch({ + logcounts <- user_scdata[['RNA']]@data + test_user_sparse_mat(logcounts) + + # shouldn't be raw counts + suspect.counts <- max(logcounts) > 100 + if (suspect.counts) logcounts <- Seurat::NormalizeData(logcounts) + + scdata[['RNA']]@data <- logcounts + }, + error = function(e) { + message(e$message) + stop(errors$ERROR_SEURAT_LOGCOUNTS, call. = FALSE) + }) + + + # add dispersions (need for gene list) + tryCatch({ + dispersions <- Seurat::FindVariableFeatures(logcounts) + test_user_df(dispersions) + colnames(dispersions) <- gsub('^vst[.]', '', colnames(dispersions)) + + # keep columns that use: same as `HVFInfo(scdata)` + dispersions <- dispersions[, c('mean', 'variance', 'variance.standardized')] + dispersions$SYMBOL <- dispersions$ENSEMBL <- row.names(dispersions) + scdata@misc$gene_dispersion <- dispersions + }, + error = function(e) { + message(e$message) + stop(errors$ERROR_SEURAT_HVFINFO, call. = FALSE) + }) + + # add gene annotations + gene_annotations <- data.frame( + input = rns, + name = rns, + original_name = rns, + row.names = rns + ) scdata@misc$gene_annotations <- gene_annotations @@ -140,21 +162,9 @@ reconstruct_seurat <- function(dataset_fpath) { stop(errors$ERROR_SEURAT_REDUCTION, call. = FALSE) }) - # add logcounts - tryCatch({ - data <- user_scdata[['RNA']]@data - test_user_sparse_mat(data) - scdata[['RNA']]@data <- data - }, - error = function(e) { - message(e$message) - stop(errors$ERROR_SEURAT_LOGCOUNTS, call. = FALSE) - }) - return(scdata) } - test_user_df <- function(df) { for (col in colnames(df)) { check_type_is_safe(df[, col]) diff --git a/pipeline-runner/R/seurat-3-upload_seurat_to_aws.R b/pipeline-runner/R/seurat-3-upload_seurat_to_aws.R index 53819e2b..c5af0329 100644 --- a/pipeline-runner/R/seurat-3-upload_seurat_to_aws.R +++ b/pipeline-runner/R/seurat-3-upload_seurat_to_aws.R @@ -15,15 +15,28 @@ upload_seurat_to_aws <- function(input, pipeline_config, prev_out) { scdata <- change_sample_names_to_ids(scdata, input) cell_sets <- get_cell_sets(scdata, input) - # add louvain clusters - cluster_sets <- data.frame( - cluster = scdata$seurat_clusters, - cell_ids = scdata$cells_id - ) %>% - format_cell_sets_object('louvain', scdata@misc$color_pool) + # detect cluster metadata + cluster_columns <- find_cluster_columns(scdata) + if (!length(cluster_columns)) + stop(errors$ERROR_SEURAT_CLUSTERS, call. = FALSE) + + # add clusters + cluster_sets <- list() + for (i in seq_along(cluster_columns)) { + col <- cluster_columns[i] + + # first cluster column used as 'louvain' clusters + method <- ifelse(i == 1, 'louvain', col) + + cluster_sets[[i]] <- data.frame( + cluster = scdata@meta.data[[col]], + cell_ids = scdata$cells_id + ) |> + format_cell_sets_object(method, scdata@misc$color_pool, name = col) + } # cell sets file to s3 - cell_sets$cellSets <- c(list(cluster_sets), cell_sets$cellSets) + cell_sets$cellSets <- c(cluster_sets, cell_sets$cellSets) cell_sets_data <- RJSONIO::toJSON(cell_sets) put_object_in_s3(pipeline_config, @@ -64,6 +77,69 @@ upload_seurat_to_aws <- function(input, pipeline_config, prev_out) { return(res) } +find_cluster_columns <- function(scdata) { + + # exclude all group columns, including duplicates + group_cols <- find_group_columns(scdata, remove.dups = FALSE) + exclude_cols <- c(group_cols, 'samples') + + # order meta to indicate preference for louvain clusters + meta <- scdata@meta.data + louvain_cols <- c('louvain', 'active.ident', 'seurat_clusters') + meta <- meta |> dplyr::relocate(dplyr::any_of(louvain_cols)) + + check_cols <- setdiff(colnames(meta), exclude_cols) + + cluster_cols <- c() + for (check_col in check_cols) { + check_vals <- meta[[check_col]] + + # skip if too few or way too many values + value_counts <- table(check_vals) + n.vals <- length(value_counts) + if (n.vals < 2) next() + if (n.vals > 1000) next() + cat("candidate cluster columns:", check_col, "--> nvals:", n.vals, '\n') + + # skip if values are numeric but non-integer + if (is.numeric(check_vals) && + !all(as.integer(check_vals) == check_vals)) next() + + # skip if more than 1/3 of values are repeated fewer than 4 times + nreps_lt4 <- sum(value_counts < 4) + if (nreps_lt4 > n.vals/3) next() + + # skip if col is same as samples or group column + is_sample_col <- FALSE + for (exclude_col in exclude_cols) { + exclude_vals <- meta[[exclude_col]] + if (test_groups_equal(check_vals, exclude_vals)) { + is_sample_col <- TRUE + break + } + } + if (is_sample_col) next() + + # TODO: decide if should skip if values are boolean? + # add remains to cluster_cols + cluster_cols <- c(cluster_cols, check_col) + } + + return(cluster_cols) +} + +make_vals_numeric <- function(vals) { + as.numeric(factor(vals, levels = unique(vals))) +} + +test_groups_equal <- function(vals1, vals2) { + vals1 <- make_vals_numeric(vals1) + vals2 <- make_vals_numeric(vals2) + + all(vals1 == vals2) +} + + add_samples_to_input <- function(scdata, input) { samples <- unique(scdata$samples) input$sampleNames <- samples @@ -96,7 +172,7 @@ add_metadata_to_input <- function(scdata, input) { } # get column names that are consistent with sample groups -find_group_columns <- function(scdata) { +find_group_columns <- function(scdata, remove.dups = TRUE) { meta <- scdata@meta.data ndistinct_sample <- meta |> @@ -118,11 +194,16 @@ find_group_columns <- function(scdata) { one.per.sample <- apply(ndistinct_sample, 2, function(x) all(x == 1)) group.cols <- names(ndistinct)[!too.many & !too.few & one.per.sample] + # remove duplicated group columns + if (remove.dups) { + meta[group.cols] <- lapply(meta[group.cols], make_vals_numeric) + dups <- duplicated(as.list(meta)) + group.cols <- group.cols[group.cols %in% colnames(meta)[!dups]] + } + return(group.cols) } - - # add 'cells_id' # 'samples' must be already added # current input$metadata not yet implemented diff --git a/pipeline-runner/tests/testthat/test-seurat-2-load_seurat.R b/pipeline-runner/tests/testthat/test-seurat-2-load_seurat.R index 62170919..82e22af6 100644 --- a/pipeline-runner/tests/testthat/test-seurat-2-load_seurat.R +++ b/pipeline-runner/tests/testthat/test-seurat-2-load_seurat.R @@ -45,8 +45,8 @@ test_that("load_seurat has the correct structure", { # only the default and 'pca' reduction is present expect_equal(c(SeuratObject::DefaultDimReduc(orig_scdata), 'pca'), names(scdata@reductions)) - # all metadata is preserved - expect_equal(scdata@meta.data, orig_scdata@meta.data) + # all metadata is preserved and active.ident is added + expect_equal(colnames(scdata@meta.data), c(colnames(orig_scdata@meta.data), 'active.ident')) # clean up unlink(data_dir, recursive = TRUE) @@ -93,7 +93,7 @@ test_that("load_seurat fails if there is no RNA assay", { unlink(data_dir, recursive = TRUE) }) -test_that("load_seurat fails if there is no seurat_clusters column in metadata", { +test_that("load_seurat does not fail if there is no seurat_clusters column in metadata", { # setup input_dir <- tempdir() data_dir <- file.path(input_dir, 'pbmc_small') @@ -107,7 +107,7 @@ test_that("load_seurat fails if there is no seurat_clusters column in metadata", prev_out <- list(config = list(samples = 'pbmc_small')) expect_error( load_seurat(input = NULL, pipeline_config = NULL, prev_out = prev_out, input_dir = input_dir), - regexp = 'ERROR_SEURAT_CLUSTERS' + NA ) # clean up @@ -157,6 +157,7 @@ test_that("load_seurat fails if there is no pca reduction", { unlink(data_dir, recursive = TRUE) }) + test_that("load_seurat fails if there is inappropriate logcounts data", { # setup input_dir <- tempdir() @@ -167,7 +168,6 @@ test_that("load_seurat fails if there is inappropriate logcounts data", { # sparse matrix with wrong dimensions bad_data <- Matrix::sparseMatrix(1, 1, x = 1) expect_error(test_user_sparse_mat(bad_data), NA) - orig_scdata@assays$RNA@data <- bad_data saveRDS(orig_scdata, file.path(data_dir, 'r.rds')) @@ -181,7 +181,7 @@ test_that("load_seurat fails if there is inappropriate logcounts data", { unlink(data_dir, recursive = TRUE) }) -test_that("load_seurat fails if there if HVFInfo can't be called", { +test_that("load_seurat generates HVFInfo if it is not present", { # setup input_dir <- tempdir() data_dir <- file.path(input_dir, 'pbmc_small') @@ -195,9 +195,45 @@ test_that("load_seurat fails if there if HVFInfo can't be called", { prev_out <- list(config = list(samples = 'pbmc_small')) expect_error( - load_seurat(input = NULL, pipeline_config = NULL, prev_out = prev_out, input_dir = input_dir), - regexp = 'ERROR_SEURAT_HVFINFO' - ) + res <- load_seurat(input = NULL, pipeline_config = NULL, prev_out = prev_out, input_dir = input_dir), + NA) + + scdata <- res$output$scdata + disp_colnames <- c('mean', 'variance', 'variance.standardized', 'ENSEMBL', 'SYMBOL') + expect_true(methods::is(scdata@misc$gene_dispersion, 'data.frame')) + expect_equal(colnames(scdata@misc$gene_dispersion), disp_colnames) + + # clean up + unlink(data_dir, recursive = TRUE) +}) + +test_that("load_seurat identifies and log-transforms counts stored in data assay", { + # setup + input_dir <- tempdir() + data_dir <- file.path(input_dir, 'pbmc_small') + dir.create(data_dir) + orig_scdata <- mock_scdata(data_dir) + + + # replace logcounts with counts + orig_logcounts <- orig_scdata[['RNA']]@data + orig_counts <- orig_scdata[['RNA']]@counts + orig_scdata[['RNA']]@data <- orig_counts + + # checking setup + expect_true(max(orig_counts) > 100) + expect_false(max(orig_logcounts) > 100) + expect_identical(orig_scdata[['RNA']]@data, orig_counts) + + saveRDS(orig_scdata, file.path(data_dir, 'r.rds')) + prev_out <- list(config = list(samples = 'pbmc_small')) + res <- load_seurat(input = NULL, pipeline_config = NULL, prev_out = prev_out, input_dir = input_dir) + scdata <- res$output$scdata + + # logcounts were log-transformed + counts <- scdata[['RNA']]@counts + expect_identical(orig_counts, counts) + expect_identical(scdata[['RNA']]@data, Seurat::NormalizeData(orig_counts)) # clean up unlink(data_dir, recursive = TRUE) diff --git a/pipeline-runner/tests/testthat/test-seurat-3-upload_seurat_to_aws.R b/pipeline-runner/tests/testthat/test-seurat-3-upload_seurat_to_aws.R index 6af3b585..fa7b7719 100644 --- a/pipeline-runner/tests/testthat/test-seurat-3-upload_seurat_to_aws.R +++ b/pipeline-runner/tests/testthat/test-seurat-3-upload_seurat_to_aws.R @@ -129,6 +129,13 @@ test_that("find_group_columns finds all group columns that are superset of sampl scdata$group1 <- ifelse(samples %in% c('A', 'B'), 'AB', 'CD') expect_equal(find_group_columns(scdata), 'group1') + # ignores column that has same grouping as another group column + scdata$group1_copy <- ifelse(samples %in% c('A', 'B'), 'group_ab', 'group_cd') + expect_equal(find_group_columns(scdata), 'group1') + + # keeps column with same grouping as another if remove.dups is FALSE + expect_equal(find_group_columns(scdata, remove.dups = FALSE), c('group1', 'group1_copy')) + # finds column that has three samples in one group scdata$group2 <- ifelse(samples %in% c('A', 'B', 'C'), 'ABC', 'D') expect_equal(find_group_columns(scdata), c('group1', 'group2')) @@ -142,6 +149,16 @@ test_that("find_group_columns finds all group columns that are superset of sampl expect_equal(find_group_columns(scdata), c('group1', 'group2')) }) +test_that("make_vals_numeric turns equivalent groups into identical vectors", { + + # works when differently ordered + vals1 <- c('a', 'a', 'a', 'a', 'b', 'b', 'b') + vals2 <- c('y', 'y', 'y', 'y', 'x', 'x', 'x') + expect_identical(make_vals_numeric(vals1), make_vals_numeric(vals2)) + + bad_make_vals_numeric <- function(vals) as.numeric(factor(vals)) + expect_failure(expect_identical(make_vals_numeric(vals1), bad_make_vals_numeric(vals2))) +}) test_that("add_metadata_to_input adds group metadata to input list", { @@ -191,3 +208,80 @@ test_that("change_sample_names_to_ids substitutes samples column for values in s scdata <- change_sample_names_to_ids(scdata, input) expect_equal(scdata@meta.data$samples, rev(tolower(samples))) }) + +test_that("find_cluster_columns finds cluster columns", { + + expected_cols <- c('seurat_clusters', 'RNA_snn_res.0.8', 'letter.idents', 'groups', 'RNA_snn_res.1') + + scdata <- mock_scdata() + sample_names <- c('A', 'B', 'C', 'D') + samples <- rep(sample_names, each = ncol(scdata)/4) + scdata$samples <- samples + + scdata$seurat_clusters <- scdata$RNA_snn_res.0.8 + + cluster_cols <- find_cluster_columns(scdata) + expect_setequal(cluster_cols, expected_cols) +}) + + +test_that("find_cluster_columns skips column with single value", { + + expected_cols <- c('seurat_clusters', 'RNA_snn_res.0.8', 'letter.idents', 'groups', 'RNA_snn_res.1') + + scdata <- mock_scdata() + sample_names <- c('A', 'B', 'C', 'D') + samples <- rep(sample_names, each = ncol(scdata)/4) + scdata$samples <- samples + + scdata$seurat_clusters <- scdata$RNA_snn_res.0.8 + scdata$blah <- 'blah' + + cluster_cols <- find_cluster_columns(scdata) + expect_setequal(cluster_cols, expected_cols) +}) + +test_that("find_cluster_columns skips columns with non-integer numeric values", { + + expected_cols <- c('seurat_clusters', 'RNA_snn_res.0.8', 'letter.idents', 'groups', 'RNA_snn_res.1') + + scdata <- mock_scdata() + sample_names <- c('A', 'B', 'C', 'D') + samples <- rep(sample_names, each = ncol(scdata)/4) + scdata$samples <- samples + + scdata$seurat_clusters <- scdata$RNA_snn_res.0.8 + scdata$blah <- rnorm(ncol(scdata)) + + cluster_cols <- find_cluster_columns(scdata) + expect_setequal(cluster_cols, expected_cols) +}) + +test_that("find_cluster_columns skips columns where repeated values are too infrequent", { + + expected_cols <- c('seurat_clusters', 'RNA_snn_res.0.8', 'letter.idents', 'groups', 'RNA_snn_res.1') + + scdata <- mock_scdata() + sample_names <- c('A', 'B', 'C', 'D') + samples <- rep(sample_names, each = ncol(scdata)/4) + scdata$samples <- samples + + scdata$seurat_clusters <- scdata$RNA_snn_res.0.8 + scdata$blah <- rep(c(letters[1:20], LETTERS[1:20]), each = 2) + + cluster_cols <- find_cluster_columns(scdata) + expect_setequal(cluster_cols, expected_cols) +}) + +test_that("find_cluster_columns puts 'louvain' column first if exists", { + + scdata <- mock_scdata() + sample_names <- c('A', 'B', 'C', 'D') + samples <- rep(sample_names, each = ncol(scdata)/4) + scdata$samples <- samples + + scdata$louvain <- scdata$RNA_snn_res.0.8 + + cluster_cols <- find_cluster_columns(scdata) + expect_equal(cluster_cols[1], 'louvain') +})