diff --git a/pipeline-runner/NAMESPACE b/pipeline-runner/NAMESPACE index e8e02bbf..3f3d2715 100644 --- a/pipeline-runner/NAMESPACE +++ b/pipeline-runner/NAMESPACE @@ -61,6 +61,8 @@ export(prepare_experiment) export(prepare_sct_integration) export(read_10x_annotations) export(read_10x_h5_file) +export(read_parse_annotations) +export(read_parse_files) export(reconstruct_seurat) export(remove_filtered_cells) export(remove_genes) diff --git a/pipeline-runner/R/gem2s-1-download_user_files.R b/pipeline-runner/R/gem2s-1-download_user_files.R index c15bee97..ffdcbff4 100644 --- a/pipeline-runner/R/gem2s-1-download_user_files.R +++ b/pipeline-runner/R/gem2s-1-download_user_files.R @@ -30,11 +30,11 @@ download_s3_files <- function(input, originals_bucket, input_dir, s3) { unlink(input_dir, recursive = TRUE) for (sample_id in sample_ids) { - for (file_type in file_types_by_technology[[technology]]) { s3_path <- sample_s3_paths[[sample_id]][[file_type]] local_fpath <- file.path(input_dir, sample_id, file_names[[file_type]]) + download_and_store(originals_bucket, s3_path, local_fpath, s3) } } diff --git a/pipeline-runner/R/gem2s-2-load_user_files.R b/pipeline-runner/R/gem2s-2-load_user_files.R index 50074a7d..4d1e21b5 100644 --- a/pipeline-runner/R/gem2s-2-load_user_files.R +++ b/pipeline-runner/R/gem2s-2-load_user_files.R @@ -20,12 +20,13 @@ load_user_files <- function(input, pipeline_config, prev_out, input_dir = INPUT_ # destructure previous output config <- prev_out$config - technology <- ifelse(config$input$type %in% c("rhapsody","10x_h5"),config$input$type , "10x") + technology <- ifelse(config$input$type %in% c("rhapsody", "10x_h5", "parse"), config$input$type, "10x") read_fun <- switch(technology, "10x" = read_10x_files, "rhapsody" = read_rhapsody_files, - "10x_h5" = read_10x_h5_file + "10x_h5" = read_10x_h5_file, + "parse" = read_parse_files ) message( @@ -45,6 +46,66 @@ load_user_files <- function(input, pipeline_config, prev_out, input_dir = INPUT_ return(res) } +#' Read Parse files. Calls readMTX +#' +#' @param config experiment settings +#' @param input_dir +#' +#' @return +#' @export +#' +read_parse_files <- function(config, input_dir) { + counts_list <- list() + annot_list <- list() + feature_types_list <- list() + + samples <- config$samples + + for (sample in samples) { + sample_dir <- file.path(input_dir, sample) + sample_fpaths <- list.files(sample_dir) + annot_fpath <- file.path(sample_dir, "all_genes.csv.gz") + + message("\nSample --> ", sample) + message( + "Reading files from ", + sample_dir, + " --> ", + paste(sample_fpaths, collapse = " - ") + ) + + annotations <- read_parse_annotations(annot_fpath, sample) + + mtx_path <- file.path(sample_dir, "DGE.mtx.gz") + barcodes_path <- file.path(sample_dir, "cell_metadata.csv.gz") + features_path <- file.path(sample_dir, "all_genes.csv.gz") + + + # We use readMtx instead of Seurat::ReadParse because the feature.column needs to be 1 instead of 2. + counts <- Seurat::ReadMtx( + mtx = mtx_path, cells = barcodes_path, features = features_path, + cell.column = 1, feature.column = 1, cell.sep = ",", + feature.sep = ",", skip.cell = 1, skip.feature = 1, mtx.transpose = TRUE + ) + + message( + sprintf( + "Sample %s has %s genes and %s barcodes", + sample, nrow(counts), ncol(counts) + ) + ) + + c(counts, annotations) %<-% filter_unnamed_features(counts, annotations, sample) + + counts_list[[sample]] <- counts + annot_list[[sample]] <- annotations + } + + annot <- format_annot(annot_list) + + return(list(counts_list = counts_list, annot = annot)) +} + #' Read h5 file #' #' Calls read10x_h5 @@ -67,13 +128,16 @@ read_10x_h5_file <- function(config, input_dir) { sample_counts_path <- file.path(sample_dir, sample_fpaths[[1]]) message("\nSample --> ", sample) - message("Reading files from ", - sample_dir, - " --> ", - paste(sample_fpaths, collapse = " - ")) + message( + "Reading files from ", + sample_dir, + " --> ", + paste(sample_fpaths, collapse = " - ") + ) - if (length(sample_fpaths) > 1) + if (length(sample_fpaths) > 1) { stop("Only one h5 expected. More files detected.") + } ungzipped_counts_path <- R.utils::gunzip(sample_counts_path) @@ -81,7 +145,7 @@ read_10x_h5_file <- function(config, input_dir) { counts <- Seurat::Read10X_h5(ungzipped_counts_path, use.names = FALSE) # use Gene Expression modality if multiple - if (methods::is(counts, 'list')) { + if (methods::is(counts, "list")) { counts_names <- counts_names$`Gene Expression` counts <- counts$`Gene Expression` } @@ -168,7 +232,6 @@ read_10x_files <- function(config, input_dir) { #' } #' read_rhapsody_files <- function(config, input_dir) { - # if we add support for other rhapsody file types (csv matrices) we should # check filetypes here and dispatch accordingly. @@ -274,6 +337,30 @@ parse_rhapsody_matrix <- function(config, input_dir) { return(list(counts_list = counts_list, annot = annot)) } +#' Loads annotations and formats the annotation file with the column names required by format_annot +#' +#' @param annot_fpath Path to annotations file +#' @param sample Sample name +#' +#' @return +#' @export +#' +read_parse_annotations <- function(annot_fpath, sample) { + annot <- read.delim(annot_fpath, header = TRUE, sep = ",") + + # Make the names in annot the same as the ones in the Read10x generated count matrix + # Since Seurat uses makes.unique, we need to as well. + # Only for the first column, at this stage column 1 are the counts matrix rownames. + annot[, 1] <- make.unique(annot[, 1]) + + # Equalizing number of columns + # We are removing the genome column, so far it's not used. + annot <- annot[, c(1, 2)] + colnames(annot) <- c("input", "name") + + return(annot) +} + #' Load and parse annotations from the feature files #' #' This function reads the features file into a data.frame, and takes steps @@ -574,4 +661,3 @@ filter_unnamed_features <- function(counts, annotations, sample) { return(list("counts" = counts, "annotations" = annotations)) } - diff --git a/pipeline-runner/R/qc-4-filter_gene_umi_outlier.R b/pipeline-runner/R/qc-4-filter_gene_umi_outlier.R index c0e6b46a..25f49c6d 100644 --- a/pipeline-runner/R/qc-4-filter_gene_umi_outlier.R +++ b/pipeline-runner/R/qc-4-filter_gene_umi_outlier.R @@ -55,7 +55,7 @@ filter_gene_umi_outlier <- function(scdata_list, config, sample_id, cells_id, ta if (safeTRUE(config$auto) || is.null(config$filterSettings$predictionInterval)) p_level <- min(0.001, 1 / ncol(sample_data)) else - p_level <- config$filterSettings$regressionTypeSettings[[type]]$p.level + p_level <- 1 - as.numeric(config$filterSettings$predictionInterval) p_level <- suppressWarnings(as.numeric(p_level)) if(is.na(p_level)) stop("p_level couldn't be interpreted as a number.") diff --git a/pipeline-runner/R/sysdata.rda b/pipeline-runner/R/sysdata.rda index 1de37724..24ac32ec 100644 Binary files a/pipeline-runner/R/sysdata.rda and b/pipeline-runner/R/sysdata.rda differ diff --git a/pipeline-runner/data-raw/sysdata.R b/pipeline-runner/data-raw/sysdata.R index f06838da..29cbcaab 100644 --- a/pipeline-runner/data-raw/sysdata.R +++ b/pipeline-runner/data-raw/sysdata.R @@ -77,7 +77,8 @@ file_types_by_technology <- list( "10x" = list("barcodes10x", "features10x", "matrix10x"), "seurat" = list("seurat"), "rhapsody" = list("rhapsody"), - "10x_h5" = list("10XH5") + "10x_h5" = list("10XH5"), + "parse" = list("barcodesParse", "featuresParse", "matrixParse") ) file_names <- list( @@ -86,7 +87,10 @@ file_names <- list( matrix10x = "matrix.mtx.gz", seurat = "r.rds", rhapsody = "expression_data.st.gz", - "10XH5" = "matrix.h5.gz" + "10XH5" = "matrix.h5.gz", + barcodesParse = "cell_metadata.csv.gz", + featuresParse = "all_genes.csv.gz", + matrixParse = "DGE.mtx.gz" ) MITOCHONDRIAL_REGEX <- "^mt[-:]" diff --git a/pipeline-runner/man/read_parse_annotations.Rd b/pipeline-runner/man/read_parse_annotations.Rd new file mode 100644 index 00000000..2f09a4ed --- /dev/null +++ b/pipeline-runner/man/read_parse_annotations.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gem2s-2-load_user_files.R +\name{read_parse_annotations} +\alias{read_parse_annotations} +\title{Loads annotations and formats the annotation file with the column names required by format_annot} +\usage{ +read_parse_annotations(annot_fpath, sample) +} +\arguments{ +\item{annot_fpath}{Path to annotations file} + +\item{sample}{Sample name} +} +\description{ +Loads annotations and formats the annotation file with the column names required by format_annot +} diff --git a/pipeline-runner/man/read_parse_files.Rd b/pipeline-runner/man/read_parse_files.Rd new file mode 100644 index 00000000..33a8404e --- /dev/null +++ b/pipeline-runner/man/read_parse_files.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gem2s-2-load_user_files.R +\name{read_parse_files} +\alias{read_parse_files} +\title{Read Parse files. Calls readMTX} +\usage{ +read_parse_files(config, input_dir) +} +\arguments{ +\item{config}{experiment settings} + +\item{input_dir}{} +} +\description{ +Read Parse files. Calls readMTX +} diff --git a/pipeline-runner/tests/testthat/mock-gem2s-input-files.R b/pipeline-runner/tests/testthat/mock-gem2s-input-files.R new file mode 100644 index 00000000..4a86f872 --- /dev/null +++ b/pipeline-runner/tests/testthat/mock-gem2s-input-files.R @@ -0,0 +1,34 @@ +mock_files <- function(counts, features, sample_dir, type = "10x") { + features_fnames <- list("10x" = "features.tsv", "parse" = "all_genes.csv") + barcodes_fnames <- list("10x" = "barcodes.tsv", "parse" = "cell_metadata.csv") + matrix_fnames <- list("10x" = "matrix.mtx", "parse" = "DGE.mtx") + file_sep <- list("10x" = "\t", "parse" = ",") + # Parse features and barcodes files include the header, so we need to emulate it. + col_names <- list("10x" = FALSE, "parse" = TRUE) + + # save features + features_path <- file.path(sample_dir, features_fnames[[type]]) + write.table(features, features_path, col.names = col_names[[type]], quote = FALSE, sep = file_sep[[type]], row.names = FALSE) + R.utils::gzip(features_path) + + # save barcodes + barcodes <- data.frame(barcodes = colnames(counts)) + barcodes_path <- file.path(sample_dir, barcodes_fnames[[type]]) + # writeLines(barcodes, barcodes_path) + write.table(barcodes, barcodes_path, col.names = col_names[[type]], quote = FALSE, sep = file_sep[[type]], row.names = FALSE) + R.utils::gzip(barcodes_path) + + # save Matrix + if (is(counts, "data.frame")) counts <- as.matrix(counts) + sparse.mat <- Matrix::Matrix(counts, sparse = TRUE) + matrix_path <- file.path(sample_dir, matrix_fnames[[type]]) + Matrix::writeMM(sparse.mat, matrix_path) + R.utils::gzip(matrix_path) +} + +local_experiment <- function(counts, features, experiment_dir, sample_dir, env = parent.frame(), type = "10x") { + sample_path <- file.path(experiment_dir, sample_dir) + dir.create(sample_path, recursive = T) + mock_files(counts, features, sample_path, type) + withr::defer(unlink(experiment_dir, recursive = T), envir = env) +} diff --git a/pipeline-runner/tests/testthat/test-gem2s-2-load_user_files.R b/pipeline-runner/tests/testthat/test-gem2s-2-load_user_files.R index 67e3b426..18881b5c 100644 --- a/pipeline-runner/tests/testthat/test-gem2s-2-load_user_files.R +++ b/pipeline-runner/tests/testthat/test-gem2s-2-load_user_files.R @@ -1,24 +1,4 @@ -mock_cellranger_files <- function(counts, features, sample_dir) { - - # save features - features_path <- file.path(sample_dir, "features.tsv") - write.table(features, features_path, col.names = FALSE, quote = FALSE, sep = "\t", row.names = FALSE) - R.utils::gzip(features_path) - - # save barcodes - barcodes <- colnames(counts) - barcodes_path <- file.path(sample_dir, "barcodes.tsv") - writeLines(barcodes, barcodes_path) - R.utils::gzip(barcodes_path) - - # save Matrix - if (is(counts, "data.frame")) counts <- as.matrix(counts) - sparse.mat <- Matrix::Matrix(counts, sparse = TRUE) - matrix_path <- file.path(sample_dir, "matrix.mtx") - Matrix::writeMM(sparse.mat, matrix_path) - R.utils::gzip(matrix_path) -} - +source("mock-gem2s-input-files.R") mock_counts <- function() { pbmc_raw <- read.table( @@ -46,22 +26,15 @@ mock_annotations <- function(counts, multiome = "no") { name = paste0("PEAKFAKE", seq_len(10)) ) annot <- rbind(annot, peaks) - annot$type <- c(rep("Gene Expression", nrow(counts)), - rep("Peaks", 10)) + annot$type <- c( + rep("Gene Expression", nrow(counts)), + rep("Peaks", 10) + ) } return(list("annot" = annot)) } -local_cellranger_experiment <- function(counts, features, experiment_dir, sample_dir, env = parent.frame()) { - - sample_path <- file.path(experiment_dir, sample_dir) - dir.create(sample_path, recursive = T) - mock_cellranger_files(counts, features, sample_path) - withr::defer(unlink(experiment_dir, recursive = T), envir = env) - -} - mock_lists <- function() { counts <- mock_counts() symbols <- rownames(counts) @@ -123,7 +96,7 @@ test_that("load_user_files loads a 10x count matrix", { experiment_dir <- "./experiment_1" sample <- "sample_a" - local_cellranger_experiment(counts, features, experiment_dir, sample) + local_experiment(counts, features, experiment_dir, sample) prev_out <- list(config = list(samples = sample, input = list(type = "10x"))) @@ -146,7 +119,7 @@ test_that("load_user_files generates feature annotation for 10x data", { experiment_dir <- "./experiment_1" sample <- "sample_a" - local_cellranger_experiment(counts, features, experiment_dir, sample) + local_experiment(counts, features, experiment_dir, sample) prev_out <- list(config = list(samples = sample, input = list(type = "10x"))) out <- load_user_files(NULL, NULL, prev_out, experiment_dir)$output @@ -155,7 +128,6 @@ test_that("load_user_files generates feature annotation for 10x data", { expect_true( all(c("input", "name", "original_name") %in% colnames(out$annot)) ) - }) @@ -173,7 +145,7 @@ test_that("load_user_files deduplicates gene symbols for 10x data", { experiment_dir <- "./experiment_1" sample <- "sample_a" - local_cellranger_experiment(counts, features, experiment_dir, sample) + local_experiment(counts, features, experiment_dir, sample) prev_out <- list(config = list(samples = sample, input = list(type = "10x"))) annot <- load_user_files(NULL, NULL, prev_out, experiment_dir)$output$annot @@ -198,7 +170,7 @@ test_that("load_user_files uses appropriate feature columns for 10x data", { experiment_dir <- "./experiment_1" sample <- "sample_a" - local_cellranger_experiment(counts, features, experiment_dir, sample) + local_experiment(counts, features, experiment_dir, sample) prev_out <- list(config = list(samples = sample, input = list(type = "10x"))) out <- load_user_files(NULL, NULL, prev_out, experiment_dir)$output @@ -214,7 +186,6 @@ test_that("load_user_files uses appropriate feature columns for 10x data", { # symbols are in column 'name' of annot expect_equal(out$annot$name, symbols) - }) test_that("load_user_files uses first column if no Gene Expression column present in features file", { @@ -226,20 +197,19 @@ test_that("load_user_files uses first column if no Gene Expression column presen features <- data.frame( ensid = paste0("ENSFAKE", seq_len(nrow(counts))), symbol = symbols, - type = c(rep("bla", 42), rep("not this one", nrow(counts)-42)) + type = c(rep("bla", 42), rep("not this one", nrow(counts) - 42)) ) experiment_dir <- "./experiment_1" sample <- "sample_a" - local_cellranger_experiment(counts, features, experiment_dir, sample) + local_experiment(counts, features, experiment_dir, sample) prev_out <- list(config = list(samples = sample, input = list(type = "10x"))) out <- load_user_files(NULL, NULL, prev_out, experiment_dir)$output # expect we keep only the rows corresponding to the first slot expect_equal(nrow(out$counts_list$sample_a), 42) - }) @@ -262,8 +232,8 @@ test_that("load_user_files loads 10x multisample experiments", { experiment_dir <- "./experiment_1" samples <- c("sample_a", "sample_b") - local_cellranger_experiment(counts, features, experiment_dir, samples[1]) - local_cellranger_experiment(counts, features2, experiment_dir, samples[2]) + local_experiment(counts, features, experiment_dir, samples[1]) + local_experiment(counts, features2, experiment_dir, samples[2]) prev_out <- list(config = list(samples = samples, input = list(type = "10x"))) @@ -281,7 +251,6 @@ test_that("load_user_files loads 10x multisample experiments", { length(unique_ensids), length(c(ensids, ensids2)) ) - }) test_that("read_10x_files returns error if files missing", { @@ -295,7 +264,7 @@ test_that("read_10x_files returns error if files missing", { sample <- "sample_a" sample_dir <- file.path(experiment_dir, sample) - local_cellranger_experiment(counts, features, experiment_dir, sample) + local_experiment(counts, features, experiment_dir, sample) prev_out <- list(config = list(samples = sample, input = list(type = "10x"))) @@ -313,11 +282,9 @@ test_that("read_10x_files returns error if files missing", { file.rename(file.path(sample_dir, file), file.path(sample_dir, "blah")) expect_error(supressWarnings(load_user_files(NULL, NULL, prev_out, experiment_dir), "cannot open the connection")) file.rename(file.path(sample_dir, "blah"), file.path(sample_dir, file)) - }) test_that("read_10x_annotations inverts columns if Gene Expression in second position", { - counts <- mock_counts() # inverted symbol and colums features <- data.frame( @@ -329,7 +296,7 @@ test_that("read_10x_annotations inverts columns if Gene Expression in second pos sample <- "sample_a" annot_fpath <- file.path(experiment_dir, sample, "features.tsv.gz") - local_cellranger_experiment(counts, features, experiment_dir, sample) + local_experiment(counts, features, experiment_dir, sample) res <- read_10x_annotations(annot_fpath, sample) @@ -339,7 +306,6 @@ test_that("read_10x_annotations inverts columns if Gene Expression in second pos expect_equal(get_feature_types(res$annot), IDS_SYM) expect_equal(res$feature_types, IDS_SYM) expect_equal(res$gene_column, 2) - }) test_that("read_10x_annotations duplicates column if there's only one column in features file", { @@ -354,14 +320,13 @@ test_that("read_10x_annotations duplicates column if there's only one column in sample <- "sample_a" annot_fpath <- file.path(experiment_dir, sample, "features.tsv.gz") - local_cellranger_experiment(counts, features, experiment_dir, sample) + local_experiment(counts, features, experiment_dir, sample) res <- read_10x_annotations(annot_fpath, sample) expect_equal(ncol(res$annot), 2) expect_equal(res$annot$name, res$annot$input) expect_equal(res$annot$name, features$ensid) - }) test_that("get_feature_types properly determines types", { @@ -380,38 +345,38 @@ test_that("get_feature_types properly determines types", { test_that("get_feature_types identifies mixed columns", { annot_list <- list( - sample1 = data.frame(ENSID = c(paste0("ENS", 1:5),paste0("gene",1:5)), SYMBOL = c(paste0("ENS", 1:4),paste0("gene",1:6))), - sample2 = data.frame(ENSID = c(paste0("ENS", 1:4),paste0("gene",1:6)), SYMBOL = paste0("gene", 1:10)) + sample1 = data.frame(ENSID = c(paste0("ENS", 1:5), paste0("gene", 1:5)), SYMBOL = c(paste0("ENS", 1:4), paste0("gene", 1:6))), + sample2 = data.frame(ENSID = c(paste0("ENS", 1:4), paste0("gene", 1:6)), SYMBOL = paste0("gene", 1:10)) ) - expect_equal(get_feature_types(annot_list[["sample1"]]),IDS_SYM) - expect_true(get_feature_types(annot_list[["sample2"]])==SYM_SYM) + expect_equal(get_feature_types(annot_list[["sample1"]]), IDS_SYM) + expect_true(get_feature_types(annot_list[["sample2"]]) == SYM_SYM) }) -test_that("normalize_annotation_types does nothing if all annotation types are the same",{ +test_that("normalize_annotation_types does nothing if all annotation types are the same", { input <- mock_lists() - features_types_list <- list(sample1=get_feature_types(input$annot_list$sample1),sample2=get_feature_types(input$annot_list$sample2)) + features_types_list <- list(sample1 = get_feature_types(input$annot_list$sample1), sample2 = get_feature_types(input$annot_list$sample2)) counts_list <- input$counts_list annot_list <- input$annot_list - res <- normalize_annotation_types(annot_list,counts_list,features_types_list, samples) + res <- normalize_annotation_types(annot_list, counts_list, features_types_list, samples) - expect_equal(res[[1]],counts_list) - expect_equal(res[[2]],annot_list) + expect_equal(res[[1]], counts_list) + expect_equal(res[[2]], annot_list) }) -test_that("normalize_annotation_types does nothing if there are no samples with annotations",{ +test_that("normalize_annotation_types does nothing if there are no samples with annotations", { input <- mock_lists() - features_types_list <- list(sample1=get_feature_types(input$annot_list$sample1),sample2=get_feature_types(input$annot_list$sample2)) + features_types_list <- list(sample1 = get_feature_types(input$annot_list$sample1), sample2 = get_feature_types(input$annot_list$sample2)) - samples <- c("sample1","sample2") - for(sample in samples){ + samples <- c("sample1", "sample2") + for (sample in samples) { input$annot_list[[sample]]$input <- input$annot_list[[sample]]$name - rownames(input$counts_list[[sample]]) <- input$annot_list[[sample]]$input + rownames(input$counts_list[[sample]]) <- input$annot_list[[sample]]$input } counts_list <- input$counts_list @@ -419,12 +384,12 @@ test_that("normalize_annotation_types does nothing if there are no samples with res <- normalize_annotation_types(annot_list, counts_list, features_types_list, samples) - expect_equal(res[[1]],counts_list) - expect_equal(res[[2]],annot_list) + expect_equal(res[[1]], counts_list) + expect_equal(res[[2]], annot_list) }) -test_that("normalize_annotation_types infers gene ids from symbols and corrects counts rownames",{ +test_that("normalize_annotation_types infers gene ids from symbols and corrects counts rownames", { input <- mock_lists() sample2_annot <- input$annot_list$sample2 @@ -432,15 +397,15 @@ test_that("normalize_annotation_types infers gene ids from symbols and corrects input$annot_list$sample2 <- sample2_annot rownames(input$counts_list$sample2) <- sample2_annot$input - features_types_list <- list(sample1=get_feature_types(input$annot_list$sample1),sample2=get_feature_types(input$annot_list$sample2)) + features_types_list <- list(sample1 = get_feature_types(input$annot_list$sample1), sample2 = get_feature_types(input$annot_list$sample2)) - res <-normalize_annotation_types(input$annot_list,input$counts_list,features_types_list,samples=list("sample1","sample2")) + res <- normalize_annotation_types(input$annot_list, input$counts_list, features_types_list, samples = list("sample1", "sample2")) expect_equal(res$annot_list$sample2, input$annot_list$sample1) expect_equal(rownames(res$counts_list$sample2), res$annot_list$sample2$input) }) -test_that("normalize_annotation_types infers gene symbols from ids",{ +test_that("normalize_annotation_types infers gene symbols from ids", { input <- mock_lists() sample2_annot <- input$annot_list$sample2 @@ -448,59 +413,58 @@ test_that("normalize_annotation_types infers gene symbols from ids",{ input$annot_list$sample2 <- sample2_annot rownames(input$counts_list$sample2) <- sample2_annot$input - features_types_list <- list(sample1=get_feature_types(input$annot_list$sample1),sample2=get_feature_types(input$annot_list$sample2)) + features_types_list <- list(sample1 = get_feature_types(input$annot_list$sample1), sample2 = get_feature_types(input$annot_list$sample2)) - res <- normalize_annotation_types(input$annot_list,input$counts_list,features_types_list,samples=list("sample1","sample2")) + res <- normalize_annotation_types(input$annot_list, input$counts_list, features_types_list, samples = list("sample1", "sample2")) - expect_equal(res$annot_list$sample2,input$annot_list$sample1) - expect_equal(rownames(res$counts_list$sample2),input$annot_list$sample2$input) + expect_equal(res$annot_list$sample2, input$annot_list$sample1) + expect_equal(rownames(res$counts_list$sample2), input$annot_list$sample2$input) }) -test_that("normalize_annotation_types infers ids with incomplete match",{ +test_that("normalize_annotation_types infers ids with incomplete match", { input <- mock_lists() sample2_annot <- input$annot_list$sample2 - sample2_annot$name[1:nrow(sample2_annot)%%2==1] <- paste0("gene",(1:(nrow(sample2_annot)/2))) + sample2_annot$name[1:nrow(sample2_annot) %% 2 == 1] <- paste0("gene", (1:(nrow(sample2_annot) / 2))) sample2_annot$input <- sample2_annot$name rownames(input$counts_list$sample2) <- sample2_annot$input input$annot_list$sample2 <- sample2_annot - features_types_list <- list(sample1=get_feature_types(input$annot_list$sample1),sample2=get_feature_types(input$annot_list$sample2)) + features_types_list <- list(sample1 = get_feature_types(input$annot_list$sample1), sample2 = get_feature_types(input$annot_list$sample2)) - res <- normalize_annotation_types(input$annot_list,input$counts_list,features_types_list,samples=list("sample1","sample2")) + res <- normalize_annotation_types(input$annot_list, input$counts_list, features_types_list, samples = list("sample1", "sample2")) expected_annot <- input$annot_list$sample1 - expected_annot$input[1:nrow(expected_annot)%%2==1] <- paste0("gene",(1:(nrow(expected_annot)/2))) - expected_annot$name[1:nrow(expected_annot)%%2==1] <- paste0("gene",(1:(nrow(expected_annot)/2))) + expected_annot$input[1:nrow(expected_annot) %% 2 == 1] <- paste0("gene", (1:(nrow(expected_annot) / 2))) + expected_annot$name[1:nrow(expected_annot) %% 2 == 1] <- paste0("gene", (1:(nrow(expected_annot) / 2))) - expect_equal(res$annot_list$sample2,expected_annot) - expect_equal(rownames(res$counts_list$sample2),res$annot_list$sample2$input) + expect_equal(res$annot_list$sample2, expected_annot) + expect_equal(rownames(res$counts_list$sample2), res$annot_list$sample2$input) }) -test_that("normalize_annotation_types infers symbols with incomplete match and doesnt modify ids",{ +test_that("normalize_annotation_types infers symbols with incomplete match and doesnt modify ids", { input <- mock_lists() sample2_annot <- input$annot_list$sample2 sample2_annot$name <- sample2_annot$input - sample2_annot$input[1:nrow(sample2_annot)%%2==1] <- paste0("gene",(1:(nrow(sample2_annot)/2))) + sample2_annot$input[1:nrow(sample2_annot) %% 2 == 1] <- paste0("gene", (1:(nrow(sample2_annot) / 2))) rownames(input$counts_list$sample2) <- sample2_annot$input input$annot_list$sample2 <- sample2_annot - features_types_list <- list(sample1=get_feature_types(input$annot_list$sample1),sample2=get_feature_types(input$annot_list$sample2)) + features_types_list <- list(sample1 = get_feature_types(input$annot_list$sample1), sample2 = get_feature_types(input$annot_list$sample2)) - res <- normalize_annotation_types(input$annot_list,input$counts_list,features_types_list,samples=list("sample1","sample2")) + res <- normalize_annotation_types(input$annot_list, input$counts_list, features_types_list, samples = list("sample1", "sample2")) expected_annot <- input$annot_list$sample1 - expected_annot$input[1:nrow(expected_annot)%%2==1] <- paste0("gene",(1:(nrow(expected_annot)/2))) - expected_annot$name[1:nrow(expected_annot)%%2==1] <- input$annot_list$sample1$input[1:nrow(expected_annot)%%2==1] + expected_annot$input[1:nrow(expected_annot) %% 2 == 1] <- paste0("gene", (1:(nrow(expected_annot) / 2))) + expected_annot$name[1:nrow(expected_annot) %% 2 == 1] <- input$annot_list$sample1$input[1:nrow(expected_annot) %% 2 == 1] - expect_equal(res$annot_list$sample2,expected_annot) - expect_equal(rownames(res$counts_list$sample2),input$annot_list$sample2$input) - expect_equal(res$annot_list$sample2$input,input$annot_list$sample2$input) + expect_equal(res$annot_list$sample2, expected_annot) + expect_equal(rownames(res$counts_list$sample2), input$annot_list$sample2$input) + expect_equal(res$annot_list$sample2$input, input$annot_list$sample2$input) }) -test_that("normalize_annotation_types properly infers ids with more than 2 samples",{ - +test_that("normalize_annotation_types properly infers ids with more than 2 samples", { input <- mock_lists() annot <- input$annot_list$sample1 @@ -538,12 +502,13 @@ test_that("normalize_annotation_types properly infers ids with more than 2 sampl paste0("gene", (1:(nrow(expected_annot) / 2))) expect_equal(res$annot_list$sample2, expected_annot) - expect_equal(rownames(res$counts_list$sample2), - res$annot_list$sample2$input) + expect_equal( + rownames(res$counts_list$sample2), + res$annot_list$sample2$input + ) }) test_that("normalize_annotation_types throws with incompatible feature types", { - input <- mock_lists() # sample 1 with ids only @@ -567,75 +532,75 @@ test_that("normalize_annotation_types throws with incompatible feature types", { input$counts_list, feature_types_list, samples = list("sample1", "sample2") - ), "Incompatible features detected.") - + ), "Incompatible features detected." + ) }) -test_that("duplicated genes dont lead to any rowname duplication",{ +test_that("duplicated genes dont lead to any rowname duplication", { input <- mock_lists() annot <- input$annot_list$sample1 counts <- input$counts_list$sample1 sample2_annot <- input$annot_list$sample2 - sample2_annot$name[1:nrow(sample2_annot)%%2==1] <- paste0("gene",(1:(nrow(sample2_annot)/2))) + sample2_annot$name[1:nrow(sample2_annot) %% 2 == 1] <- paste0("gene", (1:(nrow(sample2_annot) / 2))) sample2_annot$input <- sample2_annot$name - sample2_annot[1,1] <- sample2_annot[2,1] - sample2_annot[1,1] <- annot[2,1] + sample2_annot[1, 1] <- sample2_annot[2, 1] + sample2_annot[1, 1] <- annot[2, 1] rownames(input$counts_list$sample2) <- sample2_annot$input input$annot_list$sample2 <- sample2_annot - features_types_list <- list(sample1=get_feature_types(input$annot_list$sample1),sample2=get_feature_types(input$annot_list$sample2)) + features_types_list <- list(sample1 = get_feature_types(input$annot_list$sample1), sample2 = get_feature_types(input$annot_list$sample2)) - res <- normalize_annotation_types(input$annot_list,input$counts_list,features_types_list,samples=list("sample1","sample2")) + res <- normalize_annotation_types(input$annot_list, input$counts_list, features_types_list, samples = list("sample1", "sample2")) expected_annot <- annot - expected_annot$input[1:nrow(expected_annot)%%2==1] <- paste0("gene",(1:(nrow(expected_annot)/2))) - expected_annot$name[1:nrow(expected_annot)%%2==1] <- paste0("gene",(1:(nrow(expected_annot)/2))) + expected_annot$input[1:nrow(expected_annot) %% 2 == 1] <- paste0("gene", (1:(nrow(expected_annot) / 2))) + expected_annot$name[1:nrow(expected_annot) %% 2 == 1] <- paste0("gene", (1:(nrow(expected_annot) / 2))) expected_annot$input[[1]] <- "ENSFAKE2" expected_annot$input[[2]] <- "ENSFAKE2.1" - expect_equal(res$annot_list$sample2,expected_annot) - expect_equal(rownames(res$counts_list$sample2),expected_annot$input) + expect_equal(res$annot_list$sample2, expected_annot) + expect_equal(rownames(res$counts_list$sample2), expected_annot$input) }) -test_that("Mislabeling of features types to symbols results in no changes",{ +test_that("Mislabeling of features types to symbols results in no changes", { input <- mock_lists() sample2_annot <- input$annot_list$sample2 - sample2_annot$input[1:(nrow(sample2_annot)/2+1)] <- paste0("ab",1:(nrow(sample2_annot)/2+1)) - sample2_annot$name[1:(nrow(sample2_annot)/2+1)] <- paste0("ab",1:(nrow(sample2_annot)/2+1)) + sample2_annot$input[1:(nrow(sample2_annot) / 2 + 1)] <- paste0("ab", 1:(nrow(sample2_annot) / 2 + 1)) + sample2_annot$name[1:(nrow(sample2_annot) / 2 + 1)] <- paste0("ab", 1:(nrow(sample2_annot) / 2 + 1)) input$annot_list$sample2 <- sample2_annot rownames(input$counts_list$sample2) <- sample2_annot$input - features_types_list <- list(sample1=get_feature_types(input$annot_list$sample1),sample2=get_feature_types(input$annot_list$sample2)) + features_types_list <- list(sample1 = get_feature_types(input$annot_list$sample1), sample2 = get_feature_types(input$annot_list$sample2)) - res <- normalize_annotation_types(input$annot_list,input$counts_list,features_types_list,samples=list("sample1","sample2")) + res <- normalize_annotation_types(input$annot_list, input$counts_list, features_types_list, samples = list("sample1", "sample2")) - expect_equal(res$annot_list$sample2,input$annot_list$sample2) - expect_equal(rownames(res$counts_list$sample2),input$annot_list$sample2$input) + expect_equal(res$annot_list$sample2, input$annot_list$sample2) + expect_equal(rownames(res$counts_list$sample2), input$annot_list$sample2$input) }) -test_that("Mislabeling of features types to ids results in no changes",{ +test_that("Mislabeling of features types to ids results in no changes", { input <- mock_lists() sample2_annot <- input$annot_list$sample2 - sample2_annot$input[1:(nrow(sample2_annot)/2+1)] <- paste0("ENS",1:(nrow(sample2_annot)/2+1)) - sample2_annot$name[1:(nrow(sample2_annot)/2+1)] <- paste0("ENS",1:(nrow(sample2_annot)/2+1)) + sample2_annot$input[1:(nrow(sample2_annot) / 2 + 1)] <- paste0("ENS", 1:(nrow(sample2_annot) / 2 + 1)) + sample2_annot$name[1:(nrow(sample2_annot) / 2 + 1)] <- paste0("ENS", 1:(nrow(sample2_annot) / 2 + 1)) input$annot_list$sample2 <- sample2_annot rownames(input$counts_list$sample2) <- sample2_annot$input - features_types_list <- list(sample1=get_feature_types(input$annot_list$sample1),sample2=get_feature_types(input$annot_list$sample2)) + features_types_list <- list(sample1 = get_feature_types(input$annot_list$sample1), sample2 = get_feature_types(input$annot_list$sample2)) - res <- normalize_annotation_types(input$annot_list,input$counts_list,features_types_list,samples=list("sample1","sample2")) + res <- normalize_annotation_types(input$annot_list, input$counts_list, features_types_list, samples = list("sample1", "sample2")) - expect_equal(res$annot_list$sample2,input$annot_list$sample2) - expect_equal(rownames(res$counts_list$sample2),input$annot_list$sample2$input) + expect_equal(res$annot_list$sample2, input$annot_list$sample2) + expect_equal(rownames(res$counts_list$sample2), input$annot_list$sample2$input) }) @@ -655,12 +620,10 @@ test_that("filter_unnamed_features removes one row without annotations", { expect_equal(nrow(res$counts), nrow(counts) - 1) expect_equal(nrow(res_annot), nrow(annotations$annot) - 1) - }) test_that("filter_unnamed_features removes many rows with empty annotations", { - counts <- mock_counts() nameless_genes <- sample(1:nrow(counts), size = 10) @@ -700,7 +663,6 @@ test_that("filter_unnamed_features doesn't remove anything if no empty rownames }) test_that("filter_unnamed_features replaces annotations if available", { - unnamed_pat <- "^\\.[0-9]+$|^$" counts <- mock_counts() @@ -726,7 +688,6 @@ test_that("filter_unnamed_features replaces annotations if available", { expect_equal(nrow(res$counts), nrow(counts) - 5) expect_equal(nrow(res_annot), nrow(annotations$annot) - 5) - }) test_that("read_10x_annotations removes features different from Gene Expression", { @@ -738,7 +699,7 @@ test_that("read_10x_annotations removes features different from Gene Expression" sample <- "sample_a" annot_fpath <- file.path(experiment_dir, sample, "features.tsv.gz") - local_cellranger_experiment(counts, annotations, experiment_dir, sample) + local_experiment(counts, annotations, experiment_dir, sample) res <- read_10x_annotations(annot_fpath, sample) diff --git a/pipeline-runner/tests/testthat/test-gem2s-2-load_user_files_parse.R b/pipeline-runner/tests/testthat/test-gem2s-2-load_user_files_parse.R new file mode 100644 index 00000000..797053d3 --- /dev/null +++ b/pipeline-runner/tests/testthat/test-gem2s-2-load_user_files_parse.R @@ -0,0 +1,156 @@ +source("mock-gem2s-input-files.R") + +# Function to generate generic gene IDs and names +mock_features <- function(num_rows) { + # Generate generic gene IDs and names + gene_ids <- paste0("ENSMUSG", seq(1:num_rows)) + gene_names <- paste0("Gene", seq(1:num_rows)) + + # Create the mock data frame + annotations <- data.frame( + gene_id = gene_ids, + gene_name = gene_names, + genome = rep("mm38", num_rows) + ) + + return(annotations) +} + +mock_counts <- function(num_genes, num_cells) { + counts <- + Matrix::Matrix( + rnbinom( + num_genes * num_cells, + mu = 0.3, + size = 5 + ), + ncol = num_cells, + byrow = TRUE + ) + + colnames(counts) <- make.unique(paste0(sample(1:96, num_cells, replace = TRUE), sample(1:96, num_cells, replace = TRUE), "_", sample(1:96, num_cells, replace = TRUE))) + + return(counts) +} + +test_that("load_user_files loads a parse count matrix", { + counts <- mock_counts(100, 100) + features <- mock_features(100) + + experiment_dir <- "./experiment_1" + sample <- "sample_a" + + local_experiment(counts, features, experiment_dir, sample, type = "parse") + + prev_out <- list(config = list(samples = sample, input = list(type = "parse"))) + out <- load_user_files(NULL, NULL, prev_out, experiment_dir)$output + + expect_true("counts_list" %in% names(out)) + expect_true(sample %in% names(out$counts_list)) + + expect_s4_class(out$counts_list[[1]], "dgCMatrix") +}) + +test_that("load_user_files generates feature annotation for parse data", { + counts <- mock_counts(100, 100) + features <- mock_features(100) + + experiment_dir <- "./experiment_1" + sample <- "sample_a" + + local_experiment(counts, features, experiment_dir, sample, type = "parse") + + prev_out <- list(config = list(samples = sample, input = list(type = "parse"))) + out <- load_user_files(NULL, NULL, prev_out, experiment_dir)$output + + expect_true("annot" %in% names(out)) + expect_true( + all(c("input", "name", "original_name") %in% colnames(out$annot)) + ) +}) + +test_that("load_user_files uses appropriate feature columns for parse data", { + counts <- mock_counts(100, 100) + features <- mock_features(100) + + experiment_dir <- "./experiment_1" + sample <- "sample_a" + + local_experiment(counts, features, experiment_dir, sample, type = "parse") + + + prev_out <- list(config = list(samples = sample, input = list(type = "parse"))) + out <- load_user_files(NULL, NULL, prev_out, experiment_dir)$output + + # ensembl ids are counts row names + expect_equal( + features$gene_id, + row.names(out$counts_list[[1]]) + ) + + # ensembl ids are in column 'input' of annot + expect_equal(out$annot$input, features$gene_id) + + # symbols are in column 'name' of annot + expect_equal(out$annot$name, features$gene_name) +}) + +test_that("load_user_files loads 10x multisample experiments", { + features <- mock_features(100) + + experiment_dir <- "./experiment_1" + samples <- c("sample_a", "sample_b") + + local_experiment(mock_counts(100, 100), features, experiment_dir, samples[1], type = "parse") + local_experiment(mock_counts(100, 100), features, experiment_dir, samples[2], type = "parse") + + + prev_out <- list(config = list(samples = samples, input = list(type = "parse"))) + out <- load_user_files(NULL, NULL, prev_out, experiment_dir)$output + + # loaded both + expect_equal(names(out$counts_list), samples) + + expect_equal(out$annot$name, features$gene_name) +}) + +test_that("read_parse_files returns error if files missing", { + counts <- mock_counts(100, 100) + features <- mock_features(100) + + experiment_dir <- "./experiment_1" + sample <- "sample_a" + + local_experiment(counts, features, experiment_dir, sample, type = "parse") + + prev_out <- list(config = list(samples = sample, input = list(type = "parse"))) + + files <- c("cell_metadata.csv.gz", "DGE.mtx.gz", "all_genes.csv.gz") + + sample_dir <- file.path(experiment_dir, sample) + + # remove files one by one renaming + for (file in files) { + file.rename(file.path(sample_dir, file), file.path(sample_dir, "blah")) + expect_error(supressWarnings(load_user_files(NULL, NULL, prev_out, experiment_dir), "Cannot find")) + file.rename(file.path(sample_dir, "blah"), file.path(sample_dir, file)) + } +}) + +test_that("read_parse_annotations makes features unique", { + counts <- mock_counts(100, 100) + features <- mock_features(100) + + features$gene_id[[2]] <- features$gene_id[[1]] + + experiment_dir <- "./experiment_1" + sample <- "sample_a" + + local_experiment(counts, features, experiment_dir, sample, type = "parse") + + prev_out <- list(config = list(samples = sample, input = list(type = "parse"))) + + out <- load_user_files(NULL, NULL, prev_out, experiment_dir)$output + + expect_true(out$annot$input[[2]] != out$annot$input[[1]] & rownames(out$counts_list$sample_a)[[2]] != rownames(out$counts_list$sample_a)[[1]]) +}) diff --git a/pipeline-runner/tests/testthat/test-qc-4-filter_gene_umi_outlier.R b/pipeline-runner/tests/testthat/test-qc-4-filter_gene_umi_outlier.R index 78d09a2c..e7457304 100644 --- a/pipeline-runner/tests/testthat/test-qc-4-filter_gene_umi_outlier.R +++ b/pipeline-runner/tests/testthat/test-qc-4-filter_gene_umi_outlier.R @@ -164,6 +164,7 @@ test_that("Gene UMI filter works if input is a a float-interpretable string", { cells_id <- mock_ids() nstart <- ncol(scdata_list[[sample_2_id]]) config$auto <- FALSE + config$filterSettings$predictionInterval <- "0.3" out_number <- filter_gene_umi_outlier(scdata_list, config, sample_2_id, cells_id)