Merge pull request #350 from biomage-org/support-parse-data

Support parse data
hms-dbmi-cellenics · Dec 21, 2023 · 993ebc4 · 993ebc4
2 parents 10a4bea + 309ad4e
commit 993ebc4
Show file tree

Hide file tree

Showing 10 changed files with 416 additions and 141 deletions.
diff --git a/pipeline-runner/NAMESPACE b/pipeline-runner/NAMESPACE
@@ -61,6 +61,8 @@ export(prepare_experiment)
 export(prepare_sct_integration)
 export(read_10x_annotations)
 export(read_10x_h5_file)
+export(read_parse_annotations)
+export(read_parse_files)
 export(reconstruct_seurat)
 export(remove_filtered_cells)
 export(remove_genes)

diff --git a/pipeline-runner/R/gem2s-1-download_user_files.R b/pipeline-runner/R/gem2s-1-download_user_files.R
@@ -30,11 +30,11 @@ download_s3_files <- function(input, originals_bucket, input_dir, s3) {
   unlink(input_dir, recursive = TRUE)
 
   for (sample_id in sample_ids) {
-
     for (file_type in file_types_by_technology[[technology]]) {
       s3_path <- sample_s3_paths[[sample_id]][[file_type]]
 
       local_fpath <- file.path(input_dir, sample_id, file_names[[file_type]])
+
       download_and_store(originals_bucket, s3_path, local_fpath, s3)
     }
   }

diff --git a/pipeline-runner/R/gem2s-2-load_user_files.R b/pipeline-runner/R/gem2s-2-load_user_files.R
@@ -20,12 +20,13 @@ load_user_files <- function(input, pipeline_config, prev_out, input_dir = INPUT_
   # destructure previous output
   config <- prev_out$config
 
-  technology <- ifelse(config$input$type %in% c("rhapsody","10x_h5"),config$input$type , "10x")
+  technology <- ifelse(config$input$type %in% c("rhapsody", "10x_h5", "parse"), config$input$type, "10x")
 
   read_fun <- switch(technology,
     "10x" = read_10x_files,
     "rhapsody" = read_rhapsody_files,
-    "10x_h5" = read_10x_h5_file
+    "10x_h5" = read_10x_h5_file,
+    "parse" = read_parse_files
   )
 
   message(
@@ -45,6 +46,66 @@ load_user_files <- function(input, pipeline_config, prev_out, input_dir = INPUT_
   return(res)
 }
 
+#' Read Parse files. Calls readMTX
+#'
+#' @param config experiment settings
+#' @param input_dir
+#'
+#' @return
+#' @export
+#'
+read_parse_files <- function(config, input_dir) {
+  counts_list <- list()
+  annot_list <- list()
+  feature_types_list <- list()
+
+  samples <- config$samples
+
+  for (sample in samples) {
+    sample_dir <- file.path(input_dir, sample)
+    sample_fpaths <- list.files(sample_dir)
+    annot_fpath <- file.path(sample_dir, "all_genes.csv.gz")
+
+    message("\nSample --> ", sample)
+    message(
+      "Reading files from ",
+      sample_dir,
+      " --> ",
+      paste(sample_fpaths, collapse = " - ")
+    )
+
+    annotations <- read_parse_annotations(annot_fpath, sample)
+
+    mtx_path <- file.path(sample_dir, "DGE.mtx.gz")
+    barcodes_path <- file.path(sample_dir, "cell_metadata.csv.gz")
+    features_path <- file.path(sample_dir, "all_genes.csv.gz")
+
+
+    # We use readMtx instead of Seurat::ReadParse because the feature.column needs to be 1 instead of 2.
+    counts <- Seurat::ReadMtx(
+      mtx = mtx_path, cells = barcodes_path, features = features_path,
+      cell.column = 1, feature.column = 1, cell.sep = ",",
+      feature.sep = ",", skip.cell = 1, skip.feature = 1, mtx.transpose = TRUE
+    )
+
+    message(
+      sprintf(
+        "Sample %s has %s genes and %s barcodes",
+        sample, nrow(counts), ncol(counts)
+      )
+    )
+
+    c(counts, annotations) %<-% filter_unnamed_features(counts, annotations, sample)
+
+    counts_list[[sample]] <- counts
+    annot_list[[sample]] <- annotations
+  }
+
+  annot <- format_annot(annot_list)
+
+  return(list(counts_list = counts_list, annot = annot))
+}
+
 #' Read h5 file
 #'
 #' Calls read10x_h5
@@ -67,21 +128,24 @@ read_10x_h5_file <- function(config, input_dir) {
     sample_counts_path <- file.path(sample_dir, sample_fpaths[[1]])
 
     message("\nSample --> ", sample)
-    message("Reading files from ",
-            sample_dir,
-            " --> ",
-            paste(sample_fpaths, collapse = " - "))
+    message(
+      "Reading files from ",
+      sample_dir,
+      " --> ",
+      paste(sample_fpaths, collapse = " - ")
+    )
 
-    if (length(sample_fpaths) > 1)
+    if (length(sample_fpaths) > 1) {
       stop("Only one h5 expected. More files detected.")
+    }
 
     ungzipped_counts_path <- R.utils::gunzip(sample_counts_path)
 
     counts_names <- Seurat::Read10X_h5(ungzipped_counts_path)
     counts <- Seurat::Read10X_h5(ungzipped_counts_path, use.names = FALSE)
 
     # use Gene Expression modality if multiple
-    if (methods::is(counts, 'list')) {
+    if (methods::is(counts, "list")) {
       counts_names <- counts_names$`Gene Expression`
       counts <- counts$`Gene Expression`
     }
@@ -168,7 +232,6 @@ read_10x_files <- function(config, input_dir) {
 #'   }
 #'
 read_rhapsody_files <- function(config, input_dir) {
-
   # if we add support for other rhapsody file types (csv matrices) we should
   # check filetypes here and dispatch accordingly.
 
@@ -274,6 +337,30 @@ parse_rhapsody_matrix <- function(config, input_dir) {
   return(list(counts_list = counts_list, annot = annot))
 }
 
+#' Loads annotations and formats the annotation file with the column names required by format_annot
+#'
+#' @param annot_fpath Path to annotations file
+#' @param sample Sample name
+#'
+#' @return
+#' @export
+#'
+read_parse_annotations <- function(annot_fpath, sample) {
+  annot <- read.delim(annot_fpath, header = TRUE, sep = ",")
+
+  # Make the names in annot the same as the ones in the Read10x generated count matrix
+  # Since Seurat uses makes.unique, we need to as well.
+  # Only for the first column, at this stage column 1 are the counts matrix rownames.
+  annot[, 1] <- make.unique(annot[, 1])
+
+  # Equalizing number of columns
+  # We are removing the genome column, so far it's not used.
+  annot <- annot[, c(1, 2)]
+  colnames(annot) <- c("input", "name")
+
+  return(annot)
+}
+
 #' Load and parse annotations from the feature files
 #'
 #' This function reads the features file into a data.frame, and takes steps
@@ -574,4 +661,3 @@ filter_unnamed_features <- function(counts, annotations, sample) {
 
   return(list("counts" = counts, "annotations" = annotations))
 }
-
diff --git a/pipeline-runner/R/sysdata.rda b/pipeline-runner/R/sysdata.rda
diff --git a/pipeline-runner/data-raw/sysdata.R b/pipeline-runner/data-raw/sysdata.R
@@ -77,7 +77,8 @@ file_types_by_technology <- list(
   "10x" = list("barcodes10x", "features10x", "matrix10x"),
   "seurat" = list("seurat"),
   "rhapsody" = list("rhapsody"),
-  "10x_h5" = list("10XH5")
+  "10x_h5" = list("10XH5"),
+  "parse" = list("barcodesParse", "featuresParse", "matrixParse")
 )
 
 file_names <- list(
@@ -86,7 +87,10 @@ file_names <- list(
   matrix10x = "matrix.mtx.gz",
   seurat = "r.rds",
   rhapsody = "expression_data.st.gz",
-  "10XH5" = "matrix.h5.gz"
+  "10XH5" = "matrix.h5.gz",
+  barcodesParse = "cell_metadata.csv.gz",
+  featuresParse = "all_genes.csv.gz",
+  matrixParse = "DGE.mtx.gz"
 )
 
 MITOCHONDRIAL_REGEX <- "^mt[-:]"

diff --git a/pipeline-runner/man/read_parse_annotations.Rd b/pipeline-runner/man/read_parse_annotations.Rd
diff --git a/pipeline-runner/man/read_parse_files.Rd b/pipeline-runner/man/read_parse_files.Rd
diff --git a/pipeline-runner/tests/testthat/mock-gem2s-input-files.R b/pipeline-runner/tests/testthat/mock-gem2s-input-files.R
@@ -0,0 +1,34 @@
+mock_files <- function(counts, features, sample_dir, type = "10x") {
+  features_fnames <- list("10x" = "features.tsv", "parse" = "all_genes.csv")
+  barcodes_fnames <- list("10x" = "barcodes.tsv", "parse" = "cell_metadata.csv")
+  matrix_fnames <- list("10x" = "matrix.mtx", "parse" = "DGE.mtx")
+  file_sep <- list("10x" = "\t", "parse" = ",")
+  # Parse features and barcodes files include the header, so we need to emulate it.
+  col_names <- list("10x" = FALSE, "parse" = TRUE)
+
+  # save features
+  features_path <- file.path(sample_dir, features_fnames[[type]])
+  write.table(features, features_path, col.names = col_names[[type]], quote = FALSE, sep = file_sep[[type]], row.names = FALSE)
+  R.utils::gzip(features_path)
+
+  # save barcodes
+  barcodes <- data.frame(barcodes = colnames(counts))
+  barcodes_path <- file.path(sample_dir, barcodes_fnames[[type]])
+  # writeLines(barcodes, barcodes_path)
+  write.table(barcodes, barcodes_path, col.names = col_names[[type]], quote = FALSE, sep = file_sep[[type]], row.names = FALSE)
+  R.utils::gzip(barcodes_path)
+
+  # save Matrix
+  if (is(counts, "data.frame")) counts <- as.matrix(counts)
+  sparse.mat <- Matrix::Matrix(counts, sparse = TRUE)
+  matrix_path <- file.path(sample_dir, matrix_fnames[[type]])
+  Matrix::writeMM(sparse.mat, matrix_path)
+  R.utils::gzip(matrix_path)
+}
+
+local_experiment <- function(counts, features, experiment_dir, sample_dir, env = parent.frame(), type = "10x") {
+  sample_path <- file.path(experiment_dir, sample_dir)
+  dir.create(sample_path, recursive = T)
+  mock_files(counts, features, sample_path, type)
+  withr::defer(unlink(experiment_dir, recursive = T), envir = env)
+}