Skip to content

Commit

Permalink
Merge pull request hms-dbmi-cellenics#350 from biomage-org/support-pa…
Browse files Browse the repository at this point in the history
…rse-data

Support parse data
  • Loading branch information
cosa65 authored Dec 21, 2023
2 parents 10a4bea + 309ad4e commit 993ebc4
Show file tree
Hide file tree
Showing 10 changed files with 416 additions and 141 deletions.
2 changes: 2 additions & 0 deletions pipeline-runner/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ export(prepare_experiment)
export(prepare_sct_integration)
export(read_10x_annotations)
export(read_10x_h5_file)
export(read_parse_annotations)
export(read_parse_files)
export(reconstruct_seurat)
export(remove_filtered_cells)
export(remove_genes)
Expand Down
2 changes: 1 addition & 1 deletion pipeline-runner/R/gem2s-1-download_user_files.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ download_s3_files <- function(input, originals_bucket, input_dir, s3) {
unlink(input_dir, recursive = TRUE)

for (sample_id in sample_ids) {

for (file_type in file_types_by_technology[[technology]]) {
s3_path <- sample_s3_paths[[sample_id]][[file_type]]

local_fpath <- file.path(input_dir, sample_id, file_names[[file_type]])

download_and_store(originals_bucket, s3_path, local_fpath, s3)
}
}
Expand Down
106 changes: 96 additions & 10 deletions pipeline-runner/R/gem2s-2-load_user_files.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@ load_user_files <- function(input, pipeline_config, prev_out, input_dir = INPUT_
# destructure previous output
config <- prev_out$config

technology <- ifelse(config$input$type %in% c("rhapsody","10x_h5"),config$input$type , "10x")
technology <- ifelse(config$input$type %in% c("rhapsody", "10x_h5", "parse"), config$input$type, "10x")

read_fun <- switch(technology,
"10x" = read_10x_files,
"rhapsody" = read_rhapsody_files,
"10x_h5" = read_10x_h5_file
"10x_h5" = read_10x_h5_file,
"parse" = read_parse_files
)

message(
Expand All @@ -45,6 +46,66 @@ load_user_files <- function(input, pipeline_config, prev_out, input_dir = INPUT_
return(res)
}

#' Read Parse files. Calls readMTX
#'
#' @param config experiment settings
#' @param input_dir
#'
#' @return
#' @export
#'
read_parse_files <- function(config, input_dir) {
counts_list <- list()
annot_list <- list()
feature_types_list <- list()

samples <- config$samples

for (sample in samples) {
sample_dir <- file.path(input_dir, sample)
sample_fpaths <- list.files(sample_dir)
annot_fpath <- file.path(sample_dir, "all_genes.csv.gz")

message("\nSample --> ", sample)
message(
"Reading files from ",
sample_dir,
" --> ",
paste(sample_fpaths, collapse = " - ")
)

annotations <- read_parse_annotations(annot_fpath, sample)

mtx_path <- file.path(sample_dir, "DGE.mtx.gz")
barcodes_path <- file.path(sample_dir, "cell_metadata.csv.gz")
features_path <- file.path(sample_dir, "all_genes.csv.gz")


# We use readMtx instead of Seurat::ReadParse because the feature.column needs to be 1 instead of 2.
counts <- Seurat::ReadMtx(
mtx = mtx_path, cells = barcodes_path, features = features_path,
cell.column = 1, feature.column = 1, cell.sep = ",",
feature.sep = ",", skip.cell = 1, skip.feature = 1, mtx.transpose = TRUE
)

message(
sprintf(
"Sample %s has %s genes and %s barcodes",
sample, nrow(counts), ncol(counts)
)
)

c(counts, annotations) %<-% filter_unnamed_features(counts, annotations, sample)

counts_list[[sample]] <- counts
annot_list[[sample]] <- annotations
}

annot <- format_annot(annot_list)

return(list(counts_list = counts_list, annot = annot))
}

#' Read h5 file
#'
#' Calls read10x_h5
Expand All @@ -67,21 +128,24 @@ read_10x_h5_file <- function(config, input_dir) {
sample_counts_path <- file.path(sample_dir, sample_fpaths[[1]])

message("\nSample --> ", sample)
message("Reading files from ",
sample_dir,
" --> ",
paste(sample_fpaths, collapse = " - "))
message(
"Reading files from ",
sample_dir,
" --> ",
paste(sample_fpaths, collapse = " - ")
)

if (length(sample_fpaths) > 1)
if (length(sample_fpaths) > 1) {
stop("Only one h5 expected. More files detected.")
}

ungzipped_counts_path <- R.utils::gunzip(sample_counts_path)

counts_names <- Seurat::Read10X_h5(ungzipped_counts_path)
counts <- Seurat::Read10X_h5(ungzipped_counts_path, use.names = FALSE)

# use Gene Expression modality if multiple
if (methods::is(counts, 'list')) {
if (methods::is(counts, "list")) {
counts_names <- counts_names$`Gene Expression`
counts <- counts$`Gene Expression`
}
Expand Down Expand Up @@ -168,7 +232,6 @@ read_10x_files <- function(config, input_dir) {
#' }
#'
read_rhapsody_files <- function(config, input_dir) {

# if we add support for other rhapsody file types (csv matrices) we should
# check filetypes here and dispatch accordingly.

Expand Down Expand Up @@ -274,6 +337,30 @@ parse_rhapsody_matrix <- function(config, input_dir) {
return(list(counts_list = counts_list, annot = annot))
}

#' Loads annotations and formats the annotation file with the column names required by format_annot
#'
#' @param annot_fpath Path to annotations file
#' @param sample Sample name
#'
#' @return
#' @export
#'
read_parse_annotations <- function(annot_fpath, sample) {
annot <- read.delim(annot_fpath, header = TRUE, sep = ",")

# Make the names in annot the same as the ones in the Read10x generated count matrix
# Since Seurat uses makes.unique, we need to as well.
# Only for the first column, at this stage column 1 are the counts matrix rownames.
annot[, 1] <- make.unique(annot[, 1])

# Equalizing number of columns
# We are removing the genome column, so far it's not used.
annot <- annot[, c(1, 2)]
colnames(annot) <- c("input", "name")

return(annot)
}

#' Load and parse annotations from the feature files
#'
#' This function reads the features file into a data.frame, and takes steps
Expand Down Expand Up @@ -574,4 +661,3 @@ filter_unnamed_features <- function(counts, annotations, sample) {

return(list("counts" = counts, "annotations" = annotations))
}

Binary file modified pipeline-runner/R/sysdata.rda
Binary file not shown.
8 changes: 6 additions & 2 deletions pipeline-runner/data-raw/sysdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ file_types_by_technology <- list(
"10x" = list("barcodes10x", "features10x", "matrix10x"),
"seurat" = list("seurat"),
"rhapsody" = list("rhapsody"),
"10x_h5" = list("10XH5")
"10x_h5" = list("10XH5"),
"parse" = list("barcodesParse", "featuresParse", "matrixParse")
)

file_names <- list(
Expand All @@ -86,7 +87,10 @@ file_names <- list(
matrix10x = "matrix.mtx.gz",
seurat = "r.rds",
rhapsody = "expression_data.st.gz",
"10XH5" = "matrix.h5.gz"
"10XH5" = "matrix.h5.gz",
barcodesParse = "cell_metadata.csv.gz",
featuresParse = "all_genes.csv.gz",
matrixParse = "DGE.mtx.gz"
)

MITOCHONDRIAL_REGEX <- "^mt[-:]"
Expand Down
16 changes: 16 additions & 0 deletions pipeline-runner/man/read_parse_annotations.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions pipeline-runner/man/read_parse_files.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 34 additions & 0 deletions pipeline-runner/tests/testthat/mock-gem2s-input-files.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
mock_files <- function(counts, features, sample_dir, type = "10x") {
features_fnames <- list("10x" = "features.tsv", "parse" = "all_genes.csv")
barcodes_fnames <- list("10x" = "barcodes.tsv", "parse" = "cell_metadata.csv")
matrix_fnames <- list("10x" = "matrix.mtx", "parse" = "DGE.mtx")
file_sep <- list("10x" = "\t", "parse" = ",")
# Parse features and barcodes files include the header, so we need to emulate it.
col_names <- list("10x" = FALSE, "parse" = TRUE)

# save features
features_path <- file.path(sample_dir, features_fnames[[type]])
write.table(features, features_path, col.names = col_names[[type]], quote = FALSE, sep = file_sep[[type]], row.names = FALSE)
R.utils::gzip(features_path)

# save barcodes
barcodes <- data.frame(barcodes = colnames(counts))
barcodes_path <- file.path(sample_dir, barcodes_fnames[[type]])
# writeLines(barcodes, barcodes_path)
write.table(barcodes, barcodes_path, col.names = col_names[[type]], quote = FALSE, sep = file_sep[[type]], row.names = FALSE)
R.utils::gzip(barcodes_path)

# save Matrix
if (is(counts, "data.frame")) counts <- as.matrix(counts)
sparse.mat <- Matrix::Matrix(counts, sparse = TRUE)
matrix_path <- file.path(sample_dir, matrix_fnames[[type]])
Matrix::writeMM(sparse.mat, matrix_path)
R.utils::gzip(matrix_path)
}

local_experiment <- function(counts, features, experiment_dir, sample_dir, env = parent.frame(), type = "10x") {
sample_path <- file.path(experiment_dir, sample_dir)
dir.create(sample_path, recursive = T)
mock_files(counts, features, sample_path, type)
withr::defer(unlink(experiment_dir, recursive = T), envir = env)
}
Loading

0 comments on commit 993ebc4

Please sign in to comment.