Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Revert "Revert "[BIOMAGE-2004] Merging in QC""" #279

Merged
merged 1 commit into from
Aug 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,12 @@ endif
#--------------------------------------------------
# Targets
#--------------------------------------------------
install:
install:
@echo "Installing local runner"
@(cd ./local-runner && npm install)
@echo "Installing renv packages"
@echo "Installing R env packages"
@(cd ./pipeline-runner && R -e "renv::restore()")
@(cd ./pipeline-runner && R -e "renv::install()")
build:
build:
# regenerate sysdata.rda env file
@(cd ./pipeline-runner && Rscript data-raw/sysdata.R)
@(cd ./local-runner && npm run build)
Expand Down
4 changes: 2 additions & 2 deletions local-runner/cf-local-container-launcher.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Resources:

docker_command = ' '.join([
"docker run --rm -t",
f"--name {event['name']}-{random_string(10)}",
f"--name {event['name']}-{random_string(10)}",
f"{'-d' if event['detached'] else ''}",
f"--env ACTIVITY_ARN={event.get('activityArn', '')}",
f"--env HOST_IP=__HOST_IP__",
Expand All @@ -47,7 +47,7 @@ Resources:
Timeout: 25
QualityControlActivity:
Type: AWS::StepFunctions::Activity
Properties:
Properties:
Name: biomage-qc-activity-development
RemovePreviousPipelineContainers:
Type: "AWS::Lambda::Function"
Expand Down
5 changes: 1 addition & 4 deletions pipeline-runner/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,11 @@ FROM common AS dev
# also install watchdog to automatically restart
# when source files change
RUN pip install -U jedi radian PyYAML watchdog[watchmedo]
RUN apt update && apt -y install git
RUN pip install memory_profiler

# add R package files and runner
ADD R ./R
ADD tests ./tests
COPY DESCRIPTION NAMESPACE init.R ./

# start app
COPY start.sh start.sh
ENTRYPOINT ["./start.sh"]
ENTRYPOINT ["Rscript", "init.R"]
11 changes: 1 addition & 10 deletions pipeline-runner/NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,37 +1,28 @@
# Generated by roxygen2: do not edit by hand

export(add_metadata)
export(build_cc_gene_list)
export(build_metadata_cellsets)
export(create_scdata)
export(create_seurat)
export(download_user_files)
export(filter_doublets)
export(filter_emptydrops)
export(filter_gene_umi_outlier)
export(filter_high_mito)
export(filter_low_cellsize)
export(filter_unnamed_features)
export(generate_default_values_cellSizeDistribution)
export(generate_default_values_classifier)
export(generate_first_step_ids)
export(getClusters)
export(get_feature_types)
export(get_gem2s_file)
export(ids_to_sym)
export(list_exclude_genes)
export(load_user_files)
export(make_annot_with_ids)
export(merge_scdata_list)
export(normalize_annotation_types)
export(merge_scdatas)
export(prepare_experiment)
export(read_10x_annotations)
export(remove_genes)
export(runClusters)
export(run_emptydrops)
export(score_doublets)
export(subset_ids)
export(subset_safe)
export(sym_to_ids)
import(data.table)
importFrom(magrittr,"%>%")
7 changes: 4 additions & 3 deletions pipeline-runner/R/gem2s-2-load_user_files.R
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ parse_rhapsody_matrix <- function(config, input_dir) {
#' @return list of annotations data.frame
#' @export
#'
#' @examples
read_10x_annotations <- function(annot_fpath, sample) {
gene_column <- 1

Expand Down Expand Up @@ -408,7 +409,7 @@ make_annot_with_ids <- function(annot_list, feature_types_list) {
#' @param sample_annot data.frame of annotations
#' @param annot_with_ids data.frame of annotations with IDs. Key data.frame
#'
#' @return data.frame of annotations
#' @return
#' @export
#'
sym_to_ids <- function(sample_annot, annot_with_ids) {
Expand Down Expand Up @@ -436,7 +437,7 @@ sym_to_ids <- function(sample_annot, annot_with_ids) {
#' @param sample_annot data.frame of annotations
#' @param annot_with_ids data.frame of annotations with IDs. Key data.frame
#'
#' @return data.frame of annotations
#' @return
#' @export
#'
ids_to_sym <- function(sample_annot, annot_with_ids) {
Expand Down Expand Up @@ -464,7 +465,7 @@ ids_to_sym <- function(sample_annot, annot_with_ids) {
#' @param annotations list of annotations data.frame, feature types and gene_column
#' @param sample character specifying current sample
#'
#' @return list of counts and annotations
#' @return
#' @export
#'
filter_unnamed_features <- function(counts, annotations, sample) {
Expand Down
2 changes: 1 addition & 1 deletion pipeline-runner/R/gem2s-4-score_doublets.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ score_doublets <- function(input, pipeline_config, prev_out) {
#' @return data.frame with doublet scores and assigned classes
#'
compute_sample_doublet_scores <- function(sample_counts) {
set.seed(RANDOM_SEED)
set.seed(gem2s$random.seed)
sce <- scDblFinder::scDblFinder(sample_counts)
doublet_res <- data.frame(
row.names = colnames(sce),
Expand Down
39 changes: 14 additions & 25 deletions pipeline-runner/R/gem2s-5-create_seurat.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@ create_seurat <- function(input, pipeline_config, prev_out) {
check_prev_out(prev_out, check_names)

# destructure previous output: config, counts_list, annot, and doublet_scores
config <- prev_out$config
counts_list <- prev_out$counts_list
annot <- prev_out$annot
doublet_scores <- prev_out$doublet_scores
edrops <- prev_out$edrops
list2env(prev_out, envir = environment())

samples <- names(counts_list)
scdata_list <- list()
Expand Down Expand Up @@ -68,33 +64,26 @@ construct_scdata <- function(counts, doublet_score, edrops_out, sample, annot, c
}


#' Construct metadata for each SeuratObject
#'
#' This function creates a `data.frame` with the barcodes, sampleIDs and user supplied
#' metadata that corresponds to each one.
#'
#' @param counts count matrix
#' @param sample character sample ID
#' @param config list containing experiment config
#'
#' @return data.frame of sample metadata
#'

# NOTE: any changes here must be reflected in meta_sets

# construct metadata for each SeuratObject
construct_metadata <- function(counts, sample, config) {
message("Constructing metadata data.frame...")
metadata_df <- data.frame(row.names = colnames(counts), samples = rep(sample, ncol(counts)))
message("Constructing metadata df...")
metadata <- data.frame(row.names = colnames(counts), samples = rep(sample, ncol(counts)))

# Add "metadata" if exists in config
user_metadata <- config$metadata
if (!is.null(user_metadata)) {
user_metadata <- lapply(user_metadata, unlist)
user_metadata <- data.frame(user_metadata, row.names = config$samples, check.names = FALSE)
metadata_df[names(user_metadata)] <- user_metadata[sample, ]
rest <- config$metadata
if (!is.null(rest)) {
rest <- lapply(rest, unlist)
rest <- data.frame(rest, row.names = config$samples, check.names = FALSE)
metadata[names(rest)] <- rest[sample, ]
}

# make syntactically valid column names
colnames(metadata_df) <- make.names(colnames(metadata_df), unique = TRUE)
colnames(metadata) <- make.names(colnames(metadata), unique = TRUE)

return(metadata_df)
return(metadata)
}

# add mitochondrial percent to SeuratObject
Expand Down
62 changes: 27 additions & 35 deletions pipeline-runner/R/gem2s-6-construct_qc_config.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,6 @@
#' Constructs default QC configuration
#'
#' This function returns the default parameters used during QC as a nested list.
#' It is sent to the API, which in turn saves it as a jsonb object in the PostgreSQL
#' database.
#'
#' @param scdata_list list of seurat objects
#' @param any_filtered boolean indicating if barcodes were filtered by the emptyDrops.
#'
#' @return list of QC configuration parameters
#'
construct_qc_config <- function(scdata_list, any_filtered) {
samples <- names(scdata_list)
# constructs default QC configuration for merged SeuratObject
construct_qc_config <- function(scdata, any_filtered) {
samples <- scdata$samples

# classifier
config.classifier <- list(
Expand All @@ -36,7 +26,7 @@ construct_qc_config <- function(scdata_list, any_filtered) {
filterSettings = list(minCellSize = 1080, binStep = 200)
)

config.cellSizeDistribution <- add_custom_config_per_sample(get_cellsize_config, config.cellSizeDistribution, scdata_list)
config.cellSizeDistribution <- add_custom_config_per_sample(get_cellsize_config, config.cellSizeDistribution, scdata)

# mito
config.mitochondrialContent <- list(
Expand All @@ -53,7 +43,7 @@ construct_qc_config <- function(scdata_list, any_filtered) {
)
)

config.mitochondrialContent <- add_custom_config_per_sample(get_sample_mitochondrial_config, config.mitochondrialContent, scdata_list)
config.mitochondrialContent <- add_custom_config_per_sample(get_sample_mitochondrial_config, config.mitochondrialContent, scdata)

# ngenes vs umis
config.numGenesVsNumUmis <- list(
Expand All @@ -68,7 +58,7 @@ construct_qc_config <- function(scdata_list, any_filtered) {
)
)

config.numGenesVsNumUmis <- add_custom_config_per_sample(get_gene_umi_config, config.numGenesVsNumUmis, scdata_list)
config.numGenesVsNumUmis <- add_custom_config_per_sample(get_gene_umi_config, config.numGenesVsNumUmis, scdata)


# doublet scores
Expand All @@ -81,7 +71,7 @@ construct_qc_config <- function(scdata_list, any_filtered) {
)
)

config.doubletScores <- add_custom_config_per_sample(get_dblscore_config, config.doubletScores, scdata_list)
config.doubletScores <- add_custom_config_per_sample(get_dblscore_config, config.doubletScores, scdata)

# data integration
config.dataIntegration <- list(
Expand Down Expand Up @@ -112,8 +102,8 @@ construct_qc_config <- function(scdata_list, any_filtered) {
distanceMetric = "cosine"
),
tsne = list(
perplexity = min(30, ncol(scdata_list) / 100),
learningRate = max(200, ncol(scdata_list) / 12)
perplexity = min(30, ncol(scdata) / 100),
learningRate = max(200, ncol(scdata) / 12)
)
)
),
Expand All @@ -138,13 +128,13 @@ construct_qc_config <- function(scdata_list, any_filtered) {
}


get_cellsize_config <- function(scdata_list, config) {
minCellSize <- generate_default_values_cellSizeDistribution(scdata_list, config)
get_cellsize_config <- function(scdata, config) {
minCellSize <- generate_default_values_cellSizeDistribution(scdata, config)
config$filterSettings$minCellSize <- minCellSize
return(config)
}

get_sample_mitochondrial_config <- function(scdata_list.sample, config) {
get_sample_mitochondrial_config <- function(scdata.sample, config) {

config.sample <- list(
auto = TRUE,
Expand All @@ -155,32 +145,32 @@ get_sample_mitochondrial_config <- function(scdata_list.sample, config) {
)

config.sample$filterSettings$methodSettings$absoluteThreshold <- list(
maxFraction = generate_default_values_mitochondrialContent(scdata_list.sample, config.sample),
maxFraction = generate_default_values_mitochondrialContent(scdata.sample, config.sample),
binStep = 0.3
)

return(config.sample)
}


# threshold for doublet score is the max score given to a singlet (above score => doublets)
get_dblscore_config <- function(scdata_list, config) {
probabilityThreshold <- max(scdata_list$doublet_scores[scdata_list$doublet_class == "singlet"], na.rm = TRUE)
get_dblscore_config <- function(scdata, config) {
probabilityThreshold <- max(scdata$doublet_scores[scdata$doublet_class == "singlet"], na.rm = TRUE)
config$filterSettings$probabilityThreshold <- probabilityThreshold

return(config)
}


get_gene_umi_config <- function(scdata_list, config) {
get_gene_umi_config <- function(scdata, config) {
# Sensible values are based on the function "gene.vs.molecule.cell.filter" from the pagoda2 package
p.level <- min(0.001, 1 / ncol(scdata_list))
p.level <- min(0.001, 1 / ncol(scdata))
config$filterSettings$regressionTypeSettings[[config$filterSettings$regressionType]]$p.level <- p.level

return(config)
}



duplicate_config_per_sample <- function(step_config, config, samples) {
for (sample in unique(samples)) {
config[[sample]] <- step_config
Expand All @@ -190,23 +180,25 @@ duplicate_config_per_sample <- function(step_config, config, samples) {
return(config)
}

add_custom_config_per_sample <- function(generate_sample_config, config, scdata) {

add_custom_config_per_sample <- function(generate_sample_config, config, scdata_list) {
# We update the config file, so to be able to access the raw config we create a copy
raw_config <- config
config.raw <- config

for (sample in names(scdata_list)) {
samples <- scdata$samples

for (sample in unique(samples)) {
# subset the Seurat object to a single sample
sample_data <- scdata_list[[sample]]
scdata.sample <- scdata[, samples %in% sample]

# run the function to generate config for a sample
sample_config <- generate_sample_config(sample_data, raw_config)
config.sample <- generate_sample_config(scdata.sample, config.raw)

# update sample config thresholds
config[[sample]] <- sample_config
config[[sample]] <- config.sample

# add auto settings
config[[sample]]$defaultFilterSettings <- sample_config$filterSettings
config[[sample]]$defaultFilterSettings <- config.sample$filterSettings
}

return(config)
Expand Down
Loading