Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Init filter stats #76

Merged
merged 13 commits into from
Jun 2, 2021
42 changes: 26 additions & 16 deletions pipeline-runner/src/cellSizeDistribution.r
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ generate_default_values_cellSizeDistribution <- function(seurat_obj, config, thr
# Returns Seurat object with a new list in the `tools` slot,
# `CalculateBarcodeInflections` including inflection point calculatio
seurat_obj_tmp <- CalculateBarcodeInflections(
object = seurat_obj,
barcode.column = "nCount_RNA",
group.column = "orig.ident",
# [HARDCODED]
threshold.low = threshold.low,
threshold.high = NULL
object = seurat_obj,
barcode.column = "nCount_RNA",
group.column = "orig.ident",
# [HARDCODED]
threshold.low = threshold.low,
threshold.high = NULL
)
# extracting the inflection point value which can serve as minCellSize threshold
# all other side effects to the scdate object will be discarded
Expand Down Expand Up @@ -68,10 +68,11 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
obj_metadata <- [email protected]
barcode_names_this_sample <- rownames(obj_metadata[grep(tmp_sample, rownames(obj_metadata)),])
if(length(barcode_names_this_sample)==0){
plots <- list()
plots[generate_plotuuid(sample_id, task_name, 0)] <- list()
plots[generate_plotuuid(sample_id, task_name, 1)] <- list()
return(list(data = seurat_obj,config = config,plotData = plots))
guidata <- list()
guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list()
guidata[generate_gui_uuid(sample_id, task_name, 1)] <- list()
return(list(data = seurat_obj,config = config,plotData = guidata))

}
sample_subset <- subset(seurat_obj, cells = barcode_names_this_sample)

Expand Down Expand Up @@ -142,16 +143,25 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp

# update config
config$filterSettings$minCellSize <- minCellSize
#Populate plots list
plots <-list()
plots[generate_plotuuid(sample_id, task_name, 0)] <- list(plot1_data)
plots[generate_plotuuid(sample_id, task_name, 1)] <- list(plot2_data)
#Populate data for UI
guidata <-list()
guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list(plot1_data)
guidata[generate_gui_uuid(sample_id, task_name, 1)] <- list(plot2_data)

# Populate with filter statistics
filter_stats <- list(
before = calc_filter_stats(seurat_obj, tmp_sample),
after = calc_filter_stats(seurat_obj.filtered, tmp_sample)
)
guidata[generate_gui_uuid(sample_id, task_name, 3)] <- filter_stats
print("Filter statistics for sample before/after filter:")
str(filter_stats)

result <- list(
data = seurat_obj.filtered,
config = config,
plotData = plots
plotData = guidata
)

return(result)
}
}
34 changes: 23 additions & 11 deletions pipeline-runner/src/classifier.r
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
# sample-WT1
# we need to get only the last part, in order to grep the object.
tmp_sample <- sub("sample-","",sample_id)

# get filter stats before filtering
before <- calc_filter_stats(seurat_obj, tmp_sample)

import::here(map2, .from = purrr)

Expand All @@ -84,16 +87,16 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
print("Classify is enabled but has no classify data available: will dissable it: no filtering!")
# should this be json, i.e. "false" instead?
config$enabled <- FALSE
plots <- list()
guidata <- list()
} else { # enabled and good data:
print(paste0("Classify is enabled but classify data available: all good for filtering with FDR=", FDR))
obj_metadata <- [email protected]
# extract plotting data of original data to return to plot slot later
barcode_names_this_sample <- rownames(obj_metadata[grep(tmp_sample, rownames(obj_metadata)),])
if(length(barcode_names_this_sample)==0){
plots <- list()
plots[generate_plotuuid(sample_id, task_name, 0)] <- list()
return(list(data = seurat_obj,config = config,plotData = plots))
guidata <- list()
guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list()
return(list(data = seurat_obj,config = config,plotData = guidata))
}
sample_subset <- subset(seurat_obj, cells = barcode_names_this_sample)
print("Info: empty-drops table of FDR threshold categories (# UMIs for a given threshold interval")
Expand Down Expand Up @@ -130,22 +133,31 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
cells_position_to_keep <- sort(cells_position_to_keep)
plot1_data <- plot1_data[cells_position_to_keep]

#Populate plots list
plots <-list()
plots[generate_plotuuid(sample_id, task_name, 0)] <- list(plot1_data)
#Populate guidata list
guidata <-list()
guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list(plot1_data)
}
} else {
print("filter disabled: data not filtered!")
plots<-list()
plots[generate_plotuuid(sample_id, task_name, 0)] <- list()
guidata <- list()
guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list()
seurat_obj <- seurat_obj
}
print(paste0("Cells per sample after filter for sample ", sample_id))
print(table(seurat_obj$orig.ident, useNA="ifany"))
# > head(plots[[1]])
# > head(guidata[[1]])
# [[1]]
# FDR log_u
# 0.000000 3.554852

# get filter stats after filtering
filter_stats <- list(
before = before,
after = calc_filter_stats(seurat_obj, tmp_sample)
)
guidata[generate_gui_uuid(sample_id, task_name, 1)] <- filter_stats
print("Filter statistics for sample before/after filter:")
str(filter_stats)

# [[2]]
# FDR log_u
Expand All @@ -154,7 +166,7 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
result <- list(
data = seurat_obj,
config = config,
plotData = plots
plotData = guidata
)

return(result)
Expand Down
8 changes: 4 additions & 4 deletions pipeline-runner/src/configureEmbedding.r
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,10 @@ task <- function(scdata, config, task_name, sample_id){
)

plots <- list()
plots[generate_plotuuid("", task_name, 0)] <- list(embeddingPreviewByCellSets)
plots[generate_plotuuid("", task_name, 1)] <- list(embeddingPreviewBySamples)
plots[generate_plotuuid("", task_name, 2)] <- list(embeddingPreviewMitochondrialContent)
plots[generate_plotuuid("", task_name, 3)] <- list(embeddingPreviewDoubletScore)
plots[generate_gui_uuid("", task_name, 0)] <- list(embeddingPreviewByCellSets)
plots[generate_gui_uuid("", task_name, 1)] <- list(embeddingPreviewBySamples)
plots[generate_gui_uuid("", task_name, 2)] <- list(embeddingPreviewMitochondrialContent)
plots[generate_gui_uuid("", task_name, 3)] <- list(embeddingPreviewDoubletScore)

# the result object will have to conform to this format: {data, config, plotData : {plot1, plot2}}
result <- list(
Expand Down
4 changes: 2 additions & 2 deletions pipeline-runner/src/dataIntegration.r
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ task <- function(scdata, config,task_name,sample_id){
plot2_data <- unname(purrr::map2(1:50,varExplained,function(x,y){c("PC"=x,"percentVariance"=y)}))

plots <- list()
plots[generate_plotuuid("", task_name, 0)] <- list(plot1_data)
plots[generate_plotuuid("", task_name, 1)] <- list(plot2_data)
plots[generate_gui_uuid("", task_name, 0)] <- list(plot1_data)
plots[generate_gui_uuid("", task_name, 1)] <- list(plot2_data)

# For now config is not updated, since there is not new changes
# config <- ...
Expand Down
24 changes: 17 additions & 7 deletions pipeline-runner/src/doubletScores.r
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
obj_metadata <- [email protected]
barcode_names_this_sample <- rownames(obj_metadata[grep(tmp_sample, rownames(obj_metadata)),])
if(length(barcode_names_this_sample)==0){
plots <- list()
plots[generate_plotuuid(sample_id, task_name, 0)] <- list()
return(list(data = seurat_obj,config = config,plotData = plots))
guidata <- list()
guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list()
return(list(data = seurat_obj,config = config,plotData = guidata))
}
sample_subset <- subset(seurat_obj, cells = barcode_names_this_sample)

Expand Down Expand Up @@ -98,20 +98,30 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
cells_position_to_keep <- sort(cells_position_to_keep)
plot1_data <- plot1_data[cells_position_to_keep]

plots <-list()
guidata <-list()

# plot 1: histgram of doublet scores
# plot 1: histogram of doublet scores
# [0.161, 0.198, 0.284, ...]
plots[generate_plotuuid(sample_id, task_name, 0)] <- list(plot1_data)
guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list(plot1_data)

print(paste0("Cells per sample after filter for sample ", sample_id))
print(table(seurat_obj.filtered$orig.ident))

# populate with filter statistics
filter_stats <- list(
before = calc_filter_stats(seurat_obj, tmp_sample),
after = calc_filter_stats(seurat_obj.filtered, tmp_sample)
)

guidata[generate_gui_uuid(sample_id, task_name, 1)] <- filter_stats
print("Filter statistics for sample before/after filter:")
str(filter_stats)

# the result object will have to conform to this format: {data, config, plotData : {plot1, plot2}}
result <- list(
data = seurat_obj.filtered,
config = config,
plotData = plots
plotData = guidata
)
return(result)

Expand Down
26 changes: 18 additions & 8 deletions pipeline-runner/src/mitochondrialContent.r
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
obj_metadata <- [email protected]
barcode_names_this_sample <- rownames(obj_metadata[grep(tmp_sample, rownames(obj_metadata)),])
if (length(barcode_names_this_sample)==0){
plots <- list()
plots[generate_plotuuid(sample_id, task_name, 0)] <- list()
plots[generate_plotuuid(sample_id, task_name, 1)] <- list()
return(list(data = seurat_obj,config = config,plotData = plots))
guidata <- list()
guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list()
guidata[generate_gui_uuid(sample_id, task_name, 1)] <- list()
return(list(data = seurat_obj,config = config,plotData = guidata))
}
sample_subset <- subset(seurat_obj, cells = barcode_names_this_sample)

Expand Down Expand Up @@ -95,12 +95,12 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
plot1_data <- plot1_data[cells_position_to_keep]
plot2_data <- plot2_data[cells_position_to_keep]

plots <- list()
guidata <- list()

# plot 1: histgram of MT-content
# AAACCCAAGCGCCCAT-1 AAACCCAAGGTTCCGC-1 AAACCCACAGAGTTGG-1
# 0.161 0.198 0.284 ...
plots[generate_plotuuid(sample_id, task_name, 0)] <- list(plot1_data)
guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list(plot1_data)

# plot 2: There are two alternavitive:
# - Scatter plot with UMIs in the x-axis and MT-content in the y-axis
Expand All @@ -111,16 +111,26 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
# We have decided to use the scatter plot, but I temporaly leave the other option in the comments.
# Q: Should we return from the R side the cells that are going to be removed? For this plot it is interesting to color the
# cells that are going to be excluded.
plots[generate_plotuuid(sample_id, task_name, 1)] <- list(plot2_data)
guidata[generate_gui_uuid(sample_id, task_name, 1)] <- list(plot2_data)

# some tests:
print(paste0("Cells per sample after filter for sample ", sample_id))
print(table(seurat_obj.filtered$orig.ident))

# populate with filter statistics
filter_stats <- list(
before = calc_filter_stats(seurat_obj, tmp_sample),
after = calc_filter_stats(seurat_obj.filtered, tmp_sample)
)

guidata[generate_gui_uuid(sample_id, task_name, 1)] <- filter_stats
print("Filter statistics for sample before/after filter:")
str(filter_stats)

result <- list(
data = seurat_obj.filtered, # seurat_obj filter
config = config,
plotData = plots
plotData = guidata
)
return(result)
}
22 changes: 16 additions & 6 deletions pipeline-runner/src/numGenesVsNumUmis.r
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
obj_metadata <- [email protected]
barcode_names_this_sample <- rownames(obj_metadata[grep(tmp_sample, rownames(obj_metadata)),])
if(length(barcode_names_this_sample)==0){
plots <- list()
plots[generate_plotuuid(sample_id, task_name, 0)] <- list()
return(list(data = seurat_obj,config = config,plotData = plots))
guidata <- list()
guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list()
return(list(data = seurat_obj,config = config,plotData = guidata))
}
sample_subset <- subset(seurat_obj, cells = barcode_names_this_sample)
# We regress the molecules vs the genes. This information are stored in nCount_RNA and nFeature_RNA respectively
Expand Down Expand Up @@ -117,17 +117,27 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
# bands that are conformed with the upper_cutoff and the lower_cutoff. We can print a band or dotted lines.
# Q: Should we return the point out the cells that are going to be excluded from the R side or this task can be done in
# the UI side.
plots <- list()
plots[generate_plotuuid(sample_id, task_name, 0)] <- list(plot1_data)
guidata <- list()
guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list(plot1_data)

print(paste0("Cells per sample after filter for sample ", sample_id))
print(table(seurat_obj.filtered$orig.ident))

# Populate with filter statistics
filter_stats <- list(
before = calc_filter_stats(seurat_obj, tmp_sample),
after = calc_filter_stats(seurat_obj.filtered, tmp_sample)
)

guidata[generate_gui_uuid(sample_id, task_name, 1)] <- filter_stats
print("Filter statistics for sample before/after filter:")
str(filter_stats)

# the result object will have to conform to this format: {data, config, plotData : {plot1}}
result <- list(
data = seurat_obj.filtered,
config = config,
plotData = plots
plotData = guidata
)

return(result)
Expand Down
40 changes: 34 additions & 6 deletions pipeline-runner/src/utils.r
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
#
# Generate plot uuid as required by the UI
# Generate GUI plots and data uuid as required by the UI
#

generate_plotuuid <- function(sample_uuid, task_name, plot_idx) {
generate_gui_uuid <- function(sample_uuid, task_name, item_idx) {

if(sample_uuid != "") {
return(paste(sample_uuid, task_name, plot_idx, sep="-"))
return(paste(sample_uuid, task_name, item_idx, sep="-"))
}

return(paste(task_name, plot_idx, sep="-"))
return(paste(task_name, item_idx, sep="-"))

}

Expand Down Expand Up @@ -54,4 +53,33 @@ handle_debug <- function(scdata, config, task_name, sample_id, debug_config) {
save(seurat_obj, config, task_name, sample_id, num_cells_to_downsample, file = fpath_cont)
message(sprintf("⚠️ RUN load('%s') to restore environment.", fpath_host))
}
}
}


#' Calculates statistics before/after filter step
#'
#' @param seurat_obj \code{SeuratObject}
#' @param tmp_sample sample name in \code{[email protected]} to compute statistics for
#'
#' @return list with \itemize{
#' \item{"num_cells"}{Number of cells in sample}
#' \item{"total_genes"}{Number of detected genes in sample}
#' \item{"median_genes"}{Median number of genes detected per cell}
#' \item{"median_umis"}{Median number of counts per cell}
#' }
#'
calc_filter_stats <- function(seurat_obj, tmp_sample) {

# subset to current sample
scdata <- seurat_obj[, seurat_obj$orig.ident == tmp_sample]

# number of counts per gene
ncount <- Matrix::rowSums(scdata[['RNA']]@counts)

list(
num_cells = ncol(scdata),
total_genes = sum(ncount > 0),
median_genes = median(scdata$nFeature_RNA),
median_umis = median(scdata$nCount_RNA)
)
}