hms-dbmi-cellenics · alexvpickering · Jun 2, 2021 · May 25, 2021 · May 25, 2021 · May 25, 2021
diff --git a/pipeline-runner/src/cellSizeDistribution.r b/pipeline-runner/src/cellSizeDistribution.r
@@ -31,12 +31,12 @@ generate_default_values_cellSizeDistribution <- function(seurat_obj, config, thr
   # Returns Seurat object with a new list in the `tools` slot,
   # `CalculateBarcodeInflections` including inflection point calculatio
   seurat_obj_tmp <- CalculateBarcodeInflections(
-                                        object = seurat_obj,
-                                        barcode.column = "nCount_RNA",
-                                        group.column = "orig.ident",
-                                        # [HARDCODED]
-                                        threshold.low = threshold.low,
-                                        threshold.high = NULL
+    object = seurat_obj,
+    barcode.column = "nCount_RNA",
+    group.column = "orig.ident",
+    # [HARDCODED]
+    threshold.low = threshold.low,
+    threshold.high = NULL
   )
   # extracting the inflection point value which can serve as minCellSize threshold
   # all other side effects to the scdate object will be discarded
@@ -68,10 +68,11 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
     obj_metadata <- [email protected]
     barcode_names_this_sample <- rownames(obj_metadata[grep(tmp_sample, rownames(obj_metadata)),]) 
     if(length(barcode_names_this_sample)==0){
-        plots <- list()
-        plots[generate_plotuuid(sample_id, task_name, 0)] <- list()
-        plots[generate_plotuuid(sample_id, task_name, 1)] <- list()
-        return(list(data = seurat_obj,config = config,plotData = plots)) 
+        guidata <- list()
+        guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list()
+        guidata[generate_gui_uuid(sample_id, task_name, 1)] <- list()
+        return(list(data = seurat_obj,config = config,plotData = guidata)) 
+
     }
     sample_subset <- subset(seurat_obj, cells = barcode_names_this_sample)
 
@@ -142,16 +143,25 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
 
     # update config
     config$filterSettings$minCellSize <- minCellSize
-    #Populate plots list
-    plots <-list()
-    plots[generate_plotuuid(sample_id, task_name, 0)] <- list(plot1_data)
-    plots[generate_plotuuid(sample_id, task_name, 1)] <- list(plot2_data)
+    #Populate data for UI
+    guidata <-list()
+    guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list(plot1_data)
+    guidata[generate_gui_uuid(sample_id, task_name, 1)] <- list(plot2_data)
+
+    # Populate with filter statistics
+    filter_stats <- list(
+        before = calc_filter_stats(seurat_obj, tmp_sample),
+        after = calc_filter_stats(seurat_obj.filtered, tmp_sample)
+    )
+    guidata[generate_gui_uuid(sample_id, task_name, 3)] <- filter_stats
+    print("Filter statistics for sample before/after filter:")
+    str(filter_stats)
 
     result <- list(
         data = seurat_obj.filtered,
         config = config,
-        plotData = plots
+        plotData = guidata
     )
 
     return(result)
-}
+}
diff --git a/pipeline-runner/src/classifier.r b/pipeline-runner/src/classifier.r
@@ -65,6 +65,9 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
   # sample-WT1
   # we need to get only the last part, in order to grep the object.
   tmp_sample <- sub("sample-","",sample_id)
+
+  # get filter stats before filtering
+  before <- calc_filter_stats(seurat_obj, tmp_sample)
 
   import::here(map2, .from = purrr)
 
@@ -84,16 +87,16 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
       print("Classify is enabled but has no classify data available: will dissable it: no filtering!")      
       # should this be json, i.e. "false" instead?
       config$enabled <- FALSE
-      plots <- list()
+      guidata <- list()
     } else { # enabled and good data:
       print(paste0("Classify is enabled but classify data available: all good for filtering with FDR=", FDR))
       obj_metadata <- [email protected]
       # extract plotting data of original data to return to plot slot later
       barcode_names_this_sample <- rownames(obj_metadata[grep(tmp_sample, rownames(obj_metadata)),]) 
       if(length(barcode_names_this_sample)==0){
-          plots <- list()
-          plots[generate_plotuuid(sample_id, task_name, 0)] <- list()
-          return(list(data = seurat_obj,config = config,plotData = plots)) 
+          guidata <- list()
+          guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list()
+          return(list(data = seurat_obj,config = config,plotData = guidata)) 
       }
       sample_subset <- subset(seurat_obj, cells = barcode_names_this_sample)
       print("Info: empty-drops table of FDR threshold categories (# UMIs for a given threshold interval")
@@ -130,22 +133,31 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
       cells_position_to_keep <- sort(cells_position_to_keep)
       plot1_data <- plot1_data[cells_position_to_keep]
 
-      #Populate plots list
-      plots <-list()
-      plots[generate_plotuuid(sample_id, task_name, 0)] <- list(plot1_data)
+      #Populate guidata list
+      guidata <-list()
+      guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list(plot1_data)
     }
   } else {
     print("filter disabled: data not filtered!")
-    plots<-list()
-    plots[generate_plotuuid(sample_id, task_name, 0)] <- list()
+    guidata <- list()
+    guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list()
     seurat_obj <- seurat_obj
   }
   print(paste0("Cells per sample after filter for sample ", sample_id))
   print(table(seurat_obj$orig.ident, useNA="ifany"))
-  # > head(plots[[1]])
+  # > head(guidata[[1]])
   # [[1]]
   # FDR    log_u
   # 0.000000 3.554852
+
+  # get filter stats after filtering
+  filter_stats <- list(
+    before = before,
+    after = calc_filter_stats(seurat_obj, tmp_sample)
+  )
+  guidata[generate_gui_uuid(sample_id, task_name, 1)] <- filter_stats
+  print("Filter statistics for sample before/after filter:")
+  str(filter_stats)
 
   # [[2]]
   # FDR        log_u
@@ -154,7 +166,7 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
   result <- list(
                  data = seurat_obj,
                  config = config,
-                 plotData = plots
+                 plotData = guidata
   )
 
   return(result)

diff --git a/pipeline-runner/src/configureEmbedding.r b/pipeline-runner/src/configureEmbedding.r
@@ -94,10 +94,10 @@ task <- function(scdata, config, task_name, sample_id){
     )
 
     plots <- list()
-    plots[generate_plotuuid("", task_name, 0)] <- list(embeddingPreviewByCellSets)
-    plots[generate_plotuuid("", task_name, 1)] <- list(embeddingPreviewBySamples)
-    plots[generate_plotuuid("", task_name, 2)] <- list(embeddingPreviewMitochondrialContent)
-    plots[generate_plotuuid("", task_name, 3)] <- list(embeddingPreviewDoubletScore)
+    plots[generate_gui_uuid("", task_name, 0)] <- list(embeddingPreviewByCellSets)
+    plots[generate_gui_uuid("", task_name, 1)] <- list(embeddingPreviewBySamples)
+    plots[generate_gui_uuid("", task_name, 2)] <- list(embeddingPreviewMitochondrialContent)
+    plots[generate_gui_uuid("", task_name, 3)] <- list(embeddingPreviewDoubletScore)
 
     # the result object will have to conform to this format: {data, config, plotData : {plot1, plot2}}
     result <- list(

diff --git a/pipeline-runner/src/dataIntegration.r b/pipeline-runner/src/dataIntegration.r
@@ -63,8 +63,8 @@ task <- function(scdata, config,task_name,sample_id){
     plot2_data <- unname(purrr::map2(1:50,varExplained,function(x,y){c("PC"=x,"percentVariance"=y)}))
 
     plots <- list()
-    plots[generate_plotuuid("", task_name, 0)] <- list(plot1_data)
-    plots[generate_plotuuid("", task_name, 1)] <- list(plot2_data)    
+    plots[generate_gui_uuid("", task_name, 0)] <- list(plot1_data)
+    plots[generate_gui_uuid("", task_name, 1)] <- list(plot2_data)    
 
     # For now config is not updated, since there is not new changes
     # config <- ...

diff --git a/pipeline-runner/src/doubletScores.r b/pipeline-runner/src/doubletScores.r
@@ -58,9 +58,9 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
     obj_metadata <- [email protected]
     barcode_names_this_sample <- rownames(obj_metadata[grep(tmp_sample, rownames(obj_metadata)),]) 
     if(length(barcode_names_this_sample)==0){
-        plots <- list()
-        plots[generate_plotuuid(sample_id, task_name, 0)] <- list()
-        return(list(data = seurat_obj,config = config,plotData = plots)) 
+        guidata <- list()
+        guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list()
+        return(list(data = seurat_obj,config = config,plotData = guidata)) 
     }
     sample_subset <- subset(seurat_obj, cells = barcode_names_this_sample)
 
@@ -98,20 +98,30 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
     cells_position_to_keep <- sort(cells_position_to_keep)
     plot1_data <- plot1_data[cells_position_to_keep]
 
-    plots <-list()
+    guidata <-list()
 
-    # plot 1: histgram of doublet scores
+    # plot 1: histogram of doublet scores
     #              [0.161,              0.198,              0.284,  ...]
-    plots[generate_plotuuid(sample_id, task_name, 0)] <- list(plot1_data)
+    guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list(plot1_data)
 
     print(paste0("Cells per sample after filter for sample ", sample_id))
     print(table(seurat_obj.filtered$orig.ident))
+
+    # populate with filter statistics
+    filter_stats <- list(
+        before = calc_filter_stats(seurat_obj, tmp_sample),
+        after = calc_filter_stats(seurat_obj.filtered, tmp_sample)
+    )
+
+    guidata[generate_gui_uuid(sample_id, task_name, 1)] <- filter_stats
+    print("Filter statistics for sample before/after filter:")
+    str(filter_stats)
 
     # the result object will have to conform to this format: {data, config, plotData : {plot1, plot2}}
     result <- list(
         data = seurat_obj.filtered,
         config = config,
-        plotData = plots
+        plotData = guidata
     )
     return(result)
 

diff --git a/pipeline-runner/src/mitochondrialContent.r b/pipeline-runner/src/mitochondrialContent.r
@@ -54,10 +54,10 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
     obj_metadata <- [email protected]
     barcode_names_this_sample <- rownames(obj_metadata[grep(tmp_sample, rownames(obj_metadata)),]) 
     if (length(barcode_names_this_sample)==0){
-        plots <- list()
-        plots[generate_plotuuid(sample_id, task_name, 0)] <- list()
-        plots[generate_plotuuid(sample_id, task_name, 1)] <- list()
-        return(list(data = seurat_obj,config = config,plotData = plots)) 
+        guidata <- list()
+        guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list()
+        guidata[generate_gui_uuid(sample_id, task_name, 1)] <- list()
+        return(list(data = seurat_obj,config = config,plotData = guidata)) 
     }
     sample_subset <- subset(seurat_obj, cells = barcode_names_this_sample)
 
@@ -95,12 +95,12 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
     plot1_data <- plot1_data[cells_position_to_keep]
     plot2_data <- plot2_data[cells_position_to_keep]
 
-    plots <- list()
+    guidata <- list()
 
     # plot 1: histgram of MT-content
     # AAACCCAAGCGCCCAT-1 AAACCCAAGGTTCCGC-1 AAACCCACAGAGTTGG-1
     #              0.161              0.198              0.284  ...
-    plots[generate_plotuuid(sample_id, task_name, 0)] <- list(plot1_data)
+    guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list(plot1_data)
 
     # plot 2: There are two alternavitive:
     #           - Scatter plot with UMIs in the x-axis and MT-content in the y-axis
@@ -111,16 +111,26 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
     # We have decided to use the scatter plot, but I temporaly leave the other option in the comments. 
     # Q: Should we return from the R side the cells that are going to be removed? For this plot it is interesting to color the
     # cells that are going to be excluded. 
-    plots[generate_plotuuid(sample_id, task_name, 1)] <- list(plot2_data)
+    guidata[generate_gui_uuid(sample_id, task_name, 1)] <- list(plot2_data)
 
     # some tests:
     print(paste0("Cells per sample after filter for sample ", sample_id))
     print(table(seurat_obj.filtered$orig.ident))
+
+    # populate with filter statistics
+    filter_stats <- list(
+      before = calc_filter_stats(seurat_obj, tmp_sample),
+      after = calc_filter_stats(seurat_obj.filtered, tmp_sample)
+    )
+
+    guidata[generate_gui_uuid(sample_id, task_name, 1)] <- filter_stats
+    print("Filter statistics for sample before/after filter:")
+    str(filter_stats)
 
     result <- list( 
         data = seurat_obj.filtered, # seurat_obj filter
         config = config,
-        plotData = plots
+        plotData = guidata
     )
     return(result)
 }
diff --git a/pipeline-runner/src/numGenesVsNumUmis.r b/pipeline-runner/src/numGenesVsNumUmis.r
@@ -63,9 +63,9 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
             obj_metadata <- [email protected]
             barcode_names_this_sample <- rownames(obj_metadata[grep(tmp_sample, rownames(obj_metadata)),]) 
             if(length(barcode_names_this_sample)==0){
-                plots <- list()
-                plots[generate_plotuuid(sample_id, task_name, 0)] <- list()
-                return(list(data = seurat_obj,config = config,plotData = plots)) 
+                guidata <- list()
+                guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list()
+                return(list(data = seurat_obj,config = config,plotData = guidata)) 
             }
             sample_subset <- subset(seurat_obj, cells = barcode_names_this_sample)
             # We regress the molecules vs the genes. This information are stored in nCount_RNA and nFeature_RNA respectively
@@ -117,17 +117,27 @@ task <- function(seurat_obj, config, task_name, sample_id, num_cells_to_downsamp
     # bands that are conformed with the upper_cutoff and the lower_cutoff. We can print a band or dotted lines. 
     # Q: Should we return the point out the cells that are going to be excluded from the R side or this task can be done in 
     # the UI side.  
-    plots <- list()
-    plots[generate_plotuuid(sample_id, task_name, 0)] <- list(plot1_data)
+    guidata <- list()
+    guidata[generate_gui_uuid(sample_id, task_name, 0)] <- list(plot1_data)
 
     print(paste0("Cells per sample after filter for sample ", sample_id))
     print(table(seurat_obj.filtered$orig.ident))
 
+    # Populate with filter statistics
+    filter_stats <- list(
+      before = calc_filter_stats(seurat_obj, tmp_sample),
+      after = calc_filter_stats(seurat_obj.filtered, tmp_sample)
+    )
+
+    guidata[generate_gui_uuid(sample_id, task_name, 1)] <- filter_stats
+    print("Filter statistics for sample before/after filter:")
+    str(filter_stats)
+
     # the result object will have to conform to this format: {data, config, plotData : {plot1}}
     result <- list(
         data = seurat_obj.filtered,
         config = config,
-        plotData = plots
+        plotData = guidata
     )
 
     return(result)

diff --git a/pipeline-runner/src/utils.r b/pipeline-runner/src/utils.r
@@ -1,14 +1,13 @@
 #
-# Generate plot uuid as required by the UI
+# Generate GUI plots and data uuid as required by the UI
 # 
-
-generate_plotuuid <- function(sample_uuid, task_name, plot_idx) {
+generate_gui_uuid <- function(sample_uuid, task_name, item_idx) {
 
   if(sample_uuid != "") {
-    return(paste(sample_uuid, task_name, plot_idx, sep="-"))
+    return(paste(sample_uuid, task_name, item_idx, sep="-"))
   }
 
-  return(paste(task_name, plot_idx, sep="-"))
+  return(paste(task_name, item_idx, sep="-"))
 
 }
 
@@ -54,4 +53,33 @@ handle_debug <- function(scdata, config, task_name, sample_id, debug_config) {
         save(seurat_obj, config, task_name, sample_id, num_cells_to_downsample, file = fpath_cont)
         message(sprintf("⚠️ RUN load('%s') to restore environment.", fpath_host))
     }
-}
+}
+
+
+#' Calculates statistics before/after filter step
+#'
+#' @param seurat_obj \code{SeuratObject}
+#' @param tmp_sample sample name in \code{[email protected]} to compute statistics for
+#'
+#' @return list with \itemize{
+#'   \item{"num_cells"}{Number of cells in sample}
+#'   \item{"total_genes"}{Number of detected genes in sample}
+#'   \item{"median_genes"}{Median number of genes detected per cell}
+#'   \item{"median_umis"}{Median number of counts per cell}
+#' }
+#' 
+calc_filter_stats <- function(seurat_obj, tmp_sample) {
+
+  # subset to current sample
+  scdata <- seurat_obj[, seurat_obj$orig.ident == tmp_sample]
+
+  # number of counts per gene
+  ncount <- Matrix::rowSums(scdata[['RNA']]@counts)
+
+  list(
+    num_cells = ncol(scdata),
+    total_genes = sum(ncount > 0),
+    median_genes = median(scdata$nFeature_RNA),
+    median_umis = median(scdata$nCount_RNA)
+  )
+}