checked for a suitable binarization cutoff one

moi-taiga · May 1, 2024 · 8b5ee8b · 8b5ee8b
1 parent 8430821
commit 8b5ee8b
Showing 1 changed file with 364 additions and 0 deletions.
diff --git a/vignettes/blastocyst.rmd b/vignettes/blastocyst.rmd
@@ -0,0 +1,364 @@
+---
+title: "blastocyst"
+author: "Moi T. Nicholas"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{blastocyst}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+```
+
+This vignette will take you through running PPR. \
+The data used here is an Intregrated dataset of multiple blastocyst datsets [Here](http://). \
+\
+\
+
+
+# Data Pre-procesing
+
+#### Load neccecary packages
+```{r eval=FALSE}
+library(Seurat)
+library(ggplot2)
+library(slingshot)
+library(RColorBrewer)
+library(GeneSwitches)
+# library(GeneSwitchesScorer)
+ library(SingleCellExperiment)
+```
+
+## SEURAT
+
+### Here we use this Reprogramming dataset as the reference 
+```{r eval=FALSE}
+seu <- readRDS("/mainfs/ddnb/PathPinpointR/blastocyst/blastocyst.rds")
+```
+
+#### View the reference UMAP
+```{r eval=FALSE}
+DimPlot(object = seu, reduction = "umap", group.by = "orig.ident" ,label = T) +
+  ggtitle("Reference")
+DimPlot(object = seu, reduction = "umap", group.by = "seurat_clusters" ,label = T) +
+  ggtitle("Reference")
+```
+
+### QC
+```{r eval=FALSE}
+# 
+#seu[["percent.mt"]] <- PercentageFeatureSet(seu, pattern = "MT-")
+
+# Visualize QC metrics as a violin plot
+#VlnPlot(seu, features = c("nFeature_RNA", "nCount_RNA", "pctMT"), ncol = 3)
+
+
+```
+
+### QC
+```{r}
+#seu <- subset(seu, subset = nFeature_RNA > 2500 & pctMT < 10 & nCount_RNA < 1e+07)
+
+# Visualize QC metrics as a violin plot
+#VlnPlot(seu, features = c("nFeature_RNA", "nCount_RNA", "pctMT"), ncol = 3)
+
+```
+
+### We use subsets of the Reprogramming dataset as queries.
+```{r eval=FALSE}
+Mole21a.seu <- subset(x = seu, subset = orig.ident %in% "Mole21a")
+```
+
+
+### Label transfer & Re-integration
+####### *When doing this analysis with your own data you would more than likely need to label transfer with your referenece datatset.*
+####### *As we are using subsets of our reference data as "samples" this wont be necessary.*
+,
+
+## SingeCellExperiment
+
+
+### Convert objects to SingleCellExperiment objects
+```{r eval=FALSE}
+sce    <- SingleCellExperiment(assays = list(expdata = seu@assays$RNA$counts))
+colData(sce) <- DataFrame([email protected])
+reducedDims(sce)$UMAP <- seu@[email protected]
+Mole21a.sce   <- SingleCellExperiment(assays = list(expdata = Mole21a.seu@assays$RNA$counts))
+```
+
+
+## Slingshot
+
+### Run slingshot on the reference data to produce a reprogramming trajectory.
+```{r eval=FALSE}
+sce  <- slingshot(sce,
+                            clusterLabels = "seurat_clusters",
+                            start.clus  = "2",
+                            end.clus = "1",
+                            reducedDim = "UMAP")
+```
+
+### Plot the slingshot trajectory.
+```{r eval=FALSE}
+colors <- colorRampPalette(brewer.pal(11,'Spectral')[-6])(100)
+plotcol <- colors[cut(sce$slingPseudotime_1, breaks=100)]
+plot(reducedDims(sce)$UMAP, col = plotcol, pch=16, asp = 1)
+lines(SlingshotDataSet(sce), lwd=2, col='black')
+```
+
+
+## GeneSwitches
+###  Choose a Binerization cutoff
+###### **this would be good to automate, 
+###### or learn how to use the other method of binarizing
+```{r eval=FALSE}
+# hist(as.matrix(assays(sce)$expdata),
+#            breaks = 1e6)
+#  hist(as.matrix(seu@assays$RNA$counts), breaks = 1e6)
+#  
+#  hist(as.matrix(assays(sce)$expdata),
+#            breaks = 1e6,
+#            xlim = c(0, 50))
+#  
+#  hist(as.matrix(seu@assays$RNA$counts), breaks = 1e6, xlim =c(0, 50))
+#  
+#  trimmed_expdata <- assays(sce)$expdata[assays(sce)$expdata < 10]
+#  hist(as.matrix(trimmed_expdata))
+#  abline(v=1, col = "blue")
+#  
+#  trimmed_expdata <- seu@assays$RNA$counts[seu@assays$RNA$counts < 10]
+#  hist(as.matrix(trimmed_expdata))
+#  abline(v=1, col = "blue")
+#cutoff =1
+ 
+```
+
+### Binarize
+```{r eval=FALSE}
+sce    <- binarize_exp(sce, fix_cutoff = TRUE, binarize_cutoff = 1, ncores = 64)
+Mole21a.sce   <- binarize_exp(Mole21a.sce, fix_cutoff = TRUE, binarize_cutoff = 1, ncores = 64)
+#sce <- binarize_exp(sce, ncores = 64)
+#Mole21a.sce   <- binarize_exp(Mole21a.sce, ncores = 64)
+```
+
+### CHECKPOINT
+##### **with how long binerizing takes you may want to choose to save objects for future use.
+```{r eval=FALSE}
+ saveRDS(sce   , "~/R Packages/mini_data/Binerized/reference.rds")
+ saveRDS(Mole21a.sce  , "~/R Packages/mini_data/Binerized/fibroblast.rds")
+
+ sce    <- readRDS("~/R Packages/mini_data/Binerized/reference.rds")
+ Mole21a.sce   <- readRDS("~/R Packages/mini_data/Binerized/fibroblast.rds")
+ 
+```
+
+### fit logistic regression and find the switching pseudo-time point for each gene
+```{r eval=FALSE}
+sce <- find_switch_logistic_fastglm(sce, downsample = FALSE, show_warning = FALSE)
+```
+
+#### **This is another time consuming process, may want to choose to save objects for future use.**
+```{r eval=FALSE}
+saveRDS(reference_glm.gs   , "~/R Packages/mini_data/Binerized/reference_glm.rds")
+sce    <- readRDS("~/R Packages/mini_data/Binerized/reference_glm.rds")
+```
+
+### Filter to only include Switching Genes
+```{r eval=FALSE}
+reference.sg <- filter_switchgenes(sce, allgenes = TRUE,r2cutoff = 0.03)
+```
+
+##### View all of the switching genes
+```{r eval=FALSE}
+plot_timeline_ggplot(reference.sg, timedata = colData(reference_glm.gs)$Pseudotime, txtsize = 3)
+```
+
+# Using GeneSwitchesScorer
+
+### Filter for an even distribution
+```{r eval=FALSE}
+gss.sg <- select_evenly_distributed_switching_genes(reference.sg, min_time_spacing = 5)
+```
+
+##### View the selected switching genes
+```{r eval=FALSE}
+plot_timeline_ggplot(gss.sg, timedata = colData(reference_glm.gs)$Pseudotime, txtsize = 3)
+```
+
+### Reduce the binary counts matricies of the query data to only include the selection of evenly distributed genes from the refernence.
+```{r eval=FALSE}
+fibroblast_reduced      <- filter_gene_expression_for_switching_genes(Mole21a.sce@assays@data@listData$binary   , gss.sg)
+mixed_reduced           <- filter_gene_expression_for_switching_genes(mixed.gs@assays@data@listData$binary        , gss.sg)
+early_primed_reduced    <- filter_gene_expression_for_switching_genes(early_primed.gs@assays@data@listData$binary , gss.sg)
+primed_reduced          <- filter_gene_expression_for_switching_genes(primed.gs@assays@data@listData$binary       , gss.sg)
+reference_reduced       <- filter_gene_expression_for_switching_genes(sce@assays@data@listData$binary    , gss.sg)
+```
+
+### Produce an estimate for the position on trajectory of each gene in each cell of a sample.
+```{r eval=FALSE}
+Mole21a.sces   <- create_racing_lines(fibroblast_reduced  , gss.sg)
+mixed.gss        <- create_racing_lines(mixed_reduced       , gss.sg)
+early_primed.gss <- create_racing_lines(early_primed_reduced, gss.sg)
+primed.gss       <- create_racing_lines(primed_reduced      , gss.sg)
+sces    <- create_racing_lines(reference_reduced   , gss.sg)
+```
+
+### Accuracy
+#### *As our samples are subsets of the reference dataset we can calculate the accuracy of GSS*
+```{r eval=FALSE}
+fibroblast_accuracy   <- score_gss_accuracy(Mole21a.sces, sce)
+mixed_accuracy        <- score_gss_accuracy(mixed.gss, sce)
+early_primed_accuracy <- score_gss_accuracy(early_primed.gss, sce)
+primed_accuracy       <- score_gss_accuracy(primed.gss, sce)
+reference_accuracy    <- score_gss_accuracy(sces, sce)
+```
+
+## Plotting
+
+### Plot the predicted position of each sample:
+#### *Optional: include the switching point of some genes of interest:*
+```{r eval=FALSE}
+gss_output_plot(Mole21a.sces, col = "red", overlay=FALSE, label = "fibroblast", genes_of_interest = c("ENSG00000248605","ENSG00000272168"))
+gss_output_plot(mixed.gss, col = "peachpuff4", overlay=TRUE, label = "mixed")
+gss_output_plot(early_primed.gss, col = "purple", overlay=TRUE, label = "early primed")
+gss_output_plot(primed.gss, col = "darkgreen", overlay=TRUE, label = "primed" , genes_of_interest = c("ENSG00000236924"))
+```
+
+## Further analysis of the results
+
+### Investigate the predicted position of indervidual cells:
+
+#### Plot the preducted position of all cells in a Sample
+###### *only run this on a selection of cells or a small sample*
+```{r eval=FALSE}
+for (c in 1:nrow(sc.gss$cells_flat)) {
+  plot(x = 1:100, y = sc.gss$cells_flat[c,], type = "h",
+       xlab = "Pseudotime Index", ylab = "Cell Position Likelihood",
+       main = paste("Trajectory Progress of", rownames(sc.gss$cells_flat)[c]))
+  segments(which.max(sc.gss$cells_flat[c,]),
+           0.0,
+           which.max(sc.gss$cells_flat[c,]),
+           as.numeric(sc.gss$cells_flat[c,which.max(sc.gss$cells_flat[c,])]),
+           lwd=2,
+           col ="red")
+
+}
+```
+
+#### Plot the "racing lines" for indervidual cells.
+```{r eval=FALSE}
+racinglines_timeline(reference.sg, reference_reduced, cell_idx = 1)
+```
+
+### Plot the distribution of inaccuracy of each sample:
+```{r eval=FALSE}
+hist(fibroblast_accuracy$inaccuracy  , breaks = 100)
+hist(mixed_accuracy$inaccuracy       , breaks = 100)
+hist(early_primed_accuracy$inaccuracy, breaks = 100)
+hist(primed_accuracy$inaccuracy      , breaks = 100)
+hist(reference_accuracy$inaccuracy   , breaks = 100)
+```
+
+### Investigate the predicted position of indervidual cells:
+```{r eval=FALSE}
+#
+plot(fibroblast_accuracy$true_position_of_cells_timeIDX, fibroblast_accuracy$inaccuracy,
+     xlab = "True Position", ylab = "Inaccuracy", main = "Inaccuracy by True Positions")
+
+#
+boxplot(fibroblast_accuracy$inaccuracy ~ fibroblast_accuracy$true_position_of_cells_timeIDX,
+        xlab = "True Position", ylab = "Inaccuracy", main = "Boxplots of Inaccuracy by True Positions")
+true_position_bin <- cut(fibroblast_accuracy$true_position_of_cells_timeIDX,
+                             breaks = c(0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80.85,90,95,100))
+boxplot(fibroblast_accuracy$inaccuracy ~ true_position_bin,
+        xlab = "True Position", ylab = "Inaccuracy", main = "Boxplots of Inaccuracy by True Positions")
+
+#
+plot(fibroblast_accuracy$true_position_of_cells_timeIDX,fibroblast_accuracy$predicted_position_of_cells_timeIDX,
+     xlab = "True Position", ylab = "Predicted Position", main = "Predicted Positions by True Positions")
+segments(0,0,100,100, lwd=2, col ="green")
+segments(0,10,100,110, lwd=2, col ="blue")
+segments(0,-10,100,90, lwd=2, col ="blue")
+segments(0,40,100,140, lwd=2, col ="red")
+segments(0,-40,100,60, lwd=2, col ="red")
+
+#
+boxplot(fibroblast_accuracy$predicted_position_of_cells_timeIDX ~ fibroblast_accuracy$true_position_of_cells_timeIDX,
+        xlab = "True Position", ylab = "Predicted Position", main = "Boxplots of Predicted Positions by True Positions")
+segments(0,0,100,100, lwd=2, col ="green")
+segments(0,10,100,110, lwd=2, col ="blue")
+segments(0,-10,100,90, lwd=2, col ="blue")
+segments(0,40,100,140, lwd=2, col ="red")
+segments(0,-40,100,60, lwd=2, col ="red")
+```
+
+### Investigate the precision of min_time_spacing values:
+```{r eval=FALSE}
+precision <- data.frame(
+  min_time_spacing = 0:100,
+  n_genes          = NA,                                                   # Number of genes after min_time_spacing
+  accuracy_min     = NA,                                                   # Placeholder for accuracy minimum values
+  accuracy_1       = NA,
+  accuracy_median  = NA,
+  accuracy_mean    = NA,
+  accuracy_3       = NA,
+  accuracy_max     = NA
+  )
+
+
+
+for (i in 0:100){
+# Filter for an even distribution
+gss_genes <- select_evenly_distributed_switching_genes(sce, min_time_spacing = i)
+
+# Reduce the binary counts matricies of the query data to only include the selection of evenly distributed genes from the refernence.
+sample_reduced      <- filter_gene_expression_for_switching_genes(sample.gs@assays@data@listData$binary   , gss_genes)
+
+#
+sample.gss <- create_racing_lines(sample_reduced, gss_genes)
+
+#
+accuracy <- score_gss_accuracy(sample.gss, sce)
+
+#
+precision$n_genes[i+1]          <- dim(gss_genes)[1]
+precision$accuracy_min[i+1]     <- summary(accuracy$accuracy)[1]
+precision$accuracy_1[i+1]       <- summary(accuracy$accuracy)[2]
+precision$accuracy_median[i+1]  <- summary(accuracy$accuracy)[3]
+precision$accuracy_mean[i+1]    <- summary(accuracy$accuracy)[4]
+precision$accuracy_3[i+1]       <- summary(accuracy$accuracy)[5]
+precision$accuracy_max[i+1]     <- summary(accuracy$accuracy)[6]
+
+#
+cat(i," done \n")
+}
+
+
+ggplot(data = precision, mapping = aes(x = min_time_spacing)) +
+  geom_point(aes(y = accuracy_min), color = "lightblue") +
+  geom_point(aes(y = accuracy_1), color = "lightpink") +
+  geom_point(aes(y = accuracy_median), color = "red") +
+  geom_point(aes(y = accuracy_mean), color = "blue") +
+  geom_point(aes(y = accuracy_3), color = "darkgreen") +
+  geom_point(aes(y = accuracy_max), color = "lightblue") +
+  ylab("innacuracy")
+
+ggplot(data = precision, mapping = aes(x = n_genes)) +
+  geom_point(aes(y = accuracy_min), color = "lightblue") +
+  geom_point(aes(y = accuracy_1), color = "lightpink") +
+  geom_point(aes(y = accuracy_median), color = "red") +
+  geom_point(aes(y = accuracy_mean), color = "blue") +
+  geom_point(aes(y = accuracy_3), color = "darkgreen") +
+  geom_point(aes(y = accuracy_max), color = "lightblue") +
+  ylab("innacuracy") + scale_x_log10()
+
+
+ggplot(data = precision, mapping = aes(x = min_time_spacing)) +
+  geom_point(aes(y = n_genes), color = "red") +
+  scale_y_log10()
+```