-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
checked for a suitable binarization cutoff one
- Loading branch information
Showing
1 changed file
with
364 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,364 @@ | ||
--- | ||
title: "blastocyst" | ||
author: "Moi T. Nicholas" | ||
output: rmarkdown::html_vignette | ||
vignette: > | ||
%\VignetteIndexEntry{blastocyst} | ||
%\VignetteEngine{knitr::rmarkdown} | ||
%\VignetteEncoding{UTF-8} | ||
--- | ||
|
||
```{r, include = FALSE} | ||
knitr::opts_chunk$set( | ||
collapse = TRUE, | ||
comment = "#>" | ||
) | ||
``` | ||
|
||
This vignette will take you through running PPR. \ | ||
The data used here is an Intregrated dataset of multiple blastocyst datsets [Here](http://). \ | ||
\ | ||
\ | ||
|
||
|
||
# Data Pre-procesing | ||
|
||
#### Load neccecary packages | ||
```{r eval=FALSE} | ||
library(Seurat) | ||
library(ggplot2) | ||
library(slingshot) | ||
library(RColorBrewer) | ||
library(GeneSwitches) | ||
# library(GeneSwitchesScorer) | ||
library(SingleCellExperiment) | ||
``` | ||
|
||
## SEURAT | ||
|
||
### Here we use this Reprogramming dataset as the reference | ||
```{r eval=FALSE} | ||
seu <- readRDS("/mainfs/ddnb/PathPinpointR/blastocyst/blastocyst.rds") | ||
``` | ||
|
||
#### View the reference UMAP | ||
```{r eval=FALSE} | ||
DimPlot(object = seu, reduction = "umap", group.by = "orig.ident" ,label = T) + | ||
ggtitle("Reference") | ||
DimPlot(object = seu, reduction = "umap", group.by = "seurat_clusters" ,label = T) + | ||
ggtitle("Reference") | ||
``` | ||
|
||
### QC | ||
```{r eval=FALSE} | ||
# | ||
#seu[["percent.mt"]] <- PercentageFeatureSet(seu, pattern = "MT-") | ||
# Visualize QC metrics as a violin plot | ||
#VlnPlot(seu, features = c("nFeature_RNA", "nCount_RNA", "pctMT"), ncol = 3) | ||
``` | ||
|
||
### QC | ||
```{r} | ||
#seu <- subset(seu, subset = nFeature_RNA > 2500 & pctMT < 10 & nCount_RNA < 1e+07) | ||
# Visualize QC metrics as a violin plot | ||
#VlnPlot(seu, features = c("nFeature_RNA", "nCount_RNA", "pctMT"), ncol = 3) | ||
``` | ||
|
||
### We use subsets of the Reprogramming dataset as queries. | ||
```{r eval=FALSE} | ||
Mole21a.seu <- subset(x = seu, subset = orig.ident %in% "Mole21a") | ||
``` | ||
|
||
|
||
### Label transfer & Re-integration | ||
####### *When doing this analysis with your own data you would more than likely need to label transfer with your referenece datatset.* | ||
####### *As we are using subsets of our reference data as "samples" this wont be necessary.* | ||
, | ||
|
||
## SingeCellExperiment | ||
|
||
|
||
### Convert objects to SingleCellExperiment objects | ||
```{r eval=FALSE} | ||
sce <- SingleCellExperiment(assays = list(expdata = seu@assays$RNA$counts)) | ||
colData(sce) <- DataFrame([email protected]) | ||
reducedDims(sce)$UMAP <- seu@[email protected] | ||
Mole21a.sce <- SingleCellExperiment(assays = list(expdata = Mole21a.seu@assays$RNA$counts)) | ||
``` | ||
|
||
|
||
## Slingshot | ||
|
||
### Run slingshot on the reference data to produce a reprogramming trajectory. | ||
```{r eval=FALSE} | ||
sce <- slingshot(sce, | ||
clusterLabels = "seurat_clusters", | ||
start.clus = "2", | ||
end.clus = "1", | ||
reducedDim = "UMAP") | ||
``` | ||
|
||
### Plot the slingshot trajectory. | ||
```{r eval=FALSE} | ||
colors <- colorRampPalette(brewer.pal(11,'Spectral')[-6])(100) | ||
plotcol <- colors[cut(sce$slingPseudotime_1, breaks=100)] | ||
plot(reducedDims(sce)$UMAP, col = plotcol, pch=16, asp = 1) | ||
lines(SlingshotDataSet(sce), lwd=2, col='black') | ||
``` | ||
|
||
|
||
## GeneSwitches | ||
### Choose a Binerization cutoff | ||
###### **this would be good to automate, | ||
###### or learn how to use the other method of binarizing | ||
```{r eval=FALSE} | ||
# hist(as.matrix(assays(sce)$expdata), | ||
# breaks = 1e6) | ||
# hist(as.matrix(seu@assays$RNA$counts), breaks = 1e6) | ||
# | ||
# hist(as.matrix(assays(sce)$expdata), | ||
# breaks = 1e6, | ||
# xlim = c(0, 50)) | ||
# | ||
# hist(as.matrix(seu@assays$RNA$counts), breaks = 1e6, xlim =c(0, 50)) | ||
# | ||
# trimmed_expdata <- assays(sce)$expdata[assays(sce)$expdata < 10] | ||
# hist(as.matrix(trimmed_expdata)) | ||
# abline(v=1, col = "blue") | ||
# | ||
# trimmed_expdata <- seu@assays$RNA$counts[seu@assays$RNA$counts < 10] | ||
# hist(as.matrix(trimmed_expdata)) | ||
# abline(v=1, col = "blue") | ||
#cutoff =1 | ||
``` | ||
|
||
### Binarize | ||
```{r eval=FALSE} | ||
sce <- binarize_exp(sce, fix_cutoff = TRUE, binarize_cutoff = 1, ncores = 64) | ||
Mole21a.sce <- binarize_exp(Mole21a.sce, fix_cutoff = TRUE, binarize_cutoff = 1, ncores = 64) | ||
#sce <- binarize_exp(sce, ncores = 64) | ||
#Mole21a.sce <- binarize_exp(Mole21a.sce, ncores = 64) | ||
``` | ||
|
||
### CHECKPOINT | ||
##### **with how long binerizing takes you may want to choose to save objects for future use. | ||
```{r eval=FALSE} | ||
saveRDS(sce , "~/R Packages/mini_data/Binerized/reference.rds") | ||
saveRDS(Mole21a.sce , "~/R Packages/mini_data/Binerized/fibroblast.rds") | ||
sce <- readRDS("~/R Packages/mini_data/Binerized/reference.rds") | ||
Mole21a.sce <- readRDS("~/R Packages/mini_data/Binerized/fibroblast.rds") | ||
``` | ||
|
||
### fit logistic regression and find the switching pseudo-time point for each gene | ||
```{r eval=FALSE} | ||
sce <- find_switch_logistic_fastglm(sce, downsample = FALSE, show_warning = FALSE) | ||
``` | ||
|
||
#### **This is another time consuming process, may want to choose to save objects for future use.** | ||
```{r eval=FALSE} | ||
saveRDS(reference_glm.gs , "~/R Packages/mini_data/Binerized/reference_glm.rds") | ||
sce <- readRDS("~/R Packages/mini_data/Binerized/reference_glm.rds") | ||
``` | ||
|
||
### Filter to only include Switching Genes | ||
```{r eval=FALSE} | ||
reference.sg <- filter_switchgenes(sce, allgenes = TRUE,r2cutoff = 0.03) | ||
``` | ||
|
||
##### View all of the switching genes | ||
```{r eval=FALSE} | ||
plot_timeline_ggplot(reference.sg, timedata = colData(reference_glm.gs)$Pseudotime, txtsize = 3) | ||
``` | ||
|
||
# Using GeneSwitchesScorer | ||
|
||
### Filter for an even distribution | ||
```{r eval=FALSE} | ||
gss.sg <- select_evenly_distributed_switching_genes(reference.sg, min_time_spacing = 5) | ||
``` | ||
|
||
##### View the selected switching genes | ||
```{r eval=FALSE} | ||
plot_timeline_ggplot(gss.sg, timedata = colData(reference_glm.gs)$Pseudotime, txtsize = 3) | ||
``` | ||
|
||
### Reduce the binary counts matricies of the query data to only include the selection of evenly distributed genes from the refernence. | ||
```{r eval=FALSE} | ||
fibroblast_reduced <- filter_gene_expression_for_switching_genes(Mole21a.sce@assays@data@listData$binary , gss.sg) | ||
mixed_reduced <- filter_gene_expression_for_switching_genes(mixed.gs@assays@data@listData$binary , gss.sg) | ||
early_primed_reduced <- filter_gene_expression_for_switching_genes(early_primed.gs@assays@data@listData$binary , gss.sg) | ||
primed_reduced <- filter_gene_expression_for_switching_genes(primed.gs@assays@data@listData$binary , gss.sg) | ||
reference_reduced <- filter_gene_expression_for_switching_genes(sce@assays@data@listData$binary , gss.sg) | ||
``` | ||
|
||
### Produce an estimate for the position on trajectory of each gene in each cell of a sample. | ||
```{r eval=FALSE} | ||
Mole21a.sces <- create_racing_lines(fibroblast_reduced , gss.sg) | ||
mixed.gss <- create_racing_lines(mixed_reduced , gss.sg) | ||
early_primed.gss <- create_racing_lines(early_primed_reduced, gss.sg) | ||
primed.gss <- create_racing_lines(primed_reduced , gss.sg) | ||
sces <- create_racing_lines(reference_reduced , gss.sg) | ||
``` | ||
|
||
### Accuracy | ||
#### *As our samples are subsets of the reference dataset we can calculate the accuracy of GSS* | ||
```{r eval=FALSE} | ||
fibroblast_accuracy <- score_gss_accuracy(Mole21a.sces, sce) | ||
mixed_accuracy <- score_gss_accuracy(mixed.gss, sce) | ||
early_primed_accuracy <- score_gss_accuracy(early_primed.gss, sce) | ||
primed_accuracy <- score_gss_accuracy(primed.gss, sce) | ||
reference_accuracy <- score_gss_accuracy(sces, sce) | ||
``` | ||
|
||
## Plotting | ||
|
||
### Plot the predicted position of each sample: | ||
#### *Optional: include the switching point of some genes of interest:* | ||
```{r eval=FALSE} | ||
gss_output_plot(Mole21a.sces, col = "red", overlay=FALSE, label = "fibroblast", genes_of_interest = c("ENSG00000248605","ENSG00000272168")) | ||
gss_output_plot(mixed.gss, col = "peachpuff4", overlay=TRUE, label = "mixed") | ||
gss_output_plot(early_primed.gss, col = "purple", overlay=TRUE, label = "early primed") | ||
gss_output_plot(primed.gss, col = "darkgreen", overlay=TRUE, label = "primed" , genes_of_interest = c("ENSG00000236924")) | ||
``` | ||
|
||
## Further analysis of the results | ||
|
||
### Investigate the predicted position of indervidual cells: | ||
|
||
#### Plot the preducted position of all cells in a Sample | ||
###### *only run this on a selection of cells or a small sample* | ||
```{r eval=FALSE} | ||
for (c in 1:nrow(sc.gss$cells_flat)) { | ||
plot(x = 1:100, y = sc.gss$cells_flat[c,], type = "h", | ||
xlab = "Pseudotime Index", ylab = "Cell Position Likelihood", | ||
main = paste("Trajectory Progress of", rownames(sc.gss$cells_flat)[c])) | ||
segments(which.max(sc.gss$cells_flat[c,]), | ||
0.0, | ||
which.max(sc.gss$cells_flat[c,]), | ||
as.numeric(sc.gss$cells_flat[c,which.max(sc.gss$cells_flat[c,])]), | ||
lwd=2, | ||
col ="red") | ||
} | ||
``` | ||
|
||
#### Plot the "racing lines" for indervidual cells. | ||
```{r eval=FALSE} | ||
racinglines_timeline(reference.sg, reference_reduced, cell_idx = 1) | ||
``` | ||
|
||
### Plot the distribution of inaccuracy of each sample: | ||
```{r eval=FALSE} | ||
hist(fibroblast_accuracy$inaccuracy , breaks = 100) | ||
hist(mixed_accuracy$inaccuracy , breaks = 100) | ||
hist(early_primed_accuracy$inaccuracy, breaks = 100) | ||
hist(primed_accuracy$inaccuracy , breaks = 100) | ||
hist(reference_accuracy$inaccuracy , breaks = 100) | ||
``` | ||
|
||
### Investigate the predicted position of indervidual cells: | ||
```{r eval=FALSE} | ||
# | ||
plot(fibroblast_accuracy$true_position_of_cells_timeIDX, fibroblast_accuracy$inaccuracy, | ||
xlab = "True Position", ylab = "Inaccuracy", main = "Inaccuracy by True Positions") | ||
# | ||
boxplot(fibroblast_accuracy$inaccuracy ~ fibroblast_accuracy$true_position_of_cells_timeIDX, | ||
xlab = "True Position", ylab = "Inaccuracy", main = "Boxplots of Inaccuracy by True Positions") | ||
true_position_bin <- cut(fibroblast_accuracy$true_position_of_cells_timeIDX, | ||
breaks = c(0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80.85,90,95,100)) | ||
boxplot(fibroblast_accuracy$inaccuracy ~ true_position_bin, | ||
xlab = "True Position", ylab = "Inaccuracy", main = "Boxplots of Inaccuracy by True Positions") | ||
# | ||
plot(fibroblast_accuracy$true_position_of_cells_timeIDX,fibroblast_accuracy$predicted_position_of_cells_timeIDX, | ||
xlab = "True Position", ylab = "Predicted Position", main = "Predicted Positions by True Positions") | ||
segments(0,0,100,100, lwd=2, col ="green") | ||
segments(0,10,100,110, lwd=2, col ="blue") | ||
segments(0,-10,100,90, lwd=2, col ="blue") | ||
segments(0,40,100,140, lwd=2, col ="red") | ||
segments(0,-40,100,60, lwd=2, col ="red") | ||
# | ||
boxplot(fibroblast_accuracy$predicted_position_of_cells_timeIDX ~ fibroblast_accuracy$true_position_of_cells_timeIDX, | ||
xlab = "True Position", ylab = "Predicted Position", main = "Boxplots of Predicted Positions by True Positions") | ||
segments(0,0,100,100, lwd=2, col ="green") | ||
segments(0,10,100,110, lwd=2, col ="blue") | ||
segments(0,-10,100,90, lwd=2, col ="blue") | ||
segments(0,40,100,140, lwd=2, col ="red") | ||
segments(0,-40,100,60, lwd=2, col ="red") | ||
``` | ||
|
||
### Investigate the precision of min_time_spacing values: | ||
```{r eval=FALSE} | ||
precision <- data.frame( | ||
min_time_spacing = 0:100, | ||
n_genes = NA, # Number of genes after min_time_spacing | ||
accuracy_min = NA, # Placeholder for accuracy minimum values | ||
accuracy_1 = NA, | ||
accuracy_median = NA, | ||
accuracy_mean = NA, | ||
accuracy_3 = NA, | ||
accuracy_max = NA | ||
) | ||
for (i in 0:100){ | ||
# Filter for an even distribution | ||
gss_genes <- select_evenly_distributed_switching_genes(sce, min_time_spacing = i) | ||
# Reduce the binary counts matricies of the query data to only include the selection of evenly distributed genes from the refernence. | ||
sample_reduced <- filter_gene_expression_for_switching_genes(sample.gs@assays@data@listData$binary , gss_genes) | ||
# | ||
sample.gss <- create_racing_lines(sample_reduced, gss_genes) | ||
# | ||
accuracy <- score_gss_accuracy(sample.gss, sce) | ||
# | ||
precision$n_genes[i+1] <- dim(gss_genes)[1] | ||
precision$accuracy_min[i+1] <- summary(accuracy$accuracy)[1] | ||
precision$accuracy_1[i+1] <- summary(accuracy$accuracy)[2] | ||
precision$accuracy_median[i+1] <- summary(accuracy$accuracy)[3] | ||
precision$accuracy_mean[i+1] <- summary(accuracy$accuracy)[4] | ||
precision$accuracy_3[i+1] <- summary(accuracy$accuracy)[5] | ||
precision$accuracy_max[i+1] <- summary(accuracy$accuracy)[6] | ||
# | ||
cat(i," done \n") | ||
} | ||
ggplot(data = precision, mapping = aes(x = min_time_spacing)) + | ||
geom_point(aes(y = accuracy_min), color = "lightblue") + | ||
geom_point(aes(y = accuracy_1), color = "lightpink") + | ||
geom_point(aes(y = accuracy_median), color = "red") + | ||
geom_point(aes(y = accuracy_mean), color = "blue") + | ||
geom_point(aes(y = accuracy_3), color = "darkgreen") + | ||
geom_point(aes(y = accuracy_max), color = "lightblue") + | ||
ylab("innacuracy") | ||
ggplot(data = precision, mapping = aes(x = n_genes)) + | ||
geom_point(aes(y = accuracy_min), color = "lightblue") + | ||
geom_point(aes(y = accuracy_1), color = "lightpink") + | ||
geom_point(aes(y = accuracy_median), color = "red") + | ||
geom_point(aes(y = accuracy_mean), color = "blue") + | ||
geom_point(aes(y = accuracy_3), color = "darkgreen") + | ||
geom_point(aes(y = accuracy_max), color = "lightblue") + | ||
ylab("innacuracy") + scale_x_log10() | ||
ggplot(data = precision, mapping = aes(x = min_time_spacing)) + | ||
geom_point(aes(y = n_genes), color = "red") + | ||
scale_y_log10() | ||
``` |