Skip to content

Commit

Permalink
checked for a suitable binarization cutoff one
Browse files Browse the repository at this point in the history
  • Loading branch information
moi-taiga committed May 1, 2024
1 parent 8430821 commit 8b5ee8b
Showing 1 changed file with 364 additions and 0 deletions.
364 changes: 364 additions & 0 deletions vignettes/blastocyst.rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,364 @@
---
title: "blastocyst"
author: "Moi T. Nicholas"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{blastocyst}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
```

This vignette will take you through running PPR. \
The data used here is an Intregrated dataset of multiple blastocyst datsets [Here](http://). \
\
\


# Data Pre-procesing

#### Load neccecary packages
```{r eval=FALSE}
library(Seurat)
library(ggplot2)
library(slingshot)
library(RColorBrewer)
library(GeneSwitches)
# library(GeneSwitchesScorer)
library(SingleCellExperiment)
```

## SEURAT

### Here we use this Reprogramming dataset as the reference
```{r eval=FALSE}
seu <- readRDS("/mainfs/ddnb/PathPinpointR/blastocyst/blastocyst.rds")
```

#### View the reference UMAP
```{r eval=FALSE}
DimPlot(object = seu, reduction = "umap", group.by = "orig.ident" ,label = T) +
ggtitle("Reference")
DimPlot(object = seu, reduction = "umap", group.by = "seurat_clusters" ,label = T) +
ggtitle("Reference")
```

### QC
```{r eval=FALSE}
#
#seu[["percent.mt"]] <- PercentageFeatureSet(seu, pattern = "MT-")
# Visualize QC metrics as a violin plot
#VlnPlot(seu, features = c("nFeature_RNA", "nCount_RNA", "pctMT"), ncol = 3)
```

### QC
```{r}
#seu <- subset(seu, subset = nFeature_RNA > 2500 & pctMT < 10 & nCount_RNA < 1e+07)
# Visualize QC metrics as a violin plot
#VlnPlot(seu, features = c("nFeature_RNA", "nCount_RNA", "pctMT"), ncol = 3)
```

### We use subsets of the Reprogramming dataset as queries.
```{r eval=FALSE}
Mole21a.seu <- subset(x = seu, subset = orig.ident %in% "Mole21a")
```


### Label transfer & Re-integration
####### *When doing this analysis with your own data you would more than likely need to label transfer with your referenece datatset.*
####### *As we are using subsets of our reference data as "samples" this wont be necessary.*
,

## SingeCellExperiment


### Convert objects to SingleCellExperiment objects
```{r eval=FALSE}
sce <- SingleCellExperiment(assays = list(expdata = seu@assays$RNA$counts))
colData(sce) <- DataFrame([email protected])
reducedDims(sce)$UMAP <- seu@[email protected]
Mole21a.sce <- SingleCellExperiment(assays = list(expdata = Mole21a.seu@assays$RNA$counts))
```


## Slingshot

### Run slingshot on the reference data to produce a reprogramming trajectory.
```{r eval=FALSE}
sce <- slingshot(sce,
clusterLabels = "seurat_clusters",
start.clus = "2",
end.clus = "1",
reducedDim = "UMAP")
```

### Plot the slingshot trajectory.
```{r eval=FALSE}
colors <- colorRampPalette(brewer.pal(11,'Spectral')[-6])(100)
plotcol <- colors[cut(sce$slingPseudotime_1, breaks=100)]
plot(reducedDims(sce)$UMAP, col = plotcol, pch=16, asp = 1)
lines(SlingshotDataSet(sce), lwd=2, col='black')
```


## GeneSwitches
### Choose a Binerization cutoff
###### **this would be good to automate,
###### or learn how to use the other method of binarizing
```{r eval=FALSE}
# hist(as.matrix(assays(sce)$expdata),
# breaks = 1e6)
# hist(as.matrix(seu@assays$RNA$counts), breaks = 1e6)
#
# hist(as.matrix(assays(sce)$expdata),
# breaks = 1e6,
# xlim = c(0, 50))
#
# hist(as.matrix(seu@assays$RNA$counts), breaks = 1e6, xlim =c(0, 50))
#
# trimmed_expdata <- assays(sce)$expdata[assays(sce)$expdata < 10]
# hist(as.matrix(trimmed_expdata))
# abline(v=1, col = "blue")
#
# trimmed_expdata <- seu@assays$RNA$counts[seu@assays$RNA$counts < 10]
# hist(as.matrix(trimmed_expdata))
# abline(v=1, col = "blue")
#cutoff =1
```

### Binarize
```{r eval=FALSE}
sce <- binarize_exp(sce, fix_cutoff = TRUE, binarize_cutoff = 1, ncores = 64)
Mole21a.sce <- binarize_exp(Mole21a.sce, fix_cutoff = TRUE, binarize_cutoff = 1, ncores = 64)
#sce <- binarize_exp(sce, ncores = 64)
#Mole21a.sce <- binarize_exp(Mole21a.sce, ncores = 64)
```

### CHECKPOINT
##### **with how long binerizing takes you may want to choose to save objects for future use.
```{r eval=FALSE}
saveRDS(sce , "~/R Packages/mini_data/Binerized/reference.rds")
saveRDS(Mole21a.sce , "~/R Packages/mini_data/Binerized/fibroblast.rds")
sce <- readRDS("~/R Packages/mini_data/Binerized/reference.rds")
Mole21a.sce <- readRDS("~/R Packages/mini_data/Binerized/fibroblast.rds")
```

### fit logistic regression and find the switching pseudo-time point for each gene
```{r eval=FALSE}
sce <- find_switch_logistic_fastglm(sce, downsample = FALSE, show_warning = FALSE)
```

#### **This is another time consuming process, may want to choose to save objects for future use.**
```{r eval=FALSE}
saveRDS(reference_glm.gs , "~/R Packages/mini_data/Binerized/reference_glm.rds")
sce <- readRDS("~/R Packages/mini_data/Binerized/reference_glm.rds")
```

### Filter to only include Switching Genes
```{r eval=FALSE}
reference.sg <- filter_switchgenes(sce, allgenes = TRUE,r2cutoff = 0.03)
```

##### View all of the switching genes
```{r eval=FALSE}
plot_timeline_ggplot(reference.sg, timedata = colData(reference_glm.gs)$Pseudotime, txtsize = 3)
```

# Using GeneSwitchesScorer

### Filter for an even distribution
```{r eval=FALSE}
gss.sg <- select_evenly_distributed_switching_genes(reference.sg, min_time_spacing = 5)
```

##### View the selected switching genes
```{r eval=FALSE}
plot_timeline_ggplot(gss.sg, timedata = colData(reference_glm.gs)$Pseudotime, txtsize = 3)
```

### Reduce the binary counts matricies of the query data to only include the selection of evenly distributed genes from the refernence.
```{r eval=FALSE}
fibroblast_reduced <- filter_gene_expression_for_switching_genes(Mole21a.sce@assays@data@listData$binary , gss.sg)
mixed_reduced <- filter_gene_expression_for_switching_genes(mixed.gs@assays@data@listData$binary , gss.sg)
early_primed_reduced <- filter_gene_expression_for_switching_genes(early_primed.gs@assays@data@listData$binary , gss.sg)
primed_reduced <- filter_gene_expression_for_switching_genes(primed.gs@assays@data@listData$binary , gss.sg)
reference_reduced <- filter_gene_expression_for_switching_genes(sce@assays@data@listData$binary , gss.sg)
```

### Produce an estimate for the position on trajectory of each gene in each cell of a sample.
```{r eval=FALSE}
Mole21a.sces <- create_racing_lines(fibroblast_reduced , gss.sg)
mixed.gss <- create_racing_lines(mixed_reduced , gss.sg)
early_primed.gss <- create_racing_lines(early_primed_reduced, gss.sg)
primed.gss <- create_racing_lines(primed_reduced , gss.sg)
sces <- create_racing_lines(reference_reduced , gss.sg)
```

### Accuracy
#### *As our samples are subsets of the reference dataset we can calculate the accuracy of GSS*
```{r eval=FALSE}
fibroblast_accuracy <- score_gss_accuracy(Mole21a.sces, sce)
mixed_accuracy <- score_gss_accuracy(mixed.gss, sce)
early_primed_accuracy <- score_gss_accuracy(early_primed.gss, sce)
primed_accuracy <- score_gss_accuracy(primed.gss, sce)
reference_accuracy <- score_gss_accuracy(sces, sce)
```

## Plotting

### Plot the predicted position of each sample:
#### *Optional: include the switching point of some genes of interest:*
```{r eval=FALSE}
gss_output_plot(Mole21a.sces, col = "red", overlay=FALSE, label = "fibroblast", genes_of_interest = c("ENSG00000248605","ENSG00000272168"))
gss_output_plot(mixed.gss, col = "peachpuff4", overlay=TRUE, label = "mixed")
gss_output_plot(early_primed.gss, col = "purple", overlay=TRUE, label = "early primed")
gss_output_plot(primed.gss, col = "darkgreen", overlay=TRUE, label = "primed" , genes_of_interest = c("ENSG00000236924"))
```

## Further analysis of the results

### Investigate the predicted position of indervidual cells:

#### Plot the preducted position of all cells in a Sample
###### *only run this on a selection of cells or a small sample*
```{r eval=FALSE}
for (c in 1:nrow(sc.gss$cells_flat)) {
plot(x = 1:100, y = sc.gss$cells_flat[c,], type = "h",
xlab = "Pseudotime Index", ylab = "Cell Position Likelihood",
main = paste("Trajectory Progress of", rownames(sc.gss$cells_flat)[c]))
segments(which.max(sc.gss$cells_flat[c,]),
0.0,
which.max(sc.gss$cells_flat[c,]),
as.numeric(sc.gss$cells_flat[c,which.max(sc.gss$cells_flat[c,])]),
lwd=2,
col ="red")
}
```

#### Plot the "racing lines" for indervidual cells.
```{r eval=FALSE}
racinglines_timeline(reference.sg, reference_reduced, cell_idx = 1)
```

### Plot the distribution of inaccuracy of each sample:
```{r eval=FALSE}
hist(fibroblast_accuracy$inaccuracy , breaks = 100)
hist(mixed_accuracy$inaccuracy , breaks = 100)
hist(early_primed_accuracy$inaccuracy, breaks = 100)
hist(primed_accuracy$inaccuracy , breaks = 100)
hist(reference_accuracy$inaccuracy , breaks = 100)
```

### Investigate the predicted position of indervidual cells:
```{r eval=FALSE}
#
plot(fibroblast_accuracy$true_position_of_cells_timeIDX, fibroblast_accuracy$inaccuracy,
xlab = "True Position", ylab = "Inaccuracy", main = "Inaccuracy by True Positions")
#
boxplot(fibroblast_accuracy$inaccuracy ~ fibroblast_accuracy$true_position_of_cells_timeIDX,
xlab = "True Position", ylab = "Inaccuracy", main = "Boxplots of Inaccuracy by True Positions")
true_position_bin <- cut(fibroblast_accuracy$true_position_of_cells_timeIDX,
breaks = c(0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80.85,90,95,100))
boxplot(fibroblast_accuracy$inaccuracy ~ true_position_bin,
xlab = "True Position", ylab = "Inaccuracy", main = "Boxplots of Inaccuracy by True Positions")
#
plot(fibroblast_accuracy$true_position_of_cells_timeIDX,fibroblast_accuracy$predicted_position_of_cells_timeIDX,
xlab = "True Position", ylab = "Predicted Position", main = "Predicted Positions by True Positions")
segments(0,0,100,100, lwd=2, col ="green")
segments(0,10,100,110, lwd=2, col ="blue")
segments(0,-10,100,90, lwd=2, col ="blue")
segments(0,40,100,140, lwd=2, col ="red")
segments(0,-40,100,60, lwd=2, col ="red")
#
boxplot(fibroblast_accuracy$predicted_position_of_cells_timeIDX ~ fibroblast_accuracy$true_position_of_cells_timeIDX,
xlab = "True Position", ylab = "Predicted Position", main = "Boxplots of Predicted Positions by True Positions")
segments(0,0,100,100, lwd=2, col ="green")
segments(0,10,100,110, lwd=2, col ="blue")
segments(0,-10,100,90, lwd=2, col ="blue")
segments(0,40,100,140, lwd=2, col ="red")
segments(0,-40,100,60, lwd=2, col ="red")
```

### Investigate the precision of min_time_spacing values:
```{r eval=FALSE}
precision <- data.frame(
min_time_spacing = 0:100,
n_genes = NA, # Number of genes after min_time_spacing
accuracy_min = NA, # Placeholder for accuracy minimum values
accuracy_1 = NA,
accuracy_median = NA,
accuracy_mean = NA,
accuracy_3 = NA,
accuracy_max = NA
)
for (i in 0:100){
# Filter for an even distribution
gss_genes <- select_evenly_distributed_switching_genes(sce, min_time_spacing = i)
# Reduce the binary counts matricies of the query data to only include the selection of evenly distributed genes from the refernence.
sample_reduced <- filter_gene_expression_for_switching_genes(sample.gs@assays@data@listData$binary , gss_genes)
#
sample.gss <- create_racing_lines(sample_reduced, gss_genes)
#
accuracy <- score_gss_accuracy(sample.gss, sce)
#
precision$n_genes[i+1] <- dim(gss_genes)[1]
precision$accuracy_min[i+1] <- summary(accuracy$accuracy)[1]
precision$accuracy_1[i+1] <- summary(accuracy$accuracy)[2]
precision$accuracy_median[i+1] <- summary(accuracy$accuracy)[3]
precision$accuracy_mean[i+1] <- summary(accuracy$accuracy)[4]
precision$accuracy_3[i+1] <- summary(accuracy$accuracy)[5]
precision$accuracy_max[i+1] <- summary(accuracy$accuracy)[6]
#
cat(i," done \n")
}
ggplot(data = precision, mapping = aes(x = min_time_spacing)) +
geom_point(aes(y = accuracy_min), color = "lightblue") +
geom_point(aes(y = accuracy_1), color = "lightpink") +
geom_point(aes(y = accuracy_median), color = "red") +
geom_point(aes(y = accuracy_mean), color = "blue") +
geom_point(aes(y = accuracy_3), color = "darkgreen") +
geom_point(aes(y = accuracy_max), color = "lightblue") +
ylab("innacuracy")
ggplot(data = precision, mapping = aes(x = n_genes)) +
geom_point(aes(y = accuracy_min), color = "lightblue") +
geom_point(aes(y = accuracy_1), color = "lightpink") +
geom_point(aes(y = accuracy_median), color = "red") +
geom_point(aes(y = accuracy_mean), color = "blue") +
geom_point(aes(y = accuracy_3), color = "darkgreen") +
geom_point(aes(y = accuracy_max), color = "lightblue") +
ylab("innacuracy") + scale_x_log10()
ggplot(data = precision, mapping = aes(x = min_time_spacing)) +
geom_point(aes(y = n_genes), color = "red") +
scale_y_log10()
```

0 comments on commit 8b5ee8b

Please sign in to comment.