velocity/05.create_seurat_sample.Rmd

---
title: "RNA velocity analysis - seurat4sample"
author: "Sergey Naumenko"
date: "`r Sys.Date()`"
output:
    html_document:
        code_folding: hide
        df_print: paged
        highlight: tango
        theme: default
        number_sections: true
        toc: true
        toc_float:
            collapsed: true
            smooth_scroll: false
params:
    npcs: 30
    sample: N6_3_4
---


```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(readxl)
library(writexl)

library(BUSpaRse)
library(Seurat)
library(SeuratWrappers)
#library(BSgenome.Mmusculus.UCSC.mm10)
library(AnnotationHub)
library(zeallot) # For %<-% that unpacks lists in the Python manner
library(DropletUtils) # barcodeRanks
library(GGally) # For ggpairs
library(velocyto.R)
#library(SingleR)
library(scales)
library(plotly)

ggplot2::theme_set(theme_light(base_size = 14))

opts_chunk[["set"]](
    cache = FALSE,
    dev = c("png", "pdf"),
    error = TRUE,
    highlight = TRUE,
    message = FALSE,
    prompt = FALSE,
    tidy = FALSE,
    warning = FALSE)
```

```{r, rows.print = 25}
knee_plot <- function(bc_ranks) {
  # purrr pluck shorthand doesn't work on S4Vector DataFrame
  knee_plt <- tibble(rank = map(bc_ranks, ~ .x[["rank"]]), 
                     total = map(bc_ranks, ~ .x[["total"]]),
                     dataset = names(bc_ranks)) %>% 
    unnest(cols = c(rank, total)) %>% 
    distinct() %>% 
    dplyr::filter(total > 0)
  annot <- tibble(inflection = map_dbl(bc_ranks, ~ metadata(.x)[["inflection"]]),
                  rank_cutoff = map_dbl(bc_ranks, 
                                        ~ max(.x$rank[.x$total >
                                                        metadata(.x)[["inflection"]]])),
                  dataset = names(bc_ranks))
  p <- ggplot(knee_plt, aes(rank, total, color = dataset)) +
    geom_line() +
    geom_hline(aes(yintercept = inflection, color = dataset), 
               data = annot, linetype = 2) +
    geom_vline(aes(xintercept = rank_cutoff, color = dataset),
               data = annot, linetype = 2) +
    scale_x_log10() +
    scale_y_log10() +
    labs(x = "Rank", y = "Total UMIs")
  return(p)
}

seurat_file <- paste0("data/velocity/", params$sample, ".seurat.RDS")
velocity_dir <- paste0("data/velocity/", params$sample)

# please note that read_velocity_output ignores wd, 
# the script should be in the same directory as data/velocity..
c(spliced, unspliced) %<-% read_velocity_output(spliced_dir = velocity_dir,
                                                spliced_name = "spliced",
                                                unspliced_dir = velocity_dir,
                                                unspliced_name = "unspliced")

sum(unspliced@x) / (sum(unspliced@x) + sum(spliced@x))

# too many cells - empty droplets
dim(spliced)
dim(unspliced)

# most cells can few barcodes
tot_count <- Matrix::colSums(spliced)
summary(tot_count)

bc_rank <- barcodeRanks(spliced)
bc_uns <- barcodeRanks(unspliced)

#knee_plot(list(spliced = bc_rank, 
#               unspliced = bc_uns)) +
#  coord_flip()

library(Rcpp)
cppFunction('
//[[Rcpp::depends(RcppProgress)]]
#include <progress.hpp>
#include <progress_bar.hpp>
#include <Rcpp.h>
using namespace Rcpp;

//[[Rcpp::export]]
NumericMatrix bc_ranks2(NumericVector x, NumericVector y, 
                        NumericVector x_grid, NumericVector y_grid) {
  NumericMatrix out(x_grid.size(), y_grid.size());
  Progress p(x_grid.size(), true);
  for (int i = 0; i < x_grid.size(); i++) {
    checkUserInterrupt();
    for (int j = 0; j < y_grid.size(); j++) {
      out(i,j) = sum((x_grid[i] <= x) & (y_grid[j] <= y));
    }
    p.increment();
  }
  return(out);
}')

# Can only plot barcodes with both spliced and unspliced counts
bcs_inter <- intersect(colnames(spliced), colnames(unspliced))
s <- colSums(spliced[,bcs_inter])
u <- colSums(unspliced[,bcs_inter])
# Grid points
sr <- sort(unique(exp(round(log(s)*100)/100)))
ur <- sort(unique(exp(round(log(u)*100)/100)))
    # long
bc2_file <- paste0("data/velocity/", params$sample, ".bc2.RDS")
if (file.exists(bc2_file)){
    bc2 <- readRDS(bc2_file)
}else{
    bc2 <- bc_ranks2(s, u, sr, ur)
    saveRDS(bc2, bc2_file)
}
    
z_use <- log10(bc2)
z_use[is.infinite(z_use)] <- NA
    #plot_ly(x = sr, y = ur, z = z_use) %>% add_surface() %>% 
    #    layout(scene = list(xaxis = list(title = "Total spliced UMIs", type = "log"),
    #                  yaxis = list(title = "Total unspliced UMIs", type = "log"),
    #                  zaxis = list(title = "Rank (log10)")))
    
bcs_use <- colnames(spliced)[tot_count > metadata(bc_rank)$inflection]
# Remove genes that aren't detected
tot_genes <- Matrix::rowSums(spliced)
genes_use <- rownames(spliced)[tot_genes > 0]
sf <- spliced[genes_use, bcs_use]
uf <- unspliced[genes_use, bcs_use]
dim(sf)
dim(uf)
rownames(sf) <- str_remove(rownames(sf), "\\.\\d+")
rownames(uf) <- str_remove(rownames(uf), "\\.\\d+")

seurat <- CreateSeuratObject(sf, assay = "sf")
seurat[["uf"]] <- CreateAssayObject(uf)
saveRDS(seurat, seurat_file)
```

# R session information
```{r}
sessionInfo()
```