Skip to content

Commit

Permalink
scrnaseq updates (#35)
Browse files Browse the repository at this point in the history
* scRNAseq changes
*. multiple updates to plots
  • Loading branch information
marcjwilliams1 authored Jan 20, 2022
1 parent 52e0f10 commit 3875523
Show file tree
Hide file tree
Showing 22 changed files with 801 additions and 165 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ jobs:
run: brew install --cask xquartz
- name: Install pandoc
run: brew install pandoc
- name: Install cairo
run: brew install cairo
- name: Install dependencies
run: |
install.packages(c("remotes", "rcmdcheck"))
Expand Down
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ Suggests:
covr,
ggrastr,
VIBER,
Seurat
Seurat,
plyranges
Remotes:
VPetukhov/ggrastr,
caravagn/VIBER,
caravagn/pio,
caravagn/easypar,
Expand Down
11 changes: 10 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ export(HaplotypeHMM)
export(add_states)
export(alleleHMM)
export(assignHaplotypeHMM)
export(assign_bins_haplotypes)
export(assign_haplotype_label)
export(assign_label_persnp)
export(assign_states_dp)
export(assign_states_noprior)
export(assignalleleHMM)
Expand All @@ -26,6 +29,7 @@ export(createSNVmatrix)
export(create_cntransitions)
export(create_segments)
export(createbreakpointmatrix)
export(filter_segments)
export(filterbycells)
export(filtercn)
export(fix_assignments)
Expand All @@ -39,13 +43,17 @@ export(hc_clustering)
export(is.ascn)
export(is.hscn)
export(is.hscnrna)
export(make_copynumber_legend)
export(map_to_segments)
export(missegregations)
export(missegregations_vscell)
export(orderdf)
export(per_arm_baf_mat)
export(per_chr_baf)
export(per_chr_baf_plot)
export(per_chr_cn)
export(per_chrarm_cn)
export(per_segment_baf_mat)
export(per_segment_baf_plot)
export(phase_haplotypes)
export(phase_haplotypes_bychr)
export(phase_haplotypes_rna)
Expand Down Expand Up @@ -73,6 +81,7 @@ export(scCN_cols)
export(scCNminorallele_cols)
export(scCNphase_cols)
export(scCNstate_cols)
export(segments_to_bins)
export(simulate_cell)
export(simulate_cells)
export(simulate_data_cohort)
Expand Down
9 changes: 6 additions & 3 deletions R/callHSCN.R
Original file line number Diff line number Diff line change
Expand Up @@ -811,8 +811,11 @@ callHaplotypeSpecificCN <- function(CNbins,

if (fillmissing){
out[["data"]] <- dplyr::left_join(CNbins, out[["data"]], by = c("chr", "start", "end", "copy", "state", "cell_id"))
out[["data"]] <- tidyr::fill(out[["data"]], c("state_min", "Maj", "Min", "state_phase", "state_AS",
"state_AS_phased", "LOH", "state_BAF", "phase"), .direction = "downup")
out[["data"]] <- out[["data"]] %>%
dplyr::group_by(chr, cell_id) %>%
tidyr::fill( c("state_min", "Maj", "Min", "state_phase", "state_AS",
"state_AS_phased", "LOH", "state_BAF", "phase"), .direction = "downup") %>%
dplyr::ungroup()
out[["data"]] <- as.data.frame(out[["data"]])
}

Expand Down Expand Up @@ -1156,7 +1159,7 @@ phasing_LOH <- function(cndat, chromosomes, cutoff = 0.9, ncells = 1) {
by = "cell_id"
] %>%
.[order(LOH, decreasing = TRUE)] %>%
.[LOH > 0.9 & abs(mBAF) < 0.05] %>%
.[LOH > cutoff & abs(mBAF) < 0.05] %>%
.$cell_id

if (length(cells) >= ncells) {
Expand Down
129 changes: 92 additions & 37 deletions R/callHSCNrna.R
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,13 @@ assign_states_noprior <- function(haps,
}

#' @export
assign_states_dp <- function(bafperchr,
assign_states_dp <- function(bafpersegment,
haplotypes_rna = NULL,
samples = 10,
alpha_0 = 1,
K = 20,
most_variable_chr = TRUE,
top_nchr = 5,
most_variable_segment = TRUE,
top_nseg = 5,
overwrite_chr = NULL,
removechr = c("chrX"),
filtercounts = 0,
Expand All @@ -172,10 +173,10 @@ assign_states_dp <- function(bafperchr,
}

message("Generating count matrices...")
baf_total <- bafperchr %>%
dplyr::select(cell_id, chrarm, total) %>%
baf_total <- bafpersegment %>%
dplyr::select(cell_id, segid, total) %>%
tidyr::pivot_wider(
names_from = "chrarm",
names_from = "segid",
values_from = "total",
values_fill = list(total = 0),
names_prefix = "chr"
Expand All @@ -185,10 +186,10 @@ assign_states_dp <- function(bafperchr,
baf_total <- subset(baf_total, select = -c(cell_id))

# make chr ~ cell_id matrix for B allele counts
baf_counts <- bafperchr %>%
dplyr::select(cell_id, chrarm, alleleB) %>%
baf_counts <- bafpersegment %>%
dplyr::select(cell_id, segid, alleleB) %>%
tidyr::pivot_wider(
names_from = "chrarm",
names_from = "segid",
values_from = "alleleB",
values_fill = list(alleleB = 0),
names_prefix = "chr"
Expand All @@ -204,11 +205,12 @@ assign_states_dp <- function(bafperchr,
baf_total <- baf_total[filtered_cells, ]
}

if (most_variable_chr) {
if (most_variable_segment) {
# keepchrs <- sort(sapply(baf_counts / baf_total, function(x) var(x, na.rm = TRUE)), decreasing = TRUE)[1:top_nchr]
chr_names <- names(baf_counts)
baf_counts_temp <- baf_counts[, setdiff(chr_names, removechr)]
keepchrs <- sort(sapply(
seg_names <- names(baf_counts)
#baf_counts_temp <- baf_counts[, setdiff(chr_names, removechr)]
baf_counts_temp <- baf_counts
keepsegs <- sort(sapply(
names(baf_counts_temp),
function(x) {
matrixStats::weightedVar(baf_counts[, x] / baf_total[, x],
Expand All @@ -217,18 +219,18 @@ assign_states_dp <- function(bafperchr,
}
),
decreasing = TRUE
)[1:top_nchr]
message(paste0("Top ", top_nchr, " most variable chromosomes are: ", paste0(names(keepchrs), collapse = ", ")))
message("Using these chromosomes for clustering")
keepchrs <- names(keepchrs)
} else if (!is.null(overwrite_chr)) {
keepchrs <- overwrite_chr
)[1:top_nseg]
message(paste0("Top ", top_nseg, " most variable segments are: ", paste0(names(keepsegs), collapse = ", ")))
message("Using these segments for clustering")
keepsegs <- names(keepsegs)
} else if (!is.null(overwrite_segs)) {
keepsegs <- overwrite_segs
} else {
keepchrs <- paste0("chr", unique(bafperchr$chrarm))
keepsegs <- paste0("chr", unique(bafpersegment$chrarm))
}

baf_counts <- baf_counts[, keepchrs]
baf_total <- baf_total[, keepchrs]
baf_counts <- baf_counts[, keepsegs]
baf_total <- baf_total[, keepsegs]
print(dim(baf_counts))

message("Fitting mixture model using VIBER...")
Expand All @@ -250,11 +252,10 @@ assign_states_dp <- function(bafperchr,

message("Extract cluster means from VIBER object...")
theta <- as.data.frame(fit_filt$theta_k)
theta$chrarm <- row.names(theta)
theta$segid <- row.names(theta)
row.names(theta) <- NULL
theta <- theta %>%
tidyr::pivot_longer(-chrarm, names_to = "clone_id", values_to = "theta") %>%
dplyr::mutate(chrarm = gsub("chr", "", chrarm))
tidyr::pivot_longer(-segid, names_to = "clone_id", values_to = "theta") #%>%dplyr::mutate(chrarm = gsub("chr", "", chrarm))

message("Generating dataframe mapping cell_id to clone_id...")
x <- data.frame(
Expand All @@ -263,17 +264,17 @@ assign_states_dp <- function(bafperchr,
)

message("Assign states to clones and chromosomes...")
states <- bafperchr %>%
states <- bafpersegment %>%
dplyr::left_join(x, by = "cell_id") %>%
dplyr::group_by(chrarm, clone_id) %>%
dplyr::group_by(segid, clone_id) %>%
dplyr::summarise(
BAF = median(BAF),
alleleA = sum(alleleA),
alleleB = sum(alleleB),
total = sum(total)
) %>%
dplyr::ungroup() %>%
dplyr::left_join(theta, by = c("clone_id", "chrarm")) %>%
dplyr::left_join(theta, by = c("clone_id", "segid")) %>%
dplyr::mutate(rounded = round(BAF / 0.25) * 0.25) %>%
dplyr::mutate(state_phase = dplyr::case_when(
rounded == 0.0 ~ "A-Hom",
Expand All @@ -282,25 +283,79 @@ assign_states_dp <- function(bafperchr,
rounded == 0.75 ~ "B-Gained",
rounded == 1.0 ~ "B-Hom"
)) %>%
dplyr::select(chrarm, clone_id, state_phase)
dplyr::select(segid, clone_id, state_phase)

message("Format final dataframe...")
bafperchr_new <- bafperchr %>%
dplyr::select(chr, arm, chrarm, cell_id, alleleA, alleleB, total, BAF) %>%
bafpersegment_new <- bafpersegment %>%
dplyr::select(chr, segid, cell_id, alleleA, alleleB, total, BAF) %>%
dplyr::left_join(x, by = "cell_id") %>%
dplyr::left_join(states, by = c("chrarm", "clone_id")) %>%
dplyr::left_join(states, by = c("segid", "clone_id")) %>%
dplyr::select(-clone_id) %>%
dplyr::left_join(hg19chrom_coordinates) %>%
dplyr::left_join(x) %>%
dplyr::mutate(start = start + 1)
dplyr::left_join(x) %>%
tidyr::separate(segid, c("chr2", "start", "end"), remove = FALSE, convert = TRUE) %>%
dplyr::select(segid, chr, start, end, -chr2, dplyr::everything()) %>%
as.data.frame()

# Output
out <- list()
class(out) <- "hscnrna"
out[["viber_fit"]] <- fit_filt
out[["clusters"]] <- x
out[["hscn"]] <- bafperchr_new
out[["chromosomes_fit"]] <- keepchrs
out[["hscn"]] <- bafpersegment_new %>% as.data.frame()
out[["segments_fits"]] <- keepsegs
out[["haplotypecounts"]] <- haplotypes_rna$snp_counts
out[["segments"]] <- dplyr::distinct(bafpersegment_new, chr, start, end) %>% as.data.frame()

return(out)
}


getCNstatefunc <- function(segments, bins, ncores = 1){
bins_g <- plyranges::as_granges(dlpbins %>% dplyr::rename(seqnames = chr))
segments_g <- plyranges::as_granges(segments %>% dplyr::rename(seqnames = chr), keep_mcols = TRUE)

bins <- plyranges::find_overlaps(bins_g, segments_g) %>%
as.data.frame() %>%
dplyr::rename(chr = seqnames) %>%
dplyr::select(-width, -strand) %>%
dplyr::select(cell_id, chr, start, end, dplyr::everything()) %>%
orderdf()

return(bins)
}

#' @export
segments_to_bins <- function(segments, binsize = 0.5e6, ncores = 1){

if (!requireNamespace("plyranges", quietly = TRUE)) {
stop("Package \"plyranges\" needed to use this function. Please install it.",
call. = FALSE
)
}

bins <- schnapps::getBins(binsize = binsize)

if (ncores == 1) {
df <- data.table::rbindlist(lapply(unique(segments$cell_id),
function(x){
segments %>% dplyr::filter(cell_id == x) %>%
getCNstatefunc(., bins = bins)
})) %>%
dplyr::group_by(chr, start, end, cell_id) %>%
dplyr::filter(dplyr::row_number() == 1) %>%
as.data.frame() %>%
dplyr::filter(!is.na(cell_id))
} else{
df <- data.table::rbindlist(parallel::mclapply(unique(segments$cell_id),
function(x){
segments %>% dplyr::filter(cell_id == x) %>%
getCNstatefunc(., bins = bins)
}, mc.cores = ncores)) %>%
dplyr::group_by(chr, start, end, cell_id) %>%
dplyr::filter(dplyr::row_number() == 1) %>%
as.data.frame() %>%
dplyr::filter(!is.na(cell_id))
}

return(df)
}
27 changes: 20 additions & 7 deletions R/combinedata_rna.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@ format_haplotypes_rna <- function(haplotypes,
message("Phasing based on distribution across all cells")
phased_haplotypes <- phase_haplotypes_rna(haplotypes)
}
# } else {
# phased_haplotypes <- computehaplotypecounts(haplotypes, ...)
# }
message("Join phased haplotypes...")
haplotypes <- as.data.table(haplotypes)[phased_haplotypes,
on = .(chr, hap_label)
Expand All @@ -45,22 +42,38 @@ format_haplotypes_rna <- function(haplotypes,
.[, totalcounts := alleleA + alleleB] %>%
.[totalcounts > filtern, BAF := alleleB / totalcounts] %>%
.[, c("allele1", "allele0", "phase") := NULL]

haplotype_snp_counts <- haplotypes %>%
.[, list(
alleleA = sum(alleleA),
alleleB = sum(alleleB)
), by = c("cell_id", "chr", "position", "hap_label")] %>%
.[, totalcounts := alleleA + alleleB] %>%
.[totalcounts > filtern, BAF := alleleB / totalcounts] %>%
dplyr::rename(start = position) %>%
dplyr::select(cell_id, chr, start, hap_label, alleleA, alleleB, totalcounts, BAF)

haplotypes <- haplotypes %>%
.[, list(
alleleA = sum(alleleA),
alleleB = sum(alleleB),
start = min(position), end = max(position)
), by = c("cell_id", "chr", "sample", "patient", "hap_label")] %>%
), by = c("cell_id", "chr", "hap_label")] %>%
.[, totalcounts := alleleA + alleleB] %>%
.[totalcounts > filtern, BAF := alleleB / totalcounts] %>%
dplyr::select(cell_id, chr, start, end, hap_label, alleleA, alleleB, totalcounts, BAF, sample, patient)
dplyr::select(cell_id, chr, start, end, hap_label, alleleA, alleleB, totalcounts, BAF)

if (create_cell_id) {
haplotypes <- dplyr::mutate(cell_id = paste(patient, sample, cell_id, sep = "-")) %>%
haplotypes <- haplotypes %>%
dplyr::mutate(cell_id = paste(patient, sample, cell_id, sep = "-")) %>%
as.data.frame() %>%
orderdf()

haplotype_snp_counts <- haplotype_snp_counts %>%
dplyr::mutate(cell_id = paste(patient, sample, cell_id, sep = "-")) %>%
as.data.frame() %>%
orderdf()
}

return(haplotypes)
return(list(block_counts = haplotypes, snp_counts = haplotype_snp_counts))
}
34 changes: 34 additions & 0 deletions R/convert.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#' @export
assign_haplotype_label <- function(haplotypes, hapbinsize = 50e3){
colnames(haplotypes) <- c("chr", "pos", "cell_id", "allele0", "allele1")
haplotypes$chr <- as.character(haplotypes$chr)
haplotypes$pos2 <- haplotypes$pos

bins <- getBins(binsize = hapbinsize) %>%
dplyr::rename(start_bins = start, end_bins = end, chr_bins = chr) %>%
dplyr::select(-width) %>%
as.data.table() %>%
.[, hap_label := 1:.N]

haplotypes <- haplotypes[bins, on = .(chr == chr_bins, pos > start_bins, pos < end_bins)]
haplotypes <- na.omit(haplotypes)

hap_labels <- dplyr::distinct(haplotypes, chr, pos2, hap_label) %>% dplyr::rename(position = pos2)
return(hap_label)
}

#' @export
assign_label_persnp <- function(haplotypes, hapbinsize = 50e3){
snpdf <- as.data.table(haplotypes)[, hap_label := .GRP, by = list(chr, position)]
return(snpdf)
}

#' @export
assign_bins_haplotypes <- function(haplotypes, binsize = 0.5e6){
binshaps <- haplotypes %>%
.[, start := floor(position / binsize) * binsize + 1] %>%
.[, end := start + binsize - 1] %>%
.[, position := NULL]

return(binshaps)
}
Loading

0 comments on commit 3875523

Please sign in to comment.