Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scrnaseq updates #35

Merged
merged 15 commits into from
Jan 20, 2022
2 changes: 2 additions & 0 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ jobs:
run: brew install --cask xquartz
- name: Install pandoc
run: brew install pandoc
- name: Install cairo
run: brew install cairo
- name: Install dependencies
run: |
install.packages(c("remotes", "rcmdcheck"))
Expand Down
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ Suggests:
covr,
ggrastr,
VIBER,
Seurat
Seurat,
plyranges
Remotes:
VPetukhov/ggrastr,
caravagn/VIBER,
caravagn/pio,
caravagn/easypar,
Expand Down
11 changes: 10 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ export(HaplotypeHMM)
export(add_states)
export(alleleHMM)
export(assignHaplotypeHMM)
export(assign_bins_haplotypes)
export(assign_haplotype_label)
export(assign_label_persnp)
export(assign_states_dp)
export(assign_states_noprior)
export(assignalleleHMM)
Expand All @@ -26,6 +29,7 @@ export(createSNVmatrix)
export(create_cntransitions)
export(create_segments)
export(createbreakpointmatrix)
export(filter_segments)
export(filterbycells)
export(filtercn)
export(fix_assignments)
Expand All @@ -39,13 +43,17 @@ export(hc_clustering)
export(is.ascn)
export(is.hscn)
export(is.hscnrna)
export(make_copynumber_legend)
export(map_to_segments)
export(missegregations)
export(missegregations_vscell)
export(orderdf)
export(per_arm_baf_mat)
export(per_chr_baf)
export(per_chr_baf_plot)
export(per_chr_cn)
export(per_chrarm_cn)
export(per_segment_baf_mat)
export(per_segment_baf_plot)
export(phase_haplotypes)
export(phase_haplotypes_bychr)
export(phase_haplotypes_rna)
Expand Down Expand Up @@ -73,6 +81,7 @@ export(scCN_cols)
export(scCNminorallele_cols)
export(scCNphase_cols)
export(scCNstate_cols)
export(segments_to_bins)
export(simulate_cell)
export(simulate_cells)
export(simulate_data_cohort)
Expand Down
9 changes: 6 additions & 3 deletions R/callHSCN.R
Original file line number Diff line number Diff line change
Expand Up @@ -811,8 +811,11 @@ callHaplotypeSpecificCN <- function(CNbins,

if (fillmissing){
out[["data"]] <- dplyr::left_join(CNbins, out[["data"]], by = c("chr", "start", "end", "copy", "state", "cell_id"))
out[["data"]] <- tidyr::fill(out[["data"]], c("state_min", "Maj", "Min", "state_phase", "state_AS",
"state_AS_phased", "LOH", "state_BAF", "phase"), .direction = "downup")
out[["data"]] <- out[["data"]] %>%
dplyr::group_by(chr, cell_id) %>%
tidyr::fill( c("state_min", "Maj", "Min", "state_phase", "state_AS",
"state_AS_phased", "LOH", "state_BAF", "phase"), .direction = "downup") %>%
dplyr::ungroup()
out[["data"]] <- as.data.frame(out[["data"]])
}

Expand Down Expand Up @@ -1156,7 +1159,7 @@ phasing_LOH <- function(cndat, chromosomes, cutoff = 0.9, ncells = 1) {
by = "cell_id"
] %>%
.[order(LOH, decreasing = TRUE)] %>%
.[LOH > 0.9 & abs(mBAF) < 0.05] %>%
.[LOH > cutoff & abs(mBAF) < 0.05] %>%
.$cell_id

if (length(cells) >= ncells) {
Expand Down
129 changes: 92 additions & 37 deletions R/callHSCNrna.R
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,13 @@ assign_states_noprior <- function(haps,
}

#' @export
assign_states_dp <- function(bafperchr,
assign_states_dp <- function(bafpersegment,
haplotypes_rna = NULL,
samples = 10,
alpha_0 = 1,
K = 20,
most_variable_chr = TRUE,
top_nchr = 5,
most_variable_segment = TRUE,
top_nseg = 5,
overwrite_chr = NULL,
removechr = c("chrX"),
filtercounts = 0,
Expand All @@ -172,10 +173,10 @@ assign_states_dp <- function(bafperchr,
}

message("Generating count matrices...")
baf_total <- bafperchr %>%
dplyr::select(cell_id, chrarm, total) %>%
baf_total <- bafpersegment %>%
dplyr::select(cell_id, segid, total) %>%
tidyr::pivot_wider(
names_from = "chrarm",
names_from = "segid",
values_from = "total",
values_fill = list(total = 0),
names_prefix = "chr"
Expand All @@ -185,10 +186,10 @@ assign_states_dp <- function(bafperchr,
baf_total <- subset(baf_total, select = -c(cell_id))

# make chr ~ cell_id matrix for B allele counts
baf_counts <- bafperchr %>%
dplyr::select(cell_id, chrarm, alleleB) %>%
baf_counts <- bafpersegment %>%
dplyr::select(cell_id, segid, alleleB) %>%
tidyr::pivot_wider(
names_from = "chrarm",
names_from = "segid",
values_from = "alleleB",
values_fill = list(alleleB = 0),
names_prefix = "chr"
Expand All @@ -204,11 +205,12 @@ assign_states_dp <- function(bafperchr,
baf_total <- baf_total[filtered_cells, ]
}

if (most_variable_chr) {
if (most_variable_segment) {
# keepchrs <- sort(sapply(baf_counts / baf_total, function(x) var(x, na.rm = TRUE)), decreasing = TRUE)[1:top_nchr]
chr_names <- names(baf_counts)
baf_counts_temp <- baf_counts[, setdiff(chr_names, removechr)]
keepchrs <- sort(sapply(
seg_names <- names(baf_counts)
#baf_counts_temp <- baf_counts[, setdiff(chr_names, removechr)]
baf_counts_temp <- baf_counts
keepsegs <- sort(sapply(
names(baf_counts_temp),
function(x) {
matrixStats::weightedVar(baf_counts[, x] / baf_total[, x],
Expand All @@ -217,18 +219,18 @@ assign_states_dp <- function(bafperchr,
}
),
decreasing = TRUE
)[1:top_nchr]
message(paste0("Top ", top_nchr, " most variable chromosomes are: ", paste0(names(keepchrs), collapse = ", ")))
message("Using these chromosomes for clustering")
keepchrs <- names(keepchrs)
} else if (!is.null(overwrite_chr)) {
keepchrs <- overwrite_chr
)[1:top_nseg]
message(paste0("Top ", top_nseg, " most variable segments are: ", paste0(names(keepsegs), collapse = ", ")))
message("Using these segments for clustering")
keepsegs <- names(keepsegs)
} else if (!is.null(overwrite_segs)) {
keepsegs <- overwrite_segs
} else {
keepchrs <- paste0("chr", unique(bafperchr$chrarm))
keepsegs <- paste0("chr", unique(bafpersegment$chrarm))
}

baf_counts <- baf_counts[, keepchrs]
baf_total <- baf_total[, keepchrs]
baf_counts <- baf_counts[, keepsegs]
baf_total <- baf_total[, keepsegs]
print(dim(baf_counts))

message("Fitting mixture model using VIBER...")
Expand All @@ -250,11 +252,10 @@ assign_states_dp <- function(bafperchr,

message("Extract cluster means from VIBER object...")
theta <- as.data.frame(fit_filt$theta_k)
theta$chrarm <- row.names(theta)
theta$segid <- row.names(theta)
row.names(theta) <- NULL
theta <- theta %>%
tidyr::pivot_longer(-chrarm, names_to = "clone_id", values_to = "theta") %>%
dplyr::mutate(chrarm = gsub("chr", "", chrarm))
tidyr::pivot_longer(-segid, names_to = "clone_id", values_to = "theta") #%>%dplyr::mutate(chrarm = gsub("chr", "", chrarm))

message("Generating dataframe mapping cell_id to clone_id...")
x <- data.frame(
Expand All @@ -263,17 +264,17 @@ assign_states_dp <- function(bafperchr,
)

message("Assign states to clones and chromosomes...")
states <- bafperchr %>%
states <- bafpersegment %>%
dplyr::left_join(x, by = "cell_id") %>%
dplyr::group_by(chrarm, clone_id) %>%
dplyr::group_by(segid, clone_id) %>%
dplyr::summarise(
BAF = median(BAF),
alleleA = sum(alleleA),
alleleB = sum(alleleB),
total = sum(total)
) %>%
dplyr::ungroup() %>%
dplyr::left_join(theta, by = c("clone_id", "chrarm")) %>%
dplyr::left_join(theta, by = c("clone_id", "segid")) %>%
dplyr::mutate(rounded = round(BAF / 0.25) * 0.25) %>%
dplyr::mutate(state_phase = dplyr::case_when(
rounded == 0.0 ~ "A-Hom",
Expand All @@ -282,25 +283,79 @@ assign_states_dp <- function(bafperchr,
rounded == 0.75 ~ "B-Gained",
rounded == 1.0 ~ "B-Hom"
)) %>%
dplyr::select(chrarm, clone_id, state_phase)
dplyr::select(segid, clone_id, state_phase)

message("Format final dataframe...")
bafperchr_new <- bafperchr %>%
dplyr::select(chr, arm, chrarm, cell_id, alleleA, alleleB, total, BAF) %>%
bafpersegment_new <- bafpersegment %>%
dplyr::select(chr, segid, cell_id, alleleA, alleleB, total, BAF) %>%
dplyr::left_join(x, by = "cell_id") %>%
dplyr::left_join(states, by = c("chrarm", "clone_id")) %>%
dplyr::left_join(states, by = c("segid", "clone_id")) %>%
dplyr::select(-clone_id) %>%
dplyr::left_join(hg19chrom_coordinates) %>%
dplyr::left_join(x) %>%
dplyr::mutate(start = start + 1)
dplyr::left_join(x) %>%
tidyr::separate(segid, c("chr2", "start", "end"), remove = FALSE, convert = TRUE) %>%
dplyr::select(segid, chr, start, end, -chr2, dplyr::everything()) %>%
as.data.frame()

# Output
out <- list()
class(out) <- "hscnrna"
out[["viber_fit"]] <- fit_filt
out[["clusters"]] <- x
out[["hscn"]] <- bafperchr_new
out[["chromosomes_fit"]] <- keepchrs
out[["hscn"]] <- bafpersegment_new %>% as.data.frame()
out[["segments_fits"]] <- keepsegs
out[["haplotypecounts"]] <- haplotypes_rna$snp_counts
out[["segments"]] <- dplyr::distinct(bafpersegment_new, chr, start, end) %>% as.data.frame()

return(out)
}


getCNstatefunc <- function(segments, bins, ncores = 1){
bins_g <- plyranges::as_granges(dlpbins %>% dplyr::rename(seqnames = chr))
segments_g <- plyranges::as_granges(segments %>% dplyr::rename(seqnames = chr), keep_mcols = TRUE)

bins <- plyranges::find_overlaps(bins_g, segments_g) %>%
as.data.frame() %>%
dplyr::rename(chr = seqnames) %>%
dplyr::select(-width, -strand) %>%
dplyr::select(cell_id, chr, start, end, dplyr::everything()) %>%
orderdf()

return(bins)
}

#' @export
segments_to_bins <- function(segments, binsize = 0.5e6, ncores = 1){

if (!requireNamespace("plyranges", quietly = TRUE)) {
stop("Package \"plyranges\" needed to use this function. Please install it.",
call. = FALSE
)
}

bins <- schnapps::getBins(binsize = binsize)

if (ncores == 1) {
df <- data.table::rbindlist(lapply(unique(segments$cell_id),
function(x){
segments %>% dplyr::filter(cell_id == x) %>%
getCNstatefunc(., bins = bins)
})) %>%
dplyr::group_by(chr, start, end, cell_id) %>%
dplyr::filter(dplyr::row_number() == 1) %>%
as.data.frame() %>%
dplyr::filter(!is.na(cell_id))
} else{
df <- data.table::rbindlist(parallel::mclapply(unique(segments$cell_id),
function(x){
segments %>% dplyr::filter(cell_id == x) %>%
getCNstatefunc(., bins = bins)
}, mc.cores = ncores)) %>%
dplyr::group_by(chr, start, end, cell_id) %>%
dplyr::filter(dplyr::row_number() == 1) %>%
as.data.frame() %>%
dplyr::filter(!is.na(cell_id))
}

return(df)
}
27 changes: 20 additions & 7 deletions R/combinedata_rna.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@ format_haplotypes_rna <- function(haplotypes,
message("Phasing based on distribution across all cells")
phased_haplotypes <- phase_haplotypes_rna(haplotypes)
}
# } else {
# phased_haplotypes <- computehaplotypecounts(haplotypes, ...)
# }
message("Join phased haplotypes...")
haplotypes <- as.data.table(haplotypes)[phased_haplotypes,
on = .(chr, hap_label)
Expand All @@ -45,22 +42,38 @@ format_haplotypes_rna <- function(haplotypes,
.[, totalcounts := alleleA + alleleB] %>%
.[totalcounts > filtern, BAF := alleleB / totalcounts] %>%
.[, c("allele1", "allele0", "phase") := NULL]

haplotype_snp_counts <- haplotypes %>%
.[, list(
alleleA = sum(alleleA),
alleleB = sum(alleleB)
), by = c("cell_id", "chr", "position", "hap_label")] %>%
.[, totalcounts := alleleA + alleleB] %>%
.[totalcounts > filtern, BAF := alleleB / totalcounts] %>%
dplyr::rename(start = position) %>%
dplyr::select(cell_id, chr, start, hap_label, alleleA, alleleB, totalcounts, BAF)

haplotypes <- haplotypes %>%
.[, list(
alleleA = sum(alleleA),
alleleB = sum(alleleB),
start = min(position), end = max(position)
), by = c("cell_id", "chr", "sample", "patient", "hap_label")] %>%
), by = c("cell_id", "chr", "hap_label")] %>%
.[, totalcounts := alleleA + alleleB] %>%
.[totalcounts > filtern, BAF := alleleB / totalcounts] %>%
dplyr::select(cell_id, chr, start, end, hap_label, alleleA, alleleB, totalcounts, BAF, sample, patient)
dplyr::select(cell_id, chr, start, end, hap_label, alleleA, alleleB, totalcounts, BAF)

if (create_cell_id) {
haplotypes <- dplyr::mutate(cell_id = paste(patient, sample, cell_id, sep = "-")) %>%
haplotypes <- haplotypes %>%
dplyr::mutate(cell_id = paste(patient, sample, cell_id, sep = "-")) %>%
as.data.frame() %>%
orderdf()

haplotype_snp_counts <- haplotype_snp_counts %>%
dplyr::mutate(cell_id = paste(patient, sample, cell_id, sep = "-")) %>%
as.data.frame() %>%
orderdf()
}

return(haplotypes)
return(list(block_counts = haplotypes, snp_counts = haplotype_snp_counts))
}
34 changes: 34 additions & 0 deletions R/convert.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#' @export
assign_haplotype_label <- function(haplotypes, hapbinsize = 50e3){
colnames(haplotypes) <- c("chr", "pos", "cell_id", "allele0", "allele1")
haplotypes$chr <- as.character(haplotypes$chr)
haplotypes$pos2 <- haplotypes$pos

bins <- getBins(binsize = hapbinsize) %>%
dplyr::rename(start_bins = start, end_bins = end, chr_bins = chr) %>%
dplyr::select(-width) %>%
as.data.table() %>%
.[, hap_label := 1:.N]

haplotypes <- haplotypes[bins, on = .(chr == chr_bins, pos > start_bins, pos < end_bins)]
haplotypes <- na.omit(haplotypes)

hap_labels <- dplyr::distinct(haplotypes, chr, pos2, hap_label) %>% dplyr::rename(position = pos2)
return(hap_label)
}

#' @export
assign_label_persnp <- function(haplotypes, hapbinsize = 50e3){
snpdf <- as.data.table(haplotypes)[, hap_label := .GRP, by = list(chr, position)]
return(snpdf)
}

#' @export
assign_bins_haplotypes <- function(haplotypes, binsize = 0.5e6){
binshaps <- haplotypes %>%
.[, start := floor(position / binsize) * binsize + 1] %>%
.[, end := start + binsize - 1] %>%
.[, position := NULL]

return(binshaps)
}
Loading