diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index a0156ea..aeb4123 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -24,6 +24,8 @@ jobs: run: brew install --cask xquartz - name: Install pandoc run: brew install pandoc + - name: Install cairo + run: brew install cairo - name: Install dependencies run: | install.packages(c("remotes", "rcmdcheck")) diff --git a/DESCRIPTION b/DESCRIPTION index b45d1d4..4dae287 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,9 +50,9 @@ Suggests: covr, ggrastr, VIBER, - Seurat + Seurat, + plyranges Remotes: - VPetukhov/ggrastr, caravagn/VIBER, caravagn/pio, caravagn/easypar, diff --git a/NAMESPACE b/NAMESPACE index 91c6e90..95c06fd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,9 @@ export(HaplotypeHMM) export(add_states) export(alleleHMM) export(assignHaplotypeHMM) +export(assign_bins_haplotypes) +export(assign_haplotype_label) +export(assign_label_persnp) export(assign_states_dp) export(assign_states_noprior) export(assignalleleHMM) @@ -26,6 +29,7 @@ export(createSNVmatrix) export(create_cntransitions) export(create_segments) export(createbreakpointmatrix) +export(filter_segments) export(filterbycells) export(filtercn) export(fix_assignments) @@ -39,13 +43,17 @@ export(hc_clustering) export(is.ascn) export(is.hscn) export(is.hscnrna) +export(make_copynumber_legend) +export(map_to_segments) export(missegregations) export(missegregations_vscell) export(orderdf) export(per_arm_baf_mat) -export(per_chr_baf) +export(per_chr_baf_plot) export(per_chr_cn) export(per_chrarm_cn) +export(per_segment_baf_mat) +export(per_segment_baf_plot) export(phase_haplotypes) export(phase_haplotypes_bychr) export(phase_haplotypes_rna) @@ -73,6 +81,7 @@ export(scCN_cols) export(scCNminorallele_cols) export(scCNphase_cols) export(scCNstate_cols) +export(segments_to_bins) export(simulate_cell) export(simulate_cells) export(simulate_data_cohort) diff --git a/R/callHSCN.R b/R/callHSCN.R index d629f36..8c94c82 100644 --- a/R/callHSCN.R +++ b/R/callHSCN.R @@ -811,8 +811,11 @@ callHaplotypeSpecificCN <- function(CNbins, if (fillmissing){ out[["data"]] <- dplyr::left_join(CNbins, out[["data"]], by = c("chr", "start", "end", "copy", "state", "cell_id")) - out[["data"]] <- tidyr::fill(out[["data"]], c("state_min", "Maj", "Min", "state_phase", "state_AS", - "state_AS_phased", "LOH", "state_BAF", "phase"), .direction = "downup") + out[["data"]] <- out[["data"]] %>% + dplyr::group_by(chr, cell_id) %>% + tidyr::fill( c("state_min", "Maj", "Min", "state_phase", "state_AS", + "state_AS_phased", "LOH", "state_BAF", "phase"), .direction = "downup") %>% + dplyr::ungroup() out[["data"]] <- as.data.frame(out[["data"]]) } @@ -1156,7 +1159,7 @@ phasing_LOH <- function(cndat, chromosomes, cutoff = 0.9, ncells = 1) { by = "cell_id" ] %>% .[order(LOH, decreasing = TRUE)] %>% - .[LOH > 0.9 & abs(mBAF) < 0.05] %>% + .[LOH > cutoff & abs(mBAF) < 0.05] %>% .$cell_id if (length(cells) >= ncells) { diff --git a/R/callHSCNrna.R b/R/callHSCNrna.R index a718eea..d5bf334 100644 --- a/R/callHSCNrna.R +++ b/R/callHSCNrna.R @@ -153,12 +153,13 @@ assign_states_noprior <- function(haps, } #' @export -assign_states_dp <- function(bafperchr, +assign_states_dp <- function(bafpersegment, + haplotypes_rna = NULL, samples = 10, alpha_0 = 1, K = 20, - most_variable_chr = TRUE, - top_nchr = 5, + most_variable_segment = TRUE, + top_nseg = 5, overwrite_chr = NULL, removechr = c("chrX"), filtercounts = 0, @@ -172,10 +173,10 @@ assign_states_dp <- function(bafperchr, } message("Generating count matrices...") - baf_total <- bafperchr %>% - dplyr::select(cell_id, chrarm, total) %>% + baf_total <- bafpersegment %>% + dplyr::select(cell_id, segid, total) %>% tidyr::pivot_wider( - names_from = "chrarm", + names_from = "segid", values_from = "total", values_fill = list(total = 0), names_prefix = "chr" @@ -185,10 +186,10 @@ assign_states_dp <- function(bafperchr, baf_total <- subset(baf_total, select = -c(cell_id)) # make chr ~ cell_id matrix for B allele counts - baf_counts <- bafperchr %>% - dplyr::select(cell_id, chrarm, alleleB) %>% + baf_counts <- bafpersegment %>% + dplyr::select(cell_id, segid, alleleB) %>% tidyr::pivot_wider( - names_from = "chrarm", + names_from = "segid", values_from = "alleleB", values_fill = list(alleleB = 0), names_prefix = "chr" @@ -204,11 +205,12 @@ assign_states_dp <- function(bafperchr, baf_total <- baf_total[filtered_cells, ] } - if (most_variable_chr) { + if (most_variable_segment) { # keepchrs <- sort(sapply(baf_counts / baf_total, function(x) var(x, na.rm = TRUE)), decreasing = TRUE)[1:top_nchr] - chr_names <- names(baf_counts) - baf_counts_temp <- baf_counts[, setdiff(chr_names, removechr)] - keepchrs <- sort(sapply( + seg_names <- names(baf_counts) + #baf_counts_temp <- baf_counts[, setdiff(chr_names, removechr)] + baf_counts_temp <- baf_counts + keepsegs <- sort(sapply( names(baf_counts_temp), function(x) { matrixStats::weightedVar(baf_counts[, x] / baf_total[, x], @@ -217,18 +219,18 @@ assign_states_dp <- function(bafperchr, } ), decreasing = TRUE - )[1:top_nchr] - message(paste0("Top ", top_nchr, " most variable chromosomes are: ", paste0(names(keepchrs), collapse = ", "))) - message("Using these chromosomes for clustering") - keepchrs <- names(keepchrs) - } else if (!is.null(overwrite_chr)) { - keepchrs <- overwrite_chr + )[1:top_nseg] + message(paste0("Top ", top_nseg, " most variable segments are: ", paste0(names(keepsegs), collapse = ", "))) + message("Using these segments for clustering") + keepsegs <- names(keepsegs) + } else if (!is.null(overwrite_segs)) { + keepsegs <- overwrite_segs } else { - keepchrs <- paste0("chr", unique(bafperchr$chrarm)) + keepsegs <- paste0("chr", unique(bafpersegment$chrarm)) } - baf_counts <- baf_counts[, keepchrs] - baf_total <- baf_total[, keepchrs] + baf_counts <- baf_counts[, keepsegs] + baf_total <- baf_total[, keepsegs] print(dim(baf_counts)) message("Fitting mixture model using VIBER...") @@ -250,11 +252,10 @@ assign_states_dp <- function(bafperchr, message("Extract cluster means from VIBER object...") theta <- as.data.frame(fit_filt$theta_k) - theta$chrarm <- row.names(theta) + theta$segid <- row.names(theta) row.names(theta) <- NULL theta <- theta %>% - tidyr::pivot_longer(-chrarm, names_to = "clone_id", values_to = "theta") %>% - dplyr::mutate(chrarm = gsub("chr", "", chrarm)) + tidyr::pivot_longer(-segid, names_to = "clone_id", values_to = "theta") #%>%dplyr::mutate(chrarm = gsub("chr", "", chrarm)) message("Generating dataframe mapping cell_id to clone_id...") x <- data.frame( @@ -263,9 +264,9 @@ assign_states_dp <- function(bafperchr, ) message("Assign states to clones and chromosomes...") - states <- bafperchr %>% + states <- bafpersegment %>% dplyr::left_join(x, by = "cell_id") %>% - dplyr::group_by(chrarm, clone_id) %>% + dplyr::group_by(segid, clone_id) %>% dplyr::summarise( BAF = median(BAF), alleleA = sum(alleleA), @@ -273,7 +274,7 @@ assign_states_dp <- function(bafperchr, total = sum(total) ) %>% dplyr::ungroup() %>% - dplyr::left_join(theta, by = c("clone_id", "chrarm")) %>% + dplyr::left_join(theta, by = c("clone_id", "segid")) %>% dplyr::mutate(rounded = round(BAF / 0.25) * 0.25) %>% dplyr::mutate(state_phase = dplyr::case_when( rounded == 0.0 ~ "A-Hom", @@ -282,25 +283,79 @@ assign_states_dp <- function(bafperchr, rounded == 0.75 ~ "B-Gained", rounded == 1.0 ~ "B-Hom" )) %>% - dplyr::select(chrarm, clone_id, state_phase) + dplyr::select(segid, clone_id, state_phase) message("Format final dataframe...") - bafperchr_new <- bafperchr %>% - dplyr::select(chr, arm, chrarm, cell_id, alleleA, alleleB, total, BAF) %>% + bafpersegment_new <- bafpersegment %>% + dplyr::select(chr, segid, cell_id, alleleA, alleleB, total, BAF) %>% dplyr::left_join(x, by = "cell_id") %>% - dplyr::left_join(states, by = c("chrarm", "clone_id")) %>% + dplyr::left_join(states, by = c("segid", "clone_id")) %>% dplyr::select(-clone_id) %>% - dplyr::left_join(hg19chrom_coordinates) %>% - dplyr::left_join(x) %>% - dplyr::mutate(start = start + 1) + dplyr::left_join(x) %>% + tidyr::separate(segid, c("chr2", "start", "end"), remove = FALSE, convert = TRUE) %>% + dplyr::select(segid, chr, start, end, -chr2, dplyr::everything()) %>% + as.data.frame() # Output out <- list() class(out) <- "hscnrna" out[["viber_fit"]] <- fit_filt out[["clusters"]] <- x - out[["hscn"]] <- bafperchr_new - out[["chromosomes_fit"]] <- keepchrs + out[["hscn"]] <- bafpersegment_new %>% as.data.frame() + out[["segments_fits"]] <- keepsegs + out[["haplotypecounts"]] <- haplotypes_rna$snp_counts + out[["segments"]] <- dplyr::distinct(bafpersegment_new, chr, start, end) %>% as.data.frame() return(out) } + + +getCNstatefunc <- function(segments, bins, ncores = 1){ + bins_g <- plyranges::as_granges(dlpbins %>% dplyr::rename(seqnames = chr)) + segments_g <- plyranges::as_granges(segments %>% dplyr::rename(seqnames = chr), keep_mcols = TRUE) + + bins <- plyranges::find_overlaps(bins_g, segments_g) %>% + as.data.frame() %>% + dplyr::rename(chr = seqnames) %>% + dplyr::select(-width, -strand) %>% + dplyr::select(cell_id, chr, start, end, dplyr::everything()) %>% + orderdf() + + return(bins) +} + +#' @export +segments_to_bins <- function(segments, binsize = 0.5e6, ncores = 1){ + + if (!requireNamespace("plyranges", quietly = TRUE)) { + stop("Package \"plyranges\" needed to use this function. Please install it.", + call. = FALSE + ) + } + + bins <- schnapps::getBins(binsize = binsize) + + if (ncores == 1) { + df <- data.table::rbindlist(lapply(unique(segments$cell_id), + function(x){ + segments %>% dplyr::filter(cell_id == x) %>% + getCNstatefunc(., bins = bins) + })) %>% + dplyr::group_by(chr, start, end, cell_id) %>% + dplyr::filter(dplyr::row_number() == 1) %>% + as.data.frame() %>% + dplyr::filter(!is.na(cell_id)) + } else{ + df <- data.table::rbindlist(parallel::mclapply(unique(segments$cell_id), + function(x){ + segments %>% dplyr::filter(cell_id == x) %>% + getCNstatefunc(., bins = bins) + }, mc.cores = ncores)) %>% + dplyr::group_by(chr, start, end, cell_id) %>% + dplyr::filter(dplyr::row_number() == 1) %>% + as.data.frame() %>% + dplyr::filter(!is.na(cell_id)) + } + + return(df) +} diff --git a/R/combinedata_rna.R b/R/combinedata_rna.R index 497f4fb..258e995 100644 --- a/R/combinedata_rna.R +++ b/R/combinedata_rna.R @@ -21,9 +21,6 @@ format_haplotypes_rna <- function(haplotypes, message("Phasing based on distribution across all cells") phased_haplotypes <- phase_haplotypes_rna(haplotypes) } - # } else { - # phased_haplotypes <- computehaplotypecounts(haplotypes, ...) - # } message("Join phased haplotypes...") haplotypes <- as.data.table(haplotypes)[phased_haplotypes, on = .(chr, hap_label) @@ -45,22 +42,38 @@ format_haplotypes_rna <- function(haplotypes, .[, totalcounts := alleleA + alleleB] %>% .[totalcounts > filtern, BAF := alleleB / totalcounts] %>% .[, c("allele1", "allele0", "phase") := NULL] + + haplotype_snp_counts <- haplotypes %>% + .[, list( + alleleA = sum(alleleA), + alleleB = sum(alleleB) + ), by = c("cell_id", "chr", "position", "hap_label")] %>% + .[, totalcounts := alleleA + alleleB] %>% + .[totalcounts > filtern, BAF := alleleB / totalcounts] %>% + dplyr::rename(start = position) %>% + dplyr::select(cell_id, chr, start, hap_label, alleleA, alleleB, totalcounts, BAF) haplotypes <- haplotypes %>% .[, list( alleleA = sum(alleleA), alleleB = sum(alleleB), start = min(position), end = max(position) - ), by = c("cell_id", "chr", "sample", "patient", "hap_label")] %>% + ), by = c("cell_id", "chr", "hap_label")] %>% .[, totalcounts := alleleA + alleleB] %>% .[totalcounts > filtern, BAF := alleleB / totalcounts] %>% - dplyr::select(cell_id, chr, start, end, hap_label, alleleA, alleleB, totalcounts, BAF, sample, patient) + dplyr::select(cell_id, chr, start, end, hap_label, alleleA, alleleB, totalcounts, BAF) if (create_cell_id) { - haplotypes <- dplyr::mutate(cell_id = paste(patient, sample, cell_id, sep = "-")) %>% + haplotypes <- haplotypes %>% + dplyr::mutate(cell_id = paste(patient, sample, cell_id, sep = "-")) %>% + as.data.frame() %>% + orderdf() + + haplotype_snp_counts <- haplotype_snp_counts %>% + dplyr::mutate(cell_id = paste(patient, sample, cell_id, sep = "-")) %>% as.data.frame() %>% orderdf() } - return(haplotypes) + return(list(block_counts = haplotypes, snp_counts = haplotype_snp_counts)) } diff --git a/R/convert.R b/R/convert.R new file mode 100644 index 0000000..5c24402 --- /dev/null +++ b/R/convert.R @@ -0,0 +1,34 @@ +#' @export +assign_haplotype_label <- function(haplotypes, hapbinsize = 50e3){ + colnames(haplotypes) <- c("chr", "pos", "cell_id", "allele0", "allele1") + haplotypes$chr <- as.character(haplotypes$chr) + haplotypes$pos2 <- haplotypes$pos + + bins <- getBins(binsize = hapbinsize) %>% + dplyr::rename(start_bins = start, end_bins = end, chr_bins = chr) %>% + dplyr::select(-width) %>% + as.data.table() %>% + .[, hap_label := 1:.N] + + haplotypes <- haplotypes[bins, on = .(chr == chr_bins, pos > start_bins, pos < end_bins)] + haplotypes <- na.omit(haplotypes) + + hap_labels <- dplyr::distinct(haplotypes, chr, pos2, hap_label) %>% dplyr::rename(position = pos2) + return(hap_label) +} + +#' @export +assign_label_persnp <- function(haplotypes, hapbinsize = 50e3){ + snpdf <- as.data.table(haplotypes)[, hap_label := .GRP, by = list(chr, position)] + return(snpdf) +} + +#' @export +assign_bins_haplotypes <- function(haplotypes, binsize = 0.5e6){ + binshaps <- haplotypes %>% + .[, start := floor(position / binsize) * binsize + 1] %>% + .[, end := start + binsize - 1] %>% + .[, position := NULL] + + return(binshaps) +} \ No newline at end of file diff --git a/R/heatmap_plot.R b/R/heatmap_plot.R index 5d5b38a..72477be 100644 --- a/R/heatmap_plot.R +++ b/R/heatmap_plot.R @@ -12,6 +12,64 @@ cn_colours_minorallele <- scCNminorallele_colors cn_colours_phase <- scCNphase_colors cn_colours_bafstate <- scBAFstate_colors +#' @export +make_copynumber_legend <- function(font_size = 12, ncolcn = 2, ncolas = 1, gainloss = FALSE, cnonly = FALSE, cntitle = "Copy\nNumber", hscntitle = "HSCN State", ...) { + cn_lgd <- ComplexHeatmap::Legend( + title = cntitle, + labels = stringr::str_remove(names(scCN_colors), "CN"), + legend_gp = grid::gpar(fill = as.vector(scCN_colors)), + labels_gp = grid::gpar(fontsize = font_size), + title_gp = grid::gpar(fontsize = font_size), + title_gap = grid::unit(1, "mm"), + grid_height = grid::unit(3, "mm"), grid_width = grid::unit(2.5, "mm"), + ncol = ncolcn + ) + + hscn_lgd <- ComplexHeatmap::Legend( + title = hscntitle, + labels = names(scCNphase_colors), + legend_gp = grid::gpar(fill = as.vector(scCNphase_colors)), + labels_gp = grid::gpar(fontsize = font_size), + title_gp = grid::gpar(fontsize = font_size), + title_gap = grid::unit(1, "mm"), + grid_height = grid::unit(3, "mm"), grid_width = grid::unit(2.5, "mm"), + ncol = ncolas + ) + + gain_loss_lgd <- ComplexHeatmap::Legend( + title = "Δ", + labels = c("Gain", "Loss"), + legend_gp = grid::gpar(fill = c("#E34A33", "#3182BD")), + labels_gp = grid::gpar(fontsize = font_size), + title_gp = grid::gpar(fontsize = font_size), + title_gap = grid::unit(1, "mm"), + grid_height = grid::unit(3, "mm"), grid_width = grid::unit(2.5, "mm"), + ncol = 1 + ) + + if (gainloss){ + lgd <- ComplexHeatmap::packLegend( + gain_loss_lgd, cn_lgd, hscn_lgd, + row_gap = grid::unit(4, "mm"), + column_gap = grid::unit(4, "mm"), + ... + ) + } else { + lgd <- ComplexHeatmap::packLegend( + cn_lgd, hscn_lgd, + row_gap = grid::unit(4, "mm"), + column_gap = grid::unit(4, "mm"), + ... + ) + } + + if (cnonly){ + lgd <- cn_lgd + } + + grid::grid.grabExpr(ComplexHeatmap::draw(lgd)) +} + snv_colours <- structure( names = c(0, 1), c("#7EA5EA", "#9A2E1C") @@ -172,7 +230,7 @@ make_discrete_palette <- function(pal_name, levels) { format_copynumber_values <- function(copynumber, plotcol = "state") { # copynumber[copynumber > 11] <- 11 - if (plotcol %in% c("BAF", "copy")) { + if (plotcol %in% c("BAF", "copy", "other")) { for (col in colnames(copynumber)) { values <- copynumber[, col] copynumber[, col] <- values @@ -344,6 +402,8 @@ make_left_annot <- function(copynumber, clone_pal = NULL, idx = 1, show_legend = TRUE, + annofontsize = 14, + anno_width = 0.4, str_to_remove = NULL) { annot_colours <- list() @@ -383,6 +443,7 @@ make_left_annot <- function(copynumber, y_pos <- 1 - unlist(clone_label_pos) / nrow(clones) grid::grid.text( names(clone_label_pos), 0.5, y_pos, + gp = grid::gpar(fontsize = annofontsize - 1), just = c("centre", "centre") ) } @@ -398,10 +459,12 @@ make_left_annot <- function(copynumber, clone_label = clone_label_generator, Sample = library_labels, col = annot_colours, show_annotation_name = c(TRUE, FALSE, TRUE), - which = "row", annotation_width = grid::unit(rep(0.4, 3), "cm"), + which = "row", annotation_width = grid::unit(rep(anno_width, 3), "cm"), + annotation_name_gp = grid::gpar(fontsize = annofontsize - 1), annotation_legend_param = list( Cluster = list(nrow = clone_legend_rows, direction = "horizontal"), - Sample = list(nrow = library_legend_rows, direction = "horizontal") + Sample = list(nrow = library_legend_rows, direction = "horizontal"), + labels_gp = grid::gpar(fontsize = annofontsize) ), show_legend = show_legend ) @@ -410,9 +473,11 @@ make_left_annot <- function(copynumber, Cluster = clones$clone_label, clone_label = clone_label_generator, col = annot_colours, show_annotation_name = c(TRUE, FALSE), - which = "row", annotation_width = grid::unit(rep(0.4, 2), "cm"), + which = "row", annotation_width = grid::unit(rep(anno_width, 2), "cm"), + annotation_name_gp = grid::gpar(fontsize = annofontsize - 1), annotation_legend_param = list( - Cluster = list(nrow = clone_legend_rows) + Cluster = list(nrow = clone_legend_rows), + labels_gp = grid::gpar(fontsize = annofontsize) ), show_legend = show_legend ) @@ -421,19 +486,23 @@ make_left_annot <- function(copynumber, Cluster = clones$clone_label, Sample = library_labels, col = annot_colours, show_annotation_name = c(TRUE, FALSE, TRUE), - which = "row", annotation_width = grid::unit(rep(0.4, 3), "cm"), + which = "row", annotation_width = grid::unit(rep(anno_width, 3), "cm"), + annotation_name_gp = grid::gpar(fontsize = annofontsize - 1), annotation_legend_param = list( Cluster = list(nrow = clone_legend_rows, direction = "horizontal"), - Sample = list(nrow = library_legend_rows, direction = "horizontal") + Sample = list(nrow = library_legend_rows, direction = "horizontal"), + labels_gp = grid::gpar(fontsize = annofontsize) ), show_legend = show_legend ) } else if (show_library_label == TRUE & show_clone_label == FALSE) { left_annot <- ComplexHeatmap::HeatmapAnnotation( Sample = library_labels, col = annot_colours, - which = "row", simple_anno_size = grid::unit(0.4, "cm"), + which = "row", simple_anno_size = grid::unit(anno_width, "cm"), + annotation_name_gp = grid::gpar(fontsize = annofontsize - 1), annotation_legend_param = list( - Sample = list(nrow = library_legend_rows) + Sample = list(nrow = library_legend_rows), + labels_gp = grid::gpar(fontsize = annofontsize) ), show_legend = show_legend ) @@ -441,9 +510,12 @@ make_left_annot <- function(copynumber, left_annot <- ComplexHeatmap::HeatmapAnnotation( Cluster = clones$clone_label, col = annot_colours, show_annotation_name = c(TRUE, FALSE), - which = "row", annotation_width = grid::unit(rep(0.4, 2), "cm"), + which = "row", annotation_width = grid::unit(rep(anno_width, 2), "cm"), + simple_anno_size = grid::unit(anno_width * 2, "cm"), + annotation_name_gp = grid::gpar(fontsize = annofontsize - 1), annotation_legend_param = list( - Cluster = list(nrow = clone_legend_rows) + Cluster = list(nrow = clone_legend_rows), + labels_gp = grid::gpar(fontsize = annofontsize) ), show_legend = show_legend ) @@ -452,6 +524,7 @@ make_left_annot <- function(copynumber, left_annot <- ComplexHeatmap::HeatmapAnnotation( Sample = library_labels, col = annot_colours, which = "row", simple_anno_size = grid::unit(0.4, "cm"), + annotation_name_gp = grid::gpar(fontsize = annofontsize - 1), annotation_legend_param = list( Sample = list(nrow = library_legend_rows) ), @@ -536,6 +609,7 @@ make_bottom_annot <- function(copynumber, bottom_annot <- ComplexHeatmap::HeatmapAnnotation(chrom_labels = ComplexHeatmap::anno_mark( at = as.vector(unlist(chrom_label_pos)), labels = names(chrom_label_pos), + link_height = grid::unit(linkheight, "mm"), side = "bottom", labels_gp = grid::gpar(fontsize = annofontsize), padding = grid::unit(1.25, "mm"), extend = 0.01, labels_rot = 0 @@ -720,15 +794,19 @@ make_copynumber_heatmap <- function(copynumber, na_col = "white", linkheight = 5, str_to_remove = NULL, + anno_width = 0.4, + rasterquality = 15, ...) { if (class(colvals) == "function"){ leg_params <- list(nrow = 3, - direction = "vertical") + direction = "vertical", + legend_gp = grid::gpar(fontsize = annofontsize)) } else { leg_params <- list(nrow = 3, direction = "vertical", - at = names(colvals)) + at = names(colvals), + legend_gp = grid::gpar(fontsize = annofontsize)) } copynumber_hm <- ComplexHeatmap::Heatmap( @@ -741,9 +819,9 @@ make_copynumber_heatmap <- function(copynumber, cluster_columns = FALSE, show_column_names = FALSE, bottom_annotation = make_bottom_annot(copynumber, chrlabels = chrlabels, nticks = nticks, annotation_height = annotation_height, annofontsize = annofontsize, linkheight = linkheight), - left_annotation = make_left_annot(copynumber, clones, + left_annotation = make_left_annot(copynumber, clones, anno_width = anno_width, library_mapping = library_mapping, clone_pal = clone_pal, show_clone_label = show_clone_label, show_clone_text = show_clone_text, - idx = sample_label_idx, show_legend = show_legend, show_library_label = show_library_label, + idx = sample_label_idx, show_legend = show_legend, show_library_label = show_library_label,annofontsize = annofontsize, str_to_remove = str_to_remove ), heatmap_legend_param = leg_params, @@ -752,7 +830,7 @@ make_copynumber_heatmap <- function(copynumber, plotfrequency = plotfrequency, plotcol = plotcol, SV = SV ), use_raster = TRUE, - raster_quality = 10, + raster_quality = rasterquality, ... ) return(copynumber_hm) @@ -804,6 +882,9 @@ getSVlegend <- function(include = NULL) { #' @param linkheight height of x-axis ticks #' @param newlegendname overwrite default legend name #' @param str_to_remove string to remove from cell_id's when plotting labels +#' @param maxCNcol max value for color scale when plotting raw data +#' @param anno_width width of left annotations +#' @param rasterquality default = 15 #' #' If clusters are set to NULL then the function will compute clusters using UMAP and HDBSCAN. #' @@ -851,6 +932,9 @@ plotHeatmap <- function(cn, linkheight = 5, newlegendname = NULL, str_to_remove = NULL, + maxCNcol = 11, + anno_width = 0.4, + rasterquality = 15, ...) { if (is.hscn(cn) | is.ascn(cn)) { CNbins <- cn$data @@ -882,7 +966,7 @@ plotHeatmap <- function(cn, orderdf(.) } - if (!plotcol %in% c("state", "state_BAF", "state_phase", "state_AS", "state_min", "copy", "BAF", "Min", "Maj")) { + if (!plotcol %in% c("state", "state_BAF", "state_phase", "state_AS", "state_min", "copy", "BAF", "Min", "Maj", "other")) { stop(paste0("Column name - ", plotcol, " not available for plotting, please use one of state, copy, BAF, state_BAF, state_phase, state_AS, Min or Maj")) } @@ -919,6 +1003,11 @@ plotHeatmap <- function(cn, colvals <- circlize::colorRamp2(seq(0, 11, 1), scCN_colors) legendname <- "Copy" } + + if (plotcol == "other") { + colvals <- circlize::colorRamp2(c(0, maxCNcol / 2, maxCNcol), c(scCN_colors["CN0"], scCN_colors["CN3"], scCN_colors["CN11"])) + legendname <- "Copy" + } if (plotcol == "state_AS") { colvals <- cn_colours_loh @@ -1064,6 +1153,8 @@ plotHeatmap <- function(cn, na_col = na_col, linkheight = linkheight, str_to_remove = str_to_remove, + anno_width = anno_width, + rasterquality = rasterquality, ... ) if (plottree == TRUE) { diff --git a/R/plotting.R b/R/plotting.R index 7d1bdf3..d076654 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -1,4 +1,4 @@ -plottinglist <- function(CNbins, xaxis_order = "genome_position", maxCN = 20) { +plottinglist <- function(CNbins, xaxis_order = "genome_position", maxCN = 20, tickwidth = 50, chrstart = NULL, chrend = NULL) { # arrange segments in order, generate segment index and reorder CN state factor if (!xaxis_order %in% c("bin", "genome_position")) { @@ -51,7 +51,6 @@ plottinglist <- function(CNbins, xaxis_order = "genome_position", maxCN = 20) { dplyr::filter(chr %in% unique(CNbins$chr)) %>% dplyr::mutate(idx = 1:dplyr::n()) - CNbins <- dplyr::full_join(bins, CNbins) %>% dplyr::filter(!is.na(copy)) %>% dplyr::filter(!is.na(state)) %>% @@ -70,21 +69,39 @@ plottinglist <- function(CNbins, xaxis_order = "genome_position", maxCN = 20) { dplyr::pull(idx) # get ticks - median bin of each chromosome - chrticks <- bins %>% - dplyr::filter(chr %in% unique(CNbins$chr)) %>% - dplyr::group_by(chr) %>% - dplyr::summarise(idx = round(median(idx))) %>% - dplyr::pull(idx) + if (length(unique(CNbins$chr)) == 1){ + tickwidth <- (tickwidth * 1e6) / binsize + chrticks <- seq(tickwidth, dim(bins)[1], tickwidth) + chrlabels <- paste0((chrticks * binsize) / 1e6, " Mb") + } else { + chrticks <- bins %>% + dplyr::filter(chr %in% unique(CNbins$chr)) %>% + dplyr::group_by(chr) %>% + dplyr::summarise(idx = round(median(idx))) %>% + dplyr::pull(idx) + chrlabels <- gtools::mixedsort(unique(CNbins$chr)) + } + + if (length(unique(CNbins$chr)) == 1 & (!is.null(chrstart) | !is.null(chrstart))){ + idxstart <- ifelse(is.null(chrstart), min(CNbins$idx), (chrstart * 1e6) / binsize) + idxend <- ifelse(is.null(chrend), max(CNbins$idx), (chrend * 1e6) / binsize) + CNbins <- CNbins %>% dplyr::filter(idx >= idxstart) + CNbins <- CNbins %>% dplyr::filter(idx <= idxend) + chrticks <- seq(idxstart, idxend, tickwidth) + chrlabels <- paste0((chrticks * binsize) / 1e6, " Mb") + minidx <- ifelse(is.null(chrstart), min(bins$idx), idxstart) + maxidx <- ifelse(is.null(chrend), max(bins$idx), idxend) + } else{ + minidx <- min(bins$idx) + maxidx <- max(bins$idx) + } - chrlabels <- gtools::mixedsort(unique(CNbins$chr)) - minidx <- min(bins$idx) - maxidx <- max(bins$idx) } return(list(CNbins = CNbins, bins = bins, chrbreaks = chrbreaks, chrticks = chrticks, chrlabels = chrlabels, minidx = minidx, maxidx = maxidx)) } -plottinglistSV <- function(breakpoints, binsize = 0.5e6, chrfilt = NULL) { +plottinglistSV <- function(breakpoints, binsize = 0.5e6, chrfilt = NULL, chrstart = NULL, chrend = NULL) { breakpoints$chromosome_1 <- as.character(breakpoints$chromosome_1) breakpoints$chromosome_2 <- as.character(breakpoints$chromosome_2) @@ -137,8 +154,13 @@ plottinglistSV <- function(breakpoints, binsize = 0.5e6, chrfilt = NULL) { dplyr::pull(idx) chrlabels <- gtools::mixedsort(unique(bins$chr)) - minidx <- min(bins$idx) - maxidx <- max(bins$idx) + if (length(chrfilt) == 1 & (!is.null(chrstart) | !is.null(chrstart))){ + minidx <- ifelse(is.null(chrstart), min(bins$idx), (chrstart * 1e6) / binsize) + maxidx <- ifelse(is.null(chrend), max(bins$idx), (chrend * 1e6) / binsize) + } else{ + minidx <- min(bins$idx) + maxidx <- max(bins$idx) + } return(list(breakpoints = breakpoints, bins = bins, chrbreaks = chrbreaks, chrticks = chrticks, chrlabels = chrlabels, minidx = minidx, maxidx = maxidx)) } @@ -223,8 +245,10 @@ plotSV2 <- function(breakpoints, ylims = c(0, 2), legend.position = "bottom", svwidth = 1.0, + chrstart = NULL, + chrend = NULL, ...) { - pl <- plottinglistSV(breakpoints, chrfilt = chrfilt) + pl <- plottinglistSV(breakpoints, chrfilt = chrfilt, chrstart = chrstart, chrend = chrend) pl$breakpoints <- pl$breakpoints %>% dplyr::mutate(curve = ifelse((abs(idx_1 - idx_2) < 2) & (chromosome_1 == chromosome_2), FALSE, TRUE)) @@ -331,10 +355,21 @@ squashy_trans <- function() { scales::trans_new( "squashy", function(x) tanh(0.075 * x), - function(x) atanh(x) / 0.075 + function(x) { + nnan <- 1 + while (nnan > 0){ + suppressWarnings(y <- atanh(x) / 0.075) + nnan <- sum(is.nan(y)) + x[is.nan(y)] <- x[is.nan(y)] - 0.000001 + } + return(y) + } + ) } +#create_squashy_trans + get_bezier_df_2 <- function(sv) { set.seed(123) @@ -477,6 +512,10 @@ get_bezier_df <- function(sv, cn, maxCN, homolog = FALSE) { #' @param SV Default is NULL. If a dataframe with structural variant position is passed it will add rearrangement links between bins. #' @param svalpha the alpha scaling of the SV lines, default = 0.5 #' @param genes vector of genes to annotate, will add a dashed vertical line and label +#' @param tickwidth Spacing of ticks (in Mb) when only 1 chromosome is plotted +#' @chrstart Start of region (in Mb) when plotting a single chromosome +#' @chrend End of region (in Mb) when plotting a single chromosome +#' @shape shape for plotting #' #' @return ggplot2 plot #' @@ -503,7 +542,12 @@ plotCNprofile <- function(CNbins, svalpha = 0.5, svwidth = 1.0, adj = 0.03, - genes = NULL, ...) { + genes = NULL, + tickwidth = 50, + chrstart = NULL, + chrend = NULL, + shape = 16, + ...) { if (!xaxis_order %in% c("bin", "genome_position")) { stop("xaxis_order must be either 'bin' or 'genome_position'") } @@ -513,11 +557,16 @@ plotCNprofile <- function(CNbins, } if (y_axis_trans == "squashy") { - maxCN <- min(c(24, maxCN)) - ybreaks <- c(0, 2, 5, 10, 20) + ybreaks <- c(0, 2, 5, 10, maxCN) } else { ybreaks <- seq(0, maxCN, 2) } + + if (length(chrfilt) == 1){ + xlab <- paste0('Chromosome ', chrfilt) + } else{ + xlab <- 'Chromosome' + } statecolpal <- scCNstate_cols() @@ -530,7 +579,7 @@ plotCNprofile <- function(CNbins, pl <- CNbins %>% dplyr::filter(cell_id == cellid) %>% - plottinglist(., xaxis_order = xaxis_order, maxCN = maxCN) + plottinglist(., xaxis_order = xaxis_order, maxCN = maxCN, tickwidth = tickwidth, chrstart = chrstart, chrend = chrend) if (raster == TRUE) { if (!requireNamespace("ggrastr", quietly = TRUE)) { @@ -543,7 +592,7 @@ plotCNprofile <- function(CNbins, dplyr::mutate(state = factor(paste0(state), levels = c(paste0(seq(0, 10, 1)), "11+"))) %>% ggplot2::ggplot(ggplot2::aes(x = idx, y = copy)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggrastr::geom_point_rast(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval) + + ggrastr::geom_point_rast(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval, shape = shape) + ggplot2::scale_color_manual( name = "Copy number", breaks = names(statecolpal), @@ -559,7 +608,7 @@ plotCNprofile <- function(CNbins, ) + ggplot2::scale_x_continuous(breaks = pl$chrticks, labels = pl$chrlabels, expand = c(0, 0), limits = c(pl$minidx, pl$maxidx), guide = ggplot2::guide_axis(check.overlap = TRUE)) + ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(0, maxCN), trans = y_axis_trans) + - ggplot2::xlab("Chromosome") + + ggplot2::xlab(xlab) + ggplot2::ylab("Copy Number") + cowplot::theme_cowplot(...) + ggplot2::guides(colour = ggplot2::guide_legend( @@ -573,7 +622,7 @@ plotCNprofile <- function(CNbins, dplyr::mutate(state = factor(paste0(state), levels = c(paste0(seq(0, 10, 1)), "11+"))) %>% ggplot2::ggplot(ggplot2::aes(x = idx, y = copy)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggplot2::geom_point(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval) + + ggplot2::geom_point(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval, shape = 16) + ggplot2::scale_color_manual( name = "Allele Specific CN", breaks = names(statecolpal), @@ -589,7 +638,7 @@ plotCNprofile <- function(CNbins, ) + ggplot2::scale_x_continuous(breaks = pl$chrticks, labels = pl$chrlabels, expand = c(0, 0), limits = c(pl$minidx, pl$maxidx), guide = ggplot2::guide_axis(check.overlap = TRUE)) + ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(0, maxCN), trans = y_axis_trans) + - ggplot2::xlab("Chromosome") + + ggplot2::xlab(xlab) + ggplot2::ylab("Copy Number") + cowplot::theme_cowplot(...) + ggplot2::guides(colour = ggplot2::guide_legend( @@ -664,6 +713,10 @@ plotCNprofileBAFhomolog <- function(cn, plotdata = TRUE, offset = NULL, linewidth = 0.6, + tickwidth = 50, + chrstart = NULL, + chrend = NULL, + shape = 16, ...) { if (!xaxis_order %in% c("bin", "genome_position")) { stop("xaxis_order must be either 'bin' or 'genome_position'") @@ -674,13 +727,18 @@ plotCNprofileBAFhomolog <- function(cn, } else { CNbins <- cn } + + if (length(chrfilt) == 1){ + xlab <- paste0('Chromosome ', chrfilt) + } else{ + xlab <- 'Chromosome' + } CNbins <- CNbins %>% dplyr::mutate(Bcopy = BAF * copy, Acopy = (1 - BAF) * copy) if (y_axis_trans == "squashy") { - maxCN <- min(c(24, maxCN)) - ybreaks <- c(0, 2, 5, 10, 20) + ybreaks <- c(0, 2, 5, 10, maxCN) } else { ybreaks <- seq(0, maxCN, 2) } @@ -706,7 +764,8 @@ plotCNprofileBAFhomolog <- function(cn, dplyr::filter(cell_id == cellid) %>% dplyr::mutate(Acopy = ifelse(Acopy > maxCN, maxCN - 0.001, Acopy)) %>% dplyr::mutate(Bcopy = ifelse(Bcopy > maxCN, maxCN - 0.001, Bcopy)) %>% - plottinglist(., xaxis_order = xaxis_order, maxCN = maxCN) + plottinglist(., xaxis_order = xaxis_order, maxCN = maxCN, tickwidth = tickwidth, chrstart = chrstart, chrend = chrend) + pl_for_sv <- pl if (plotdata == TRUE){ pl$CNbins <- pl$CNbins %>% @@ -728,7 +787,7 @@ plotCNprofileBAFhomolog <- function(cn, dplyr::mutate(state_min = paste0(state_min)) %>% ggplot2::ggplot(ggplot2::aes(x = idx)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggrastr::geom_point_rast(ggplot2::aes(y = value, col = name), size = pointsize, alpha = alphaval) + + ggrastr::geom_point_rast(ggplot2::aes(y = value, col = name), size = pointsize, alpha = alphaval, shape = shape) + ggplot2::scale_color_manual( name = "", labels = c("Homolog A", "Homolog B"), @@ -743,7 +802,7 @@ plotCNprofileBAFhomolog <- function(cn, ) + ggplot2::scale_x_continuous(breaks = pl$chrticks, labels = pl$chrlabels, expand = c(0, 0), limits = c(pl$minidx, pl$maxidx), guide = ggplot2::guide_axis(check.overlap = TRUE)) + ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(0, maxCN), trans = y_axis_trans) + - ggplot2::xlab("Chromosome") + + ggplot2::xlab(xlab) + ggplot2::ylab("Copy Number") + cowplot::theme_cowplot(...) + ggplot2::guides(colour = ggplot2::guide_legend( @@ -758,7 +817,7 @@ plotCNprofileBAFhomolog <- function(cn, dplyr::mutate(state_min = paste0(state_min)) %>% ggplot2::ggplot(ggplot2::aes(x = idx)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggplot2::geom_point(ggplot2::aes(y = value, col = name), size = pointsize, alpha = alphaval) + + ggplot2::geom_point(ggplot2::aes(y = value, col = name), size = pointsize, alpha = alphaval, shape = shape) + ggplot2::scale_color_manual( name = "", labels = c("Homolog A", "Homolog B"), @@ -773,7 +832,7 @@ plotCNprofileBAFhomolog <- function(cn, ) + ggplot2::scale_x_continuous(breaks = pl$chrticks, labels = pl$chrlabels, expand = c(0, 0), limits = c(pl$minidx, pl$maxidx), guide = ggplot2::guide_axis(check.overlap = TRUE)) + ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(0, maxCN), trans = y_axis_trans) + - ggplot2::xlab("Chromosome") + + ggplot2::xlab(xlab) + ggplot2::ylab("Copy Number") + cowplot::theme_cowplot(...) + ggplot2::guides(colour = ggplot2::guide_legend( @@ -785,18 +844,25 @@ plotCNprofileBAFhomolog <- function(cn, } else{ pl$CNbins <- pl$CNbins %>% - dplyr::mutate(idx_start = idx, idx_end = lead(idx_start)) + dplyr::mutate(rl = data.table::rleid(state_AS_phased)) %>% + dplyr::group_by(chr, rl, Min, Maj) %>% + dplyr::summarize(idx_start = dplyr::first(idx), idx_end = dplyr::last(idx)) pl$CNbins <- pl$CNbins %>% tidyr::pivot_longer(cols = c("Maj", "Min")) if (is.null(offset)) { offset <- maxCN * 1.1 * 0.03 - print(offset) } - pl$CNbins <- pl$CNbins %>% - dplyr::mutate(value = ifelse(name == "Maj", value + offset, value - offset)) + if (y_axis_trans == "squashy"){ + pl$CNbins <- pl$CNbins %>% + dplyr::mutate(value = ifelse(name == "Maj", value + squashy_trans()$transform(2 * value * offset), + value - squashy_trans()$transform(2 * value * offset))) + } else{ + pl$CNbins <- pl$CNbins %>% + dplyr::mutate(value = ifelse(name == "Maj", value + offset, value - offset)) + } if (shuffle){ pl$CNbins <- dplyr::sample_frac(pl$CNbins, 1L) @@ -810,9 +876,9 @@ plotCNprofileBAFhomolog <- function(cn, } gCN <- pl$CNbins %>% - ggplot2::ggplot(ggplot2::aes(x = idx)) + + ggplot2::ggplot() + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - geom_linerange(aes(xmin = idx_start, xmax = idx_end, y = value, colour = name), size = linewidth) + + ggplot2::geom_linerange(aes(xmin = idx_start, xmax = idx_end, y = value, colour = name), size = linewidth) + ggplot2::scale_color_manual( name = "", labels = c("Homolog A", "Homolog B"), @@ -827,7 +893,7 @@ plotCNprofileBAFhomolog <- function(cn, ) + ggplot2::scale_x_continuous(breaks = pl$chrticks, labels = pl$chrlabels, expand = c(0, 0), limits = c(pl$minidx, pl$maxidx), guide = ggplot2::guide_axis(check.overlap = TRUE)) + ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(0 - offset, maxCN + offset), trans = y_axis_trans) + - ggplot2::xlab("Chromosome") + + ggplot2::xlab(xlab) + ggplot2::ylab("Copy Number") + cowplot::theme_cowplot(...) + ggplot2::guides(colour = ggplot2::guide_legend( @@ -837,9 +903,9 @@ plotCNprofileBAFhomolog <- function(cn, ggplot2::theme(legend.title = ggplot2::element_blank(), legend.position = legend.position) } else { gCN <- pl$CNbins %>% - ggplot2::ggplot(ggplot2::aes(x = idx)) + + ggplot2::ggplot() + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - geom_linerange(aes(xmin = idx_start, xmax = idx_end, y = value, colour = name), size = linewidth) + + ggplot2::geom_linerange(aes(xmin = idx_start, xmax = idx_end, y = value, colour = name), size = linewidth) + ggplot2::scale_color_manual( name = "", labels = c("Homolog A", "Homolog B"), @@ -854,7 +920,7 @@ plotCNprofileBAFhomolog <- function(cn, ) + ggplot2::scale_x_continuous(breaks = pl$chrticks, labels = pl$chrlabels, expand = c(0, 0), limits = c(pl$minidx, pl$maxidx), guide = ggplot2::guide_axis(check.overlap = TRUE)) + ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(0 - offset, maxCN + offset), trans = y_axis_trans) + - ggplot2::xlab("Chromosome") + + ggplot2::xlab(xlab) + ggplot2::ylab("Copy Number") + cowplot::theme_cowplot(...) + ggplot2::guides(colour = ggplot2::guide_legend( @@ -868,7 +934,7 @@ plotCNprofileBAFhomolog <- function(cn, if (!is.null(SV)) { svpl <- plottinglistSV(SV, chrfilt = chrfilt) - bezdf <- get_bezier_df(svpl, pl, maxCN, homolog = TRUE) + bezdf <- get_bezier_df(svpl, pl_for_sv, maxCN, homolog = TRUE) bezdf <- bezdf %>% dplyr::filter((position_1 != position_2) | rearrangement_type == "foldback") gCN <- gCN + @@ -965,6 +1031,10 @@ plotCNprofileBAF <- function(cn, plotdata = TRUE, offset = NULL, my_title = NULL, + tickwidth = 50, + chrstart = NULL, + chrend = NULL, + shape = 16, ...) { if (homolog == TRUE) { ghomolog <- plotCNprofileBAFhomolog(cn, @@ -987,6 +1057,10 @@ plotCNprofileBAF <- function(cn, svwidth = svwidth, plotdata = plotdata, offset = offset, + tickwidth = tickwidth, + chrstart = chrstart, + chrend = chrend, + shape = shape, ... ) return(ghomolog) @@ -1003,8 +1077,7 @@ plotCNprofileBAF <- function(cn, } if (y_axis_trans == "squashy") { - maxCN <- min(c(24, maxCN)) - ybreaks <- c(0, 2, 5, 10, 20) + ybreaks <- c(0, 2, 5, 10, maxCN) } else { ybreaks <- seq(0, maxCN, 2) } @@ -1012,8 +1085,12 @@ plotCNprofileBAF <- function(cn, if (is.null(my_title)){ my_title <- cellid } - - + + if (length(chrfilt) == 1){ + xlab <- paste0('Chromosome ', chrfilt) + } else{ + xlab <- 'Chromosome' + } if (is.null(cellid)) { cellid <- unique(CNbins$cell_id)[min(cellidx, length(unique(CNbins$cell_id)))] @@ -1046,7 +1123,7 @@ plotCNprofileBAF <- function(cn, pl <- CNbins %>% dplyr::filter(cell_id == cellid) %>% - plottinglist(., xaxis_order = xaxis_order, maxCN = maxCN) + plottinglist(., xaxis_order = xaxis_order, maxCN = maxCN, tickwidth = tickwidth, chrstart = chrstart, chrend = chrend) if (raster == TRUE) { if (!requireNamespace("ggrastr", quietly = TRUE)) { @@ -1058,12 +1135,13 @@ plotCNprofileBAF <- function(cn, dplyr::mutate(state_min = paste0(state_min)) %>% ggplot2::ggplot(ggplot2::aes(x = idx, y = BAF)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggrastr::geom_point_rast(ggplot2::aes_string(col = BAFcol), size = pointsize, alpha = alphaval) + + ggrastr::geom_point_rast(ggplot2::aes_string(col = BAFcol), size = pointsize, alpha = alphaval, shape = shape) + ggplot2::scale_color_manual( name = "CN", breaks = names(BAFcolpal), labels = names(BAFcolpal), - values = BAFcolpal + values = BAFcolpal, + drop = FALSE ) + ggplot2::theme( axis.title.y = ggplot2::element_blank(), @@ -1073,7 +1151,7 @@ plotCNprofileBAF <- function(cn, ) + ggplot2::scale_x_continuous(breaks = pl$chrticks, labels = pl$chrlabels, expand = c(0, 0), limits = c(pl$minidx, pl$maxidx), guide = ggplot2::guide_axis(check.overlap = TRUE)) + ggplot2::scale_y_continuous(breaks = c(0.0, 0.25, 0.5, 0.75, 1.0), limits = c(0, 1.0)) + - ggplot2::xlab("Chromosome") + + ggplot2::xlab(xlab) + ggplot2::ylab("BAF") + ggplot2::ggtitle(my_title) + cowplot::theme_cowplot(...) + @@ -1092,7 +1170,7 @@ plotCNprofileBAF <- function(cn, dplyr::mutate(state_min = paste0(state_min)) %>% ggplot2::ggplot(ggplot2::aes(x = idx, y = copy)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggrastr::geom_point_rast(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval) + + ggrastr::geom_point_rast(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval, shape = shape) + ggplot2::scale_color_manual( name = "Allele Specific CN", breaks = names(statecolpal), @@ -1108,7 +1186,7 @@ plotCNprofileBAF <- function(cn, ) + ggplot2::scale_x_continuous(breaks = pl$chrticks, labels = pl$chrlabels, expand = c(0, 0), limits = c(pl$minidx, pl$maxidx), guide = ggplot2::guide_axis(check.overlap = TRUE)) + ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(0, maxCN), trans = y_axis_trans) + - ggplot2::xlab("Chromosome") + + ggplot2::xlab(xlab) + ggplot2::ylab("Copy Number") + cowplot::theme_cowplot(...) + ggplot2::guides(colour = ggplot2::guide_legend( @@ -1121,12 +1199,13 @@ plotCNprofileBAF <- function(cn, dplyr::mutate(state_min = paste0(state_min)) %>% ggplot2::ggplot(ggplot2::aes(x = idx, y = BAF)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggplot2::geom_point(ggplot2::aes_string(col = BAFcol), size = pointsize, alpha = alphaval) + + ggplot2::geom_point(ggplot2::aes_string(col = BAFcol), size = pointsize, alpha = alphaval, shape = shape) + ggplot2::scale_color_manual( name = "CN", breaks = names(BAFcolpal), labels = names(BAFcolpal), - values = BAFcolpal + values = BAFcolpal, + drop = FALSE ) + ggplot2::theme( axis.title.y = ggplot2::element_blank(), @@ -1155,7 +1234,7 @@ plotCNprofileBAF <- function(cn, dplyr::mutate(state_min = paste0(state_min)) %>% ggplot2::ggplot(ggplot2::aes(x = idx, y = copy)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggplot2::geom_point(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval) + + ggplot2::geom_point(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval, shape = shape) + ggplot2::scale_color_manual( name = "Allele Specific CN", breaks = names(statecolpal), @@ -1171,7 +1250,7 @@ plotCNprofileBAF <- function(cn, ) + ggplot2::scale_x_continuous(breaks = pl$chrticks, labels = pl$chrlabels, expand = c(0, 0), limits = c(pl$minidx, pl$maxidx), guide = ggplot2::guide_axis(check.overlap = TRUE)) + ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(0, maxCN), trans = y_axis_trans) + - ggplot2::xlab("Chromosome") + + ggplot2::xlab(xlab) + ggplot2::ylab("Copy Number") + cowplot::theme_cowplot(...) + ggplot2::guides(colour = ggplot2::guide_legend( @@ -1232,7 +1311,7 @@ plotCNprofileBAF <- function(cn, #' @export -plotCNBAF <- function(cn, nfilt = 10^5, plottitle = "5Mb", pointsize = 0.1, ...) { +plotCNBAF <- function(cn, nfilt = 10^5, plottitle = "5Mb", pointsize = 0.1, shape = 16, ...) { if (is.hscn(cn) | is.ascn(cn)) { CNbins <- cn$data } else { @@ -1259,7 +1338,7 @@ plotCNBAF <- function(cn, nfilt = 10^5, plottitle = "5Mb", pointsize = 0.1, ...) g <- CNbins %>% ggplot2::ggplot(ggplot2::aes(y = BAF, x = copy, col = paste0("CN", state))) + - ggplot2::geom_point(size = pointsize, alpha = 0.2) + + ggplot2::geom_point(size = pointsize, alpha = 0.2, shape = shape) + ggplot2::xlab("Corrected read counts") + cowplot::theme_cowplot(...) + ggplot2::ggtitle(plottitle) + diff --git a/R/plotting_rna.R b/R/plotting_rna.R index 113eba0..85563c7 100644 --- a/R/plotting_rna.R +++ b/R/plotting_rna.R @@ -1,5 +1,5 @@ #' @export -per_chr_baf <- function(haps, filtern = 9, perarm = FALSE, labelclones = FALSE) { +per_chr_baf_plot <- function(haps, filtern = 9, perarm = FALSE, labelclones = FALSE) { if (labelclones) { if (perarm) { chridx <- data.frame(chrarm = paste0(rep(c(paste0(seq(1:22)), "X", "Y"), each = 2), rep(c("p", "q"), 24))) %>% @@ -109,6 +109,64 @@ per_chr_baf <- function(haps, filtern = 9, perarm = FALSE, labelclones = FALSE) return(hst) } +#' @export +per_segment_baf_plot <- function(hscn, filtern = 9, labelclones = FALSE) { + + if (is.hscnrna(hscn)) { + cndat <- hscn$hscn + } else { + cndat <- hscn + } + + mixedrank = function(x) order(gtools::mixedorder(x)) + + if (labelclones) { + + chridx <- dplyr::distinct(cndat, chr, segid) %>% + dplyr::arrange(mixedrank(segid)) %>% + dplyr::mutate(idx = 1:n()) + + hst <- cndat %>% + dplyr::filter(total > filtern) %>% + dplyr::left_join(chridx) %>% + dplyr::mutate(segidf = stringr::str_replace_all(segid, "_", "\n")) %>% + dplyr::mutate(segidf = forcats::fct_reorder(as.factor(segidf), idx)) %>% + ggplot2::ggplot(ggplot2::aes(x = BAF, fill = clone_id)) + + ggplot2::geom_histogram(bins = 30, alpha = 0.5) + + ggplot2::facet_wrap(~segidf, scales = "free_y") + + cowplot::theme_cowplot(font_size = 8) + + ggplot2::scale_x_continuous(breaks = c(0.0, 0.5, 1.0), limits = c(-0.05, 1.05)) + + cowplot::panel_border() + + ggplot2::geom_vline(xintercept = 0.5, lty = 2, size = 0.5, col = "firebrick4") + + ggplot2::theme( + axis.title.y = ggplot2::element_blank(), + axis.text.y = ggplot2::element_blank(), + axis.ticks.y = ggplot2::element_blank() + ) + } else { + hst <- cndat %>% + dplyr::filter(total > filtern) %>% + dplyr::left_join(chridx) %>% + dplyr::mutate(segidf = stringr::str_replace_all(segid, "_", "\n")) %>% + dplyr::mutate(segidf = forcats::fct_reorder(as.factor(segidf), idx)) %>% + ggplot2::ggplot(ggplot2::aes(x = BAF)) + + ggplot2::geom_histogram(bins = 30, alpha = 0.5) + + ggplot2::facet_wrap(~segidf, scales = "free_y") + + cowplot::theme_cowplot(font_size = 8) + + ggplot2::scale_x_continuous(breaks = c(0.0, 0.5, 1.0), limits = c(-0.05, 1.05)) + + cowplot::panel_border() + + ggplot2::geom_vline(xintercept = 0.5, lty = 2, size = 0.5, col = "firebrick4") + + ggplot2::theme( + axis.title.y = ggplot2::element_blank(), + axis.text.y = ggplot2::element_blank(), + axis.ticks.y = ggplot2::element_blank() + ) + } + + return(hst) +} + + #' @export plot_proportions <- function(hscn_dna_arm, hscn_rna_arm, perarm = FALSE) { prop_rna <- hscn_rna_arm %>% diff --git a/R/util.R b/R/util.R index 1d92308..58d62b4 100644 --- a/R/util.R +++ b/R/util.R @@ -338,6 +338,7 @@ widen_bins <- function(CNbins, return(as.data.frame(widerCNbins)) } + #' @export snv_states <- function(SNV, CNbins) { CN <- CNbins %>% @@ -360,6 +361,19 @@ snv_states <- function(SNV, CNbins) { return(as.data.frame(mappedSNVs)) } +#' @export +map_to_segments <- function(regions, segments){ + segments <- dplyr::mutate(segments, seg_start = start, seg_end = end) + regions <- dplyr::mutate(regions, pos = start) + mapped_segs <- as.data.table(regions)[as.data.table(segments), + on = .(chr == chr, start > start, start < end) + ] %>% + .[, start := pos] %>% + .[, pos := NULL] %>% + .[, start.1 := NULL] %>% + na.omit() +} + #' Genomic coordinate to chromosome arm #' @@ -683,6 +697,129 @@ per_arm_baf_mat <- function(haps, return(list(bafperchr = baf, bafperchrmat = baf_mat)) } +mixedrank = function(x) order(gtools::mixedorder(x)) + +#' @export +filter_segments <- function(segs, binwidth = 5e6){ + data("hg19chrom_coordinates", envir = environment()) + chrin <- unique(segs$chr) + + w <- 0 + while (w < 5e6){ + segs <- segs %>% + dplyr::mutate(w = end-start) %>% + dplyr::filter(w > binwidth) %>% + dplyr::group_by(chr, cell_id) %>% + dplyr::mutate(end2 = lead(start) - 1) %>% + dplyr::mutate(end = ifelse(is.na(end2), end, end2)) %>% + dplyr::ungroup() %>% + dplyr::select(chr, start, end, w) %>% + dplyr::arrange(mixedrank(chr), start) + w <- min(segs$w) + } + + segs <- dplyr::select(segs, -w) + + #make sure whole chromosome is covered + segs <- dplyr::left_join(segs, hg19chrom_coordinates %>% dplyr::filter(arm == "") %>% dplyr::mutate(start = start + 1), by = "chr") %>% + dplyr::group_by(chr) %>% + dplyr::mutate(start = ifelse(dplyr::row_number() == 1, start.y, start.x), + end = ifelse(dplyr::row_number() == dplyr::n(), end.y, end.x)) %>% + dplyr::select(chr, start, end) + + w <- 0 + while (w < 5e6){ + segs <- segs %>% + dplyr::mutate(w = end-start) %>% + dplyr::filter(w > binwidth) %>% + dplyr::group_by(chr) %>% + dplyr::mutate(end2 = lead(start) - 1) %>% + dplyr::mutate(end = ifelse(is.na(end2), end, end2)) %>% + dplyr::ungroup() %>% + dplyr::select(chr, start, end, w) %>% + dplyr::arrange(mixedrank(chr), start) + w <- min(segs$w) + } + + segs <- dplyr::select(segs, -w) + + if (all(chrin %in% unique(segs$chr)) == FALSE){ + warning("Not all chromosomes present in output adding whole chromosomes") + chroms <- chrin[which(!chrin %in% unique(segs$chr))] + to_add <- hg19chrom_coordinates %>% + dplyr::filter(chr %in% chroms & arm == "") %>% + dplyr::mutate(start = start + 1) %>% + dplyr::select(chr, start, end) + segs <- dplyr::bind_rows(segs, to_add) %>% + dplyr::arrange(mixedrank(chr), start) + } + + return(segs %>% as.data.frame()) +} + +#' @export +per_segment_baf_mat <- function(haps, + segments = NULL, + mergelowcounts = TRUE, + mincounts = 10, + arms = NULL) { + + options(scipen=999) + + if (is.null(segments)){ + message("No segments provided, using chromosome arms as segments") + segments <- hg19chrom_coordinates %>% + dplyr::filter(arm != "") %>% + dplyr::select(-arm, -chrarm) %>% + dplyr::mutate(start = start + 1) + } + + mapped_segs <- map_to_segments(haps, segments) + + mapped_segs$segid <- paste(mapped_segs$chr, mapped_segs$seg_start, mapped_segs$seg_end, sep = "_") + + baf <- mapped_segs %>% + dplyr::group_by(chr, segid, cell_id) %>% + dplyr::summarise( + alleleA = sum(alleleA, na.rm = TRUE), + alleleB = sum(alleleB, na.rm = TRUE) + ) %>% + dplyr::ungroup() %>% + dplyr::mutate( + BAF = alleleB / (alleleA + alleleB), + total = alleleB + alleleA + ) + + ord <- dplyr::distinct(baf, chr, segid) %>% + as.data.table() %>% + .[gtools::mixedorder(chr)] %>% + .[, idx := 1:.N] %>% + dplyr::as_tibble() + + baf <- dplyr::left_join(baf, ord) + + baf_mat <- baf %>% + dplyr::select(cell_id, segid, BAF) %>% + tidyr::pivot_wider(names_from = "segid", values_from = "BAF") %>% + as.data.frame() + + row.names(baf_mat) <- baf_mat$cell_id + baf_mat <- subset(baf_mat, select = -c(cell_id)) + + idx <- dplyr::distinct(baf, chr, segid, idx) %>% dplyr::arrange(idx) + baf_mat <- baf_mat[, idx$segid] + + counts <- baf %>% + dplyr::group_by(chr, segid) %>% + dplyr::summarise(median = median(total)) %>% + dplyr::ungroup() %>% + dplyr::filter(median < mincounts) + + warning(paste0("The following segments have on average ", mincounts, " or fewer counts: ", paste(counts$segid, collapse = ", "))) + + return(list(bafpersegment = baf, bafpersegmentmat = baf_mat)) +} + #' @export per_chrarm_cn <- function(hscn, arms = NULL) { data("hg19chrom_coordinates", envir = environment()) @@ -833,18 +970,20 @@ add_states <- function(df) { } #' @export -createBAFassay <- function(seur, rna_ascn) { +createBAFassay <- function(seur, rna_ascn, ref = "hg19") { if (!requireNamespace("Seurat", quietly = TRUE)) { stop("Package \"Seurat\" is needed for this function. Please install it.", call. = FALSE ) } + + data("gene_locations", envir = environment()) message("Add BAF to Seurat object") x <- tidyr::pivot_wider(rna_ascn$hscn %>% - dplyr::select(cell_id, BAF, chrarm) %>% - dplyr::mutate(chrarm = paste0("BAF-", chrarm)), - names_from = "chrarm", + dplyr::select(cell_id, BAF, segid) %>% + dplyr::mutate(segid = paste0("BAF-", stringr::str_replace_all(segid, "_", "-"))), + names_from = "segid", values_from = c("BAF") ) %>% as.data.frame() @@ -854,12 +993,13 @@ createBAFassay <- function(seur, rna_ascn) { cM <- colMeans(x, na.rm = TRUE) indx <- which(is.na(x), arr.ind = TRUE) x[indx] <- cM[indx[, 2]] - seur[["BAF"]] <- Seurat::CreateAssayObject(data = t(x)) + baf_assay <- Seurat::CreateAssayObject(data = t(x)) + seur[["segBAF"]] <- Seurat::CreateAssayObject(data = t(x)) message("Add allele specific state to Seurat Object") x <- tidyr::pivot_wider(rna_ascn$hscn %>% - dplyr::select(cell_id, state_phase, chrarm), - names_from = "chrarm", + dplyr::select(cell_id, state_phase, segid), + names_from = "segid", values_from = c("state_phase") ) %>% as.data.frame() @@ -879,10 +1019,75 @@ createBAFassay <- function(seur, rna_ascn) { metadata = clonesvec, col.name = "DP_cloneid" ) + + if (!is.null(rna_ascn$haplotypecounts)){ + message("Add gene allelic imbalance") + snps_to_genes <- map_to_segments(rna_ascn$haplotypecounts, + gene_locations[[ref]] %>% dplyr::filter(!stringr::str_detect(chr, "_"))) + gene_baf <- snps_to_genes %>% + .[, list(BAF = mean(BAF)), by = c("cell_id", "ensembl_gene_symbol")] %>% + .[, cell_id := as.factor(cell_id)] %>% + .[, ensembl_gene_symbol := as.factor(paste0("BAF-",ensembl_gene_symbol))] + x <- tidyr::pivot_wider(gene_baf %>% + dplyr::select(cell_id, ensembl_gene_symbol, BAF), + names_from = "ensembl_gene_symbol", + values_from = c("BAF") + ) %>% + as.data.frame() + row.names(x) <- x$cell_id + x <- as.matrix(subset(x, select = -cell_id)) + seur[["gBAF"]] <- Seurat::CreateAssayObject(data = t(x)) + meta <- gene_locations[[ref]] %>% + dplyr::filter(!stringr::str_detect(chr, "_")) %>% + dplyr::group_by(ensembl_gene_symbol) %>% + dplyr::filter(dplyr::row_number() == 1) %>% #hack, take first chr if there are > 1 + as.data.frame() %>% + dplyr::mutate(arm = paste0(chr, coord_to_arm(chr, start, assembly = ref))) + row.names(meta) <- meta$ensembl_gene_symbol + meta <- meta[stringr::str_remove(rownames(seur[["gBAF"]]), "BAF-"), ] + meta <- merge(meta, snps_to_genes[, list(totalsnpcounts = sum(totalcounts)), by = "ensembl_gene_symbol"]) + row.names(meta) <- meta$ensembl_gene_symbol + seur[["gBAF"]]@meta.features <- meta + + gene_counts <- snps_to_genes %>% + .[, list(totalcounts = sum(totalcounts)), by = c("cell_id", "ensembl_gene_symbol")] %>% + .[, cell_id := as.factor(cell_id)] %>% + .[, ensembl_gene_symbol := as.factor(paste0("counts-",ensembl_gene_symbol))] + x <- tidyr::pivot_wider(gene_counts %>% + dplyr::select(cell_id, ensembl_gene_symbol, totalcounts), + names_from = "ensembl_gene_symbol", + values_from = c("totalcounts") + ) %>% + as.data.frame() + row.names(x) <- x$cell_id + x <- as.matrix(subset(x, select = -cell_id)) + seur[["gCounts"]] <- Seurat::CreateAssayObject(data = t(x)) + } + + # x <- with(gene_baf, Matrix::sparseMatrix(i=as.numeric(ensembl_gene_symbol), + # j=as.numeric(cell_id), + # x=as.numeric(BAF), + # dimnames=list(levels(ensembl_gene_symbol), levels(cell_id)))) + #seur[["gBAF"]] <- Seurat::CreateAssayObject(data = x) return(seur) } +#' export +add_gene_locations_to_seurat <- function(obj, ref = "hg19"){ + meta <- gene_locations[[ref]] %>% + dplyr::filter(!stringr::str_detect(chr, "_")) %>% + dplyr::group_by(ensembl_gene_symbol) %>% + dplyr::filter(dplyr::row_number() == 1) %>% #hack, take first chr if there are > 1 + as.data.frame() %>% + dplyr::mutate(arm = paste0(chr, coord_to_arm(chr, start, assembly = ref))) + row.names(meta) <- meta$ensembl_gene_symbol + meta <- meta[rownames(obj[["RNA"]]), ] + row.names(meta) <- row.names(obj[["RNA"]]@meta.features) + obj[["RNA"]]@meta.features <- cbind(obj[["RNA"]]@meta.features, meta) + return(obj) +} + #' @export consensuscopynumber <- function(hscn, cl = NULL) { if (!is.null(cl)) { @@ -897,11 +1102,11 @@ consensuscopynumber <- function(hscn, cl = NULL) { .[, .( state = as.double(round(median(state, na.rm = TRUE))), copy = as.double(median(copy, na.rm = TRUE)), - Maj = as.double(floor(median(Maj))), - alleleA = sum(alleleA), - alleleB = sum(alleleB), - totalcounts = sum(totalcounts), - BAF = median(BAF) + Maj = as.double(floor(median(Maj, na.rm = TRUE))), + alleleA = sum(alleleA, na.rm = TRUE), + alleleB = sum(alleleB, na.rm = TRUE), + totalcounts = sum(totalcounts, na.rm = TRUE), + BAF = median(BAF, na.rm = TRUE) ), by = .(chr, start, end, clone_id)] %>% .[, Min := state - Maj] %>% add_states() %>% diff --git a/data/gene_locations.rda b/data/gene_locations.rda new file mode 100644 index 0000000..e68e26c Binary files /dev/null and b/data/gene_locations.rda differ diff --git a/man/add_gene_locations_to_seurat.Rd b/man/add_gene_locations_to_seurat.Rd new file mode 100644 index 0000000..b2f10d8 --- /dev/null +++ b/man/add_gene_locations_to_seurat.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.R +\name{add_gene_locations_to_seurat} +\alias{add_gene_locations_to_seurat} +\title{export} +\usage{ +add_gene_locations_to_seurat(obj, ref = "hg19") +} +\description{ +export +} diff --git a/man/callAlleleSpecificCN.Rd b/man/callAlleleSpecificCN.Rd index 3689de2..2eed72f 100644 --- a/man/callAlleleSpecificCN.Rd +++ b/man/callAlleleSpecificCN.Rd @@ -7,7 +7,7 @@ callAlleleSpecificCN( CNbins, haplotypes, - eps = 1e-12, + eps = 0.000000000001, loherror = 0.02, maxCN = NULL, selftransitionprob = 0.95, diff --git a/man/callAlleleSpecificCNfromHSCN.Rd b/man/callAlleleSpecificCNfromHSCN.Rd index c8de454..5bb110d 100644 --- a/man/callAlleleSpecificCNfromHSCN.Rd +++ b/man/callAlleleSpecificCNfromHSCN.Rd @@ -6,11 +6,12 @@ \usage{ callAlleleSpecificCNfromHSCN( hscn, - eps = 1e-12, + eps = 0.000000000001, maxCN = NULL, selftransitionprob = 0.95, progressbar = TRUE, - ncores = 1 + ncores = 1, + fillmissing = TRUE ) } \arguments{ diff --git a/man/callHaplotypeSpecificCN.Rd b/man/callHaplotypeSpecificCN.Rd index 9aec8b6..ae2d7fd 100644 --- a/man/callHaplotypeSpecificCN.Rd +++ b/man/callHaplotypeSpecificCN.Rd @@ -7,7 +7,7 @@ callHaplotypeSpecificCN( CNbins, haplotypes, - eps = 1e-12, + eps = 0.000000000001, loherror = 0.02, maxCN = NULL, selftransitionprob = 0.95, diff --git a/man/getBins.Rd b/man/getBins.Rd index 6d29365..9290306 100644 --- a/man/getBins.Rd +++ b/man/getBins.Rd @@ -4,7 +4,7 @@ \alias{getBins} \title{Make fixed-width bins} \usage{ -getBins(chrom.lengths = hg19_chrlength, binsize = 1e+06, chromosomes = NULL) +getBins(chrom.lengths = hg19_chrlength, binsize = 1000000, chromosomes = NULL) } \arguments{ \item{chrom.lengths}{A named character vector with chromosome lengths. Names correspond to chromosomes.} diff --git a/man/plotCNprofile.Rd b/man/plotCNprofile.Rd index 2d9bbf5..fbd3dd4 100644 --- a/man/plotCNprofile.Rd +++ b/man/plotCNprofile.Rd @@ -24,6 +24,9 @@ plotCNprofile( svwidth = 1, adj = 0.03, genes = NULL, + tickwidth = 50, + chrstart = NULL, + chrend = NULL, ... ) } @@ -61,6 +64,8 @@ plotCNprofile( \item{svalpha}{the alpha scaling of the SV lines, default = 0.5} \item{genes}{vector of genes to annotate, will add a dashed vertical line and label} + +\item{tickwidth}{Spacing of ticks (in Mb) when only 1 chromosome is plotted} } \value{ ggplot2 plot diff --git a/man/plotCNprofileBAF.Rd b/man/plotCNprofileBAF.Rd index 5a4245a..5adbeff 100644 --- a/man/plotCNprofileBAF.Rd +++ b/man/plotCNprofileBAF.Rd @@ -26,6 +26,12 @@ plotCNprofileBAF( adj = 0.03, svalpha = 0.5, svwidth = 1, + plotdata = TRUE, + offset = NULL, + my_title = NULL, + tickwidth = 50, + chrstart = NULL, + chrend = NULL, ... ) } @@ -67,6 +73,10 @@ plotCNprofileBAF( \item{svalpha}{the alpha scaling of the SV lines, default = 0.5} \item{svwidth}{width of rearrangement connections} + +\item{plotdata}{Binary value whether to plot raw data or inferred states in homolog plot} + +\item{offest}{to use when plotting inferred states in homolog plot} } \value{ ggplot2 plot diff --git a/man/plotHeatmap.Rd b/man/plotHeatmap.Rd index 6f3eb7c..3e7d50b 100644 --- a/man/plotHeatmap.Rd +++ b/man/plotHeatmap.Rd @@ -26,6 +26,7 @@ plotHeatmap( show_legend = TRUE, show_library_label = TRUE, show_clone_label = TRUE, + show_clone_text = TRUE, widenarm = FALSE, umapmetric = "euclidean", chrlabels = TRUE, @@ -39,6 +40,8 @@ plotHeatmap( linkheight = 5, newlegendname = NULL, str_to_remove = NULL, + maxCNcol = 11, + anno_width = 0.4, ... ) } @@ -99,7 +102,11 @@ plotHeatmap( \item{newlegendname}{overwrite default legend name} -\item{str_to_remove}{string to remove from cell_id's when plotting labels +\item{str_to_remove}{string to remove from cell_id's when plotting labels} + +\item{maxCNcol}{max value for color scale when plotting raw data} + +\item{anno_width}{width of left annotations If clusters are set to NULL then the function will compute clusters using UMAP and HDBSCAN. diff --git a/tests/testthat/test-util.R b/tests/testthat/test-util.R index 50393fd..9e16c12 100644 --- a/tests/testthat/test-util.R +++ b/tests/testthat/test-util.R @@ -41,3 +41,18 @@ segs <- create_segments(sim_data_bb$CNbins) test_that("Test create segments", { expect_equal(dim(segs)[1], 24 * sum(clones_dist)) }) + +segs_cn <- create_segments(consensuscopynumber(CNbins)) +segs_filt <- filter_segments(segs_cn, binwidth = 10e6) +test_that("Test filtering segments", { + expect_lt(dim(segs_filt)[1], dim(segs_cn)[1]) + expect_true(segs_filt %>% dplyr::mutate(w = end - start) %>% dplyr::pull(w) %>% min() > 10e6) +}) + +bins <- segments_to_bins(segs_cn, binsize = 0.5e6) +bin_cell_id <- paste(bins$cell_id, bins$chr, bins$start, bins$end, sep = "_") +test_that("Test segments to bins", { + expect_lt(dim(segs_cn)[1], dim(bins)[1]) + expect_equal(median((bins$end - bins$start) + 1), 0.5e6) + expect_true(all(!duplicated(bin_cell_id))) #no duplicates +}) diff --git a/vignettes/ASCN-RNA.Rmd b/vignettes/ASCN-RNA.Rmd index da3e3c9..9fca23b 100644 --- a/vignettes/ASCN-RNA.Rmd +++ b/vignettes/ASCN-RNA.Rmd @@ -24,7 +24,7 @@ library(ggplot2) ``` ## Background -This vignette illustrates how to perform allele specific copy number inference in scRNAseq data. scRNAseq data is much more sparse than scDNAseq, so we perform inference on a chromosome arm basis by merging counts across arms. +This vignette illustrates how to perform allele specific copy number inference in scRNAseq data. scRNAseq data is much more sparse than scDNAseq, so we perform inference on a chromosome arm basis by merging counts across segments or chromosome arms. ## Data @@ -57,54 +57,65 @@ haplotypes_rna <- format_haplotypes_rna(haplotypes_rna, phased_haplotypes = hscn$haplotype_phasing) ``` -As a first look at the data we can calculate the BAF per arm in each cell and plot these distributions using `per_chr_baf`. Notice on this dataset that the BAFs in chromosomes `3p`, `2p`, and `17` in almost all cells are very left skewed towards 0 suggesting clonal LOH events in these chromosomes. We can also see other chromosome arms such as 4p and 2q that have multimodal distributions, suggesting different clusters of cells. +As a first look at the data we can calculate the BAF per arm in each cell and plot these distributions using `per_chr_baf_plot`. Notice on this dataset that the BAFs in chromosomes `3p`, `2p`, and `17` in almost all cells are very left skewed towards 0 suggesting clonal LOH events in these chromosomes. We can also see other chromosome arms such as 4p and 2q that have multimodal distributions, suggesting different clusters of cells. ```{r, fig.width = 6, fig.height=6} -per_chr_baf(haplotypes_rna, perarm = TRUE) +per_chr_baf_plot(haplotypes_rna$block_counts, perarm = TRUE) ``` For comparison we can plot a similar distribution from our scDNAseq dataset. Here we notice similar trends but note that data from the scRNAseq is much more dispersed due to the lower number of counts per cell. scRNAseq will have 1-2 orders of magnitude lower reads so our ability to identify SNPs is reduced. ```{r, fig.width = 6, fig.height=6} -per_chr_baf(hscn$data, perarm = TRUE) +per_chr_baf_plot(hscn$data, perarm = TRUE) ``` -We can also plot heatmaps of the 2 modalities as shown below. +We can also plot heatmaps of the 2 modalities as shown below, rows are clustered by hierarchical clustering. ```{r, fig.width=7} -plotHeatmapBAF(haplotypes_rna) +plotHeatmapBAF(haplotypes_rna$block_counts) ``` ```{r, fig.width=7} plotHeatmapBAF(hscn$data) ``` -We'll now move on to inferring the allele specific copy number in the scRNAseq. We use dirichilet process clustering using [Viber](https://github.com/caravagn/VIBER) to cluster cells that share similar BAF distributions across chromosomes. +We'll now move on to inferring the allele specific copy number in the scRNAseq. Rather than use chromosome arms, we first use the single cell DNA sequencing data to identify segments that have different allele specific copy number. We compute consensus copy number across all cells, identify segments where the allele specific copy number phased state is consistent and then finally merge any segments that are smaller than 10Mb. ```{r} -haplotypes_per_arm <- per_arm_baf_mat(haplotypes_rna) +consensus <- consensuscopynumber(hscn$data) +segments <- create_segments(consensus, field = "state_phase") +segments <- filter_segments(segments, binwidth = 10e6) +``` + +Note, that if you have some other ways to obtain segments such as bulk whole genome sequencing then that would also work. The `segments` data frame needs to have the columns `chr`, `start` and `end`. Alternatively, if you don't have any data that can be used to identify segments a priori you can just use chromosome arms as the segmentation. To do this just leave the segments option in any of the following function empty and the functions will revert to the default of using chromosome arms. + +Our inference scheme is based on using dirichilet process clustering using [Viber](https://github.com/caravagn/VIBER) to cluster cells that share similar BAF distributions across segments. We first compute BAF frequencies across each segment in each cell. + +```{r} +haplotypes_per_segment <- per_segment_baf_mat(haplotypes_rna$block_counts, segments = segments) -ascn_rna_dp <- assign_states_dp(haplotypes_per_arm$bafperchr) +ascn_rna_dp <- assign_states_dp(haplotypes_per_segment$bafpersegment, haplotypes_rna) ascn_rna_dp ``` -Here, we found 2 clusters. We can visualize the separation of clusters using the same plot as before. +Here, we found 2 clusters. We can visualize the separation of clusters using a similar plot to before. Setting `lableclones = TRUE` will colour each cell by their clone assignment. ```{r, fig.width = 6, fig.height=6} -per_chr_baf(ascn_rna_dp$hscn, perarm = TRUE, labelclones = TRUE) +per_segment_baf_plot(ascn_rna_dp$hscn, labelclones = TRUE) ``` ### Heatmap -Finally we can plot heatmaps of the resulting outputs. +Finally we can plot heatmaps of the resulting outputs. Firstly using the scRNA. ```{r, fig.width=8.5} -plotHeatmap(ascn_rna_dp$hscn, +bins <- segments_to_bins(ascn_rna_dp$hscn) + +plotHeatmap(bins, clusters = ascn_rna_dp$clusters, reorderclusters = TRUE, spacer_cols = 15, plotcol = "BAF", plottree = FALSE, - widenarm = TRUE, #make sure this is true show_legend = FALSE) ``` @@ -112,15 +123,17 @@ plotHeatmap(ascn_rna_dp$hscn, We can also do this for the scDNAseq data. For more straightforward comparison we'll perform the dirichilet process arm level inference as we do for the scRNAseq. We can see that the scRNAseq has much lower resolution that scDNAseq, but we are still able to recover the events shared across all cells and clones that are at a reasonably high frequency. ```{r, fig.width=8.5} -hscn_dna_arm <- per_arm_baf_mat(hscn$data) -ascn_dna_dp <- assign_states_dp(hscn_dna_arm$bafperchr) -plotHeatmap(ascn_dna_dp$hscn, +hscn_dna_segment <- per_segment_baf_mat(hscn$data, segments) +ascn_dna_dp <- assign_states_dp(hscn_dna_segment$bafpersegment) + +bins_dna <- segments_to_bins(ascn_dna_dp$hscn) + +plotHeatmap(bins_dna, clusters = ascn_dna_dp$clusters, reorderclusters = TRUE, spacer_cols = 15, plotcol = "BAF", plottree = FALSE, - widenarm = TRUE, #make sure this is true show_legend = FALSE) ``` @@ -145,7 +158,7 @@ x <- RunUMAP(object = x, dims = 1:20, reduction = "pca") Then we can use `createBAFassay` to add a `BAF` assay and add `clone_id` to the metadata. ```{r} -x <- createBAFassay(x, ascn_rna_dp) +x <- createBAFassay(x, ascn_rna_dp, "hg19") ``` This means all the Seurat functionality is available to analyse the allele specific copy number. For example we can plot low dimensional embeddings of BAF values per cell per chromosome or coloured by clone as shown below. @@ -153,15 +166,40 @@ This means all the Seurat functionality is available to analyse the allele speci ```{r} DimPlot(x, group.by = "DP_cloneid") ``` + +```{r} +rownames(x@assays$segBAF) +FeaturePlot(x, "BAF-3-75500001-192300000") + scale_color_gradientn(colors = scBAFstate_cols()) +``` + ```{r} -FeaturePlot(x, "BAF-3q") + scale_color_gradientn(colors = scBAFstate_cols()) +FeaturePlot(x, "BAF-10-1-97000000") + scale_color_gradientn(colors = scBAFstate_cols()) ``` ```{r} -FeaturePlot(x, "BAF-10p") + scale_color_gradientn(colors = scBAFstate_cols()) +FeaturePlot(x, "BAF-2-99000001-225500000") + scale_color_gradientn(colors = scBAFstate_cols()) ``` ```{r} -VlnPlot(x,"BAF-10p", group.by = "DP_cloneid") +VlnPlot(x,"BAF-2-99000001-225500000", group.by = "DP_cloneid") + ylab("BAF") ``` + +We can also pull out the allelic imbalance in individual genes. Let's look at some genes on 4p, where one group of cells is homozygous. Most genes have very few counts, so we'll pull out genes with a large number of genotyped SNPs to take a look at. + + +```{r} +# chr4pgenes <- x[["gBAF"]]@meta.features %>% filter(arm == "4p") %>% +# arrange(desc(totalsnpcounts)) +# +# mygene <- chr4pgenes$ensembl_gene_symbol[2] +# +# cowplot::plot_grid( +# FeaturePlot(x, paste0("BAF-", mygene)) + scale_color_gradientn(colors = scBAFstate_cols()), +# FeaturePlot(x, mygene), +# DimPlot(x, group.by = "DP_cloneid"), +# VlnPlot(x, paste0("BAF-", mygene), group.by = "DP_cloneid") + ylab("BAF")) + +``` + +Note that even in this example where we have been able to identify lots of SNP counts in this gene, most individual cells will still have very few counts. This explains why in cluster C2, the violin plot shows that there are lots of cells with BAF of 0.0 or 1.0, suggesting these cells are homozygous. This however is likely just because the number of counts is very small, even for a heterozygous SNP we're very likely to see cells where we have only sampled the reference or the alternate allele. The key here, is that in cluster C1 we *only* see cells with BAF = 0.0 which is unlikely to be down to sampling alone. In contrast in C2 we see cells across a range of BAF, consistent with expression of of both alleles + random sampling. \ No newline at end of file