From 4a710da4e4561b83b646b42a1658b06f78a04234 Mon Sep 17 00:00:00 2001 From: Marc Williams Date: Tue, 10 Oct 2023 14:08:54 +0100 Subject: [PATCH] plotting updates (#52) * working ideogram plots * some updates to plotting * update README * update umap_clustering due to bug, could be due to updates uwot? --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/callHSCN.R | 4 + R/clustering.R | 56 ++++++++++--- R/col_palettes.R | 14 ++++ R/heatmap_plot.R | 25 +++++- R/plotting.R | 122 ++++++++++++++++++++++++---- R/util.R | 22 ++++- README.md | 12 +-- man/callAlleleSpecificCN.Rd | 2 +- man/callAlleleSpecificCNfromHSCN.Rd | 2 +- man/callHaplotypeSpecificCN.Rd | 2 +- man/getBins.Rd | 2 +- man/plotCNprofile.Rd | 15 ++++ man/plotCNprofileBAF.Rd | 3 + man/plotHeatmap.Rd | 2 +- tests/testthat/test-hscn.R | 1 + 17 files changed, 242 insertions(+), 45 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index cdb81751..b7bee05d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,7 +41,7 @@ Imports: grid, ggforce, ggtree -RoxygenNote: 7.2.0 +RoxygenNote: 7.2.3 Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index d2b0a037..335f446a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -37,6 +37,7 @@ export(fixjitter) export(format_haplotypes) export(format_haplotypes_dlp) export(format_haplotypes_rna) +export(format_tree_labels) export(getBins) export(get_clone_label_pos) export(getphase) diff --git a/R/callHSCN.R b/R/callHSCN.R index f26bf0eb..3151252c 100644 --- a/R/callHSCN.R +++ b/R/callHSCN.R @@ -920,6 +920,10 @@ callHaplotypeSpecificCN <- function(CNbins, return(out) } +# #TODO This is horrible and needs to be re-written! +# Basic ideas is to remove singletons (single bins with copy number that is different from their neighbours) +# this is done by checking whether the log-likelihood of read counts in bin i better supports +# the copy number in bin i-1 or bin i+1 #' @export fix_assignments <- function(hscn) { if (hscn$likelihood$likelihood == "binomial") { diff --git a/R/clustering.R b/R/clustering.R index d9fb3e08..bc9b72e7 100644 --- a/R/clustering.R +++ b/R/clustering.R @@ -51,17 +51,53 @@ umap_clustering <- function(CNbins, pca <- NULL fast_sgd <- FALSE } - umapresults <- uwot::umap(cnmatrix, - metric = umapmetric, - n_neighbors = n_neighbors, - n_components = 2, - min_dist = min_dist, - ret_model = TRUE, - ret_nn = TRUE, - pca = pca, - fast_sgd = fast_sgd - ) + # umapresults <- uwot::umap(cnmatrix, + # metric = umapmetric, + # n_neighbors = n_neighbors, + # n_components = 2, + # min_dist = min_dist, + # ret_model = TRUE, + # ret_nn = TRUE, + # pca = pca, + # fast_sgd = fast_sgd + # ) + #TODO find out why umap gives an error for some cases, seems to be a new bug + umapresults <- tryCatch( + { + umapresults <- uwot::umap(cnmatrix, + metric = umapmetric, + n_neighbors = n_neighbors, + n_components = 2, + min_dist = min_dist, + ret_model = TRUE, + ret_nn = TRUE, + pca = pca, + fast_sgd = fast_sgd) + }, + error = function(e) { + # Handle error by rerunning UMAP with different parameters + message("An error occurred in umap calculation: ", e$message) + message("Rerunning UMAP after adding small jitter to data points...") + + mat <- cnmatrix + matrix(runif(nrow(cnmatrix) * ncol(cnmatrix), + min=-0.005, max=0.005), + nrow=nrow(cnmatrix), ncol=ncol(cnmatrix)) + + umapresults <- uwot::umap(mat, + metric = umapmetric, + n_neighbors = n_neighbors, + n_components = 2, + min_dist = min_dist, + ret_model = TRUE, + ret_nn = TRUE, + pca = pca, + fast_sgd = fast_sgd) + } + ) + + + dfumap <- data.frame( umap1 = umapresults$embedding[, 1], umap2 = umapresults$embedding[, 2], diff --git a/R/col_palettes.R b/R/col_palettes.R index d3c94dfa..620cb281 100644 --- a/R/col_palettes.R +++ b/R/col_palettes.R @@ -13,6 +13,20 @@ scCN_colors <- c( `CN11` = "#D4B9DA" ) +cyto_colors = c( + 'gpos100'= rgb(0/255.0,0/255.0,0/255.0), + 'gpos' = rgb(0/255.0,0/255.0,0/255.0), + 'gpos75' = rgb(130/255.0,130/255.0,130/255.0), + 'gpos66' = rgb(160/255.0,160/255.0,160/255.0), + 'gpos50' = rgb(200/255.0,200/255.0,200/255.0), + 'gpos33' = rgb(210/255.0,210/255.0,210/255.0), + 'gpos25' = rgb(200/255.0,200/255.0,200/255.0), + 'gvar' = rgb(220/255.0,220/255.0,220/255.0), + 'gneg' = rgb(255/255.0,255/255.0,255/255.0), + 'acen' = rgb(217/255.0,47/255.0,39/255.0), + 'stalk' = rgb(100/255.0,127/255.0,164/255.0) +) + scCNstate_colors <- c( `0` = "#3182BD", `1` = "#9ECAE1", diff --git a/R/heatmap_plot.R b/R/heatmap_plot.R index bac107f7..79ec8b57 100644 --- a/R/heatmap_plot.R +++ b/R/heatmap_plot.R @@ -966,7 +966,7 @@ plotHeatmap <- function(cn, annotation_height = NULL, annofontsize = 10, na_col = "white", - linkheight = 5, + linkheight = 2.5, newlegendname = NULL, str_to_remove = NULL, maxCNcol = 11, @@ -1066,12 +1066,26 @@ plotHeatmap <- function(cn, } ncells <- length(unique(CNbins$cell_id)) + + if (!is.null(clusters) & !is.null(tree)) { + cells_clusters <- unique(clusters$cell_id) + cells_data <- unique(CNbins$cell_id) + cells_tree <- unique(tree$tip.label) + check_cells <- all(c(length(cells_tree),length(cells_clusters),length(cells_data)) == length(cells_tree)) + if (check_cells == FALSE){ + warning("Trees, clusters and copy number data have different numbers of cells, removing non-overlapping cells.") + cells_to_keep <- intersect(intersect(cells_clusters, cells_data), cells_tree) + CNbins <- dplyr::filter(CNbins, cell_id %in% cells_to_keep) + clusters <- dplyr::filter(clusters, cell_id %in% cells_to_keep) + cells_to_remove <- setdiff(cells_tree, cells_to_keep) + tree <- ape::drop.tip(tree, cells_to_remove, collapse.singles = FALSE, trim.internal = FALSE) + tree <- format_tree_labels(tree) + } + } if (is.null(clusters) & !is.null(tree)) { ordered_cell_ids <- paste0(unique(CNbins$cell_id)) clusters <- data.frame(cell_id = unique(CNbins$cell_id), clone_id = "0") - } else { - ordered_cell_ids <- paste0(clusters$cell_id) } if (is.null(tree) & is.null(clusters)) { @@ -1097,7 +1111,10 @@ plotHeatmap <- function(cn, cells_clusters <- length(unique(clusters$cell_id)) cells_data <- length(unique(CNbins$cell_id)) if (cells_data != cells_clusters){ - warning("Number of cells in clusters dataframe != number of cells in the bins data!") + warning("Number of cells in clusters dataframe != number of cells in the bins data! Removing some cells") + cells_to_keep <- intersect(cells_clusters, cells_data) + CNbins <- dplyr::filter(CNbins, cell_id %in% cells_to_keep) + clusters <- dplyr::filter(clusters, cell_id %in% cells_to_keep) } if (!"clone_id" %in% names(clusters)) { stop("No clone_id columns in clusters dataframe, you might need to rename your clusters") diff --git a/R/plotting.R b/R/plotting.R index c94a9d0b..96c0ade0 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -1,3 +1,12 @@ +removexaxis <- ggplot2::theme(axis.line.x=ggplot2::element_blank(), + axis.title.x=ggplot2::element_blank(), + axis.text.x=ggplot2::element_blank(), + axis.ticks.x=ggplot2::element_blank()) + +removeyaxis <- ggplot2::theme(axis.line.y=ggplot2::element_blank(), + axis.title.y=ggplot2::element_blank(), + axis.text.y=ggplot2::element_blank(), + axis.ticks.y=ggplot2::element_blank()) plottinglist <- function(CNbins, xaxis_order = "genome_position", @@ -12,6 +21,10 @@ plottinglist <- function(CNbins, stop("xaxis_order must be either 'bin' or 'genome_position'") } + if (nrow(CNbins) == 0){ + stop("Data is empty!") + } + binsize <- CNbins$end[1] - CNbins$start[1] + 1 tickwidth <- (tickwidth * 1e6) / binsize @@ -64,11 +77,11 @@ plottinglist <- function(CNbins, tickwidth <- tickwidth / 2 } - CNbins <- dplyr::full_join(bins, CNbins) %>% + CNbins <- dplyr::full_join(bins, CNbins, by = c("chr", "start", "end")) %>% dplyr::filter(!is.na(copy)) %>% dplyr::filter(!is.na(state)) %>% dplyr::mutate(copy = ifelse(copy > maxCN, maxCN, copy)) %>% - dplyr::mutate(state = ifelse(state > maxCN, maxCN, state)) %>% + #dplyr::mutate(state = ifelse(state > maxCN, maxCN, state)) %>% dplyr::mutate(idxs = forcats::fct_reorder(factor(idx), idx)) %>% dplyr::mutate(CNs = forcats::fct_reorder(ifelse(is.na(state), NA, paste0("CN", state) @@ -162,9 +175,14 @@ plottinglistSV <- function(breakpoints, binsize = 0.5e6, chrfilt = NULL, chrstar breakpoints <- breakpoints %>% dplyr::mutate( - position_1 = binsize * floor(position_1 / binsize) + 1, - position_2 = binsize * floor(position_2 / binsize) + 1 + position_1 = ifelse(position_1 < position_2, + binsize * floor(position_1 / binsize) + 1, + binsize * ceiling(position_1 / binsize) + 1), + position_2 = ifelse(position_2 < position_1, + binsize * floor(position_2 / binsize) + 1, + binsize * ceiling(position_2 / binsize) + 1) ) %>% + dplyr::mutate(position_2 = ifelse(abs(position_1- position_2) <= binsize, position_1, position_2)) %>% dplyr::left_join(bins %>% dplyr::rename(chromosome_1 = chr, position_1 = start, idx_1 = idx, maxidx_1 = maxidx, minidx_1 = minidx), by = c("chromosome_1", "position_1")) %>% dplyr::left_join(bins %>% dplyr::rename(chromosome_2 = chr, position_2 = start, idx_2 = idx, maxidx_2 = maxidx, minidx_2 = minidx), by = c("chromosome_2", "position_2")) @@ -460,6 +478,9 @@ get_bezier_df <- function(sv, cn, maxCN, homolog = FALSE) { maxidx <- max(cn$CNbins$idx, na.rm = TRUE) minidx <- min(cn$CNbins$idx, na.rm = TRUE) idxrange <- 0.05 * maxidx + + cn$CNbins <- cn$CNbins %>% + tidyr::fill( c("state", "copy"), .direction = "downup") svcn1 <- dplyr::left_join(sv$breakpoints, cn$CNbins %>% dplyr::rename(chromosome_1 = chr, position_1 = start, copy_1 = copy) %>% @@ -541,6 +562,7 @@ get_bezier_df <- function(sv, cn, maxCN, homolog = FALSE) { #' @param xaxis_order Default is "genome_position" #' @param legend.position Where to place the legend, default is "bottom" #' @param annotateregions Dataframe with chr start and end positions to annotate, will draw a dashed vertical line at this position +#' @param annotateregions_linetype linetype for region annotation, default = 2 (dashed) #' @param SV Default is NULL. If a dataframe with structural variant position is passed it will add rearrangement links between bins. #' @param SVcol Default is TRUE. Colour SVs or not #' @param svalpha the alpha scaling of the SV lines, default = 0.5 @@ -552,7 +574,10 @@ get_bezier_df <- function(sv, cn, maxCN, homolog = FALSE) { #' @param chrend End of region (in Mb) when plotting a single chromosome #' @param shape shape for plotting, default = 16 #' @param positionticks set to TRUE to use position ticks rather than chromosome ticks -#' +#' @param genome genome to use, default = "hg19" (only used for ideogram) +#' @param ideogram plot ideogram at the top, default = TRUE +#' @param ideogram_height height of the ideogram +#' #' @return ggplot2 plot #' #' @examples @@ -570,10 +595,12 @@ plotCNprofile <- function(CNbins, statecol = "state", returnlist = FALSE, raster = FALSE, + genome = "hg19", y_axis_trans = "identity", xaxis_order = "genome_position", legend.position = "bottom", annotateregions = NULL, + annotateregions_linetype = 2, SV = NULL, SVcol = TRUE, svalpha = 0.5, @@ -585,11 +612,13 @@ plotCNprofile <- function(CNbins, chrend = NULL, shape = 16, positionticks = FALSE, + ideogram = FALSE, + overwrite_color = NULL, ...) { if (!xaxis_order %in% c("bin", "genome_position")) { stop("xaxis_order must be either 'bin' or 'genome_position'") } - + if (is.null(cellid)) { cellid <- unique(CNbins$cell_id)[min(cellidx, length(unique(CNbins$cell_id)))] } @@ -619,7 +648,13 @@ plotCNprofile <- function(CNbins, dplyr::filter(cell_id == cellid) %>% plottinglist(., xaxis_order = xaxis_order, maxCN = maxCN, positionticks = positionticks, tickwidth = tickwidth, chrstart = chrstart, chrend = chrend) - + + if (ideogram == TRUE){ + miny <- -0.5 + } else{ + miny <- 0 + } + if (raster == TRUE) { if (!requireNamespace("ggrastr", quietly = TRUE)) { stop("Package \"ggrastr\" needed for this function to work. Please install it.", @@ -646,7 +681,7 @@ plotCNprofile <- function(CNbins, legend.position = "none" ) + ggplot2::scale_x_continuous(breaks = pl$chrticks, labels = pl$chrlabels, expand = c(0, 0), limits = c(pl$minidx, pl$maxidx), guide = ggplot2::guide_axis(check.overlap = TRUE)) + - ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(0, maxCN), trans = y_axis_trans) + + ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(miny, maxCN), trans = y_axis_trans) + ggplot2::xlab(xlab) + ggplot2::ylab("Copy Number") + cowplot::theme_cowplot(...) + @@ -676,7 +711,7 @@ plotCNprofile <- function(CNbins, legend.position = "none" ) + ggplot2::scale_x_continuous(breaks = pl$chrticks, labels = pl$chrlabels, expand = c(0, 0), limits = c(pl$minidx, pl$maxidx), guide = ggplot2::guide_axis(check.overlap = TRUE)) + - ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(0, maxCN), trans = y_axis_trans) + + ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(miny, maxCN), trans = y_axis_trans) + ggplot2::xlab(xlab) + ggplot2::ylab("Copy Number") + cowplot::theme_cowplot(...) + @@ -688,6 +723,10 @@ plotCNprofile <- function(CNbins, } if (!is.null(genes)) { + yplace <- maxCN + if (ideogram == TRUE){ + yplace <- yplace - 2 + } binsize <- pl$CNbins$end[1] - pl$CNbins$start[1] + 1 gene_idx <- get_gene_idx(genes, chr = chrfilt, binsize = binsize) npoints <- dim(pl$CNbins)[1] @@ -697,13 +736,16 @@ plotCNprofile <- function(CNbins, } if (!is.null(annotateregions)) { + binsize <- pl$CNbins$end[1] - pl$CNbins$start[1] + 1 + annotateregions <- dplyr::mutate(annotateregions, start = round(start / binsize) * binsize + 1) datidx <- dplyr::inner_join(annotateregions, pl$bins %>% dplyr::select(chr, start, idx)) %>% dplyr::distinct(.) + datidx <- dplyr::mutate(datidx, idx = ifelse(idx > pl$maxidx, pl$maxidx, idx)) gCN <- gCN + - ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = 2, size = 0.3, alpha = 0.5) + ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = annotateregions_linetype, size = 0.3, alpha = 0.5) } - if (!is.null(SV)){ - SV <- dplyr::filter(SV, chromosome_1 %in% chrfilt | chromosome_2 %in% chrfilt) + if (!is.null(SV) & !is.null(chrfilt)){ + SV <- dplyr::filter(SV, chromosome_1 %in% chrfilt & chromosome_2 %in% chrfilt) } if (!is.null(SV) && nrow(SV) > 0) { @@ -712,7 +754,8 @@ plotCNprofile <- function(CNbins, pl$CNbins <- dplyr::left_join(getBins(binsize = binsize), pl$CNbins) bezdf <- get_bezier_df(svpl, pl, maxCN) bezdf <- bezdf %>% - dplyr::mutate(samebin = (position_1 == position_2) | rearrangement_type == "foldback") + dplyr::mutate(samebin = (position_1 == position_2) | + rearrangement_type == "foldback") bezdf$rearrangement_type = unlist(lapply(bezdf$rearrangement_type, CapStr)) rearrangement_types <- unique(bezdf %>% pull(rearrangement_type)) @@ -749,12 +792,51 @@ plotCNprofile <- function(CNbins, ) } } + + if (!is.null(overwrite_color)){ + gCN <- gCN + + ggplot2::scale_color_manual(values = overwrite_color) + + ggplot2::theme(legend.position = "none") + } + + if (ideogram == TRUE){ + binsize <- pl$CNbins$end[1] - pl$CNbins$start[1] + 1 + ideogram_dat <- cytoband_map[[genome]] + names(ideogram_dat) <- c("chr", "start", "end", "band", "colval") + ideogram_dat <- ideogram_dat %>% + dplyr::mutate(chr = stringr::str_remove(chr, "chr")) %>% + dplyr::mutate(start = round(start / binsize) * binsize + 1, + end = round(end / binsize) * binsize + 1) + + #create a dataframe that has the index of the start and end position + cnbin_idx_start <- pl$bins %>% + dplyr::select(chr, start, idx) %>% + dplyr::rename(idx_start = idx) + cnbin_idx_end <- pl$bins %>% + dplyr::select(chr, start, idx) %>% + dplyr::rename(end = start) %>% + dplyr::rename(idx_end = idx) + ideogram_dat <- dplyr::inner_join(ideogram_dat, + cnbin_idx_start, by = c("chr", "start")) %>% + dplyr::inner_join(cnbin_idx_end, by = c("chr", "end")) + + gCN <- gCN + + ggplot2::geom_rect(data = ideogram_dat, + ggplot2::aes(xmin = idx_start, + y = NULL, + x = NULL, + xmax = idx_end, + ymin = -0.5, + ymax = -0.15, fill = colval)) + + ggplot2::scale_fill_manual(values = cyto_colors) + + ggplot2::theme(legend.position = "none") + + } if (returnlist == TRUE) { gCN <- list(CN = gCN, plist = pl) } - return(gCN) } @@ -772,6 +854,7 @@ plotCNprofileBAFhomolog <- function(cn, legend.position = "bottom", genes = NULL, annotateregions = NULL, + annotateregions_linetype = 2, homolog = FALSE, SV = NULL, adj = 0.03, @@ -1032,9 +1115,11 @@ plotCNprofileBAFhomolog <- function(cn, } if (!is.null(annotateregions)) { + binsize <- pl$CNbins$end[1] - pl$CNbins$start[1] + 1 + annotateregions <- dplyr::mutate(annotateregions, start = round(start / binsize) * binsize + 1) datidx <- dplyr::inner_join(annotateregions, pl$bins %>% dplyr::select(chr, start, idx)) %>% dplyr::distinct(.) gCN <- gCN + - ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = 2, size = 0.3, alpha = 0.5) + ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = annotateregions_linetype, size = 0.3, alpha = 0.5) } return(gCN) @@ -1056,6 +1141,7 @@ plotCNprofileBAFhomolog <- function(cn, #' @param xaxis_order Default is "genome_position" #' @param legend.position Where to place the legend, default is "bottom" #' @param annotateregions Dataframe with chr start and end positions to annotate, will draw a dashed vertical line at this position +#' @param annotateregions_linetype Linetype for region annotation, default = 2 (dashed) #' @param SV Default is NULL. If a dataframe with structural variant position is passed it will add a track on the top showin rearrangement links #' @param svalpha the alpha scaling of the SV lines, default = 0.5 #' @param svwidth width of rearrangement connections @@ -1101,6 +1187,7 @@ plotCNprofileBAF <- function(cn, legend.position = "bottom", genes = NULL, annotateregions = NULL, + annotateregions_linetype = 2, homolog = FALSE, SV = NULL, adj = 0.03, @@ -1130,6 +1217,7 @@ plotCNprofileBAF <- function(cn, legend.position = legend.position, genes = genes, annotateregions = annotateregions, + annotateregions_linetype = annotateregions_linetype, SV = SV, adj = adj, svalpha = svalpha, @@ -1375,9 +1463,9 @@ plotCNprofileBAF <- function(cn, if (!is.null(annotateregions)) { datidx <- dplyr::inner_join(annotateregions, pl$bins %>% dplyr::select(chr, start, idx)) %>% dplyr::distinct(.) gBAF <- gBAF + - ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = 2, size = 0.3, alpha = 0.5) + ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = annotateregions_linetype, size = 0.3, alpha = 0.5) gCN <- gCN + - ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = 2, size = 0.3, alpha = 0.5) + ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = annotateregions_linetype, size = 0.3, alpha = 0.5) } g <- cowplot::plot_grid(gBAF, gCN, align = "v", ncol = 1, rel_heights = c(1, 1.2)) diff --git a/R/util.R b/R/util.R index a225a945..1b2953c1 100644 --- a/R/util.R +++ b/R/util.R @@ -453,7 +453,10 @@ create_segments <- function(CNbins, field = "state") { } #' @export -create_cntransitions <- function(CNbins, field = "state", add_orientation = TRUE) { +create_cntransitions <- function(CNbins, + field = "state", + add_orientation = TRUE, + binsize = 0.5e6) { newsegs <- CNbins %>% data.table::as.data.table() %>% @@ -461,7 +464,7 @@ create_cntransitions <- function(CNbins, field = "state", add_orientation = TRUE .[, rlid := data.table::rleid(get(field)), by = cell_id] %>% .[, list( start = min(start), - end = min(start) + 0.5e6 - 1 + end = min(start) + binsize - 1 ), by = .(cell_id, chr, get(field), rlid)] %>% .[order(cell_id, chr, start)] %>% dplyr::group_by(cell_id, chr) %>% @@ -478,7 +481,7 @@ create_cntransitions <- function(CNbins, field = "state", add_orientation = TRUE .[, rlid := data.table::rleid(get(field)), by = cell_id] %>% .[, list( start = min(start), - end = min(start) + 0.5e6 - 1 + end = min(start) + binsize - 1 ), by = .(cell_id, chr, state, rlid)] %>% .[order(cell_id, chr, start)] %>% dplyr::group_by(cell_id, chr) %>% @@ -1209,3 +1212,16 @@ filterbycells <- function(cn, cells){ return(cn) } + +#' @export +format_tree_labels <- function(tree, removeloci = TRUE, internal_node_string = 'locus|internal'){ + if (removeloci){ + tip.loci <- grep(internal_node_string, tree$tip.label, value = T) + while (length(tip.loci) > 0) { + tree <- ape::drop.tip(tree, tip.loci, trim.internal = FALSE, collapse.singles = FALSE) + tip.loci <- grep(internal_node_string, tree$tip.label, value = T) + } + } + tree$tip.label <- str_remove(tree$tip.label, "cell_") + return(tree) +} diff --git a/README.md b/README.md index bd65e973..391f8605 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ A docker image is available [here](https://hub.docker.com/repository/docker/marc ## Input data -`signals` was developed to work with Direct Library Preperation + (DLP+) data. A high throughput single cell whole genome sequencing workflow, described in [Laks et al.](https://www.sciencedirect.com/science/article/pii/S0092867419311766). As such it works using the output of the the pipeline developed to process this type of data, available [here](https://github.com/shahcompbio/single_cell_pipeline). Despite being developed with this type of data and pipeline in mind, it should work well with other single cell whole genome technologies. We have run it successfully with 10X CNV data for example. The required inputs are total copy number estimates in bins across the genome and haplotype block counts per cell (SNP can also work). See the test datasets provided with the package for example inputs. If you have a different type of technology and would like some advice or help running signals please open an issue. We describe in more detail the necessary input below. +`signals` was developed to work with Direct Library Preperation + (DLP+) data. A high throughput single cell whole genome sequencing workflow, described in [Laks et al.](https://www.sciencedirect.com/science/article/pii/S0092867419311766). As such it works using the output of the the mondiran pipeline developed to process this type of data, available [here](https://github.com/mondrian-scwgs). Despite being developed with this type of data and pipeline in mind, it should work well with other single cell whole genome technologies. We have run it successfully with 10X CNV data for example. The required inputs are total copy number estimates in bins across the genome and haplotype block counts per cell (SNPs can also work). See the test datasets provided with the package for example inputs. If you have a different type of technology and would like some advice or help running signals please open an issue. We describe in more detail the necessary input below. ### DLP+ data @@ -33,7 +33,7 @@ You will need the HMM copy results table (`CNbins`) with the following columns: ### Other technologies -Other technologies and software should also be compatible with signals. For example, we have used 10X data successfully. If you have single cell bam files or fastq files see the detailed documentation for running our single cell pipeline [here](https://github.com/shahcompbio/single_cell_pipeline/blob/master/docs/source/install.md). Alternatively, we provide a lightweight snakemake pipeline with the key steps [here](https://github.com/marcjwilliams1/hscn_pipeline). Also included there are some scripts to demultiplex 10X CNV bams. +Other technologies and software should also be compatible with signals. For example, we have used 10X data successfully. If you have single cell bam files or fastq files see the detailed documentation for running our single cell pipeline [here](https://mondrian-scwgs.github.io/mondrian/#/). Alternatively, we provide a lightweight snakemake pipeline with the key steps [here](https://github.com/marcjwilliams1/hscn_pipeline). Also included there are some scripts to demultiplex 10X CNV bams. If you total copy number calls from other software such as 10X cellranger or [scope](https://github.com/rujinwang/SCOPE), these may also work but not something we have tried. Feel free to open an issue if you need some advice. @@ -45,13 +45,13 @@ Some of the plotting tools assume that the `cell_id`'s conform to the following {sample}-{library}-{R*}-{C*} ``` -Here R and C refer the rows and columns on the chip. If you're using another technology and your cells are named differently, we would reccomend renaming your cell's for easy compatability. For example, if you have 10X data where cell_id's are barcodes that look like `CCGTACTTCACGGTAT-1` something like this would work +Here R and C refer the rows and columns of the chip used in the DLP protocol. If you're using another technology and your cells are named differently, we would reccomend renaming your cell's for easy compatibility. For example, if you have 10X data where cell_id's are barcodes that look like `CCGTACTTCACGGTAT-1` something like this would work ```{r} new_cell_id <- paste("mysample", "mylibrary", "CCGTACTTCACGGTAT-1", sep = "-") ``` -It is imortant to have 4 string's seperated by "-", but the unique cell identifier should be one of the last 2 strings for the heatmap labelling to format nicely. +It is imortant to have 4 string's seperated by "-", but the unique cell identifier should be one of the last 2 strings for the heatmap labeling to format nicely. ## Example @@ -85,6 +85,8 @@ plotHeatmap(hscn, plotcol = "state_BAF") ``` This will cluster the cells using umap and hdbscan. +## Utilities + `signals` includes a number of other utilities for analysing single cell (haplotype-specific) copy number including the following: * integration with scRNAseq using [seurat](https://satijalab.org/seurat/index.html) @@ -93,7 +95,7 @@ This will cluster the cells using umap and hdbscan. * clustering * utilities such as consensus copy number in cell clusters and arm level changes etc -Please see the vignettes for more information. +Even if you haven't used `signals` for allele specific copy number calling, if you format your dataframe to match `hscn` then most functions should work. Even if you just have total copy number, many of the utilities should also work. Please see the vignettes for more information. ## Outputs diff --git a/man/callAlleleSpecificCN.Rd b/man/callAlleleSpecificCN.Rd index 143ea1c8..6c0b3cf3 100644 --- a/man/callAlleleSpecificCN.Rd +++ b/man/callAlleleSpecificCN.Rd @@ -7,7 +7,7 @@ callAlleleSpecificCN( CNbins, haplotypes, - eps = 1e-12, + eps = 0.000000000001, loherror = 0.02, maxCN = NULL, selftransitionprob = 0.95, diff --git a/man/callAlleleSpecificCNfromHSCN.Rd b/man/callAlleleSpecificCNfromHSCN.Rd index 71478e61..449b50bc 100644 --- a/man/callAlleleSpecificCNfromHSCN.Rd +++ b/man/callAlleleSpecificCNfromHSCN.Rd @@ -6,7 +6,7 @@ \usage{ callAlleleSpecificCNfromHSCN( hscn, - eps = 1e-12, + eps = 0.000000000001, maxCN = NULL, selftransitionprob = 0.95, progressbar = TRUE, diff --git a/man/callHaplotypeSpecificCN.Rd b/man/callHaplotypeSpecificCN.Rd index 1e8ac40c..d3bffc6f 100644 --- a/man/callHaplotypeSpecificCN.Rd +++ b/man/callHaplotypeSpecificCN.Rd @@ -7,7 +7,7 @@ callHaplotypeSpecificCN( CNbins, haplotypes, - eps = 1e-12, + eps = 0.000000000001, maskedbins = NULL, loherror = 0.02, maxCN = NULL, diff --git a/man/getBins.Rd b/man/getBins.Rd index 6d293656..92903060 100644 --- a/man/getBins.Rd +++ b/man/getBins.Rd @@ -4,7 +4,7 @@ \alias{getBins} \title{Make fixed-width bins} \usage{ -getBins(chrom.lengths = hg19_chrlength, binsize = 1e+06, chromosomes = NULL) +getBins(chrom.lengths = hg19_chrlength, binsize = 1000000, chromosomes = NULL) } \arguments{ \item{chrom.lengths}{A named character vector with chromosome lengths. Names correspond to chromosomes.} diff --git a/man/plotCNprofile.Rd b/man/plotCNprofile.Rd index 40afcc0d..a7119720 100644 --- a/man/plotCNprofile.Rd +++ b/man/plotCNprofile.Rd @@ -15,11 +15,14 @@ plotCNprofile( statecol = "state", returnlist = FALSE, raster = FALSE, + genome = "hg19", y_axis_trans = "identity", xaxis_order = "genome_position", legend.position = "bottom", annotateregions = NULL, + annotateregions_linetype = 2, SV = NULL, + SVcol = TRUE, svalpha = 0.5, svwidth = 1, adj = 0.03, @@ -29,6 +32,8 @@ plotCNprofile( chrend = NULL, shape = 16, positionticks = FALSE, + ideogram = FALSE, + overwrite_color = NULL, ... ) } @@ -53,6 +58,8 @@ plotCNprofile( \item{raster}{use ggrastr or not, default = FALSE} +\item{genome}{genome to use, default = "hg19" (only used for ideogram)} + \item{y_axis_trans}{What transformation to use on the y-axis, default is identity, the other option is "squashy" which uses a tanh transformation} \item{xaxis_order}{Default is "genome_position"} @@ -61,8 +68,12 @@ plotCNprofile( \item{annotateregions}{Dataframe with chr start and end positions to annotate, will draw a dashed vertical line at this position} +\item{annotateregions_linetype}{linetype for region annotation, default = 2 (dashed)} + \item{SV}{Default is NULL. If a dataframe with structural variant position is passed it will add rearrangement links between bins.} +\item{SVcol}{Default is TRUE. Colour SVs or not} + \item{svalpha}{the alpha scaling of the SV lines, default = 0.5} \item{svwidth}{Width of SV width curves, default = 1.0} @@ -80,6 +91,10 @@ plotCNprofile( \item{shape}{shape for plotting, default = 16} \item{positionticks}{set to TRUE to use position ticks rather than chromosome ticks} + +\item{ideogram}{plot ideogram at the top, default = TRUE} + +\item{ideogram_height}{height of the ideogram} } \value{ ggplot2 plot diff --git a/man/plotCNprofileBAF.Rd b/man/plotCNprofileBAF.Rd index 86876fe5..2e57f888 100644 --- a/man/plotCNprofileBAF.Rd +++ b/man/plotCNprofileBAF.Rd @@ -21,6 +21,7 @@ plotCNprofileBAF( legend.position = "bottom", genes = NULL, annotateregions = NULL, + annotateregions_linetype = 2, homolog = FALSE, SV = NULL, adj = 0.03, @@ -70,6 +71,8 @@ plotCNprofileBAF( \item{annotateregions}{Dataframe with chr start and end positions to annotate, will draw a dashed vertical line at this position} +\item{annotateregions_linetype}{Linetype for region annotation, default = 2 (dashed)} + \item{homolog}{Rather than plot the BAF and CN seperately this will plot the 2 homologs on the same track} \item{SV}{Default is NULL. If a dataframe with structural variant position is passed it will add a track on the top showin rearrangement links} diff --git a/man/plotHeatmap.Rd b/man/plotHeatmap.Rd index ee257ae3..181b0a1e 100644 --- a/man/plotHeatmap.Rd +++ b/man/plotHeatmap.Rd @@ -39,7 +39,7 @@ plotHeatmap( annotation_height = NULL, annofontsize = 10, na_col = "white", - linkheight = 5, + linkheight = 2.5, newlegendname = NULL, str_to_remove = NULL, maxCNcol = 11, diff --git a/tests/testthat/test-hscn.R b/tests/testthat/test-hscn.R index e7d6e3a5..e4e58bb5 100644 --- a/tests/testthat/test-hscn.R +++ b/tests/testthat/test-hscn.R @@ -87,3 +87,4 @@ for (i in 1:dim(mymat)[1]) { test_that("Test rephasing by minimizing number of events", { expect_true(isTRUE(all.equal(trueA$x, newA)) | isTRUE(all.equal(trueB$x, newA))) }) +