diff --git a/DESCRIPTION b/DESCRIPTION index cdb8175..b7bee05 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,7 +41,7 @@ Imports: grid, ggforce, ggtree -RoxygenNote: 7.2.0 +RoxygenNote: 7.2.3 Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index d2b0a03..335f446 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -37,6 +37,7 @@ export(fixjitter) export(format_haplotypes) export(format_haplotypes_dlp) export(format_haplotypes_rna) +export(format_tree_labels) export(getBins) export(get_clone_label_pos) export(getphase) diff --git a/R/callHSCN.R b/R/callHSCN.R index f26bf0e..3151252 100644 --- a/R/callHSCN.R +++ b/R/callHSCN.R @@ -920,6 +920,10 @@ callHaplotypeSpecificCN <- function(CNbins, return(out) } +# #TODO This is horrible and needs to be re-written! +# Basic ideas is to remove singletons (single bins with copy number that is different from their neighbours) +# this is done by checking whether the log-likelihood of read counts in bin i better supports +# the copy number in bin i-1 or bin i+1 #' @export fix_assignments <- function(hscn) { if (hscn$likelihood$likelihood == "binomial") { diff --git a/R/clustering.R b/R/clustering.R index d9fb3e0..bc9b72e 100644 --- a/R/clustering.R +++ b/R/clustering.R @@ -51,17 +51,53 @@ umap_clustering <- function(CNbins, pca <- NULL fast_sgd <- FALSE } - umapresults <- uwot::umap(cnmatrix, - metric = umapmetric, - n_neighbors = n_neighbors, - n_components = 2, - min_dist = min_dist, - ret_model = TRUE, - ret_nn = TRUE, - pca = pca, - fast_sgd = fast_sgd - ) + # umapresults <- uwot::umap(cnmatrix, + # metric = umapmetric, + # n_neighbors = n_neighbors, + # n_components = 2, + # min_dist = min_dist, + # ret_model = TRUE, + # ret_nn = TRUE, + # pca = pca, + # fast_sgd = fast_sgd + # ) + #TODO find out why umap gives an error for some cases, seems to be a new bug + umapresults <- tryCatch( + { + umapresults <- uwot::umap(cnmatrix, + metric = umapmetric, + n_neighbors = n_neighbors, + n_components = 2, + min_dist = min_dist, + ret_model = TRUE, + ret_nn = TRUE, + pca = pca, + fast_sgd = fast_sgd) + }, + error = function(e) { + # Handle error by rerunning UMAP with different parameters + message("An error occurred in umap calculation: ", e$message) + message("Rerunning UMAP after adding small jitter to data points...") + + mat <- cnmatrix + matrix(runif(nrow(cnmatrix) * ncol(cnmatrix), + min=-0.005, max=0.005), + nrow=nrow(cnmatrix), ncol=ncol(cnmatrix)) + + umapresults <- uwot::umap(mat, + metric = umapmetric, + n_neighbors = n_neighbors, + n_components = 2, + min_dist = min_dist, + ret_model = TRUE, + ret_nn = TRUE, + pca = pca, + fast_sgd = fast_sgd) + } + ) + + + dfumap <- data.frame( umap1 = umapresults$embedding[, 1], umap2 = umapresults$embedding[, 2], diff --git a/R/col_palettes.R b/R/col_palettes.R index d3c94df..620cb28 100644 --- a/R/col_palettes.R +++ b/R/col_palettes.R @@ -13,6 +13,20 @@ scCN_colors <- c( `CN11` = "#D4B9DA" ) +cyto_colors = c( + 'gpos100'= rgb(0/255.0,0/255.0,0/255.0), + 'gpos' = rgb(0/255.0,0/255.0,0/255.0), + 'gpos75' = rgb(130/255.0,130/255.0,130/255.0), + 'gpos66' = rgb(160/255.0,160/255.0,160/255.0), + 'gpos50' = rgb(200/255.0,200/255.0,200/255.0), + 'gpos33' = rgb(210/255.0,210/255.0,210/255.0), + 'gpos25' = rgb(200/255.0,200/255.0,200/255.0), + 'gvar' = rgb(220/255.0,220/255.0,220/255.0), + 'gneg' = rgb(255/255.0,255/255.0,255/255.0), + 'acen' = rgb(217/255.0,47/255.0,39/255.0), + 'stalk' = rgb(100/255.0,127/255.0,164/255.0) +) + scCNstate_colors <- c( `0` = "#3182BD", `1` = "#9ECAE1", diff --git a/R/heatmap_plot.R b/R/heatmap_plot.R index bac107f..79ec8b5 100644 --- a/R/heatmap_plot.R +++ b/R/heatmap_plot.R @@ -966,7 +966,7 @@ plotHeatmap <- function(cn, annotation_height = NULL, annofontsize = 10, na_col = "white", - linkheight = 5, + linkheight = 2.5, newlegendname = NULL, str_to_remove = NULL, maxCNcol = 11, @@ -1066,12 +1066,26 @@ plotHeatmap <- function(cn, } ncells <- length(unique(CNbins$cell_id)) + + if (!is.null(clusters) & !is.null(tree)) { + cells_clusters <- unique(clusters$cell_id) + cells_data <- unique(CNbins$cell_id) + cells_tree <- unique(tree$tip.label) + check_cells <- all(c(length(cells_tree),length(cells_clusters),length(cells_data)) == length(cells_tree)) + if (check_cells == FALSE){ + warning("Trees, clusters and copy number data have different numbers of cells, removing non-overlapping cells.") + cells_to_keep <- intersect(intersect(cells_clusters, cells_data), cells_tree) + CNbins <- dplyr::filter(CNbins, cell_id %in% cells_to_keep) + clusters <- dplyr::filter(clusters, cell_id %in% cells_to_keep) + cells_to_remove <- setdiff(cells_tree, cells_to_keep) + tree <- ape::drop.tip(tree, cells_to_remove, collapse.singles = FALSE, trim.internal = FALSE) + tree <- format_tree_labels(tree) + } + } if (is.null(clusters) & !is.null(tree)) { ordered_cell_ids <- paste0(unique(CNbins$cell_id)) clusters <- data.frame(cell_id = unique(CNbins$cell_id), clone_id = "0") - } else { - ordered_cell_ids <- paste0(clusters$cell_id) } if (is.null(tree) & is.null(clusters)) { @@ -1097,7 +1111,10 @@ plotHeatmap <- function(cn, cells_clusters <- length(unique(clusters$cell_id)) cells_data <- length(unique(CNbins$cell_id)) if (cells_data != cells_clusters){ - warning("Number of cells in clusters dataframe != number of cells in the bins data!") + warning("Number of cells in clusters dataframe != number of cells in the bins data! Removing some cells") + cells_to_keep <- intersect(cells_clusters, cells_data) + CNbins <- dplyr::filter(CNbins, cell_id %in% cells_to_keep) + clusters <- dplyr::filter(clusters, cell_id %in% cells_to_keep) } if (!"clone_id" %in% names(clusters)) { stop("No clone_id columns in clusters dataframe, you might need to rename your clusters") diff --git a/R/plotting.R b/R/plotting.R index c94a9d0..96c0ade 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -1,3 +1,12 @@ +removexaxis <- ggplot2::theme(axis.line.x=ggplot2::element_blank(), + axis.title.x=ggplot2::element_blank(), + axis.text.x=ggplot2::element_blank(), + axis.ticks.x=ggplot2::element_blank()) + +removeyaxis <- ggplot2::theme(axis.line.y=ggplot2::element_blank(), + axis.title.y=ggplot2::element_blank(), + axis.text.y=ggplot2::element_blank(), + axis.ticks.y=ggplot2::element_blank()) plottinglist <- function(CNbins, xaxis_order = "genome_position", @@ -12,6 +21,10 @@ plottinglist <- function(CNbins, stop("xaxis_order must be either 'bin' or 'genome_position'") } + if (nrow(CNbins) == 0){ + stop("Data is empty!") + } + binsize <- CNbins$end[1] - CNbins$start[1] + 1 tickwidth <- (tickwidth * 1e6) / binsize @@ -64,11 +77,11 @@ plottinglist <- function(CNbins, tickwidth <- tickwidth / 2 } - CNbins <- dplyr::full_join(bins, CNbins) %>% + CNbins <- dplyr::full_join(bins, CNbins, by = c("chr", "start", "end")) %>% dplyr::filter(!is.na(copy)) %>% dplyr::filter(!is.na(state)) %>% dplyr::mutate(copy = ifelse(copy > maxCN, maxCN, copy)) %>% - dplyr::mutate(state = ifelse(state > maxCN, maxCN, state)) %>% + #dplyr::mutate(state = ifelse(state > maxCN, maxCN, state)) %>% dplyr::mutate(idxs = forcats::fct_reorder(factor(idx), idx)) %>% dplyr::mutate(CNs = forcats::fct_reorder(ifelse(is.na(state), NA, paste0("CN", state) @@ -162,9 +175,14 @@ plottinglistSV <- function(breakpoints, binsize = 0.5e6, chrfilt = NULL, chrstar breakpoints <- breakpoints %>% dplyr::mutate( - position_1 = binsize * floor(position_1 / binsize) + 1, - position_2 = binsize * floor(position_2 / binsize) + 1 + position_1 = ifelse(position_1 < position_2, + binsize * floor(position_1 / binsize) + 1, + binsize * ceiling(position_1 / binsize) + 1), + position_2 = ifelse(position_2 < position_1, + binsize * floor(position_2 / binsize) + 1, + binsize * ceiling(position_2 / binsize) + 1) ) %>% + dplyr::mutate(position_2 = ifelse(abs(position_1- position_2) <= binsize, position_1, position_2)) %>% dplyr::left_join(bins %>% dplyr::rename(chromosome_1 = chr, position_1 = start, idx_1 = idx, maxidx_1 = maxidx, minidx_1 = minidx), by = c("chromosome_1", "position_1")) %>% dplyr::left_join(bins %>% dplyr::rename(chromosome_2 = chr, position_2 = start, idx_2 = idx, maxidx_2 = maxidx, minidx_2 = minidx), by = c("chromosome_2", "position_2")) @@ -460,6 +478,9 @@ get_bezier_df <- function(sv, cn, maxCN, homolog = FALSE) { maxidx <- max(cn$CNbins$idx, na.rm = TRUE) minidx <- min(cn$CNbins$idx, na.rm = TRUE) idxrange <- 0.05 * maxidx + + cn$CNbins <- cn$CNbins %>% + tidyr::fill( c("state", "copy"), .direction = "downup") svcn1 <- dplyr::left_join(sv$breakpoints, cn$CNbins %>% dplyr::rename(chromosome_1 = chr, position_1 = start, copy_1 = copy) %>% @@ -541,6 +562,7 @@ get_bezier_df <- function(sv, cn, maxCN, homolog = FALSE) { #' @param xaxis_order Default is "genome_position" #' @param legend.position Where to place the legend, default is "bottom" #' @param annotateregions Dataframe with chr start and end positions to annotate, will draw a dashed vertical line at this position +#' @param annotateregions_linetype linetype for region annotation, default = 2 (dashed) #' @param SV Default is NULL. If a dataframe with structural variant position is passed it will add rearrangement links between bins. #' @param SVcol Default is TRUE. Colour SVs or not #' @param svalpha the alpha scaling of the SV lines, default = 0.5 @@ -552,7 +574,10 @@ get_bezier_df <- function(sv, cn, maxCN, homolog = FALSE) { #' @param chrend End of region (in Mb) when plotting a single chromosome #' @param shape shape for plotting, default = 16 #' @param positionticks set to TRUE to use position ticks rather than chromosome ticks -#' +#' @param genome genome to use, default = "hg19" (only used for ideogram) +#' @param ideogram plot ideogram at the top, default = TRUE +#' @param ideogram_height height of the ideogram +#' #' @return ggplot2 plot #' #' @examples @@ -570,10 +595,12 @@ plotCNprofile <- function(CNbins, statecol = "state", returnlist = FALSE, raster = FALSE, + genome = "hg19", y_axis_trans = "identity", xaxis_order = "genome_position", legend.position = "bottom", annotateregions = NULL, + annotateregions_linetype = 2, SV = NULL, SVcol = TRUE, svalpha = 0.5, @@ -585,11 +612,13 @@ plotCNprofile <- function(CNbins, chrend = NULL, shape = 16, positionticks = FALSE, + ideogram = FALSE, + overwrite_color = NULL, ...) { if (!xaxis_order %in% c("bin", "genome_position")) { stop("xaxis_order must be either 'bin' or 'genome_position'") } - + if (is.null(cellid)) { cellid <- unique(CNbins$cell_id)[min(cellidx, length(unique(CNbins$cell_id)))] } @@ -619,7 +648,13 @@ plotCNprofile <- function(CNbins, dplyr::filter(cell_id == cellid) %>% plottinglist(., xaxis_order = xaxis_order, maxCN = maxCN, positionticks = positionticks, tickwidth = tickwidth, chrstart = chrstart, chrend = chrend) - + + if (ideogram == TRUE){ + miny <- -0.5 + } else{ + miny <- 0 + } + if (raster == TRUE) { if (!requireNamespace("ggrastr", quietly = TRUE)) { stop("Package \"ggrastr\" needed for this function to work. Please install it.", @@ -646,7 +681,7 @@ plotCNprofile <- function(CNbins, legend.position = "none" ) + ggplot2::scale_x_continuous(breaks = pl$chrticks, labels = pl$chrlabels, expand = c(0, 0), limits = c(pl$minidx, pl$maxidx), guide = ggplot2::guide_axis(check.overlap = TRUE)) + - ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(0, maxCN), trans = y_axis_trans) + + ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(miny, maxCN), trans = y_axis_trans) + ggplot2::xlab(xlab) + ggplot2::ylab("Copy Number") + cowplot::theme_cowplot(...) + @@ -676,7 +711,7 @@ plotCNprofile <- function(CNbins, legend.position = "none" ) + ggplot2::scale_x_continuous(breaks = pl$chrticks, labels = pl$chrlabels, expand = c(0, 0), limits = c(pl$minidx, pl$maxidx), guide = ggplot2::guide_axis(check.overlap = TRUE)) + - ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(0, maxCN), trans = y_axis_trans) + + ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(miny, maxCN), trans = y_axis_trans) + ggplot2::xlab(xlab) + ggplot2::ylab("Copy Number") + cowplot::theme_cowplot(...) + @@ -688,6 +723,10 @@ plotCNprofile <- function(CNbins, } if (!is.null(genes)) { + yplace <- maxCN + if (ideogram == TRUE){ + yplace <- yplace - 2 + } binsize <- pl$CNbins$end[1] - pl$CNbins$start[1] + 1 gene_idx <- get_gene_idx(genes, chr = chrfilt, binsize = binsize) npoints <- dim(pl$CNbins)[1] @@ -697,13 +736,16 @@ plotCNprofile <- function(CNbins, } if (!is.null(annotateregions)) { + binsize <- pl$CNbins$end[1] - pl$CNbins$start[1] + 1 + annotateregions <- dplyr::mutate(annotateregions, start = round(start / binsize) * binsize + 1) datidx <- dplyr::inner_join(annotateregions, pl$bins %>% dplyr::select(chr, start, idx)) %>% dplyr::distinct(.) + datidx <- dplyr::mutate(datidx, idx = ifelse(idx > pl$maxidx, pl$maxidx, idx)) gCN <- gCN + - ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = 2, size = 0.3, alpha = 0.5) + ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = annotateregions_linetype, size = 0.3, alpha = 0.5) } - if (!is.null(SV)){ - SV <- dplyr::filter(SV, chromosome_1 %in% chrfilt | chromosome_2 %in% chrfilt) + if (!is.null(SV) & !is.null(chrfilt)){ + SV <- dplyr::filter(SV, chromosome_1 %in% chrfilt & chromosome_2 %in% chrfilt) } if (!is.null(SV) && nrow(SV) > 0) { @@ -712,7 +754,8 @@ plotCNprofile <- function(CNbins, pl$CNbins <- dplyr::left_join(getBins(binsize = binsize), pl$CNbins) bezdf <- get_bezier_df(svpl, pl, maxCN) bezdf <- bezdf %>% - dplyr::mutate(samebin = (position_1 == position_2) | rearrangement_type == "foldback") + dplyr::mutate(samebin = (position_1 == position_2) | + rearrangement_type == "foldback") bezdf$rearrangement_type = unlist(lapply(bezdf$rearrangement_type, CapStr)) rearrangement_types <- unique(bezdf %>% pull(rearrangement_type)) @@ -749,12 +792,51 @@ plotCNprofile <- function(CNbins, ) } } + + if (!is.null(overwrite_color)){ + gCN <- gCN + + ggplot2::scale_color_manual(values = overwrite_color) + + ggplot2::theme(legend.position = "none") + } + + if (ideogram == TRUE){ + binsize <- pl$CNbins$end[1] - pl$CNbins$start[1] + 1 + ideogram_dat <- cytoband_map[[genome]] + names(ideogram_dat) <- c("chr", "start", "end", "band", "colval") + ideogram_dat <- ideogram_dat %>% + dplyr::mutate(chr = stringr::str_remove(chr, "chr")) %>% + dplyr::mutate(start = round(start / binsize) * binsize + 1, + end = round(end / binsize) * binsize + 1) + + #create a dataframe that has the index of the start and end position + cnbin_idx_start <- pl$bins %>% + dplyr::select(chr, start, idx) %>% + dplyr::rename(idx_start = idx) + cnbin_idx_end <- pl$bins %>% + dplyr::select(chr, start, idx) %>% + dplyr::rename(end = start) %>% + dplyr::rename(idx_end = idx) + ideogram_dat <- dplyr::inner_join(ideogram_dat, + cnbin_idx_start, by = c("chr", "start")) %>% + dplyr::inner_join(cnbin_idx_end, by = c("chr", "end")) + + gCN <- gCN + + ggplot2::geom_rect(data = ideogram_dat, + ggplot2::aes(xmin = idx_start, + y = NULL, + x = NULL, + xmax = idx_end, + ymin = -0.5, + ymax = -0.15, fill = colval)) + + ggplot2::scale_fill_manual(values = cyto_colors) + + ggplot2::theme(legend.position = "none") + + } if (returnlist == TRUE) { gCN <- list(CN = gCN, plist = pl) } - return(gCN) } @@ -772,6 +854,7 @@ plotCNprofileBAFhomolog <- function(cn, legend.position = "bottom", genes = NULL, annotateregions = NULL, + annotateregions_linetype = 2, homolog = FALSE, SV = NULL, adj = 0.03, @@ -1032,9 +1115,11 @@ plotCNprofileBAFhomolog <- function(cn, } if (!is.null(annotateregions)) { + binsize <- pl$CNbins$end[1] - pl$CNbins$start[1] + 1 + annotateregions <- dplyr::mutate(annotateregions, start = round(start / binsize) * binsize + 1) datidx <- dplyr::inner_join(annotateregions, pl$bins %>% dplyr::select(chr, start, idx)) %>% dplyr::distinct(.) gCN <- gCN + - ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = 2, size = 0.3, alpha = 0.5) + ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = annotateregions_linetype, size = 0.3, alpha = 0.5) } return(gCN) @@ -1056,6 +1141,7 @@ plotCNprofileBAFhomolog <- function(cn, #' @param xaxis_order Default is "genome_position" #' @param legend.position Where to place the legend, default is "bottom" #' @param annotateregions Dataframe with chr start and end positions to annotate, will draw a dashed vertical line at this position +#' @param annotateregions_linetype Linetype for region annotation, default = 2 (dashed) #' @param SV Default is NULL. If a dataframe with structural variant position is passed it will add a track on the top showin rearrangement links #' @param svalpha the alpha scaling of the SV lines, default = 0.5 #' @param svwidth width of rearrangement connections @@ -1101,6 +1187,7 @@ plotCNprofileBAF <- function(cn, legend.position = "bottom", genes = NULL, annotateregions = NULL, + annotateregions_linetype = 2, homolog = FALSE, SV = NULL, adj = 0.03, @@ -1130,6 +1217,7 @@ plotCNprofileBAF <- function(cn, legend.position = legend.position, genes = genes, annotateregions = annotateregions, + annotateregions_linetype = annotateregions_linetype, SV = SV, adj = adj, svalpha = svalpha, @@ -1375,9 +1463,9 @@ plotCNprofileBAF <- function(cn, if (!is.null(annotateregions)) { datidx <- dplyr::inner_join(annotateregions, pl$bins %>% dplyr::select(chr, start, idx)) %>% dplyr::distinct(.) gBAF <- gBAF + - ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = 2, size = 0.3, alpha = 0.5) + ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = annotateregions_linetype, size = 0.3, alpha = 0.5) gCN <- gCN + - ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = 2, size = 0.3, alpha = 0.5) + ggplot2::geom_vline(data = datidx, ggplot2::aes(xintercept = idx), lty = annotateregions_linetype, size = 0.3, alpha = 0.5) } g <- cowplot::plot_grid(gBAF, gCN, align = "v", ncol = 1, rel_heights = c(1, 1.2)) diff --git a/R/util.R b/R/util.R index a225a94..1b2953c 100644 --- a/R/util.R +++ b/R/util.R @@ -453,7 +453,10 @@ create_segments <- function(CNbins, field = "state") { } #' @export -create_cntransitions <- function(CNbins, field = "state", add_orientation = TRUE) { +create_cntransitions <- function(CNbins, + field = "state", + add_orientation = TRUE, + binsize = 0.5e6) { newsegs <- CNbins %>% data.table::as.data.table() %>% @@ -461,7 +464,7 @@ create_cntransitions <- function(CNbins, field = "state", add_orientation = TRUE .[, rlid := data.table::rleid(get(field)), by = cell_id] %>% .[, list( start = min(start), - end = min(start) + 0.5e6 - 1 + end = min(start) + binsize - 1 ), by = .(cell_id, chr, get(field), rlid)] %>% .[order(cell_id, chr, start)] %>% dplyr::group_by(cell_id, chr) %>% @@ -478,7 +481,7 @@ create_cntransitions <- function(CNbins, field = "state", add_orientation = TRUE .[, rlid := data.table::rleid(get(field)), by = cell_id] %>% .[, list( start = min(start), - end = min(start) + 0.5e6 - 1 + end = min(start) + binsize - 1 ), by = .(cell_id, chr, state, rlid)] %>% .[order(cell_id, chr, start)] %>% dplyr::group_by(cell_id, chr) %>% @@ -1209,3 +1212,16 @@ filterbycells <- function(cn, cells){ return(cn) } + +#' @export +format_tree_labels <- function(tree, removeloci = TRUE, internal_node_string = 'locus|internal'){ + if (removeloci){ + tip.loci <- grep(internal_node_string, tree$tip.label, value = T) + while (length(tip.loci) > 0) { + tree <- ape::drop.tip(tree, tip.loci, trim.internal = FALSE, collapse.singles = FALSE) + tip.loci <- grep(internal_node_string, tree$tip.label, value = T) + } + } + tree$tip.label <- str_remove(tree$tip.label, "cell_") + return(tree) +} diff --git a/README.md b/README.md index bd65e97..391f860 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ A docker image is available [here](https://hub.docker.com/repository/docker/marc ## Input data -`signals` was developed to work with Direct Library Preperation + (DLP+) data. A high throughput single cell whole genome sequencing workflow, described in [Laks et al.](https://www.sciencedirect.com/science/article/pii/S0092867419311766). As such it works using the output of the the pipeline developed to process this type of data, available [here](https://github.com/shahcompbio/single_cell_pipeline). Despite being developed with this type of data and pipeline in mind, it should work well with other single cell whole genome technologies. We have run it successfully with 10X CNV data for example. The required inputs are total copy number estimates in bins across the genome and haplotype block counts per cell (SNP can also work). See the test datasets provided with the package for example inputs. If you have a different type of technology and would like some advice or help running signals please open an issue. We describe in more detail the necessary input below. +`signals` was developed to work with Direct Library Preperation + (DLP+) data. A high throughput single cell whole genome sequencing workflow, described in [Laks et al.](https://www.sciencedirect.com/science/article/pii/S0092867419311766). As such it works using the output of the the mondiran pipeline developed to process this type of data, available [here](https://github.com/mondrian-scwgs). Despite being developed with this type of data and pipeline in mind, it should work well with other single cell whole genome technologies. We have run it successfully with 10X CNV data for example. The required inputs are total copy number estimates in bins across the genome and haplotype block counts per cell (SNPs can also work). See the test datasets provided with the package for example inputs. If you have a different type of technology and would like some advice or help running signals please open an issue. We describe in more detail the necessary input below. ### DLP+ data @@ -33,7 +33,7 @@ You will need the HMM copy results table (`CNbins`) with the following columns: ### Other technologies -Other technologies and software should also be compatible with signals. For example, we have used 10X data successfully. If you have single cell bam files or fastq files see the detailed documentation for running our single cell pipeline [here](https://github.com/shahcompbio/single_cell_pipeline/blob/master/docs/source/install.md). Alternatively, we provide a lightweight snakemake pipeline with the key steps [here](https://github.com/marcjwilliams1/hscn_pipeline). Also included there are some scripts to demultiplex 10X CNV bams. +Other technologies and software should also be compatible with signals. For example, we have used 10X data successfully. If you have single cell bam files or fastq files see the detailed documentation for running our single cell pipeline [here](https://mondrian-scwgs.github.io/mondrian/#/). Alternatively, we provide a lightweight snakemake pipeline with the key steps [here](https://github.com/marcjwilliams1/hscn_pipeline). Also included there are some scripts to demultiplex 10X CNV bams. If you total copy number calls from other software such as 10X cellranger or [scope](https://github.com/rujinwang/SCOPE), these may also work but not something we have tried. Feel free to open an issue if you need some advice. @@ -45,13 +45,13 @@ Some of the plotting tools assume that the `cell_id`'s conform to the following {sample}-{library}-{R*}-{C*} ``` -Here R and C refer the rows and columns on the chip. If you're using another technology and your cells are named differently, we would reccomend renaming your cell's for easy compatability. For example, if you have 10X data where cell_id's are barcodes that look like `CCGTACTTCACGGTAT-1` something like this would work +Here R and C refer the rows and columns of the chip used in the DLP protocol. If you're using another technology and your cells are named differently, we would reccomend renaming your cell's for easy compatibility. For example, if you have 10X data where cell_id's are barcodes that look like `CCGTACTTCACGGTAT-1` something like this would work ```{r} new_cell_id <- paste("mysample", "mylibrary", "CCGTACTTCACGGTAT-1", sep = "-") ``` -It is imortant to have 4 string's seperated by "-", but the unique cell identifier should be one of the last 2 strings for the heatmap labelling to format nicely. +It is imortant to have 4 string's seperated by "-", but the unique cell identifier should be one of the last 2 strings for the heatmap labeling to format nicely. ## Example @@ -85,6 +85,8 @@ plotHeatmap(hscn, plotcol = "state_BAF") ``` This will cluster the cells using umap and hdbscan. +## Utilities + `signals` includes a number of other utilities for analysing single cell (haplotype-specific) copy number including the following: * integration with scRNAseq using [seurat](https://satijalab.org/seurat/index.html) @@ -93,7 +95,7 @@ This will cluster the cells using umap and hdbscan. * clustering * utilities such as consensus copy number in cell clusters and arm level changes etc -Please see the vignettes for more information. +Even if you haven't used `signals` for allele specific copy number calling, if you format your dataframe to match `hscn` then most functions should work. Even if you just have total copy number, many of the utilities should also work. Please see the vignettes for more information. ## Outputs diff --git a/man/callAlleleSpecificCN.Rd b/man/callAlleleSpecificCN.Rd index 143ea1c..6c0b3cf 100644 --- a/man/callAlleleSpecificCN.Rd +++ b/man/callAlleleSpecificCN.Rd @@ -7,7 +7,7 @@ callAlleleSpecificCN( CNbins, haplotypes, - eps = 1e-12, + eps = 0.000000000001, loherror = 0.02, maxCN = NULL, selftransitionprob = 0.95, diff --git a/man/callAlleleSpecificCNfromHSCN.Rd b/man/callAlleleSpecificCNfromHSCN.Rd index 71478e6..449b50b 100644 --- a/man/callAlleleSpecificCNfromHSCN.Rd +++ b/man/callAlleleSpecificCNfromHSCN.Rd @@ -6,7 +6,7 @@ \usage{ callAlleleSpecificCNfromHSCN( hscn, - eps = 1e-12, + eps = 0.000000000001, maxCN = NULL, selftransitionprob = 0.95, progressbar = TRUE, diff --git a/man/callHaplotypeSpecificCN.Rd b/man/callHaplotypeSpecificCN.Rd index 1e8ac40..d3bffc6 100644 --- a/man/callHaplotypeSpecificCN.Rd +++ b/man/callHaplotypeSpecificCN.Rd @@ -7,7 +7,7 @@ callHaplotypeSpecificCN( CNbins, haplotypes, - eps = 1e-12, + eps = 0.000000000001, maskedbins = NULL, loherror = 0.02, maxCN = NULL, diff --git a/man/getBins.Rd b/man/getBins.Rd index 6d29365..9290306 100644 --- a/man/getBins.Rd +++ b/man/getBins.Rd @@ -4,7 +4,7 @@ \alias{getBins} \title{Make fixed-width bins} \usage{ -getBins(chrom.lengths = hg19_chrlength, binsize = 1e+06, chromosomes = NULL) +getBins(chrom.lengths = hg19_chrlength, binsize = 1000000, chromosomes = NULL) } \arguments{ \item{chrom.lengths}{A named character vector with chromosome lengths. Names correspond to chromosomes.} diff --git a/man/plotCNprofile.Rd b/man/plotCNprofile.Rd index 40afcc0..a711972 100644 --- a/man/plotCNprofile.Rd +++ b/man/plotCNprofile.Rd @@ -15,11 +15,14 @@ plotCNprofile( statecol = "state", returnlist = FALSE, raster = FALSE, + genome = "hg19", y_axis_trans = "identity", xaxis_order = "genome_position", legend.position = "bottom", annotateregions = NULL, + annotateregions_linetype = 2, SV = NULL, + SVcol = TRUE, svalpha = 0.5, svwidth = 1, adj = 0.03, @@ -29,6 +32,8 @@ plotCNprofile( chrend = NULL, shape = 16, positionticks = FALSE, + ideogram = FALSE, + overwrite_color = NULL, ... ) } @@ -53,6 +58,8 @@ plotCNprofile( \item{raster}{use ggrastr or not, default = FALSE} +\item{genome}{genome to use, default = "hg19" (only used for ideogram)} + \item{y_axis_trans}{What transformation to use on the y-axis, default is identity, the other option is "squashy" which uses a tanh transformation} \item{xaxis_order}{Default is "genome_position"} @@ -61,8 +68,12 @@ plotCNprofile( \item{annotateregions}{Dataframe with chr start and end positions to annotate, will draw a dashed vertical line at this position} +\item{annotateregions_linetype}{linetype for region annotation, default = 2 (dashed)} + \item{SV}{Default is NULL. If a dataframe with structural variant position is passed it will add rearrangement links between bins.} +\item{SVcol}{Default is TRUE. Colour SVs or not} + \item{svalpha}{the alpha scaling of the SV lines, default = 0.5} \item{svwidth}{Width of SV width curves, default = 1.0} @@ -80,6 +91,10 @@ plotCNprofile( \item{shape}{shape for plotting, default = 16} \item{positionticks}{set to TRUE to use position ticks rather than chromosome ticks} + +\item{ideogram}{plot ideogram at the top, default = TRUE} + +\item{ideogram_height}{height of the ideogram} } \value{ ggplot2 plot diff --git a/man/plotCNprofileBAF.Rd b/man/plotCNprofileBAF.Rd index 86876fe..2e57f88 100644 --- a/man/plotCNprofileBAF.Rd +++ b/man/plotCNprofileBAF.Rd @@ -21,6 +21,7 @@ plotCNprofileBAF( legend.position = "bottom", genes = NULL, annotateregions = NULL, + annotateregions_linetype = 2, homolog = FALSE, SV = NULL, adj = 0.03, @@ -70,6 +71,8 @@ plotCNprofileBAF( \item{annotateregions}{Dataframe with chr start and end positions to annotate, will draw a dashed vertical line at this position} +\item{annotateregions_linetype}{Linetype for region annotation, default = 2 (dashed)} + \item{homolog}{Rather than plot the BAF and CN seperately this will plot the 2 homologs on the same track} \item{SV}{Default is NULL. If a dataframe with structural variant position is passed it will add a track on the top showin rearrangement links} diff --git a/man/plotHeatmap.Rd b/man/plotHeatmap.Rd index ee257ae..181b0a1 100644 --- a/man/plotHeatmap.Rd +++ b/man/plotHeatmap.Rd @@ -39,7 +39,7 @@ plotHeatmap( annotation_height = NULL, annofontsize = 10, na_col = "white", - linkheight = 5, + linkheight = 2.5, newlegendname = NULL, str_to_remove = NULL, maxCNcol = 11, diff --git a/tests/testthat/test-hscn.R b/tests/testthat/test-hscn.R index e7d6e3a..e4e58bb 100644 --- a/tests/testthat/test-hscn.R +++ b/tests/testthat/test-hscn.R @@ -87,3 +87,4 @@ for (i in 1:dim(mymat)[1]) { test_that("Test rephasing by minimizing number of events", { expect_true(isTRUE(all.equal(trueA$x, newA)) | isTRUE(all.equal(trueB$x, newA))) }) +