shahcompbio · marcjwilliams1 · Jan 20, 2022 · Nov 14, 2021 · Dec 1, 2021 · Dec 6, 2021
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -24,6 +24,8 @@ jobs:
         run: brew install --cask xquartz
       - name: Install pandoc
         run: brew install pandoc
+      - name: Install cairo
+        run: brew install cairo
       - name: Install dependencies
         run: |
           install.packages(c("remotes", "rcmdcheck"))

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -50,9 +50,9 @@ Suggests:
     covr,
     ggrastr,
     VIBER,
-    Seurat
+    Seurat,
+    plyranges
 Remotes:
-  VPetukhov/ggrastr,
   caravagn/VIBER,
   caravagn/pio,
   caravagn/easypar,

diff --git a/NAMESPACE b/NAMESPACE
@@ -9,6 +9,9 @@ export(HaplotypeHMM)
 export(add_states)
 export(alleleHMM)
 export(assignHaplotypeHMM)
+export(assign_bins_haplotypes)
+export(assign_haplotype_label)
+export(assign_label_persnp)
 export(assign_states_dp)
 export(assign_states_noprior)
 export(assignalleleHMM)
@@ -26,6 +29,7 @@ export(createSNVmatrix)
 export(create_cntransitions)
 export(create_segments)
 export(createbreakpointmatrix)
+export(filter_segments)
 export(filterbycells)
 export(filtercn)
 export(fix_assignments)
@@ -39,13 +43,17 @@ export(hc_clustering)
 export(is.ascn)
 export(is.hscn)
 export(is.hscnrna)
+export(make_copynumber_legend)
+export(map_to_segments)
 export(missegregations)
 export(missegregations_vscell)
 export(orderdf)
 export(per_arm_baf_mat)
-export(per_chr_baf)
+export(per_chr_baf_plot)
 export(per_chr_cn)
 export(per_chrarm_cn)
+export(per_segment_baf_mat)
+export(per_segment_baf_plot)
 export(phase_haplotypes)
 export(phase_haplotypes_bychr)
 export(phase_haplotypes_rna)
@@ -73,6 +81,7 @@ export(scCN_cols)
 export(scCNminorallele_cols)
 export(scCNphase_cols)
 export(scCNstate_cols)
+export(segments_to_bins)
 export(simulate_cell)
 export(simulate_cells)
 export(simulate_data_cohort)

diff --git a/R/callHSCN.R b/R/callHSCN.R
@@ -811,8 +811,11 @@ callHaplotypeSpecificCN <- function(CNbins,
 
   if (fillmissing){
     out[["data"]] <- dplyr::left_join(CNbins, out[["data"]], by = c("chr", "start", "end", "copy", "state", "cell_id"))
-    out[["data"]] <- tidyr::fill(out[["data"]], c("state_min", "Maj", "Min", "state_phase", "state_AS", 
-                                          "state_AS_phased", "LOH", "state_BAF", "phase"), .direction = "downup")
+    out[["data"]] <- out[["data"]] %>% 
+      dplyr::group_by(chr, cell_id) %>% 
+      tidyr::fill( c("state_min", "Maj", "Min", "state_phase", "state_AS", 
+                    "state_AS_phased", "LOH", "state_BAF", "phase"), .direction = "downup") %>% 
+      dplyr::ungroup()
     out[["data"]] <- as.data.frame(out[["data"]])
   }
 
@@ -1156,7 +1159,7 @@ phasing_LOH <- function(cndat, chromosomes, cutoff = 0.9, ncells = 1) {
       by = "cell_id"
       ] %>%
       .[order(LOH, decreasing = TRUE)] %>%
-      .[LOH > 0.9 & abs(mBAF) < 0.05] %>%
+      .[LOH > cutoff & abs(mBAF) < 0.05] %>%
       .$cell_id
 
     if (length(cells) >= ncells) {

diff --git a/R/callHSCNrna.R b/R/callHSCNrna.R
@@ -153,12 +153,13 @@ assign_states_noprior <- function(haps,
 }
 
 #' @export
-assign_states_dp <- function(bafperchr,
+assign_states_dp <- function(bafpersegment,
+                             haplotypes_rna = NULL,
                              samples = 10,
                              alpha_0 = 1,
                              K = 20,
-                             most_variable_chr = TRUE,
-                             top_nchr = 5,
+                             most_variable_segment = TRUE,
+                             top_nseg = 5,
                              overwrite_chr = NULL,
                              removechr = c("chrX"),
                              filtercounts = 0,
@@ -172,10 +173,10 @@ assign_states_dp <- function(bafperchr,
   }
 
   message("Generating count matrices...")
-  baf_total <- bafperchr %>%
-    dplyr::select(cell_id, chrarm, total) %>%
+  baf_total <- bafpersegment %>%
+    dplyr::select(cell_id, segid, total) %>%
     tidyr::pivot_wider(
-      names_from = "chrarm",
+      names_from = "segid",
       values_from = "total",
       values_fill = list(total = 0),
       names_prefix = "chr"
@@ -185,10 +186,10 @@ assign_states_dp <- function(bafperchr,
   baf_total <- subset(baf_total, select = -c(cell_id))
 
   # make chr ~ cell_id matrix for B allele counts
-  baf_counts <- bafperchr %>%
-    dplyr::select(cell_id, chrarm, alleleB) %>%
+  baf_counts <- bafpersegment %>%
+    dplyr::select(cell_id, segid, alleleB) %>%
     tidyr::pivot_wider(
-      names_from = "chrarm",
+      names_from = "segid",
       values_from = "alleleB",
       values_fill = list(alleleB = 0),
       names_prefix = "chr"
@@ -204,11 +205,12 @@ assign_states_dp <- function(bafperchr,
     baf_total <- baf_total[filtered_cells, ]
   }
 
-  if (most_variable_chr) {
+  if (most_variable_segment) {
     # keepchrs <- sort(sapply(baf_counts / baf_total, function(x) var(x, na.rm = TRUE)), decreasing = TRUE)[1:top_nchr]
-    chr_names <- names(baf_counts)
-    baf_counts_temp <- baf_counts[, setdiff(chr_names, removechr)]
-    keepchrs <- sort(sapply(
+    seg_names <- names(baf_counts)
+    #baf_counts_temp <- baf_counts[, setdiff(chr_names, removechr)]
+    baf_counts_temp <- baf_counts
+    keepsegs <- sort(sapply(
       names(baf_counts_temp),
       function(x) {
         matrixStats::weightedVar(baf_counts[, x] / baf_total[, x],
@@ -217,18 +219,18 @@ assign_states_dp <- function(bafperchr,
       }
     ),
     decreasing = TRUE
-    )[1:top_nchr]
-    message(paste0("Top ", top_nchr, " most variable chromosomes are: ", paste0(names(keepchrs), collapse = ", ")))
-    message("Using these chromosomes for clustering")
-    keepchrs <- names(keepchrs)
-  } else if (!is.null(overwrite_chr)) {
-    keepchrs <- overwrite_chr
+    )[1:top_nseg]
+    message(paste0("Top ", top_nseg, " most variable segments are: ", paste0(names(keepsegs), collapse = ", ")))
+    message("Using these segments for clustering")
+    keepsegs <- names(keepsegs)
+  } else if (!is.null(overwrite_segs)) {
+    keepsegs <- overwrite_segs
   } else {
-    keepchrs <- paste0("chr", unique(bafperchr$chrarm))
+    keepsegs <- paste0("chr", unique(bafpersegment$chrarm))
   }
 
-  baf_counts <- baf_counts[, keepchrs]
-  baf_total <- baf_total[, keepchrs]
+  baf_counts <- baf_counts[, keepsegs]
+  baf_total <- baf_total[, keepsegs]
   print(dim(baf_counts))
 
   message("Fitting mixture model using VIBER...")
@@ -250,11 +252,10 @@ assign_states_dp <- function(bafperchr,
 
   message("Extract cluster means from VIBER object...")
   theta <- as.data.frame(fit_filt$theta_k)
-  theta$chrarm <- row.names(theta)
+  theta$segid <- row.names(theta)
   row.names(theta) <- NULL
   theta <- theta %>%
-    tidyr::pivot_longer(-chrarm, names_to = "clone_id", values_to = "theta") %>%
-    dplyr::mutate(chrarm = gsub("chr", "", chrarm))
+    tidyr::pivot_longer(-segid, names_to = "clone_id", values_to = "theta") #%>%dplyr::mutate(chrarm = gsub("chr", "", chrarm))
 
   message("Generating dataframe mapping cell_id to clone_id...")
   x <- data.frame(
@@ -263,17 +264,17 @@ assign_states_dp <- function(bafperchr,
   )
 
   message("Assign states to clones and chromosomes...")
-  states <- bafperchr %>%
+  states <- bafpersegment %>%
     dplyr::left_join(x, by = "cell_id") %>%
-    dplyr::group_by(chrarm, clone_id) %>%
+    dplyr::group_by(segid, clone_id) %>%
     dplyr::summarise(
       BAF = median(BAF),
       alleleA = sum(alleleA),
       alleleB = sum(alleleB),
       total = sum(total)
     ) %>%
     dplyr::ungroup() %>%
-    dplyr::left_join(theta, by = c("clone_id", "chrarm")) %>%
+    dplyr::left_join(theta, by = c("clone_id", "segid")) %>%
     dplyr::mutate(rounded = round(BAF / 0.25) * 0.25) %>%
     dplyr::mutate(state_phase = dplyr::case_when(
       rounded == 0.0 ~ "A-Hom",
@@ -282,25 +283,79 @@ assign_states_dp <- function(bafperchr,
       rounded == 0.75 ~ "B-Gained",
       rounded == 1.0 ~ "B-Hom"
     )) %>%
-    dplyr::select(chrarm, clone_id, state_phase)
+    dplyr::select(segid, clone_id, state_phase)
 
   message("Format final dataframe...")
-  bafperchr_new <- bafperchr %>%
-    dplyr::select(chr, arm, chrarm, cell_id, alleleA, alleleB, total, BAF) %>%
+  bafpersegment_new <- bafpersegment %>%
+    dplyr::select(chr, segid, cell_id, alleleA, alleleB, total, BAF) %>%
     dplyr::left_join(x, by = "cell_id") %>%
-    dplyr::left_join(states, by = c("chrarm", "clone_id")) %>%
+    dplyr::left_join(states, by = c("segid", "clone_id")) %>%
     dplyr::select(-clone_id) %>%
-    dplyr::left_join(hg19chrom_coordinates) %>%
-    dplyr::left_join(x) %>%
-    dplyr::mutate(start = start + 1)
+    dplyr::left_join(x) %>% 
+    tidyr::separate(segid, c("chr2", "start", "end"), remove = FALSE, convert = TRUE) %>% 
+    dplyr::select(segid, chr, start, end, -chr2, dplyr::everything()) %>% 
+    as.data.frame()
 
   # Output
   out <- list()
   class(out) <- "hscnrna"
   out[["viber_fit"]] <- fit_filt
   out[["clusters"]] <- x
-  out[["hscn"]] <- bafperchr_new
-  out[["chromosomes_fit"]] <- keepchrs
+  out[["hscn"]] <- bafpersegment_new %>% as.data.frame()
+  out[["segments_fits"]] <- keepsegs
+  out[["haplotypecounts"]] <- haplotypes_rna$snp_counts
+  out[["segments"]] <- dplyr::distinct(bafpersegment_new, chr, start, end) %>% as.data.frame()
 
   return(out)
 }
+
+
+getCNstatefunc <- function(segments, bins, ncores = 1){
+  bins_g <- plyranges::as_granges(dlpbins %>% dplyr::rename(seqnames = chr))
+  segments_g <- plyranges::as_granges(segments %>% dplyr::rename(seqnames = chr), keep_mcols = TRUE)
+
+  bins <- plyranges::find_overlaps(bins_g, segments_g) %>%
+    as.data.frame() %>%
+    dplyr::rename(chr = seqnames) %>%
+    dplyr::select(-width, -strand) %>%
+    dplyr::select(cell_id, chr, start, end, dplyr::everything()) %>%
+    orderdf()
+
+  return(bins)
+}
+
+#' @export
+segments_to_bins <- function(segments, binsize = 0.5e6, ncores = 1){
+
+  if (!requireNamespace("plyranges", quietly = TRUE)) {
+    stop("Package \"plyranges\" needed to use this function. Please install it.",
+         call. = FALSE
+    )
+  }
+
+  bins <- schnapps::getBins(binsize = binsize)
+
+  if (ncores == 1) {
+    df <- data.table::rbindlist(lapply(unique(segments$cell_id),
+                                       function(x){
+                                         segments %>% dplyr::filter(cell_id == x) %>% 
+                                           getCNstatefunc(., bins = bins)
+                                       })) %>%
+      dplyr::group_by(chr, start, end, cell_id) %>% 
+      dplyr::filter(dplyr::row_number() == 1) %>% 
+      as.data.frame() %>%
+      dplyr::filter(!is.na(cell_id))
+  } else{
+    df <- data.table::rbindlist(parallel::mclapply(unique(segments$cell_id),
+                                                   function(x){
+                                                     segments %>% dplyr::filter(cell_id == x) %>%
+                                                       getCNstatefunc(., bins = bins)
+                                                   }, mc.cores = ncores)) %>%
+      dplyr::group_by(chr, start, end, cell_id) %>% 
+      dplyr::filter(dplyr::row_number() == 1) %>% 
+      as.data.frame() %>%
+      dplyr::filter(!is.na(cell_id))
+  }
+
+  return(df)
+}
diff --git a/R/combinedata_rna.R b/R/combinedata_rna.R
@@ -21,9 +21,6 @@ format_haplotypes_rna <- function(haplotypes,
       message("Phasing based on distribution across all cells")
       phased_haplotypes <- phase_haplotypes_rna(haplotypes)
     }
-    # } else {
-    #   phased_haplotypes <- computehaplotypecounts(haplotypes, ...)
-    # }
     message("Join phased haplotypes...")
     haplotypes <- as.data.table(haplotypes)[phased_haplotypes,
       on = .(chr, hap_label)
@@ -45,22 +42,38 @@ format_haplotypes_rna <- function(haplotypes,
     .[, totalcounts := alleleA + alleleB] %>%
     .[totalcounts > filtern, BAF := alleleB / totalcounts] %>%
     .[, c("allele1", "allele0", "phase") := NULL]
+
+  haplotype_snp_counts <-  haplotypes %>%
+    .[, list(
+      alleleA = sum(alleleA),
+      alleleB = sum(alleleB)
+    ), by = c("cell_id", "chr", "position", "hap_label")] %>%
+    .[, totalcounts := alleleA + alleleB] %>%
+    .[totalcounts > filtern, BAF := alleleB / totalcounts] %>%
+    dplyr::rename(start = position) %>% 
+    dplyr::select(cell_id, chr, start, hap_label, alleleA, alleleB, totalcounts, BAF)
 
   haplotypes <- haplotypes %>%
     .[, list(
       alleleA = sum(alleleA),
       alleleB = sum(alleleB),
       start = min(position), end = max(position)
-    ), by = c("cell_id", "chr", "sample", "patient", "hap_label")] %>%
+    ), by = c("cell_id", "chr", "hap_label")] %>%
     .[, totalcounts := alleleA + alleleB] %>%
     .[totalcounts > filtern, BAF := alleleB / totalcounts] %>%
-    dplyr::select(cell_id, chr, start, end, hap_label, alleleA, alleleB, totalcounts, BAF, sample, patient)
+    dplyr::select(cell_id, chr, start, end, hap_label, alleleA, alleleB, totalcounts, BAF)
 
   if (create_cell_id) {
-    haplotypes <- dplyr::mutate(cell_id = paste(patient, sample, cell_id, sep = "-")) %>%
+    haplotypes <- haplotypes %>% 
+      dplyr::mutate(cell_id = paste(patient, sample, cell_id, sep = "-")) %>%
+      as.data.frame() %>%
+      orderdf()
+
+    haplotype_snp_counts <- haplotype_snp_counts %>% 
+      dplyr::mutate(cell_id = paste(patient, sample, cell_id, sep = "-")) %>%
       as.data.frame() %>%
       orderdf()
   }
 
-  return(haplotypes)
+  return(list(block_counts = haplotypes, snp_counts = haplotype_snp_counts))
 }
diff --git a/R/convert.R b/R/convert.R
@@ -0,0 +1,34 @@
+#' @export
+assign_haplotype_label <- function(haplotypes, hapbinsize = 50e3){
+  colnames(haplotypes) <- c("chr", "pos", "cell_id", "allele0", "allele1")
+  haplotypes$chr <- as.character(haplotypes$chr)
+  haplotypes$pos2 <- haplotypes$pos
+
+  bins <- getBins(binsize = hapbinsize) %>%
+    dplyr::rename(start_bins = start, end_bins = end, chr_bins = chr) %>%
+    dplyr::select(-width) %>%
+    as.data.table() %>%
+    .[, hap_label := 1:.N]
+
+  haplotypes <- haplotypes[bins, on = .(chr == chr_bins, pos > start_bins, pos < end_bins)]
+  haplotypes <- na.omit(haplotypes)
+
+  hap_labels <- dplyr::distinct(haplotypes, chr, pos2, hap_label) %>% dplyr::rename(position = pos2)
+  return(hap_label)
+}
+
+#' @export
+assign_label_persnp <- function(haplotypes, hapbinsize = 50e3){
+  snpdf <- as.data.table(haplotypes)[, hap_label := .GRP, by = list(chr, position)]
+  return(snpdf)
+}
+
+#' @export
+assign_bins_haplotypes <- function(haplotypes, binsize = 0.5e6){
+  binshaps <- haplotypes %>%
+    .[, start := floor(position / binsize) * binsize + 1] %>%
+    .[, end := start + binsize - 1] %>%
+    .[, position := NULL]
+
+  return(binshaps)
+}