From f0b45380e6b76ed3dd0cd946f33cf7a88e71c9a9 Mon Sep 17 00:00:00 2001 From: Kent Riemondy Date: Mon, 16 Oct 2023 13:48:16 -0600 Subject: [PATCH] Bioc updates (#113) * address requested content changes * improve formatting issues --- DESCRIPTION | 2 +- R/annot_rse.R | 54 ++- R/calc_AEI.R | 149 ++++--- R/calc_scAEI.R | 148 ++++--- R/differential_editing.R | 263 ++++++------ R/filter_rse.R | 81 ++-- R/pileup.R | 172 ++++---- R/sc-pileup.R | 164 ++++---- R/utils.R | 99 +++-- README.Rmd | 5 +- inst/extdata/README | 3 - {man/figures => inst/misc}/raer-logo.ai | 0 inst/scripts/10x-data-processing.Rmd | 123 ++++++ inst/scripts/GSE99249-data-processing.Rmd | 230 +++++++++++ inst/scripts/README | 1 + inst/scripts/dbases/get_databases.sh | 31 ++ man/annot_from_gr.Rd | 7 +- man/calc_AEI.Rd | 16 +- man/calc_confidence.Rd | 19 +- man/calc_edit_frequency.Rd | 16 +- man/calc_scAEI.Rd | 44 +- man/correct_strand.Rd | 4 +- man/filter_clustered_variants.Rd | 12 +- man/filter_splice_variants.Rd | 1 - man/find_de_sites.Rd | 10 +- man/find_mispriming_sites.Rd | 31 +- man/find_scde_sites.Rd | 23 +- man/make_de_object.Rd | 12 +- man/mock_rse.Rd | 6 +- man/pileup_cells.Rd | 48 ++- man/pileup_sites.Rd | 62 +-- man/read_sparray.Rd | 2 +- tests/testthat/helper-astyle.R | 48 +-- tests/testthat/test_aei.R | 3 +- tests/testthat/test_de.R | 5 +- tests/testthat/test_pileup_cells.R | 128 +++--- tests/testthat/test_pileup_sites.R | 5 +- tests/testthat/test_rse.R | 27 +- tests/testthat/test_scaei.R | 15 +- tests/testthat/test_utils.R | 17 +- vignettes/raer.Rmd | 475 ++++++++++++++++------ 41 files changed, 1678 insertions(+), 883 deletions(-) delete mode 100644 inst/extdata/README rename {man/figures => inst/misc}/raer-logo.ai (100%) create mode 100644 inst/scripts/10x-data-processing.Rmd create mode 100644 inst/scripts/GSE99249-data-processing.Rmd create mode 100644 inst/scripts/README create mode 100644 inst/scripts/dbases/get_databases.sh diff --git a/DESCRIPTION b/DESCRIPTION index 3a5df88..86da057 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: raer Type: Package Title: RNA editing tools in R -Version: 0.99.14 +Version: 0.99.15 Authors@R: c( person("Kent", "Riemondy", , "kent.riemondy@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-0750-1273")), diff --git a/R/annot_rse.R b/R/annot_rse.R index 0b345b1..0151d8e 100644 --- a/R/annot_rse.R +++ b/R/annot_rse.R @@ -63,11 +63,15 @@ annot_snps.GRanges <- function(obj, RLE = TRUE, ...) { if (!is(dbsnp, "ODLT_SNPlocs")) { - cli::cli_abort("supplied dbSNP not valid SNP package, please install SNP db") + cli::cli_abort( + "supplied dbSNP not valid SNP package, please install SNP db" + ) } if (any(col_to_aggr %in% colnames(mcols(obj)))) { - cli::cli_abort("supplied column name in col_to_aggr already exists in input") + cli::cli_abort( + "supplied column name in col_to_aggr already exists in input" + ) } if (!is.null(chrom)) { @@ -108,16 +112,16 @@ annot_snps.GRanges <- function(obj, drop = FALSE )$snp - if(RLE) { - mcols(sites)[col_to_aggr] <- S4Vectors::Rle(mcols(sites)[[col_to_aggr]]) + if (RLE) { + col_rle <- S4Vectors::Rle(mcols(sites)[[col_to_aggr]]) + mcols(sites)[col_to_aggr] <- col_rle } - } else { cols_exist <- any(c("snp_ref_allele", "snp_alt_alleles") %in% colnames(mcols(obj))) if (cols_exist) { cli::cli_abort( - c("snp_ref_allele or snp_alt_alleles columns already exist in input") + "snp_ref/alt_allele columns(s) already exist in input" ) } @@ -125,8 +129,9 @@ annot_snps.GRanges <- function(obj, alt_alleles <- ref_allele <- NULL snps$alt_alleles <- vapply(snps$alt_alleles, - function(x) paste0(unique(x), collapse = ","), - FUN.VALUE = character(1)) + function(x) paste0(unique(x), collapse = ","), + FUN.VALUE = character(1) + ) snp_info <- aggregate( snps, @@ -138,21 +143,25 @@ annot_snps.GRanges <- function(obj, ) snp_info$grouping <- NULL - snp_cols <- c(col_to_aggr, - "snp_ref_allele", - "snp_alt_alleles") + snp_cols <- c( + col_to_aggr, + "snp_ref_allele", + "snp_alt_alleles" + ) colnames(snp_info) <- snp_cols mcols(sites) <- cbind(mcols(sites), snp_info) - if("ALT" %in% colnames(mcols(sites))) { + if ("ALT" %in% colnames(mcols(sites))) { snp_cols <- c(snp_cols, "snp_matches_site") mcols(sites)$snp_matches_site <- check_snp_match(sites) } - if(RLE) { - mcols(sites)[snp_cols] <- lapply(mcols(sites)[snp_cols], - S4Vectors::Rle) + if (RLE) { + mcols(sites)[snp_cols] <- lapply( + mcols(sites)[snp_cols], + S4Vectors::Rle + ) } } @@ -178,16 +187,17 @@ annot_snps.SummarizedExperiment <- function(obj, ...) { #' Annotate sites using GRanges object #' #' @description Utility function to map annotations from GRanges to rowData of -#' SummarizedExperiment or to mcols of GRanges object. If multiple features overlap then -#' they will be concatenated with the specified separtor string . +#' SummarizedExperiment or to mcols of GRanges object. If multiple features +#' overlap then they will be concatenated with the specified separtor string. #' #' @param obj RangedSummarizedExperiment or GRanges object #' @param gr GRanges with annotations to map to obj -#' @param cols_to_map character vector of columns from GRanges to map to SummarizedExperiment. +#' @param cols_to_map character vector of columns from GRanges to map to +#' SummarizedExperiment. #' If the vector has names, the names will be the column names in the output. #' @param RLE If TRUE, columns added will returned as [S4Vectors::Rle()] vectors #' to reduce memory -#' @param sep separator string, defaults to comma. +#' @param sep separator string, defaults to comma. #' @param ... additional arguments to pass to [GenomicRanges::findOverlaps()] #' #' @return Either a SummarizedExperiment or GRanges object with additional @@ -250,9 +260,11 @@ annot_from_gr <- function(obj, gr, cols_to_map, RLE = TRUE, sep = ",", ...) { mcols(gr_sites)[[col_id]] <- x$tmp } - if(RLE) { + if (RLE) { col_nms <- names(cols_to_map) - rls_clms <- lapply(col_nms, function(x) S4Vectors::Rle(mcols(gr_sites)[[x]])) + rls_clms <- lapply(col_nms, function(x) { + S4Vectors::Rle(mcols(gr_sites)[[x]]) + }) mcols(gr_sites)[col_nms] <- rls_clms } diff --git a/R/calc_AEI.R b/R/calc_AEI.R index 7ece5ba..39bbc0a 100644 --- a/R/calc_AEI.R +++ b/R/calc_AEI.R @@ -6,7 +6,7 @@ #' ALU elements in the human genome, and these regions have a high A-to-I #' editing signal compared to other regions such as coding exons. This #' function will perform pileup at specified repeat regions and return a -#' summary AEI metric. +#' summary AEI metric. #' #' @references Roth, S.H., Levanon, E.Y. & Eisenberg, E. Genome-wide #' quantification of ADAR adenosine-to-inosine RNA editing activity. Nat Methods @@ -17,17 +17,18 @@ #' @param fasta fasta filename #' @param alu_ranges [GRanges] with regions to query for #' calculating the AEI, typically ALU repeats. -#' @param txdb A [TxDb] object, if supplied, will be used to subset the alu_ranges -#' to those found overlapping genes. Alternatively a [GRanges] object with gene -#' coordinates. If the `library_type`, specified by +#' @param txdb A [TxDb] object, if supplied, will be used to subset the +#' alu_ranges to those found overlapping genes. Alternatively a [GRanges] +#' object with gene coordinates. If the `library_type`, specified by #' `FilterParam`, is `unstranded` then the [TxDb] will -#' be used to correct the strandness relative to the reference and is a required +#' be used to correct the strandness relative to the reference and is a +#' required #' parameter. #' @param snp_db either a [SNPlocs], [GPos], or [GRanges] object. If supplied, #' will be used to exclude polymorphic positions prior to calculating the AEI. #' If `calc_AEI()` will be used many times, one will save time by first -#' identifying SNPs that overlap the supplied `alu_ranges`, and passing these as -#' a [GRanges] to `snp_db` rather than supplying all known SNPs (see +#' identifying SNPs that overlap the supplied `alu_ranges`, and passing these +#' as a [GRanges] to `snp_db` rather than supplying all known SNPs (see #' [get_overlapping_snps()]). #' @param param object of class [FilterParam()] which specify various #' filters to apply to reads and sites during pileup. @@ -38,7 +39,8 @@ #' @returns A named list containing: #' - `AEI`: a matrix of AEI index values computed for all allelic #' combinations, one row for each supplied bam file. -#' - `AEI_per_chrom`: a data.frame containing values computed for each chromosome +#' - `AEI_per_chrom`: a data.frame containing values computed for each +#' chromosome #' #' @examples #' suppressPackageStartupMessages(library(Rsamtools)) @@ -71,7 +73,6 @@ calc_AEI <- function(bamfiles, param = FilterParam(), BPPARAM = SerialParam(), verbose = FALSE) { - if (!is(bamfiles, "BamFileList")) { if (is.null(names(bamfiles))) { names(bamfiles) <- bamfiles @@ -112,10 +113,17 @@ calc_AEI <- function(bamfiles, } else { genes_gr <- txdb } - alu_ranges <- subsetByOverlaps(alu_ranges, genes_gr, ignore.strand = TRUE) + alu_ranges <- subsetByOverlaps(alu_ranges, + genes_gr, + ignore.strand = TRUE + ) alu_ranges <- reduce(alu_ranges) } - chroms <- intersect(chroms, as.character(unique(seqnames(alu_ranges)))) + chroms <- intersect( + chroms, + as.character(unique(seqnames(alu_ranges))) + ) + alu_ranges <- alu_ranges[seqnames(alu_ranges) %in% chroms] } else { cli::cli_abort("unrecognized format for alu_ranges") @@ -148,7 +156,7 @@ calc_AEI <- function(bamfiles, } } else { cli::cli_abort( - "removing snps using a SNPloc package requires alu_ranges to be supplied " + "removing snps using a SNPloc package requires alu_ranges" ) } } else { @@ -156,7 +164,7 @@ calc_AEI <- function(bamfiles, } snp_lst <- split(snps, seqnames(snps)) missing_chroms <- setdiff(chroms, names(snp_lst)) - if(length(missing_chroms) > 0){ + if (length(missing_chroms) > 0) { missing_snp_lst <- lapply(missing_chroms, function(x) GRanges()) names(missing_snp_lst) <- missing_chroms snp_lst <- c(snp_lst, missing_snp_lst) @@ -165,15 +173,17 @@ calc_AEI <- function(bamfiles, snps <- snp_lst[chroms] } res <- list() - for(i in seq_along(bamfiles)) { + for (i in seq_along(bamfiles)) { bam_fn <- bamfiles[i] - aei <- .AEI_per_bam(bam_fn = bam_fn, fasta = fasta, chroms = chroms, - alu_ranges = alu_ranges, snps = snps, param = param, - genes_gr = genes_gr, verbose = verbose, BPPARAM = BPPARAM) + aei <- .AEI_per_bam( + bam_fn = bam_fn, fasta = fasta, chroms = chroms, + alu_ranges = alu_ranges, snps = snps, param = param, + genes_gr = genes_gr, verbose = verbose, BPPARAM = BPPARAM + ) res[[path(bam_fn)]] <- aei } - aei <- do.call(rbind, lapply(res, '[[', 1)) + aei <- do.call(rbind, lapply(res, "[[", 1)) rownames(aei) <- names(bamfiles) aei_per_chrom <- lapply(seq_along(res), function(i) { @@ -186,42 +196,42 @@ calc_AEI <- function(bamfiles, } -.AEI_per_bam <- function(bam_fn, fasta, chroms, alu_ranges, snps, param, - genes_gr, verbose, - BPPARAM = BiocParallel::SerialParam()) { - +.AEI_per_bam <- function( + bam_fn, fasta, chroms, alu_ranges, snps, param, + genes_gr, verbose, + BPPARAM = BiocParallel::SerialParam()) { if (is.null(snps)) { aei <- bpmapply(.calc_AEI_per_chrom, - chroms, - MoreArgs = list( - bam_fn = bam_fn, - fasta = fasta, - alu_sites = alu_ranges, - param = param, - snp_gr = NULL, - genes_gr = genes_gr, - verbose = verbose - ), - BPPARAM = BPPARAM, - SIMPLIFY = FALSE + chroms, + MoreArgs = list( + bam_fn = bam_fn, + fasta = fasta, + alu_sites = alu_ranges, + param = param, + snp_gr = NULL, + genes_gr = genes_gr, + verbose = verbose + ), + BPPARAM = BPPARAM, + SIMPLIFY = FALSE ) } else { if (length(chroms) != length(snps)) { cli::cli_abort("issue subsetting snpDb and chromosomes") } aei <- bpmapply(.calc_AEI_per_chrom, - chroms, - snps, - MoreArgs = list( - bam_fn = bam_fn, - fasta = fasta, - alu_sites = alu_ranges, - param = param, - genes_gr = genes_gr, - verbose = verbose - ), - BPPARAM = BPPARAM, - SIMPLIFY = FALSE + chroms, + snps, + MoreArgs = list( + bam_fn = bam_fn, + fasta = fasta, + alu_sites = alu_ranges, + param = param, + genes_gr = genes_gr, + verbose = verbose + ), + BPPARAM = BPPARAM, + SIMPLIFY = FALSE ) } bpstop(BPPARAM) @@ -231,9 +241,11 @@ calc_AEI <- function(bamfiles, vals <- aei[[x]] id <- names(aei)[x] xx <- as.data.frame(t(do.call(data.frame, vals))) - xx <- cbind(chrom = id, - allele = rownames(xx), - xx) + xx <- cbind( + chrom = id, + allele = rownames(xx), + xx + ) rownames(xx) <- NULL xx }) @@ -241,12 +253,14 @@ calc_AEI <- function(bamfiles, aei <- do.call(rbind, aei) aei <- split(aei, aei$allele) - aei_summary <- lapply(aei, function(x) 100 * (sum(x$alt) / (sum(x$ref) + sum(x$alt)))) + aei_summary <- lapply(aei, function(x) { + 100 * (sum(x$alt) / (sum(x$ref) + sum(x$alt))) + }) aei_summary <- t(do.call(rbind, aei_summary)) aei_per_chrom <- do.call(rbind, aei) - if(is.null(names(bam_fn))){ + if (is.null(names(bam_fn))) { rownames(aei_summary) <- bam_fn aei_per_chrom$bam_file <- bam_fn } else { @@ -258,7 +272,7 @@ calc_AEI <- function(bamfiles, } .calc_AEI_per_chrom <- function(bam_fn, fasta, alu_sites, chrom, param, - snp_gr, genes_gr, verbose) { + snp_gr, genes_gr, verbose) { if (verbose) { start <- Sys.time() cli::cli_progress_step("working on: {chrom}") @@ -349,7 +363,11 @@ get_overlapping_snps <- function(gr, mcols(xx) <- NULL if (!is.null(output_file)) { - rtracklayer::export(xx, output_file, append = TRUE, ignore.strand = TRUE) + rtracklayer::export(xx, + output_file, + append = TRUE, + ignore.strand = TRUE + ) } else { alu_snps[[i]] <- xx } @@ -370,8 +388,8 @@ get_overlapping_snps <- function(gr, #' object. Sites with no-overlap, or overlapping features with conflicting #' strands (+ and -) will be removed. #' -#' @param rse RangedSummarizedExperiment object containing editing sites processed with -#' "unstranded" setting +#' @param rse RangedSummarizedExperiment object containing editing sites +#' processed with "unstranded" setting #' @param genes_gr GRanges object containing reference features to annotate the #' strand of the editing sites. #' @@ -419,7 +437,7 @@ correct_strand <- function(rse, genes_gr) { rowData(rse)$REF[flip_rows] <- comp_bases(rowData(rse)$REF[flip_rows]) - if("ALT" %in% colnames(rowData(rse))) { + if ("ALT" %in% colnames(rowData(rse))) { rowData(rse)$ALT[flip_rows] <- comp_bases(rowData(rse)$ALT[flip_rows]) } @@ -447,7 +465,8 @@ correct_strand <- function(rse, genes_gr) { } -# complement the ALT assay matrix, including multiallelics (e.g. c("A", "A,T", "C)) +# complement the ALT assay matrix, +# including multiallelics (e.g. c("A", "A,T", "C")) complement_variant_matrix <- function(alt_mat) { stopifnot(all(!is.na(alt_mat))) @@ -463,17 +482,19 @@ complement_variant_matrix <- function(alt_mat) { # split and complement multiallelics multialts <- alts[n_alleles > 0] - multialts <- vapply(strsplit(multialts, ","), function(y) { - paste0(comp_bases(y), collapse = ",") - }, - character(1) + multialts <- vapply( + strsplit(multialts, ","), function(y) { + paste0(comp_bases(y), collapse = ",") + }, + character(1) ) comp_alts[n_alleles > 0] <- multialts # get back to a matrix comp_alts <- matrix(comp_alts, - nrow = nrow(alt_mat), - ncol = ncol(alt_mat), - dimnames = dimnames(alt_mat)) + nrow = nrow(alt_mat), + ncol = ncol(alt_mat), + dimnames = dimnames(alt_mat) + ) comp_alts } diff --git a/R/calc_scAEI.R b/R/calc_scAEI.R index 9e6c0f8..c1defe7 100644 --- a/R/calc_scAEI.R +++ b/R/calc_scAEI.R @@ -5,21 +5,23 @@ #' bases) observed at A positions. The vast majority A-to-I editing occurs in #' ALU elements in the human genome, and these regions have a high A-to-I #' editing signal compared to other regions such as coding exons. This -#' function will examine enumerate edited and non-edited base counts at the supplied -#' sites and return summary AEI metric per cell. Potential editing sites within -#' repeat regions can be generated using `get_scAEI_sites()`. +#' function will examine enumerate edited and non-edited base counts at the +#' supplied sites and return summary AEI metric per cell. Potential editing +#' sites within repeat regions can be generated using `get_scAEI_sites()`. #' #' @references Roth, S.H., Levanon, E.Y. & Eisenberg, E. Genome-wide #' quantification of ADAR adenosine-to-inosine RNA editing activity. Nat Methods #' 16, 1131–1138 (2019). https://doi.org/10.1038/s41592-019-0610-9 #' -#' @param bamfiles a path to a BAM file (for 10x libraries), or a vector of paths -#' to BAM files (smart-seq2). Can be supplied as a character vector, `BamFile`, or -#' `BamFileList`. -#' @param sites a GRanges object produced by [get_scAEI_sites()] containing sites to process. -#' @param cell_barcodes A character vector of single cell barcodes to process. If -#' processing multiple BAM files (e.g. smart-seq-2), provide a character vector -#' of unique identifiers for each input BAM, to name each BAM file in the output files. +#' @param bamfiles a path to a BAM file (for 10x libraries), or a vector of +#' paths to BAM files (smart-seq2). Can be supplied as a character vector, +#' `BamFile`, or `BamFileList`. +#' @param sites a GRanges object produced by [get_scAEI_sites()] containing +#' sites to process. +#' @param cell_barcodes A character vector of single cell barcodes to process. +#' If processing multiple BAM files (e.g. smart-seq-2), provide a character +#' vector of unique identifiers for each input BAM, to name each BAM file in the +#' output files. #' @param param object of class [FilterParam()] which specify various filters to #' apply to reads and sites during pileup. #' @param edit_from This should correspond to the base @@ -35,9 +37,9 @@ #' @param ... additional arguments to [pileup_cells()] #' #' @returns A `DataFrame` containing computed `AEI` values, -#' count of editing events (`n_alt`), and count of reference events (`n_ref`) per cell. -#' If `return_sce` is `TRUE`, then a `SingleCellExperiment` is returned with the -#' AEI values stored in the `colData`. +#' count of editing events (`n_alt`), and count of reference events (`n_ref`) +#' per cell. If `return_sce` is `TRUE`, then a `SingleCellExperiment` is +#' returned with the AEI values stored in the `colData`. #' #' @examples #' suppressPackageStartupMessages(library(Rsamtools)) @@ -57,7 +59,7 @@ #' )) #' #' # alu intervals -#' alus_gr <- GRanges(c( +#' alus_gr <- GRanges(c( #' "2:110-380", #' "2:510-600", #' "2:610-670" @@ -69,18 +71,19 @@ #' # get positions of potential A -> G changes in alus #' sites <- get_scAEI_sites(fa_fn, genes_gr, alus_gr) #' -#' fp <- FilterParam(library_type = "fr-second-strand", -#' min_mapq = 255) +#' fp <- FilterParam( +#' library_type = "fr-second-strand", +#' min_mapq = 255 +#' ) #' calc_scAEI(bam_fn, sites, cbs, fp) #' #' @rdname calc_scAEI #' @export calc_scAEI <- function(bamfiles, sites, cell_barcodes, param = FilterParam(), - edit_from = "A", edit_to = "G", - output_dir = NULL, return_sce = FALSE, - ...) { - - if(is.null(output_dir)) { + edit_from = "A", edit_to = "G", + output_dir = NULL, return_sce = FALSE, + ...) { + if (is.null(output_dir)) { output_dir <- tempdir() outfns <- c("counts.mtx.gz", "sites.txt.gz", "barcodes.txt.gz") outfns <- file.path(output_dir, outfns) @@ -90,7 +93,7 @@ calc_scAEI <- function(bamfiles, sites, cell_barcodes, param = FilterParam(), # if unstranded, only query w.r.t + strand is_unstranded <- !param@library_type %in% c(1, 2) is_minus <- strand(sites) == "-" - if(is_unstranded && sum(is_minus) > 0) { + if (is_unstranded && sum(is_minus) > 0) { sites[is_minus]$REF <- comp_bases(edit_from) sites[is_minus]$ALT <- comp_bases(edit_to) strand(sites[is_minus]) <- "+" @@ -106,24 +109,28 @@ calc_scAEI <- function(bamfiles, sites, cell_barcodes, param = FilterParam(), ... ) - if(nrow(aei_sce) == 0) { - cli::cli_abort(c("no sites returned from pileup_cells", - "check input sites and filterParams")) + if (nrow(aei_sce) == 0) { + cli::cli_abort(c( + "no sites returned from pileup_cells", + "check input sites and filterParams" + )) } - if(is_unstranded) { + if (is_unstranded) { aei_sce <- resolve_aei_regions(aei_sce) } n_alt <- Matrix::colSums(assay(aei_sce, "nAlt")) n_ref <- Matrix::colSums(assay(aei_sce, "nRef")) aei <- 100 * (n_alt / (n_alt + n_ref)) - res <- DataFrame(row.names = colnames(aei_sce), - AEI = aei, - n_alt = n_alt, - n_ref = n_ref) + res <- DataFrame( + row.names = colnames(aei_sce), + AEI = aei, + n_alt = n_alt, + n_ref = n_ref + ) - if(return_sce) { + if (return_sce) { colData(aei_sce) <- res return(aei_sce) } @@ -131,11 +138,11 @@ calc_scAEI <- function(bamfiles, sites, cell_barcodes, param = FilterParam(), } #' @param fasta Path to a genome fasta file -#' @param genes A `GRanges` object with gene coordinates.Alternatively a `TxDb` object, -#' which if supplied, will be used extract gene coordinates. +#' @param genes A `GRanges` object with gene coordinates.Alternatively a `TxDb` +#' object, which if supplied, will be used extract gene coordinates. #' @param alus `GRanges` with repeat regions to query for calculating the AEI, -#' typically ALU repeats. The strand of the supplied intervals will be ignored for -#' defining repeat regions. +#' typically ALU repeats. The strand of the supplied intervals will be ignored +#' for defining repeat regions. #' @param edit_from This should correspond to the base #' (`A`, `C`, `G`, `T`) you expect in the reference. Ex. for A to I #' editing events, this would be `A`. @@ -145,29 +152,33 @@ calc_scAEI <- function(bamfiles, sites, cell_barcodes, param = FilterParam(), #' #' @rdname calc_scAEI #' @export -get_scAEI_sites <- function(fasta, - genes, - alus, - edit_from = "A", - edit_to = "G") { - if(is(genes, "TxDb")) { +get_scAEI_sites <- function( + fasta, + genes, + alus, + edit_from = "A", + edit_to = "G") { + if (is(genes, "TxDb")) { genes <- GenomicFeatures::genes(genes) } - if(!is(genes, "GRanges")) { + if (!is(genes, "GRanges")) { cli::cli_abort("genes must be a GRanges or TxDb object") } - if(!is(alus, "GRanges")) { + if (!is(alus, "GRanges")) { cli::cli_abort("alus must be a GRanges or TxDb object") } mcols(alus) <- NULL - # store an integer id to allow user to track sites corresponding to supplied alu + # store an integer id to allow user to track sites corresponding to + # supplied alu alus$id <- seq_along(alus) gn_alus <- prep_genic_alu_regions(genes, alus) - aei_sites <- get_aei_site_positions(gn_alus, - fasta, - edit_from) + aei_sites <- get_aei_site_positions( + gn_alus, + fasta, + edit_from + ) aei_sites$ALT <- edit_to @@ -179,7 +190,7 @@ get_scAEI_sites <- function(fasta, prep_genic_alu_regions <- function(genes_gr, alu_gr) { # annotate gene region strandedness - gns_ovl <- disjoin(genes_gr, with.revmap=TRUE, ignore.strand=TRUE) + gns_ovl <- disjoin(genes_gr, with.revmap = TRUE, ignore.strand = TRUE) gn_strands <- unique(extractList(strand(genes_gr), gns_ovl$revmap)) gn_strands[lengths(gn_strands) > 1] <- "*" strand(gns_ovl) <- unlist(gn_strands) @@ -220,7 +231,7 @@ aei_site_positions <- function(base, seqs, gr) { } get_aei_site_positions <- function(gr, fasta, base) { - if(any(strand(gr) == "*")) { + if (any(strand(gr) == "*")) { cli::cli_abort("strand must be set to + or -") } alu_seqs <- Rsamtools::scanFa(fasta, gr) @@ -234,9 +245,9 @@ get_aei_site_positions <- function(gr, fasta, base) { # figure out which strand should be used for each ALU region -# follows approach described by Roth et al @ https://doi.org/10.1038/s41592-019-0610-9 +# follows approach described by Roth et al +# https://doi.org/10.1038/s41592-019-0610-9 resolve_aei_regions <- function(sce) { - # general approach for regions with overlapping annotations # if there are mismatches, select strand with most mismatches # if not, average the counts from the two strands @@ -252,33 +263,36 @@ resolve_aei_regions <- function(sce) { dat <- data.frame(rid = rownames(ud_sce), nr, na, v, id) sums <- rowsum(data.frame(nr, na), - decode(id), - reorder = FALSE) + decode(id), + reorder = FALSE + ) sites_to_average <- rownames(sums[sums$na == 0, ]) subset_dat <- dat[!dat$id %in% sites_to_average, ] sdat <- split(subset_dat, subset_dat$id) # select strand based on most mismatches - vdat <- vapply(sdat, function(x){ + vdat <- vapply(sdat, function(x) { rs <- rowsum(x$na, x$v) rownames(rs)[which.max(rs)] }, FUN.VALUE = character(1)) vdat <- unlist(vdat) - sites_to_keep <- subset_dat[subset_dat$v == vdat[as.character(subset_dat$id)], ] + vvals <- vdat[as.character(subset_dat$id)] + sites_to_keep <- subset_dat[subset_dat$v == vvals, ] res <- d_sce - if(length(sites_to_keep > 0)) { + if (length(sites_to_keep > 0)) { res_sce <- ud_sce[sites_to_keep$rid, ] res <- rbind(res, res_sce) } - if(length(sites_to_average) > 0 ){ + if (length(sites_to_average) > 0) { sce_to_avg <- ud_sce[rowData(ud_sce)$id %in% sites_to_average, ] anr <- rowsum(assay(sce_to_avg, "nRef"), - paste0(rowData(sce_to_avg)$var, "_", rowData(sce_to_avg)$id), - reorder = FALSE) + paste0(rowData(sce_to_avg)$var, "_", rowData(sce_to_avg)$id), + reorder = FALSE + ) ids <- unlist(lapply(strsplit(rownames(anr), "_"), "[", 2)) ns <- lengths(split(ids, ids)) @@ -298,16 +312,20 @@ resolve_aei_regions <- function(sce) { avg_nalt <- avg_nalt[unique(as.character(rr$id)), , drop = FALSE] srr <- split(rr, rr$id) new_ranges <- GRanges(unlist(unique(seqnames(srr))), - IRanges(min(start(srr)), - max(end(srr))), - id = names(srr)) + IRanges( + min(start(srr)), + max(end(srr)) + ), + id = names(srr) + ) names(new_ranges) <- paste0("range_mean_", seq_along(new_ranges)) - avg_sce_res <- SingleCellExperiment(list(nRef = avg_nref, - nAlt = avg_nalt)) + avg_sce_res <- SingleCellExperiment(list( + nRef = avg_nref, + nAlt = avg_nalt + )) rowRanges(avg_sce_res) <- new_ranges res <- rbind(res, avg_sce_res) } res } - diff --git a/R/differential_editing.R b/R/differential_editing.R index 1a96c47..0342899 100644 --- a/R/differential_editing.R +++ b/R/differential_editing.R @@ -6,19 +6,19 @@ #' for each site (`edit_freq`), depth of coverage computed #' using the indicated edited nucleotides (`depth`) and new `colData` #' columns with the number of edited sites (`n_sites`) and the -#' fraction of edits (`edit_idx`) is returned. +#' fraction of edits (`edit_idx`) is returned. #' #' @param rse A [RangedSummarizedExperiment] object created by [pileup_sites()] #' @param edit_from This should correspond to a nucleotide or assay -#' (`A`, `C`, `G`, `T`, `Ref`, or `Alt`) you expect in the reference. Ex. for A to I -#' editing events, this would be `A`. +#' (`A`, `C`, `G`, `T`, `Ref`, or `Alt`) you expect in the reference. Ex. for +#' A to I editing events, this would be `A`. #' @param edit_to This should correspond to a nucleotide or assay -#' (`A`, `C`, `G`, `T`, `Ref`, or `Alt`) you expect in the editing site. Ex. for A -#' to I editing events, this would be `G`. -#' @param drop If `TRUE`, the [RangedSummarizedExperiment] returned will only retain sites -#' matching the specified `edit_from` and `edit_to` bases. -#' @param replace_na If `TRUE`, `NA` and `NaN` editing frequencies will be coerced to -#' `0`. +#' (`A`, `C`, `G`, `T`, `Ref`, or `Alt`) you expect in the editing site. Ex. +#' for A to I editing events, this would be `G`. +#' @param drop If `TRUE`, the [RangedSummarizedExperiment] returned will only +#' retain sites matching the specified `edit_from` and `edit_to` bases. +#' @param replace_na If `TRUE`, `NA` and `NaN` editing frequencies will be +#' coerced to `0`. #' @param edit_frequency The edit frequency cutoff used when calculating the #' number of sites. Set to `0` to require any non-zero editing frequency. The #' number of sites is stored as `n_sites` in the `colData`. @@ -44,12 +44,13 @@ calc_edit_frequency <- function(rse, replace_na = TRUE, edit_frequency = 0, min_count = 1) { - valid_assays <- c("A", "T", "C", "G", "Ref", "Alt") if (!(edit_from %in% valid_assays) | !(edit_to %in% valid_assays)) { - cli::cli_abort("`edit_to` and `edit_from` must be one of: ", - "'A', 'T', 'C', 'G', 'Ref', or 'Alt'") + cli::cli_abort( + "`edit_to` and `edit_from` must be one of: ", + "'A', 'T', 'C', 'G', 'Ref', or 'Alt'" + ) } from_col <- paste0("n", edit_from) @@ -61,8 +62,8 @@ calc_edit_frequency <- function(rse, if ("depth" %in% names(assays(rse))) { cli::cli_alert_info( - "depth has been overwritten with sum of {to_col} and {from_col} assays" - ) + "depth overwritten with sum of {to_col} and {from_col} assays" + ) } assay(rse, "depth") <- assay(rse, to_col) + assay(rse, from_col) @@ -130,8 +131,9 @@ calc_edit_frequency <- function(rse, #' @importFrom Matrix colSums #' @noRd #' @keywords internal -count_edits <- function(se, edit_frequency = 0.01, min_count = 10, - edit_from = NULL, edit_to = NULL) { +count_edits <- function( + se, edit_frequency = 0.01, min_count = 10, + edit_from = NULL, edit_to = NULL) { n_pass_filter <- Matrix::colSums((assay(se, "edit_freq") > edit_frequency) & ((assay(se, paste0("n", edit_from)) + assay(se, paste0("n", edit_to))) >= @@ -157,15 +159,15 @@ count_edits <- function(se, edit_frequency = 0.01, min_count = 10, #' #' @param rse A [RangedSummarizedExperiment] object #' @param edit_from This should correspond to a nucleotide or assay -#' (`A`, `C`, `G`, `T`, `Ref`, or `Alt`) you expect in the reference. Ex. for A to I -#' editing events, this would be `A`. +#' (`A`, `C`, `G`, `T`, `Ref`, or `Alt`) you expect in the reference. +#' Ex. for A to I editing events, this would be `A`. #' @param edit_to This should correspond to a nucleotide or assay -#' (`A`, `C`, `G`, `T`, `Ref`, or `Alt`) you expect in the editing site. Ex. for A -#' to I editing events, this would be `G`. +#' (`A`, `C`, `G`, `T`, `Ref`, or `Alt`) you expect in the editing site. +#' Ex. for A to I editing events, this would be `G`. #' @param min_prop The minimum required proportion of reads edited at a site. At #' least `min_samples` need to pass this to keep the site. -#' @param max_prop The maximum allowable proportion of reads edited at a site. At -#' least `min_samples` need to pass this to keep the site. +#' @param max_prop The maximum allowable proportion of reads edited at a site. +#' At least `min_samples` need to pass this to keep the site. #' @param min_samples The minimum number of samples passing the `min_prop` and #' `max_prop` cutoffs to keep a site. #' @@ -189,17 +191,21 @@ make_de_object <- function(rse, min_prop = 0.0, max_prop = 1.0, min_samples = 1) { - valid_assays <- c("A", "T", "C", "G", "Ref", "Alt") if (!(edit_from %in% valid_assays) | !(edit_to %in% valid_assays)) { - cli::cli_abort("`edit_to` and `edit_from` must be one of: ", - "'A', 'T', 'C', 'G', 'Ref', or 'Alt'") + cli::cli_abort( + "`edit_to` and `edit_from` must be one of: ", + "'A', 'T', 'C', 'G', 'Ref', or 'Alt'" + ) } # Only keep locations that pass cutoffs in a certain number of samples - if(!"edit_freq" %in% assayNames(rse)) { - cli::cli_abort("`edit_freq` not present in assay, please run calc_edit_frequency()") + if (!"edit_freq" %in% assayNames(rse)) { + cli::cli_abort(c("`edit_freq` not present in assay", + "please run calc_edit_frequency()", + collapse = "," + )) } pass_cutoff <- (assay(rse, "edit_freq") >= min_prop) & (assay(rse, "edit_freq") <= max_prop) @@ -235,7 +241,8 @@ make_de_object <- function(rse, #' #' @description Use `edgeR` or `DESeq2` to perform differential editing #' analysis. This will work for designs that have 1 treatment and 1 -#' control group. For more complex designs, we suggest you perform your own modeling. +#' control group. For more complex designs, we suggest you perform your own +#' modeling. #' #' @param deobj A [RangedSummarizedExperiment] object prepared for differential #' editing analysis by [make_de_object()] @@ -268,23 +275,28 @@ make_de_object <- function(rse, #' #' rse <- calc_edit_frequency(rse) #' dse <- make_de_object(rse) -#' res <- find_de_sites(dse, condition_control = "WT", condition_treatment = "KO") +#' res <- find_de_sites(dse, +#' condition_control = "WT", +#' condition_treatment = "KO" +#' ) #' res$sig_results[1:3, ] #' #' @returns A named list: -#' - `de_obj`: The `edgeR` or `deseq` object used for differential editing analysis +#' - `de_obj`: The `edgeR` or `deseq` object used for differential editing +#' analysis #' - `results_full`: Unfiltered differential editing results #' - `sig_results`: Filtered differential editing (FDR < 0.05) #' - `model_matrix`: The model matrix used for generating DE results #' #' @importFrom stats model.matrix #' @export -find_de_sites <- function(deobj, - test = c("edgeR", "DESeq2"), - sample_col = "sample", - condition_col = "condition", - condition_control = NULL, - condition_treatment = NULL) { +find_de_sites <- function( + deobj, + test = c("edgeR", "DESeq2"), + sample_col = "sample", + condition_col = "condition", + condition_control = NULL, + condition_treatment = NULL) { # Make sure all variables are present if (!sample_col %in% colnames(colData(deobj))) { cli::cli_abort( @@ -350,7 +362,7 @@ find_de_sites <- function(deobj, cli::cli_abort( c( "condition_control must be a column in your deobj colData.", - "'{condition_control}' not found in the levels of the condition", + "'{condition_control}' not found in levels of the condition", " column of colData(deobj). Possible", " options from your experiment are: {cond_options}" ) @@ -362,7 +374,7 @@ find_de_sites <- function(deobj, cli::cli_abort( c( "condition_treatment must be a column in your deobj colData ", - "'{condition_treatment}' not found in the levels of the condition", + "'{condition_treatment}' not found in levels of the condition", " column of colData(deobj)! Possible", " options from your experiment are: {cond_options}" ) @@ -390,7 +402,8 @@ find_de_sites <- function(deobj, # complex designs, we suggest you perform your own. It will test if your # sample column makes the model matrix not full rank. If that happens, the # model matrix will be modified to be full rank. This is not intended to be -# called directly by the user, instead, this should be called by `find_de_sites` +# called directly by the user, instead, this should be called by +# `find_de_sites` # # At the moment, this function will only find editing events specific to the # treatment, but it will be pretty straight forward to add other possible @@ -403,8 +416,9 @@ find_de_sites <- function(deobj, # a variable in your condition_col of colData(deobj). # # -run_deseq2 <- function(deobj, condition_control = NULL, - condition_treatment = NULL) { +run_deseq2 <- function( + deobj, condition_control = NULL, + condition_treatment = NULL) { if (!requireNamespace("DESeq2", quietly = TRUE)) { cli::cli_abort("Package \"DESeq2\" needed for differential editing.") } @@ -496,8 +510,9 @@ run_deseq2 <- function(deobj, condition_control = NULL, # variable in your condition_col of colData(deobj). No default provided. # @param condition_treatment The name of the treatment condition. This must be # a variable in your condition_col of colData(deobj). -run_edger <- function(deobj, condition_control = NULL, - condition_treatment = NULL) { +run_edger <- function( + deobj, condition_control = NULL, + condition_treatment = NULL) { if (!requireNamespace("edgeR", quietly = TRUE)) { cli::cli_abort("Package \"edgeR\" needed to run differential analysis.") } @@ -515,7 +530,8 @@ run_edger <- function(deobj, condition_control = NULL, } dge <- edgeR::DGEList(assay(deobj, "counts"), - lib.size = rep(1, ncol(deobj))) + lib.size = rep(1, ncol(deobj)) + ) dge <- edgeR::estimateDisp(dge) fit <- edgeR::glmFit(dge, design) @@ -531,10 +547,10 @@ run_edger <- function(deobj, condition_control = NULL, deobj$count == "ref", ]) treatment_vs_control <- edgeR::glmLRT(fit, - contrast = ( - alt_treatment - ref_treatment) - - (alt_control - ref_control) - ) + contrast = ( + alt_treatment - ref_treatment) - + (alt_control - ref_control) + ) treatment_vs_control <- edgeR::topTags(treatment_vs_control, n = nrow(treatment_vs_control) @@ -552,26 +568,29 @@ run_edger <- function(deobj, condition_control = NULL, )) } -#' Identify sites with differential editing between cells in single cell datasets +#' Identify sites with differential editing between cells in single cell +#' datasets #' #' @description #' Compare editing frequencies between clusters or celltypes. REF and ALT counts -#' from each cluster are pooled to create pseudobulk estimates. Each pair of clusters -#' are compared using fisher exact tests. Statistics are aggregated across each pairwise +#' from each cluster are pooled to create pseudobulk estimates. Each pair of +#' clusters are compared using fisher exact tests. Statistics are aggregated +#' across each pairwise #' comparison using [scran::combineMarkers]. #' #' @param sce [SingleCellExperiment] object with `nRef` and `nAlt` assays. #' @param group column name from colData used to define groups to compare. -#' @param rowData if TRUE, [rowData] from the input [SingleCellExperiment] will be -#' included in the output DataFrames -#' @param BPPARAM BiocParallel backend for control how paralllel computations are -#' performed. +#' @param rowData if TRUE, [rowData] from the input [SingleCellExperiment] will +#' be included in the output DataFrames +#' @param BPPARAM BiocParallel backend for control how parallel computations +#' are performed. #' @param ... Additional arguments passed to [scran::combineMarkers] #' #' @returns -#' A named list of [DataFrame]s containing results for each cluster specified by `group`. -#' The difference in editing frequencies between cluster pairs are denoted as `dEF`. -#' See [scran::combineMarkers] for a description of additional output fields. +#' A named list of [DataFrame]s containing results for each cluster specified by +#' `group`. The difference in editing frequencies between cluster pairs are +#' denoted as `dEF`. See [scran::combineMarkers] for a description of additional +#' output fields. #' #' #' @@ -602,13 +621,11 @@ run_edger <- function(deobj, condition_control = NULL, #' res <- find_scde_sites(sce, "clusters") #' res[[1]] #' @export -find_scde_sites <- function( - sce, - group, - rowData = FALSE, - BPPARAM = SerialParam(), - ...) { - +find_scde_sites <- function(sce, + group, + rowData = FALSE, + BPPARAM = SerialParam(), + ...) { if (!requireNamespace("scran", quietly = TRUE)) { cli::cli_abort("Package \"scran\" needed for differential editing.") } @@ -617,87 +634,98 @@ find_scde_sites <- function( cli::cli_abort("Package \"scran\" needed for differential editing.") } - if(!is(sce, "SingleCellExperiment")) { + if (!is(sce, "SingleCellExperiment")) { cli::cli_abort("sce must be a SingleCellExperiment object") } - if(!all(c("nRef", "nAlt") %in% assayNames(sce))) { + if (!all(c("nRef", "nAlt") %in% assayNames(sce))) { cli::cli_abort("sce must contain nRef and nAlt assays") } - if(length(group) != 1) { + if (length(group) != 1) { cli::cli_abort("group must be a single value") } - if(!(group %in% colnames(colData(sce)))){ + if (!(group %in% colnames(colData(sce)))) { cli::cli_abort("{group} not found in colData") } - if(any(is.na(sce[[group]]))) { + if (any(is.na(sce[[group]]))) { cli::cli_abort("NA values not allowed in group column") } - if(!"depth" %in% names(assays(sce))) { + if (!"depth" %in% names(assays(sce))) { assay(sce, "depth") <- assay(sce, "nRef") + assay(sce, "nAlt") } - no_depth_cells <- colSums(assay(sce, "depth")) == 0 - if(sum(no_depth_cells) > 0) { - cli::cli_alert(c("{sum(no_depth_cells)} cells had no REF or ALT counts ", - "and were excluded from the analysis")) + no_depth_cells <- colSums(assay(sce, "depth")) == 0 + if (sum(no_depth_cells) > 0) { + cli::cli_alert(c( + "{sum(no_depth_cells)} cells had no REF or ALT counts ", + "and were excluded from the analysis" + )) sce <- sce[, !no_depth_cells] } assay(sce, "depth") <- scuttle::normalizeCounts(sce, - assay.type = "depth", - BPPARAM = BPPARAM) + assay.type = "depth", + BPPARAM = BPPARAM + ) nref <- scuttle::summarizeAssayByGroup(sce, - sce[[group]], - statistics = c("sum", "prop.detected"), - assay.type = "nRef", - BPPARAM = BPPARAM) + sce[[group]], + statistics = c("sum", "prop.detected"), + assay.type = "nRef", + BPPARAM = BPPARAM + ) nalt <- scuttle::summarizeAssayByGroup(sce, - sce[[group]], - statistics = c("sum", "prop.detected"), - assay.type = "nAlt", - BPPARAM = BPPARAM) - + sce[[group]], + statistics = c("sum", "prop.detected"), + assay.type = "nAlt", + BPPARAM = BPPARAM + ) + grp_pairs <- group_combinations(sce[[group]]) - + stats <- BiocParallel::bplapply(seq_len(nrow(grp_pairs)), - function(i) { - gp <- grp_pairs[i, , drop = TRUE] - ref <- assay(nref, "sum")[, c(gp$first, gp$second)] - alt <- assay(nalt, "sum")[, c(gp$first, gp$second)] - pvals <- calc_fisher_exact(ref, alt) - ef <- alt / (ref + alt) - d_editing_frequency <- ef[, 1] - ef[, 2] - - res <- data.frame(p.value = pvals, - dEF = d_editing_frequency, - row.names = rownames(ref)) - res - }, BPPARAM = BPPARAM) + function(i) { + gp <- grp_pairs[i, , drop = TRUE] + ref <- assay(nref, "sum")[, c(gp$first, gp$second)] + alt <- assay(nalt, "sum")[, c(gp$first, gp$second)] + pvals <- calc_fisher_exact(ref, alt) + ef <- alt / (ref + alt) + d_editing_frequency <- ef[, 1] - ef[, 2] + + res <- data.frame( + p.value = pvals, + dEF = d_editing_frequency, + row.names = rownames(ref) + ) + res + }, + BPPARAM = BPPARAM + ) res <- scran::combineMarkers(stats, - grp_pairs, - effect.field = "dEF", - BPPARAM = BPPARAM, - ...) + grp_pairs, + effect.field = "dEF", + BPPARAM = BPPARAM, + ... + ) depth_summary <- scran::summaryMarkerStats(sce, - sce[[group]], - assay.type = "depth", - BPPARAM = BPPARAM) + sce[[group]], + assay.type = "depth", + BPPARAM = BPPARAM + ) - for(nm in names(res)) { + for (nm in names(res)) { de_stats <- res[[nm]] out_rows <- rownames(de_stats) depths <- depth_summary[[nm]][out_rows, ] de_stats <- cbind(depths, de_stats) - - if(rowData) { + + if (rowData) { rd <- rowData(sce)[out_rows, ] de_stats <- cbind(de_stats, rd) } @@ -709,13 +737,13 @@ find_scde_sites <- function( calc_fisher_exact <- function(ref, alt) { stopifnot(ncol(ref) == 2L) stopifnot(ncol(alt) == 2L) - + vals <- cbind(ref, alt)[, c(1, 3, 2, 4), drop = FALSE] vals <- t(vals) - + mode(vals) <- "integer" - if(any(is.na(vals))) { + if (any(is.na(vals))) { cli::cli_abort("NA values not supported in fisher test") } @@ -724,16 +752,16 @@ calc_fisher_exact <- function(ref, alt) { } group_combinations <- function(x) { - if(length(x) < 2) { + if (length(x) < 2) { cli::cli_abort("At least 2 groups must be present") } - - if(is.factor(x)) { + + if (is.factor(x)) { grps <- levels(droplevels(x)) } else { grps <- levels(as.factor(x)) } - + grp_pairs <- expand.grid(grps, grps, stringsAsFactors = FALSE) colnames(grp_pairs) <- c("first", "second") grp_pairs <- grp_pairs[grp_pairs[, 1] != grp_pairs[, 2], ] @@ -741,4 +769,3 @@ group_combinations <- function(x) { rownames(grp_pairs) <- NULL grp_pairs } - diff --git a/R/filter_rse.R b/R/filter_rse.R index 9d851d4..877b5c8 100644 --- a/R/filter_rse.R +++ b/R/filter_rse.R @@ -27,9 +27,15 @@ filter_multiallelic <- function(se) { !grepl(",", x) }) se <- se[which(is_not_multiallelic), ] - rowData(se)$ALT <- apply(assay(se, "ALT"), 1, function(x) unique(x[x != "-"])) + rowData(se)$ALT <- apply( + assay(se, "ALT"), + 1, + function(x) unique(x[x != "-"]) + ) - n_filt <- sum(c(is.na(is_not_multiallelic), !is_not_multiallelic), na.rm = TRUE) + n_filt <- sum(c(is.na(is_not_multiallelic), !is_not_multiallelic), + na.rm = TRUE + ) cli::cli_alert_info( c( "{.fun filter_multiallelic}: removed {.val {n_filt}} sites", @@ -102,7 +108,7 @@ get_splice_sites <- function(txdb, slop = 4) { #' @examples #' library(GenomicFeatures) #' rse_adar_ifn <- mock_rse() -#' +#' #' # mock up a txdb with genes #' gr <- GRanges(c( #' "DHFR:310-330:-", @@ -117,9 +123,8 @@ get_splice_sites <- function(txdb, slop = 4) { #' gr$gene_id <- c(1, 1, 2, 2) #' gr$transcript_id <- rep(c("1.1", "2.1"), each = 2) #' txdb <- makeTxDbFromGRanges(gr) -#' +#' #' filter_splice_variants(rse_adar_ifn, txdb) -#' #' #' @returns `SummarizedExperiment::SummarizedExperiment` with sites #' adjacent to splice sites removed. @@ -152,7 +157,8 @@ filter_splice_variants <- function(rse, txdb, n_filt <- length(to_keep) cli::cli_alert_info( c( - "{.fun filter_splice_variants}: removed {.val {n_in - n_filt}} sites", + "{.fun filter_splice_variants}: ", + " removed {.val {n_in - n_filt}} sites", " from {.val {n_in}} ({.val {n_filt}} remain)" ) ) @@ -162,12 +168,14 @@ filter_splice_variants <- function(rse, txdb, #' Filter out clustered sequence variants #' -#' @description Sequence variants of multiple allele types (e.g., `A -> G`, `A -> C`) -#' proximal to a putative editing site can be indicative of a region prone to mis-alignment -#' artifacts. Sites will be removed if variants of multiple allele types are present -#' within a given distance in genomic or transcriptome coordinate space. +#' @description Sequence variants of multiple allele types (e.g., `A -> G`, +#' `A -> C`) proximal to a putative editing site can be indicative of a region +#' prone to mis-alignment artifacts. Sites will be removed if variants of +#' multiple allele types are present within a given distance in genomic or +#' transcriptome coordinate space. #' -#' @param rse `SummarizedExperiment::SummarizedExperiment` containing editing sites +#' @param rse `SummarizedExperiment::SummarizedExperiment` containing editing +#' sites #' @param txdb `GenomicFeatures::TxDb` #' @param regions One of `transcript` or `genome`, specifying the coordinate #' system for calculating distances between variants. @@ -176,10 +184,10 @@ filter_splice_variants <- function(rse, txdb, #' #' @examples #' library(GenomicFeatures) -#' +#' #' rse_adar_ifn <- mock_rse() #' rse <- rse_adar_ifn[seqnames(rse_adar_ifn) == "SPCS3"] -#' +#' #' # mock up a txdb with genes #' gr <- GRanges(c( #' "SPCS3:100-120:-", @@ -192,7 +200,7 @@ filter_splice_variants <- function(rse, txdb, #' gr$gene_id <- c(1, 2) #' gr$transcript_id <- c("1.1", "2.1") #' txdb <- makeTxDbFromGRanges(gr) -#' +#' #' rse <- filter_multiallelic(rse) #' filter_clustered_variants(rse, txdb, variant_dist = 10) #' @@ -211,7 +219,9 @@ filter_clustered_variants <- function(rse, txdb, } if (length(setdiff(regions, c("transcript", "genome"))) > 0) { - cli::cli_abort("only transcript and/or genome are valid arguments for region") + cli::cli_abort( + "only transcript and/or genome are valid arguments for region" + ) } n_in <- nrow(rse) @@ -278,7 +288,8 @@ filter_clustered_variants <- function(rse, txdb, cli::cli_alert_info( c( - "{.fun filter_clustered_variants}: removed {.val {n_in - n_out}} sites", + "{.fun filter_clustered_variants}: ", + " removed {.val {n_in - n_out}} sites", " from {.val {n_in}} ({.val {n_out}} remain)" ) ) @@ -289,17 +300,18 @@ filter_clustered_variants <- function(rse, txdb, #' Calculate confidence score for observing editing #' -#' @description Calculate a confidence score based on a Bayesian inverse probability -#' model as described by Washburn et al. Cell Reports. 2015, and implemented -#' in the SAILOR pipeline. +#' @description Calculate a confidence score based on a Bayesian inverse +#' probability model as described by Washburn et al. Cell Reports. 2015, and +#' implemented in the SAILOR pipeline. #' -#' @param se `SummarizedExperiment::SummarizedExperiment` containing editing sites +#' @param se `SummarizedExperiment::SummarizedExperiment` containing editing +#' sites #' @param edit_to edited base #' @param edit_from non-edited base #' @param per_sample if TRUE, calculate confidence per sample, otherwise edited #' and non-edited counts will be summed across all samples. -#' @param exp_fraction Numeric value between 0 and 1, specifying the expected error -#' rate +#' @param exp_fraction Numeric value between 0 and 1, specifying the expected +#' error rate #' @param alpha Pseudo-count to add to non-edited base counts #' @param beta Pseudo-count to add to edited base counts #' @@ -313,26 +325,31 @@ filter_clustered_variants <- function(rse, txdb, #' calculated `per_sample`. #' #' @references -#' Washburn MC, Kakaradov B, Sundararaman B, Wheeler E, Hoon S, Yeo GW, Hundley HA. The dsRBP and inactive editor ADR-1 utilizes dsRNA binding to regulate A-to-I RNA editing across the C. elegans transcriptome. Cell Rep. 2014 Feb 27;6(4):599-607. doi: 10.1016/j.celrep.2014.01.011. Epub 2014 Feb 6. PMID: 24508457; PMCID: PMC3959997. +#' Washburn MC, Kakaradov B, Sundararaman B, Wheeler E, Hoon S, Yeo GW, Hundley +#' HA. The dsRBP and inactive editor ADR-1 utilizes dsRNA binding to regulate +#' A-to-I RNA editing across the C. elegans transcriptome. Cell Rep. 2014 +#' Feb 27;6(4):599-607. doi: 10.1016/j.celrep.2014.01.011. Epub 2014 Feb 6. +#' PMID: 24508457; PMCID: PMC3959997. #' #' SAILOR pipeline: https://github.com/YeoLab/sailor #' @importFrom stats pbeta #' @export -calc_confidence <- function(se, - edit_to = "G", - edit_from = "A", - per_sample = FALSE, - exp_fraction = 0.01, - alpha = 0L, - beta = 0L) { +calc_confidence <- function( + se, + edit_to = "G", + edit_from = "A", + per_sample = FALSE, + exp_fraction = 0.01, + alpha = 0L, + beta = 0L) { if (length(exp_fraction) != 1 || (exp_fraction < 0 || exp_fraction > 1)) { cli::cli_abort("exp_fraction must be numeric(1) and between 0 and 1") } - + if (length(alpha) != 1 || length(beta) != 1) { cli::cli_abort("alpha and beta must be length 1") } - + edit_to <- paste0("n", edit_to) edit_from <- paste0("n", edit_from) alt <- assay(se, edit_to) + as.integer(beta) diff --git a/R/pileup.R b/R/pileup.R index 4cc7f64..007bbac 100644 --- a/R/pileup.R +++ b/R/pileup.R @@ -1,23 +1,24 @@ #' Generate base counts using pileup -#' -#' @description This function uses a pileup routine to examine numerate base counts from -#' alignments at specified sites, regions, or across all read alignments, from one or -#' more BAM files. Alignment and site filtering -#' options are controlled by the `FilterParam` class.A [RangedSummarizedExperiment] object -#' is returned, populated with base count statistics for each supplied BAM file. -#' -#' @param bamfiles a character vector, [BamFile] or [BamFileList] indicating 1 or -#' more BAM files to process. If named, the names will be included in the [colData] -#' of the [RangedSummarizedExperiment] as a `sample` column, otherwise the names will -#' be taken from the basename of the BAM file. -#' @param fasta path to genome fasta file used for read alignment. Can be provided in -#' compressed gzip or bgzip format. +#' +#' @description This function uses a pileup routine to examine numerate base +#' counts from alignments at specified sites, regions, or across all read +#' alignments, from one or more BAM files. Alignment and site filtering +#' options are controlled by the `FilterParam` class. A +#' [RangedSummarizedExperiment] object is returned, populated with base count +#' statistics for each supplied BAM file. +#' +#' @param bamfiles a character vector, [BamFile] or [BamFileList] indicating 1 +#' or more BAM files to process. If named, the names will be included in the +#' [colData] of the [RangedSummarizedExperiment] as a `sample` column, otherwise +#' the names will be taken from the basename of the BAM file. +#' @param fasta path to genome fasta file used for read alignment. Can be +#' provided in compressed gzip or bgzip format. #' @param sites a [GRanges] object containing regions or sites to process. -#' @param region samtools region query string (i.e. `chr1:100-1000`). Can be combined -#' with sites, in which case sites will be filtered to keep only sites within the -#' region. -#' @param chroms chromosomes to process, provided as a character vector. Not to be used with the -#' region parameter. +#' @param region samtools region query string (i.e. `chr1:100-1000`). Can be +#' combined with sites, in which case sites will be filtered to keep only sites +#' within the region. +#' @param chroms chromosomes to process, provided as a character vector. Not to +#' be used with the region parameter. #' @param param object of class [FilterParam()] which specify various #' filters to apply to reads and sites during pileup. #' @param umi_tag The BAM tag containing a UMI sequence. If supplied, multiple @@ -27,7 +28,8 @@ #' region. #' @param verbose if TRUE, then report progress and warnings. #' -#' @returns A [RangedSummarizedExperiment] object populated with multiple assays: +#' @returns A [RangedSummarizedExperiment] object populated with +#' multiple assays: #' * `ALT`: Alternate base(s) found at each position #' * `nRef`: # of reads supporting the reference base #' * `nAlt`: # of reads supporting an alternate base @@ -110,7 +112,6 @@ pileup_sites <- function(bamfiles, BPPARAM = SerialParam(), umi_tag = NULL, verbose = FALSE) { - if (!is(bamfiles, "BamFileList")) { bamfiles <- BamFileList(bamfiles) } @@ -124,7 +125,9 @@ pileup_sites <- function(bamfiles, if (!is.null(sites) && !is(sites, "GRanges")) { cl <- class(sites) - cli::cli_abort("invalid object passed to sites, expecting GRanges found {cl}") + cli::cli_abort( + "invalid object passed to sites, expecting GRanges found {cl}" + ) } bf_exists <- file.exists(path(bamfiles)) @@ -198,12 +201,11 @@ pileup_sites <- function(bamfiles, res <- res[2:length(res)] res <- lists_to_grs(res, contigs) res <- merge_pileups(res, - sample_names = sample_ids) + sample_names = sample_ids + ) rowData(res) <- cbind(rowData(res), rdat) - } else { res <- bplapply(chroms_to_process, function(ctig) { - if (is.null(ctig)) { ctig <- character() } @@ -268,15 +270,15 @@ setup_valid_regions <- function(bam, chroms, region = NULL, fasta = NULL) { chroms_to_process[!chroms_to_process %in% names(contig_info)] if (length(missing_chroms) > 0) { - msg <- paste(missing_chroms, collapse = "\n") - cli::cli_warn(c( - "the following chromosomes are not present in the bamfile(s):", - "{msg}" - )) + msg <- paste(missing_chroms, collapse = "\n") + cli::cli_warn(c( + "the following chromosomes are not present in the bamfile(s):", + "{msg}" + )) chroms_to_process <- setdiff(chroms_to_process, missing_chroms) } - if(!is.null(fasta)){ + if (!is.null(fasta)) { chroms_in_fa <- seqnames(Rsamtools::scanFaIndex(fasta)) missing_chroms <- chroms_to_process[!chroms_to_process %in% levels(chroms_in_fa)] @@ -298,19 +300,21 @@ setup_valid_regions <- function(bam, chroms, region = NULL, fasta = NULL) { cli::cli_abort("There are no valid chromosomes to process") } - list(chroms = chroms_to_process, - all_contigs = contigs, - region = region) + list( + chroms = chroms_to_process, + all_contigs = contigs, + region = region + ) } # IRanges/GRanges are limited to this max int MAX_INT <- 536870912 defaultBulkBamFlags <- Rsamtools::scanBamFlag( - isSecondaryAlignment = FALSE, - isNotPassingQualityCont = FALSE, - isDuplicate = FALSE, - isSupplementaryAlignment = FALSE + isSecondaryAlignment = FALSE, + isNotPassingQualityCont = FALSE, + isDuplicate = FALSE, + isSupplementaryAlignment = FALSE ) @@ -344,11 +348,9 @@ get_region <- function(region) { homopolymer_len = "integer", max_mismatch_type = "integer", # length 2 min_variant_reads = "integer", - only_keep_variants = "logical", # variable length report_multiallelic = "logical", remove_overlaps = "logical", - ftrim_5p = "numeric", ftrim_3p = "numeric", read_bqual = "numeric", # length 2 @@ -364,10 +366,12 @@ setMethod(show, "FilterParam", function(object) { }) -encode_libtype <- function(library_type = c("unstranded", - "fr-first-strand", - "fr-second-strand"), - n_files) { +encode_libtype <- function(library_type = c( + "unstranded", + "fr-first-strand", + "fr-second-strand" + ), + n_files) { # encode libtype as integer # 0 = unstranded all reads on + strand # 1 = fr-first-strand strand based on R1/antisense, R2/sense @@ -428,7 +432,7 @@ adjustParams <- function(filterParam, nFiles) { c_args_FilterParam <- function(x, ...) { - fp <-as_list_FilterParam(x) + fp <- as_list_FilterParam(x) # consistent length args are populated into vectors # note that unlisting will increase vector size greater than number of args @@ -494,7 +498,8 @@ cfilterParam <- function(param, nfiles) { #' type will be reported with variants w.r.t the + strand. Values for each #' input BAM file can be provided as a vector. #' @param only_keep_variants if TRUE, then only variant sites will be reported -#' (FALSE by default). Values for each input BAM file can be provided as a vector. +#' (FALSE by default). Values for each input BAM file can be provided as a +#' vector. #' @param bam_flags bam flags to filter or keep, use [Rsamtools::scanBamFlag()] #' to generate. #' @param trim_5p Bases to trim from 5' end of read alignments @@ -515,18 +520,19 @@ cfilterParam <- function(param, nfiles) { #' c(X, Y). #' @param read_bqual Exclude read if more than X percent of the bases have #' base qualities less than Y. Numeric vector of length 2. e.g. c(0.25, 20) -#' @param min_variant_reads Required number of reads containing a variant for a site -#' to be reported. Calculated per bam file, such that if 1 bam file has >= min_variant_reads, -#' then the site will be reported. -#' @param min_allelic_freq minimum allelic frequency required for a variant to be -#' reported in ALT assay. -#' @param report_multiallelic if TRUE, report sites with multiple variants passing -#' filters. If FALSE, site will not be reported. -#' @param remove_overlaps if TRUE, enable read pair overlap detection, which will count only -#' 1 read in regions where read pairs overlap using the htslib algorithm. In brief -#' for each overlapping base pair the base quality of the base with the lower quality -#' is set to 0, which discards it from being counted. -#' +#' @param min_variant_reads Required number of reads containing a variant for a +#' site to be reported. Calculated per bam file, such that if 1 bam file has >= +#' min_variant_reads, then the site will be reported. +#' @param min_allelic_freq minimum allelic frequency required for a variant to +#' be reported in ALT assay. +#' @param report_multiallelic if TRUE, report sites with multiple variants +#' passing filters. If FALSE, site will not be reported. +#' @param remove_overlaps if TRUE, enable read pair overlap detection, which +#' will count only 1 read in regions where read pairs overlap using the htslib +#' algorithm. In brief for each overlapping base pair the base quality of the +#' base with the lower quality is set to 0, which discards it from being +#' counted. +#' #' @rdname pileup_sites #' @export FilterParam <- @@ -574,11 +580,12 @@ FilterParam <- stopifnot(ftrim_5p >= 0 && ftrim_5p <= 1) stopifnot(ftrim_3p >= 0 && ftrim_3p <= 1) - stopifnot(length(max_mismatch_type) == 2 && !any(is.na(max_mismatch_type))) + stopifnot(length(max_mismatch_type) == 2 && + !any(is.na(max_mismatch_type))) stopifnot(length(read_bqual) == 2 && !any(is.na(read_bqual))) stopifnot(isTRUEorFALSE(report_multiallelic)) stopifnot(isTRUEorFALSE(remove_overlaps)) - + # variable length depending on n_files stopifnot(is.character(library_type)) stopifnot(is.integer(min_mapq)) @@ -588,8 +595,11 @@ FilterParam <- # defaults to allowing all reads bam_flags <- Rsamtools::scanBamFlag() } else { - if (length(bam_flags) != 2 || !all(names(bam_flags) == c("keep0", "keep1"))) { - stop("bam_flags must be generated using Rsamtools::scanBamFlag()") + if (length(bam_flags) != 2 || + !all(names(bam_flags) == c("keep0", "keep1"))) { + stop( + "bam_flags must be generated using Rsamtools::scanBamFlag()" + ) } } @@ -597,17 +607,27 @@ FilterParam <- ## creation .FilterParam( - max_depth = max_depth, min_base_quality = min_base_quality, - min_mapq = min_mapq, min_depth = min_depth, - library_type = library_type, only_keep_variants = only_keep_variants, - trim_5p = trim_5p, trim_3p = trim_3p, indel_dist = indel_dist, - splice_dist = splice_dist, homopolymer_len = homopolymer_len, - max_mismatch_type = max_mismatch_type, read_bqual = read_bqual, + max_depth = max_depth, + min_base_quality = min_base_quality, + min_mapq = min_mapq, + min_depth = min_depth, + library_type = library_type, + only_keep_variants = only_keep_variants, + trim_5p = trim_5p, + trim_3p = trim_3p, + indel_dist = indel_dist, + splice_dist = splice_dist, + homopolymer_len = homopolymer_len, + max_mismatch_type = max_mismatch_type, + read_bqual = read_bqual, min_splice_overhang = min_splice_overhang, min_variant_reads = min_variant_reads, - ftrim_5p = ftrim_5p, ftrim_3p = ftrim_3p, - min_allelic_freq = min_allelic_freq, report_multiallelic = report_multiallelic, - bam_flags = bam_flags, remove_overlaps = remove_overlaps + ftrim_5p = ftrim_5p, + ftrim_3p = ftrim_3p, + min_allelic_freq = min_allelic_freq, + report_multiallelic = report_multiallelic, + bam_flags = bam_flags, + remove_overlaps = remove_overlaps ) } @@ -663,9 +683,9 @@ gr_to_cregions <- function(gr) { #' @description Create a `RangedSummarizedExperiment` from a single or list of #' pileups (e.g., for different samples) generated by `pileup_sites()`. #' -#' @param plps results from running [pileup_sites()], can be one result, a list of -#' results, or a named list of results. If a named list is given, the colData -#' will be named using the names in the list. +#' @param plps results from running [pileup_sites()], can be one result, a +#' list of results, or a named list of results. If a named list is given, the +#' colData will be named using the names in the list. #' @param assay_cols character vector of columns to store as assays #' @param sample_names A list of names to be added to the SE object. If no #' sample names are given and plps is not a named list, then default names (ie @@ -702,7 +722,8 @@ merge_pileups <- function(plps, } else { if (length(plps) != length(sample_names)) { cli::cli_abort(c( - "You must provide the same number of sample names as pileup results", + "You must provide the same number of sample names ", + "as pileup results.", "You supplied {length(plps)} pileup results but supplied ", "{length(sample_names} sample names." )) @@ -860,7 +881,7 @@ site_names <- function(gr, allele = FALSE) { if (length(gr) == 0) { return(NULL) } - if(allele) { + if (allele) { stopifnot(all(c("ALT", "REF") %in% colnames(mcols(gr)))) res <- paste0( "site", "_", @@ -880,5 +901,4 @@ site_names <- function(gr, allele = FALSE) { ) } res - } diff --git a/R/sc-pileup.R b/R/sc-pileup.R index 053170b..78036c5 100644 --- a/R/sc-pileup.R +++ b/R/sc-pileup.R @@ -1,10 +1,10 @@ #' Generate base counts per cell #' -#' @description This function processes scRNA-seq library to enumerate base counts -#' for Reference (unedited) or Alternate ( -#' edited) bases at specified sites in single cells. `pileup_cells` can process -#' droplet scRNA-seq libraries, from a BAM file containing a cell-barcode and UMI, -#' or well-based libraries that do not contain cell-barcodes. +#' @description This function processes scRNA-seq library to enumerate base +#' counts for Reference (unedited) or Alternate (edited) bases at specified +#' sites in single cells. `pileup_cells` can process droplet scRNA-seq +#' libraries, from a BAM file containing a cell-barcode and UMI, or well-based +#' libraries that do not contain cell-barcodes. #' #' The `sites` parameter specifies sites to quantify. This must be a [GRanges] #' object with 1 base intervals, a strand (+ or -), and supplemented with @@ -15,26 +15,28 @@ #' each ref and alt base enumerated for each cell-barcode present. A single #' base will be counted once for each UMI sequence present in each cell. #' -#' @param bamfiles a path to a BAM file (for droplet scRNA-seq), or a vector of paths -#' to BAM files (Smart-seq2). Can be supplied as a character vector, [BamFile], or -#' [BamFileList]. +#' @param bamfiles a path to a BAM file (for droplet scRNA-seq), or a vector of +#' paths to BAM files (Smart-seq2). Can be supplied as a character vector, +#' [BamFile], or [BamFileList]. #' @param sites a GRanges object containing sites to process. See examples for #' valid formatting. -#' @param output_directory Output directory for output matrix files. The directory -#' will be generated if it doesn't exist. +#' @param output_directory Output directory for output matrix files. The +#' directory will be generated if it doesn't exist. #' @param chroms A character vector of chromosomes to process. If supplied, only #' sites present in the listed chromosomes will be processed -#' @param cell_barcodes A character vector of single cell barcodes to process. If -#' processing multiple BAM files (e.g. Smart-seq2), provide a character vector -#' of unique identifiers for each input BAM, to name each BAM file in the output files. +#' @param cell_barcodes A character vector of single cell barcodes to process. +#' If processing multiple BAM files (e.g. Smart-seq2), provide a character +#' vector of unique identifiers for each input BAM, to name each BAM file in the +#' output files. #' @param param object of class [FilterParam()] which specify various filters to #' apply to reads and sites during pileup. Note that the `min_depth` and #' `min_variant_reads` parameters if set > 0 specify the number of reads -#' from any cell required in order to report a site. E.g. if `min_variant_reads` is -#' set to 2, then at least 2 reads (from any cell) must have a variant in order -#' to report the site. Setting `min_depth` and `min_variant_reads` to 0 reports -#' all sites present in the `sites` object. The following options are not enabled -#' for pileup_cells(): `max_mismatch_type`, `homopolymer_len`, and `min_allelic_freq`. +#' from any cell required in order to report a site. E.g. if +#' `min_variant_reads` is set to 2, then at least 2 reads (from any cell) must +#' have a variant in order to report the site. Setting `min_depth` and +#' `min_variant_reads` to 0 reports all sites present in the `sites` object. +#' The following options are not enabled for pileup_cells(): +#' `max_mismatch_type`, `homopolymer_len`, and `min_allelic_freq`. #' @param umi_tag tag in BAM containing the UMI sequence #' @param cb_tag tag in BAM containing the cell-barcode sequence #' @param return_sce if `TRUE`, data is returned as a SingleCellExperiment, if @@ -50,8 +52,8 @@ #' for the reference and alternate alleles. The [rowRanges()] will contain the #' genomic interval for each site, along with `REF` and `ALT` columns. The #' rownames will be populated with the format -#' `site_[seqnames]_[position(1-based)]_[strand]_[allele]`, with `strand` being encoded -#' as 1 = +, 2 = -, and 3 = *, and allele being `REF` + `ALT`. +#' `site_[seqnames]_[position(1-based)]_[strand]_[allele]`, with `strand` +#' being encoded as 1 = +, 2 = -, and 3 = *, and allele being `REF` + `ALT`. #' #' If `return_sce` is `FALSE` then a character vector of paths to the #' sparseMatrix files (`barcodes.txt.gz`, `sites.txt.gz`, `counts.mtx.gz`), @@ -80,10 +82,12 @@ #' #' many_small_bams <- rep(bam_fn, 10) #' bam_ids <- LETTERS[1:10] -#' -#' fp <- FilterParam(library_type = "unstranded", -#' remove_overlaps = TRUE) -#' +#' +#' fp <- FilterParam( +#' library_type = "unstranded", +#' remove_overlaps = TRUE +#' ) +#' #' pileup_cells(many_small_bams, #' sites = gr, #' cell_barcodes = bam_ids, @@ -112,8 +116,7 @@ pileup_cells <- function(bamfiles, BPPARAM = SerialParam(), return_sce = TRUE, verbose = FALSE) { - - if(!is(bamfiles, "BamFileList")) { + if (!is(bamfiles, "BamFileList")) { bamfiles <- BamFileList(bamfiles) } @@ -164,9 +167,12 @@ pileup_cells <- function(bamfiles, umi_tag <- check_tag(umi_tag) valid_regions <- setup_valid_regions(bamfiles[[1]], chroms) - chroms_to_process <- intersect(valid_regions$chroms, unique(seqnames(sites))) + chroms_to_process <- intersect( + valid_regions$chroms, + unique(seqnames(sites)) + ) sites <- sites[seqnames(sites) %in% chroms_to_process, ] - + if (verbose) cli::cli_alert("Beginning pileup") bf <- path.expand(path(bamfiles)) bfi <- path.expand(index(bamfiles)) @@ -257,9 +263,9 @@ defaultScBamFlags <- Rsamtools::scanBamFlag( ) get_sc_pileup <- function(bamfn, index, id, sites, barcodes, - outfile_prefix, chrom, - umi_tag, cb_tag, param, - verbose) { + outfile_prefix, chrom, + umi_tag, cb_tag, param, + verbose) { if (length(chrom) > 0) { sites <- sites[seqnames(sites) %in% chrom, ] } @@ -302,7 +308,8 @@ get_sc_pileup <- function(bamfn, index, id, sites, barcodes, # alternatively can regenerate gr regions using data in files, however # using the indexes is likely less error prone sce <- read_sparray(plp_outfns[1], plp_outfns[2], plp_outfns[3], - site_format = "index") + site_format = "index" + ) ridx <- as.integer(rownames(sce)) rowRanges(sce) <- sites[ridx] @@ -346,7 +353,7 @@ get_sc_pileup <- function(bamfn, index, id, sites, barcodes, #' bai <- indexBam(bam_fn) #' #' fp <- FilterParam(library_type = "fr-second-strand") -#' mtx_fns <- pileup_cells(bam_fn, gr, cbs, outdir, return_sce = FALSE) +#' mtx_fns <- pileup_cells(bam_fn, gr, cbs, outdir, return_sce = FALSE) #' sce <- read_sparray(mtx_fns[1], mtx_fns[2], mtx_fns[3]) #' sce #' @@ -358,30 +365,35 @@ get_sc_pileup <- function(bamfn, index, id, sites, barcodes, #' @importFrom R.utils gzip #' @export read_sparray <- function(mtx_fn, sites_fn, bc_fn, - site_format = c("coordinate", "index")) { - + site_format = c("coordinate", "index")) { if (!file.size(sites_fn) > 0) { return(SingleCellExperiment::SingleCellExperiment()) } rnames <- data.table::fread(sites_fn, - sep = "\t", - col.names = c("index", "seqnames", "start", - "strand", "REF", "ALT"), - colClasses = c("integer", "character", "integer", - "integer", "character", "character"), - data.table = FALSE) + sep = "\t", + col.names = c( + "index", "seqnames", "start", + "strand", "REF", "ALT" + ), + colClasses = c( + "integer", "character", "integer", + "integer", "character", "character" + ), + data.table = FALSE + ) site_format <- match.arg(site_format) # reconstruct rowRanges using index value or - if(site_format == "index") { + if (site_format == "index") { rnames <- rnames$index } else if (site_format == "coordinate") { rnames <- rnames[, -1] rnames$strand <- c("+", "-")[rnames$strand] gr <- makeGRangesFromDataFrame(rnames, - end.field = "start", - keep.extra.columns = TRUE) + end.field = "start", + keep.extra.columns = TRUE + ) rnames <- site_names(gr, allele = TRUE) } @@ -391,12 +403,13 @@ read_sparray <- function(mtx_fn, sites_fn, bc_fn, sp_mtx_names <- c("nRef", "nAlt") n_sp_cols <- 2 + length(sp_mtx_names) - if(file.size(mtx_fn) > 0) { + if (file.size(mtx_fn) > 0) { dt <- data.table::fread(mtx_fn, - sep = " ", - colClasses = "integer", - skip = n_skip, - header = FALSE) + sep = " ", + colClasses = "integer", + skip = n_skip, + header = FALSE + ) if (ncol(dt) != n_sp_cols) cli::cli_abort("malformed sparseMatrix") @@ -413,17 +426,17 @@ read_sparray <- function(mtx_fn, sites_fn, bc_fn, # handle case where sites were queried, but no counts sps <- lapply(seq_along(sp_mtx_names), function(x) { sparseMatrix(integer(0), - integer(0), - x = 0L, - dims = c(length(rnames), length(cnames))) + integer(0), + x = 0L, + dims = c(length(rnames), length(cnames)) + ) }) - } names(sps) <- sp_mtx_names res <- SingleCellExperiment::SingleCellExperiment(sps) colnames(res) <- cnames - if(site_format == "coordinate") { + if (site_format == "coordinate") { rowRanges(res) <- gr } rownames(res) <- rnames @@ -431,42 +444,46 @@ read_sparray <- function(mtx_fn, sites_fn, bc_fn, } write_sparray <- function(sce, mtx_fn, sites_fn, bc_fn) { - if(!all(c("nRef", "nAlt") %in% assayNames(sce))) { + if (!all(c("nRef", "nAlt") %in% assayNames(sce))) { cli::cli_abort("missing required asssays nRef or nAlt") } nref <- assay(sce, "nRef") nalt <- assay(sce, "nAlt") - if(!is(nref, 'sparseMatrix')) { + if (!is(nref, "sparseMatrix")) { cli::cli_abort("nRef must be a sparseMatrix") } - if(!is(nalt, 'sparseMatrix')) { + if (!is(nalt, "sparseMatrix")) { cli::cli_abort("nAlt must be a sparseMatrix") } nref_trpl <- summary(nref) nalt_trpl <- summary(nalt) conforms <- identical(dim(nref_trpl), dim(nalt_trpl)) - if(!conforms){ + if (!conforms) { cli::cli_abort("nRef and nAlt sparseMatrices triplet dimensions differ") } writeLines( - c("%%% raer MatrixMarket-like matrix coordinate integer general", - paste("%%% ", nref@Dim[1], nref@Dim[2], length(nref@x)), - "%%% x y nRef nAlt"), - gzfile(mtx_fn)) + c( + "%%% raer MatrixMarket-like matrix coordinate integer general", + paste("%%% ", nref@Dim[1], nref@Dim[2], length(nref@x)), + "%%% x y nRef nAlt" + ), + gzfile(mtx_fn) + ) mtx <- matrix(0L, nrow = dim(nref_trpl)[1], ncol = 4L) mtx <- cbind(nref_trpl, nalt = nalt_trpl$x) data.table::fwrite(mtx, mtx_fn, - append = TRUE, - sep = " ", - row.names = FALSE, - col.names = FALSE, - showProgress = FALSE) + append = TRUE, + sep = " ", + row.names = FALSE, + col.names = FALSE, + showProgress = FALSE + ) sites <- data.frame( seq_along(sce), @@ -478,10 +495,11 @@ write_sparray <- function(sce, mtx_fn, sites_fn, bc_fn) { ) data.table::fwrite(sites, sites_fn, - sep = "\t", - row.names = FALSE, - col.names = FALSE, - showProgress = FALSE) + sep = "\t", + row.names = FALSE, + col.names = FALSE, + showProgress = FALSE + ) writeLines(colnames(sce), gzfile(bc_fn)) } @@ -499,7 +517,9 @@ gr_to_regions <- function(gr) { } if (any(strand(gr) == "*")) { - cli::cli_alert_warning("missing strand not found in input, coercing strand to '+'") + cli::cli_alert_warning( + "missing strand not found in input, coercing strand to '+'" + ) strand(gr) <- "+" } gr$idx <- seq(1, nr) # is one-based index diff --git a/R/utils.R b/R/utils.R index fcdd1cb..2cda9f9 100644 --- a/R/utils.R +++ b/R/utils.R @@ -14,9 +14,9 @@ raer_example <- function(path) { #' Generate a small RangedSummarizedExperiment object for tests and examples #' #' @description -#' A RangedSummarizedExperiment containing a subset of data from an RNA-seq experiment -#' to measure the effects of IFN treatment of cell lines with wild-type -#' or ADAR1-KO. +#' A RangedSummarizedExperiment containing a subset of data from an RNA-seq +#' experiment to measure the effects of IFN treatment of cell lines with +#' wild-type or ADAR1-KO. #' @examples #' mock_rse() #' @@ -59,17 +59,19 @@ chunk_vec <- function(x, n) { #' Find regions with oligodT mispriming #' -#' @description OligodT will prime at A-rich regions in an RNA. Reverse transcription -#' from these internal priming sites will install an oligodT sequence at the 3' end -#' of the cDNA. Sequence variants within these internal priming sites are enriched -#' for variants converting the genomic sequence to the A encoded by the oligodT primer. -#' Trimming poly(A) from the 3' ends of reads reduces but does not eliminate these signals +#' @description OligodT will prime at A-rich regions in an RNA. Reverse +#' transcription from these internal priming sites will install an oligodT +#' sequence at the 3' end of the cDNA. Sequence variants within these internal +#' priming sites are enriched for variants converting the genomic sequence to +#' the A encoded by the oligodT primer. Trimming poly(A) from the 3' ends of +#' reads reduces but does not eliminate these signals #' -#' This function will identify regions that are enriched for mispriming events. Reads -#' that were trimmed to remove poly(A) (encoded in the pa tag by 10x genomics) are -#' identified. The aligned 3' positions of these reads are counted, and sites passing -#' thresholds (at least 2 reads) are retained as possible sites of mispriming. Be default -#' regions 5 bases upstream and 20 bases downstream of these putative mispriming sites +#' This function will identify regions that are enriched for mispriming events. +#' Reads that were trimmed to remove poly(A) (encoded in the pa tag by +#' 10x Genomics) are identified. The aligned 3' positions of these reads are +#' counted, and sites passing thresholds (at least 2 reads) are retained as +#' possible sites of mispriming. Be default regions 5 bases upstream and +#' 20 bases downstream of these putative mispriming sites #' are returned. #' #' @param bamfile path to bamfile @@ -85,10 +87,11 @@ chunk_vec <- function(x, n) { #' #' @returns A GenomicsRanges containing regions enriched for putative mispriming #' events. The `n_reads` column specifies the number of polyA trimmed reads -#' overlapping the mispriming region. `mean_pal` indicates the mean length of polyA -#' sequence trimmed from reads overlapping the region. The `n_regions` column specifies the number -#' overlapping independent regions found in each chunk (dictated by `n_reads_per_chunk`). -#' The `A_freq` column indicates the frequency of A bases within the region. +#' overlapping the mispriming region. `mean_pal` indicates the mean length of +#' polyA sequence trimmed from reads overlapping the region. The `n_regions` +#' column specifies the number overlapping independent regions found in each +#' chunk (dictated by `n_reads_per_chunk`). The `A_freq` column indicates the +#' frequency of A bases within the region. #' #' @import GenomicRanges S4Vectors IRanges #' @importFrom Rsamtools ScanBamParam BamFile @@ -100,16 +103,15 @@ chunk_vec <- function(x, n) { #' #' @export find_mispriming_sites <- function(bamfile, fasta, pos_5p = 5, pos_3p = 20, - min_reads = 2, tag = "pa", tag_values = 3:300, - n_reads_per_chunk = 1e6, verbose = TRUE){ - - if(pos_5p < 0 || pos_3p < 0) { + min_reads = 2, tag = "pa", tag_values = 3:300, + n_reads_per_chunk = 1e6, verbose = TRUE) { + if (pos_5p < 0 || pos_3p < 0) { cli::cli_abort("pos_5p and pos_3p must be positive integers") } tg_lst <- list(tag_values) names(tg_lst) <- tag - sbp <- Rsamtools::ScanBamParam(tagFilter = tg_lst,tag = tag) + sbp <- Rsamtools::ScanBamParam(tagFilter = tg_lst, tag = tag) bf <- Rsamtools::BamFile(bamfile, yieldSize = n_reads_per_chunk) open(bf) pa_pks <- GRanges() @@ -119,7 +121,7 @@ find_mispriming_sites <- function(bamfile, fasta, pos_5p = 5, pos_3p = 20, if (length(galn) == 0) break gr <- as(galn, "GRanges") - if(verbose) { + if (verbose) { s_ivl <- gr[1] e_ivl <- gr[length(gr)] message("working on ", s_ivl, " to ", e_ivl) @@ -134,14 +136,15 @@ find_mispriming_sites <- function(bamfile, fasta, pos_5p = 5, pos_3p = 20, mean_pal <- n_reads <- NULL ans <- reduce(pa_pks, with.revmap = TRUE) mcols(ans) <- aggregate(pa_pks, - mcols(ans)$revmap, - mean_pal = mean(mean_pal), - n_reads = sum(n_reads), - drop = FALSE) + mcols(ans)$revmap, + mean_pal = mean(mean_pal), + n_reads = sum(n_reads), + drop = FALSE + ) # keep reads above threshold, slop, and merge adjacent misprimed regions ans <- ans[ans$n_reads >= min_reads] - if(length(ans) == 0) { + if (length(ans) == 0) { return(empty_mispriming_record()) } @@ -151,10 +154,11 @@ find_mispriming_sites <- function(bamfile, fasta, pos_5p = 5, pos_3p = 20, res <- reduce(ans, with.revmap = TRUE) mcols(res) <- aggregate(ans, - mcols(res)$revmap, - mean_pal = mean(mean_pal), - n_reads = sum(n_reads), - drop = FALSE) + mcols(res)$revmap, + mean_pal = mean(mean_pal), + n_reads = sum(n_reads), + drop = FALSE + ) res$n_regions <- IRanges::grouplengths(res$grouping) res$grouping <- NULL res <- pa_seq_context(res, fasta) @@ -183,19 +187,21 @@ merge_pa_peaks <- function(gr) { pa <- NULL ans <- reduce(gr, with.revmap = TRUE) mcols(ans) <- aggregate(gr, - mcols(ans)$revmap, - mean_pal = mean(pa), - drop = FALSE) + mcols(ans)$revmap, + mean_pal = mean(pa), + drop = FALSE + ) mcols(ans)$n_reads <- grouplengths(ans$grouping) ans } #' @importFrom Rsamtools FaFile scanFa #' @importFrom Biostrings letterFrequency reverseComplement -pa_seq_context <- function(gr, fasta){ +pa_seq_context <- function(gr, fasta) { fa <- Rsamtools::FaFile(fasta) seqs <- Rsamtools::scanFa(fa, gr) - seqs[strand(gr) == "-"] <- Biostrings::reverseComplement(seqs[strand(gr) == "-"]) + rvcomp <- Biostrings::reverseComplement(seqs[strand(gr) == "-"]) + seqs[strand(gr) == "-"] <- rvcomp a_prop <- Biostrings::letterFrequency(seqs, "A") / width(gr) mcols(gr)$A_freq <- a_prop[, 1] gr @@ -213,7 +219,8 @@ seqinfo_from_header <- function(bam) { stopifnot(length(bam) == 1) stopifnot(is(bam, "BamFile")) ctigs <- Rsamtools::scanBamHeader(path(bam), - index = index(bam))[[1]]$targets + index = index(bam) + )[[1]]$targets GenomeInfoDb::Seqinfo(names(ctigs), ctigs) } @@ -226,19 +233,22 @@ comp_bases <- function(x) { # Check is ALT allele matches snpDB allele check_snp_match <- function(x, snp_col = "snp_alt_alleles", stranded = TRUE) { stopifnot(all(c(snp_col, "ALT") %in% - colnames(mcols(x)))) + colnames(mcols(x)))) alt <- mcols(x)$ALT snp_seq_str <- mcols(x)[[snp_col]] snp_seqs <- strsplit(snp_seq_str, ",") - # convert ALT to + strand representation to match SNP sequence representation - if(stranded) { + # convert ALT to + strand representation to match SNP sequence + # representation + if (stranded) { is_minus <- as.logical(strand(x) == "-") alt[is_minus] <- comp_bases(alt[is_minus]) } - res <- mapply(function(x, y) {x %in% y}, decode(alt), snp_seqs, USE.NAMES = FALSE) + res <- mapply(function(x, y) { + x %in% y + }, decode(alt), snp_seqs, USE.NAMES = FALSE) # set sites without a SNP to NA res[snp_seq_str == ""] <- NA res @@ -248,11 +258,12 @@ check_tag <- function(tag) { if (!is.null(tag)) { if (length(tag) != 1 && nchar(tag) != 2) { tag_variable <- as.list(match.call())$tag - cli::cli_abort("{tag_variable} must be a character(1) with nchar of 2 ") + cli::cli_abort( + "{tag_variable} must be a character(1) with nchar of 2" + ) } } else { tag <- character() } tag } - diff --git a/README.Rmd b/README.Rmd index b024d68..a9ea956 100644 --- a/README.Rmd +++ b/README.Rmd @@ -28,11 +28,12 @@ raer facilitates analysis of RNA adenosine editing in the You can install the development version of raer from [GitHub](https://github.com/rnabioco/raer) with: ```{r, eval = FALSE} -if (!require("BiocManager", quietly = TRUE)) +if (!require("BiocManager", quietly = TRUE)) { install.packages("BiocManager") +} # The following initializes usage of Bioc devel -BiocManager::install(version='devel') +BiocManager::install(version = "devel") BiocManager::install("rnabioco/raer") ``` diff --git a/inst/extdata/README b/inst/extdata/README deleted file mode 100644 index beeb4c3..0000000 --- a/inst/extdata/README +++ /dev/null @@ -1,3 +0,0 @@ -Data files here were generated using code in the raerdata package from these Rmarkdown files: -https://github.com/rnabioco/raerdata/blob/devel/inst/scripts/10x-data-processing.Rmd -https://github.com/rnabioco/raerdata/blob/devel/inst/scripts/GSE99249-data-processing.Rmd diff --git a/man/figures/raer-logo.ai b/inst/misc/raer-logo.ai similarity index 100% rename from man/figures/raer-logo.ai rename to inst/misc/raer-logo.ai diff --git a/inst/scripts/10x-data-processing.Rmd b/inst/scripts/10x-data-processing.Rmd new file mode 100644 index 0000000..58d1e9b --- /dev/null +++ b/inst/scripts/10x-data-processing.Rmd @@ -0,0 +1,123 @@ +--- +title: "10x Genomics datasets" +output: html_document +date: "2023-05-22" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +```{r} +library(R.utils) +library(SingleCellExperiment) +library(DropletUtils) +library(rtracklayer) +``` + +## Raer package data + +### Mouse 10x single cell RNA-seq + +Make a tiny bam file from a 10x experiment, in this case a mouse brain single cell RNA-seq experiment. + +First, make a tiny fasta file from a few regions with editing sites. Note that database files used here were downloaded using the script provided at `inst/scripts/dbases/get_databases.sh`. + +```{r} +regions <- c("chr2:116032569-116033683", + "chr6:48079510-48079909", + "chr11:75300212-75300704", + "chr8:65639891-65640286") +regions <- GRanges(regions) +strand(regions) <- "+" + +seqs <- getSeq(FaFile("dbases/GRCm38.primary_assembly.genome.fa"), regions) +names(seqs) <- sub("chr", "", names(seqs)) +export(seqs, "../extdata/mouse_tiny.fasta") + + +gtf_lines <- c( + '2\tunknown\texon\t1\t1115\t.\t-\t.\tgene_id Meis2; transcript_id Meis2; gene_name Meis2; gene_biotype "protein_coding";', + '6\tunknown\texon\t1\t400\t.\t-\t.\tgene_id Zfp746; transcript_id Zfp746; gene_name Zfp746; gene_biotype "protein_coding";', + '10\tunknown\texon\t1\t616\t.\t+\t.\tgene_id Ppfia2; transcript_id Ppfia2; gene_name Ppfia2; gene_biotype "protein_coding";', + '11\tunknown\texon\t1\t493\t.\t-\t.\tgene_id Rpa1; transcript_id Rpa1; gene_name Rpa1; gene_biotype "protein_coding";') +writeLines(gtf_lines, "../extdata/mouse_tiny.gtf") +``` + +Next, build a cellranger reference for the tiny fasta + +```{bash} +mkdir -p mouse_5k_neuron +cd mouse_5k_neuron + +cellranger mkref --genome=mm10_tiny --fasta=../extdata/mouse_tiny.fasta --genes=../extdata/mouse_tiny.gtf +``` + +Download bam file, downsample to 1%, convert to fastqs, and requantify using cellranger count. + +```{bash} +cd mouse_5k_neuron +wget https://cg.10xgenomics.com/samples/cell-exp/3.0.2/5k_neuron_v3_nextgem/5k_neuron_v3_nextgem_possorted_genome_bam.bam +wget https://cg.10xgenomics.com/samples/cell-exp/3.0.2/5k_neuron_v3_nextgem/5k_neuron_v3_nextgem_possorted_genome_bam.bam.bai + +# downsample +samtools view -@ 8 -b -s 42.01 5k_neuron_v3_nextgem_possorted_genome_bam.bam > 5k_neuron_v3_nextgem_possorted_genome_bam_0.1.bam + +# bamtofastq from 10x genomics +~/bin/bamtofastq-1.3.2 5k_neuron_v3_nextgem_possorted_genome_bam_0.1.bam ./fastq/ + +# cellranger from 10x genomics +cellranger count \ + --id 5k_neuron_tiny \ + --transcriptome=mouse_5k_neuron/mm10_tiny \ + --fastqs fastq/5k_neuron_v3_nextgem_0_1_HKF7VDSXX \ + --sample bamtofastq \ + --jobmode local \ + --localcores 6 \ + --localmem 4 \ + --chemistry SC3Pv3 +``` + +Next extract a small subset of reads. + +```{bash} +cd mouse_5k_neuron/5k_neuron_tiny/outs + +# find UMIs duplicated in file +samtools view -f 1024 possorted_genome_bam.bam 2 \ + | egrep -o "UB:Z:[A-Z]+" \ + | sed 's/UB:Z://'g > umi_tags_to_keep.txt + +# get reads from UMIs +samtools view -b -D UB:umi_tags_to_keep.txt possorted_genome_bam.bam > tmp.bam + +# get all other reads from cbs with these reads +samtools view tmp.bam | cut -f 1 | sort | uniq > tmp_reads.txt + +# and a few random reads +samtools view -s 42.01 possorted_genome_bam.bam 2 | head -n 500 >> tmp_reads.txt +cat tmp_reads.txt | sort | uniq > tmp_uniq_reads.txt + +samtools view -b -N tmp_uniq_reads.txt possorted_genome_bam.bam > tmp2.bam +samtools merge -f -o 5k_neuron_mouse_possort.bam tmp.bam tmp2.bam +samtools index 5k_neuron_mouse_possort.bam + +cd ../../../ + +mv 5k_neuron_mouse_possort.bam* ../extdata/ +``` + + +clean up + +```{bash} +rm -rf mouse_5k_neuron extdata/mouse_tiny.gtf +``` + + +
Show session info + +```{r code} +sessionInfo() +``` +
diff --git a/inst/scripts/GSE99249-data-processing.Rmd b/inst/scripts/GSE99249-data-processing.Rmd new file mode 100644 index 0000000..1a46cd7 --- /dev/null +++ b/inst/scripts/GSE99249-data-processing.Rmd @@ -0,0 +1,230 @@ +--- +title: "GSE99249: ADAR1 KO 293FT" +author: Kent Riemondy +date: "2023-05-22" +output: + BiocStyle::html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +realign_data <- TRUE +``` + +```{r} +library(glue) +library(rtracklayer) +library(Rsamtools) +library(raer) + +output_dir <- "../extdata" + +sra_ids <- c("SRR5564269", + "SRR5564277") +``` + +## Obtaining fastq files + +FASTQ files will be downloaded using the fasterq-dump command-line tool from the sratoolkit. + +```{r, eval = realign_data} + +fq_suffixes <- paste0("_", 1:2, ".fastq") +fq_files <- list() + +for(i in seq_along(sra_ids)) { + sra_id <- sra_ids[i] + dir.create(file.path("GSE99249", sra_id), recursive = TRUE, showWarnings = FALSE) + outfiles <- file.path("GSE99249", sra_id, paste0(sra_id, fq_suffixes)) + fq_files[[sra_id]] <- outfiles + + if(all(file.exists(outfiles))) next; + + message("downloading sra id: ", sra_id) + rc <- system(paste("fasterq-dump -S -O", file.path("GSE99249", sra_id), sra_id)) + stopifnot(rc == 0L) +} +``` + + +## Read alignment + +Reads are next aligned to the genome using STAR supplemented with splice junctions defined for the hg38 genome. The STAR index and gene annotations in GTF format were generated using the script provided at `inst/scripts/dbases/get_databases.sh`. + +```{r, eval = realign_data} +star_idx <- "dbases/star/GRCh38" +gtf <- "dbases/gencode.v37.annotation.gtf" +star_bam_files <- list() + +for(i in seq_along(sra_ids)){ + + sra_id <- sra_ids[[i]] + r1 <- fq_files[[sra_id]][1] + r2 <- fq_files[[sra_id]][2] + out_prefix <- file.path("GSE99249", sra_id, paste0(sra_id, "_")) + tmp_bam_file <- paste0(out_prefix, "Aligned.out.bam") + srted_bam_file <- paste0(out_prefix, "sorted") + out_bam <- paste0(srted_bam_file, ".bam") + + if(file.exists(out_bam)) { + star_bam_files[[sra_id]] <- out_bam + next; + } + + message("aligning sra_id: ", sra_id) + + star_cmd <- glue("STAR", + "--genomeDir {star_idx}", + "--sjdbGTFfile {gtf}", + "--runThreadN 12", + "--readFilesIn {r1} {r2}", + "--outFileNamePrefix {out_prefix}", + "--outSAMattributes NH HI AS nM MD", + "--outSAMtype BAM Unsorted", + "--outFilterType BySJout", + "--alignSJoverhangMin 8", + "--alignSJDBoverhangMin 2", + "--outFilterMismatchNoverLmax 0.04", + "--alignIntronMin 20", + "--alignIntronMax 1000000", + "--alignMatesGapMax 1000000", + .sep = " ") + rc <- system(star_cmd) + stopifnot(rc == 0L) + + out_bam <- sortBam(tmp_bam_file, srted_bam_file) + unlink(tmp_bam_file) + + star_bam_files[[sra_id]] <- out_bam +} +``` + +# Identify duplicate reads + +Duplicate reads are next identified using `MarkDuplicates` from picard, and bam file index files generated from final output bam files. + +```{r, eval = realign_data} +for(i in seq_along(sra_ids)){ + sra_id <- sra_ids[[i]] + inbam <- star_bam_files[[sra_id]] + outbam <- file.path("GSE99249", paste0(sra_id, ".bam")) + if(file.exists(outbam)) next; + + dedup_log <- file.path("GSE99249", sra_id, "dedup.log") + picard_cmd <- glue("picard MarkDuplicates", + "-Xms2g -Xmx8g -XX:ParallelGCThreads=2", + "-I {inbam}", + "-M {dedup_log}", + "--MAX_FILE_HANDLES_FOR_READ_ENDS_MAP 1000", + "-O {outbam}", + "--REMOVE_DUPLICATES false", + "--CREATE_INDEX false", + "--VALIDATION_STRINGENCY SILENT", + .sep = " ") + rc <- system(picard_cmd) + stopifnot(rc == 0L) + indexBam(outbam) +} +``` + +# Make tiny dataset for testing package + +These files will be included directly in the `raer` package to be used for tests and examples. + +```{r} +regions <- c("chr3:156540241-156540769", + "chr4:176330847-176331494", + "chr5:80627251-80627768") +regions <- GRanges(regions) +strand(regions) <- "+" + +seqs <- BSgenome::getSeq(FaFile(fasta_fn), regions) +names(seqs) <- c("SSR3", "SPCS3", "DHFR") +export(seqs, file.path(output_dir, "human.fasta")) + +strand(regions) <- c("-", "+", "-") +regions$name <- names(seqs) +regions$score <- 0L +export(regions, file.path(output_dir, "regions.bed")) +``` + +```{bash} +mkdir -p GSE99249/temp_data + +## build star index +STAR --runMode genomeGenerate --genomeDir GSE99249/star/ --genomeFastaFiles ../extdata/human.fasta --genomeSAindexNbases 4 + +regions=../extdata/regions.bed + +## extract reads in fasta file regions +samtools view -M -L $regions GSE99249/SRR5564269.bam \ + | cut -f 1 \ + | uniq > GSE99249/temp_data/reads_to_get.txt + +samtools view -b -N GSE99249/temp_data/reads_to_get.txt GSE99249/SRR5564269.bam \ + | samtools sort -n \ + | samtools fastq \ + -1 GSE99249/temp_data/SRR5564269_1.fastq.gz \ + -2 GSE99249/temp_data/SRR5564269_2.fastq.gz + +samtools view -M -L $regions GSE99249/SRR5564277.bam \ + | cut -f 1 \ + | uniq > GSE99249/temp_data/reads_to_get.txt + +samtools view -b -N GSE99249/temp_data/reads_to_get.txt GSE99249/SRR5564277.bam \ + | samtools sort -n \ + | samtools fastq \ + -1 GSE99249/temp_data/SRR5564277_1.fastq.gz \ + -2 GSE99249/temp_data/SRR5564277_2.fastq.gz + +## remap + +STAR \ + --readMapNumber 500 \ + --outSAMtype BAM SortedByCoordinate \ + --outSAMunmapped Within \ + --outSAMmode Full \ + --outSAMattributes NH HI AS nM MD \ + --outFileNamePrefix ../extdata/SRR5564277_ \ + --readFilesCommand "gunzip -c" \ + --genomeDir GSE99249/star/ \ + --readFilesIn \ + GSE99249/temp_data/SRR5564277_1.fastq.gz \ + GSE99249/temp_data/SRR5564277_2.fastq.gz + +samtools index ../extdata/SRR5564277_Aligned.sortedByCoord.out.bam + +STAR \ + --readMapNumber 500 \ + --outSAMtype BAM SortedByCoordinate \ + --outSAMunmapped Within \ + --outSAMmode Full \ + --outSAMattributes NH HI AS nM MD \ + --outFileNamePrefix ../extdata/SRR5564269_ \ + --readFilesCommand "gunzip -c" \ + --genomeDir GSE99249/star/ \ + --readFilesIn \ + GSE99249/temp_data/SRR5564269_1.fastq.gz \ + GSE99249/temp_data/SRR5564269_2.fastq.gz + +samtools index ../extdata/SRR5564269_Aligned.sortedByCoord.out.bam + + +rm -rf GSE99249/temp_data GSE99249/star GSE99249/test_data_regions.bed +``` + +# Clean up + +Unneeded temporary files are next deleted, keeping only the bam and bam index files. + +```{r, eval = realign_data} +unlink(file.path("GSE99249", sra_ids), recursive = TRUE) +``` + + +
Show session info + +```{r code} +sessionInfo() +``` +
diff --git a/inst/scripts/README b/inst/scripts/README new file mode 100644 index 0000000..6ad41ac --- /dev/null +++ b/inst/scripts/README @@ -0,0 +1 @@ +Data files in inst/extdata were generated using scripts included here. \ No newline at end of file diff --git a/inst/scripts/dbases/get_databases.sh b/inst/scripts/dbases/get_databases.sh new file mode 100644 index 0000000..9ccc3ba --- /dev/null +++ b/inst/scripts/dbases/get_databases.sh @@ -0,0 +1,31 @@ +#! /usr/bin/env bash + +# download genome sequences and transcript annotations + +wget http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/GRCh38.primary_assembly.genome.fa.gz +gunzip GRCh38.primary_assembly.genome.fa.gz +samtools faidx GRCh38.primary_assembly.genome.fa + +wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz +gunzip GRCm38.primary_assembly.genome.fa.gz +samtools faidx GRCm38.primary_assembly.genome.fa + +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gtf.gz +gunzip gencode.v37.annotation.gtf.gz + +wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz +gunzip gencode.vM25.annotation.gtf.gz + +# make STAR indexes + +STAR \ + --runMode genomeGenerate \ + --runThreadN 12 \ + --genomeDir star/GRCm38 \ + --genomeFastaFiles GRCm38.primary_assembly.genome.fa + +STAR \ + --runMode genomeGenerate \ + --runThreadN 12 \ + --genomeDir star/GRCh38 \ + --genomeFastaFiles GRCh38.primary_assembly.genome.fa diff --git a/man/annot_from_gr.Rd b/man/annot_from_gr.Rd index e8a0002..e33a29f 100644 --- a/man/annot_from_gr.Rd +++ b/man/annot_from_gr.Rd @@ -11,7 +11,8 @@ annot_from_gr(obj, gr, cols_to_map, RLE = TRUE, sep = ",", ...) \item{gr}{GRanges with annotations to map to obj} -\item{cols_to_map}{character vector of columns from GRanges to map to SummarizedExperiment. +\item{cols_to_map}{character vector of columns from GRanges to map to +SummarizedExperiment. If the vector has names, the names will be the column names in the output.} \item{RLE}{If TRUE, columns added will returned as \code{\link[S4Vectors:Rle-class]{S4Vectors::Rle()}} vectors @@ -27,8 +28,8 @@ annotations provided by the supplied GRanges object. } \description{ Utility function to map annotations from GRanges to rowData of -SummarizedExperiment or to mcols of GRanges object. If multiple features overlap then -they will be concatenated with the specified separtor string . +SummarizedExperiment or to mcols of GRanges object. If multiple features +overlap then they will be concatenated with the specified separtor string. } \examples{ library(SummarizedExperiment) diff --git a/man/calc_AEI.Rd b/man/calc_AEI.Rd index 889f075..8b1afcf 100644 --- a/man/calc_AEI.Rd +++ b/man/calc_AEI.Rd @@ -24,18 +24,19 @@ character vector is supplied the names will be used in the output.} \item{alu_ranges}{\link{GRanges} with regions to query for calculating the AEI, typically ALU repeats.} -\item{txdb}{A \link{TxDb} object, if supplied, will be used to subset the alu_ranges -to those found overlapping genes. Alternatively a \link{GRanges} object with gene -coordinates. If the \code{library_type}, specified by +\item{txdb}{A \link{TxDb} object, if supplied, will be used to subset the +alu_ranges to those found overlapping genes. Alternatively a \link{GRanges} +object with gene coordinates. If the \code{library_type}, specified by \code{FilterParam}, is \code{unstranded} then the \link{TxDb} will -be used to correct the strandness relative to the reference and is a required +be used to correct the strandness relative to the reference and is a +required parameter.} \item{snp_db}{either a \link{SNPlocs}, \link{GPos}, or \link{GRanges} object. If supplied, will be used to exclude polymorphic positions prior to calculating the AEI. If \code{calc_AEI()} will be used many times, one will save time by first -identifying SNPs that overlap the supplied \code{alu_ranges}, and passing these as -a \link{GRanges} to \code{snp_db} rather than supplying all known SNPs (see +identifying SNPs that overlap the supplied \code{alu_ranges}, and passing these +as a \link{GRanges} to \code{snp_db} rather than supplying all known SNPs (see \code{\link[=get_overlapping_snps]{get_overlapping_snps()}}).} \item{param}{object of class \code{\link[=FilterParam]{FilterParam()}} which specify various @@ -51,7 +52,8 @@ A named list containing: \itemize{ \item \code{AEI}: a matrix of AEI index values computed for all allelic combinations, one row for each supplied bam file. -\item \code{AEI_per_chrom}: a data.frame containing values computed for each chromosome +\item \code{AEI_per_chrom}: a data.frame containing values computed for each +chromosome } } \description{ diff --git a/man/calc_confidence.Rd b/man/calc_confidence.Rd index 435c599..5a8bfb8 100644 --- a/man/calc_confidence.Rd +++ b/man/calc_confidence.Rd @@ -15,7 +15,8 @@ calc_confidence( ) } \arguments{ -\item{se}{\code{SummarizedExperiment::SummarizedExperiment} containing editing sites} +\item{se}{\code{SummarizedExperiment::SummarizedExperiment} containing editing +sites} \item{edit_to}{edited base} @@ -24,8 +25,8 @@ calc_confidence( \item{per_sample}{if TRUE, calculate confidence per sample, otherwise edited and non-edited counts will be summed across all samples.} -\item{exp_fraction}{Numeric value between 0 and 1, specifying the expected error -rate} +\item{exp_fraction}{Numeric value between 0 and 1, specifying the expected +error rate} \item{alpha}{Pseudo-count to add to non-edited base counts} @@ -37,9 +38,9 @@ or rowData column named "confidence" depending on whether confidence is calculated \code{per_sample}. } \description{ -Calculate a confidence score based on a Bayesian inverse probability -model as described by Washburn et al. Cell Reports. 2015, and implemented -in the SAILOR pipeline. +Calculate a confidence score based on a Bayesian inverse +probability model as described by Washburn et al. Cell Reports. 2015, and +implemented in the SAILOR pipeline. } \examples{ rse_adar_ifn <- mock_rse() @@ -48,7 +49,11 @@ calc_confidence(rse_adar_ifn, per_sample = TRUE) } \references{ -Washburn MC, Kakaradov B, Sundararaman B, Wheeler E, Hoon S, Yeo GW, Hundley HA. The dsRBP and inactive editor ADR-1 utilizes dsRNA binding to regulate A-to-I RNA editing across the C. elegans transcriptome. Cell Rep. 2014 Feb 27;6(4):599-607. doi: 10.1016/j.celrep.2014.01.011. Epub 2014 Feb 6. PMID: 24508457; PMCID: PMC3959997. +Washburn MC, Kakaradov B, Sundararaman B, Wheeler E, Hoon S, Yeo GW, Hundley +HA. The dsRBP and inactive editor ADR-1 utilizes dsRNA binding to regulate +A-to-I RNA editing across the C. elegans transcriptome. Cell Rep. 2014 +Feb 27;6(4):599-607. doi: 10.1016/j.celrep.2014.01.011. Epub 2014 Feb 6. +PMID: 24508457; PMCID: PMC3959997. SAILOR pipeline: https://github.com/YeoLab/sailor } diff --git a/man/calc_edit_frequency.Rd b/man/calc_edit_frequency.Rd index ad1802c..def9d91 100644 --- a/man/calc_edit_frequency.Rd +++ b/man/calc_edit_frequency.Rd @@ -18,18 +18,18 @@ calc_edit_frequency( \item{rse}{A \link{RangedSummarizedExperiment} object created by \code{\link[=pileup_sites]{pileup_sites()}}} \item{edit_from}{This should correspond to a nucleotide or assay -(\code{A}, \code{C}, \code{G}, \code{T}, \code{Ref}, or \code{Alt}) you expect in the reference. Ex. for A to I -editing events, this would be \code{A}.} +(\code{A}, \code{C}, \code{G}, \code{T}, \code{Ref}, or \code{Alt}) you expect in the reference. Ex. for +A to I editing events, this would be \code{A}.} \item{edit_to}{This should correspond to a nucleotide or assay -(\code{A}, \code{C}, \code{G}, \code{T}, \code{Ref}, or \code{Alt}) you expect in the editing site. Ex. for A -to I editing events, this would be \code{G}.} +(\code{A}, \code{C}, \code{G}, \code{T}, \code{Ref}, or \code{Alt}) you expect in the editing site. Ex. +for A to I editing events, this would be \code{G}.} -\item{drop}{If \code{TRUE}, the \link{RangedSummarizedExperiment} returned will only retain sites -matching the specified \code{edit_from} and \code{edit_to} bases.} +\item{drop}{If \code{TRUE}, the \link{RangedSummarizedExperiment} returned will only +retain sites matching the specified \code{edit_from} and \code{edit_to} bases.} -\item{replace_na}{If \code{TRUE}, \code{NA} and \code{NaN} editing frequencies will be coerced to -\code{0}.} +\item{replace_na}{If \code{TRUE}, \code{NA} and \code{NaN} editing frequencies will be +coerced to \code{0}.} \item{edit_frequency}{The edit frequency cutoff used when calculating the number of sites. Set to \code{0} to require any non-zero editing frequency. The diff --git a/man/calc_scAEI.Rd b/man/calc_scAEI.Rd index 2c93f24..00f6f67 100644 --- a/man/calc_scAEI.Rd +++ b/man/calc_scAEI.Rd @@ -20,15 +20,17 @@ calc_scAEI( get_scAEI_sites(fasta, genes, alus, edit_from = "A", edit_to = "G") } \arguments{ -\item{bamfiles}{a path to a BAM file (for 10x libraries), or a vector of paths -to BAM files (smart-seq2). Can be supplied as a character vector, \code{BamFile}, or -\code{BamFileList}.} +\item{bamfiles}{a path to a BAM file (for 10x libraries), or a vector of +paths to BAM files (smart-seq2). Can be supplied as a character vector, +\code{BamFile}, or \code{BamFileList}.} -\item{sites}{a GRanges object produced by \code{\link[=get_scAEI_sites]{get_scAEI_sites()}} containing sites to process.} +\item{sites}{a GRanges object produced by \code{\link[=get_scAEI_sites]{get_scAEI_sites()}} containing +sites to process.} -\item{cell_barcodes}{A character vector of single cell barcodes to process. If -processing multiple BAM files (e.g. smart-seq-2), provide a character vector -of unique identifiers for each input BAM, to name each BAM file in the output files.} +\item{cell_barcodes}{A character vector of single cell barcodes to process. +If processing multiple BAM files (e.g. smart-seq-2), provide a character +vector of unique identifiers for each input BAM, to name each BAM file in the +output files.} \item{param}{object of class \code{\link[=FilterParam]{FilterParam()}} which specify various filters to apply to reads and sites during pileup.} @@ -51,18 +53,18 @@ If NULL, a temporary directory will be used.} \item{fasta}{Path to a genome fasta file} -\item{genes}{A \code{GRanges} object with gene coordinates.Alternatively a \code{TxDb} object, -which if supplied, will be used extract gene coordinates.} +\item{genes}{A \code{GRanges} object with gene coordinates.Alternatively a \code{TxDb} +object, which if supplied, will be used extract gene coordinates.} \item{alus}{\code{GRanges} with repeat regions to query for calculating the AEI, -typically ALU repeats. The strand of the supplied intervals will be ignored for -defining repeat regions.} +typically ALU repeats. The strand of the supplied intervals will be ignored +for defining repeat regions.} } \value{ A \code{DataFrame} containing computed \code{AEI} values, -count of editing events (\code{n_alt}), and count of reference events (\code{n_ref}) per cell. -If \code{return_sce} is \code{TRUE}, then a \code{SingleCellExperiment} is returned with the -AEI values stored in the \code{colData}. +count of editing events (\code{n_alt}), and count of reference events (\code{n_ref}) +per cell. If \code{return_sce} is \code{TRUE}, then a \code{SingleCellExperiment} is +returned with the AEI values stored in the \code{colData}. } \description{ The Adenosine Editing Index describes the magnitude of A-to-I @@ -70,9 +72,9 @@ editing in a sample. The index is a weighted average of editing events (G bases) observed at A positions. The vast majority A-to-I editing occurs in ALU elements in the human genome, and these regions have a high A-to-I editing signal compared to other regions such as coding exons. This -function will examine enumerate edited and non-edited base counts at the supplied -sites and return summary AEI metric per cell. Potential editing sites within -repeat regions can be generated using \code{get_scAEI_sites()}. +function will examine enumerate edited and non-edited base counts at the +supplied sites and return summary AEI metric per cell. Potential editing +sites within repeat regions can be generated using \code{get_scAEI_sites()}. } \examples{ suppressPackageStartupMessages(library(Rsamtools)) @@ -92,7 +94,7 @@ genes_gr <- GRanges(c( )) # alu intervals -alus_gr <- GRanges(c( +alus_gr <- GRanges(c( "2:110-380", "2:510-600", "2:610-670" @@ -104,8 +106,10 @@ fa_fn <- raer_example("mouse_tiny.fasta") # get positions of potential A -> G changes in alus sites <- get_scAEI_sites(fa_fn, genes_gr, alus_gr) -fp <- FilterParam(library_type = "fr-second-strand", - min_mapq = 255) +fp <- FilterParam( + library_type = "fr-second-strand", + min_mapq = 255 +) calc_scAEI(bam_fn, sites, cbs, fp) } diff --git a/man/correct_strand.Rd b/man/correct_strand.Rd index e891a49..86b5b1a 100644 --- a/man/correct_strand.Rd +++ b/man/correct_strand.Rd @@ -7,8 +7,8 @@ correct_strand(rse, genes_gr) } \arguments{ -\item{rse}{RangedSummarizedExperiment object containing editing sites processed with -"unstranded" setting} +\item{rse}{RangedSummarizedExperiment object containing editing sites +processed with "unstranded" setting} \item{genes_gr}{GRanges object containing reference features to annotate the strand of the editing sites.} diff --git a/man/filter_clustered_variants.Rd b/man/filter_clustered_variants.Rd index 8dc09df..f3add80 100644 --- a/man/filter_clustered_variants.Rd +++ b/man/filter_clustered_variants.Rd @@ -12,7 +12,8 @@ filter_clustered_variants( ) } \arguments{ -\item{rse}{\code{SummarizedExperiment::SummarizedExperiment} containing editing sites} +\item{rse}{\code{SummarizedExperiment::SummarizedExperiment} containing editing +sites} \item{txdb}{\code{GenomicFeatures::TxDb}} @@ -27,10 +28,11 @@ variants} object dependent on filtering applied. } \description{ -Sequence variants of multiple allele types (e.g., \code{A -> G}, \code{A -> C}) -proximal to a putative editing site can be indicative of a region prone to mis-alignment -artifacts. Sites will be removed if variants of multiple allele types are present -within a given distance in genomic or transcriptome coordinate space. +Sequence variants of multiple allele types (e.g., \code{A -> G}, +\code{A -> C}) proximal to a putative editing site can be indicative of a region +prone to mis-alignment artifacts. Sites will be removed if variants of +multiple allele types are present within a given distance in genomic or +transcriptome coordinate space. } \examples{ library(GenomicFeatures) diff --git a/man/filter_splice_variants.Rd b/man/filter_splice_variants.Rd index ac03a04..33b5a4b 100644 --- a/man/filter_splice_variants.Rd +++ b/man/filter_splice_variants.Rd @@ -45,7 +45,6 @@ txdb <- makeTxDbFromGRanges(gr) filter_splice_variants(rse_adar_ifn, txdb) - } \seealso{ Other se-filters: diff --git a/man/find_de_sites.Rd b/man/find_de_sites.Rd index 66a51e6..098346c 100644 --- a/man/find_de_sites.Rd +++ b/man/find_de_sites.Rd @@ -37,7 +37,8 @@ a variable in your \code{condition_col} of \code{colData(deobj)}.} \value{ A named list: \itemize{ -\item \code{de_obj}: The \code{edgeR} or \code{deseq} object used for differential editing analysis +\item \code{de_obj}: The \code{edgeR} or \code{deseq} object used for differential editing +analysis \item \code{results_full}: Unfiltered differential editing results \item \code{sig_results}: Filtered differential editing (FDR < 0.05) \item \code{model_matrix}: The model matrix used for generating DE results @@ -46,7 +47,8 @@ A named list: \description{ Use \code{edgeR} or \code{DESeq2} to perform differential editing analysis. This will work for designs that have 1 treatment and 1 -control group. For more complex designs, we suggest you perform your own modeling. +control group. For more complex designs, we suggest you perform your own +modeling. } \examples{ library(SummarizedExperiment) @@ -64,7 +66,9 @@ rse$condition <- substr(rse$sample, 1, 2) rse <- calc_edit_frequency(rse) dse <- make_de_object(rse) -res <- find_de_sites(dse, condition_control = "WT", condition_treatment = "KO") +res <- find_de_sites(dse, + condition_control = "WT", + condition_treatment = "KO") res$sig_results[1:3, ] } diff --git a/man/find_mispriming_sites.Rd b/man/find_mispriming_sites.Rd index f8a188d..4c9d6b7 100644 --- a/man/find_mispriming_sites.Rd +++ b/man/find_mispriming_sites.Rd @@ -39,23 +39,26 @@ find_mispriming_sites( \value{ A GenomicsRanges containing regions enriched for putative mispriming events. The \code{n_reads} column specifies the number of polyA trimmed reads -overlapping the mispriming region. \code{mean_pal} indicates the mean length of polyA -sequence trimmed from reads overlapping the region. The \code{n_regions} column specifies the number -overlapping independent regions found in each chunk (dictated by \code{n_reads_per_chunk}). -The \code{A_freq} column indicates the frequency of A bases within the region. +overlapping the mispriming region. \code{mean_pal} indicates the mean length of +polyA sequence trimmed from reads overlapping the region. The \code{n_regions} +column specifies the number overlapping independent regions found in each +chunk (dictated by \code{n_reads_per_chunk}). The \code{A_freq} column indicates the +frequency of A bases within the region. } \description{ -OligodT will prime at A-rich regions in an RNA. Reverse transcription -from these internal priming sites will install an oligodT sequence at the 3' end -of the cDNA. Sequence variants within these internal priming sites are enriched -for variants converting the genomic sequence to the A encoded by the oligodT primer. -Trimming poly(A) from the 3' ends of reads reduces but does not eliminate these signals +OligodT will prime at A-rich regions in an RNA. Reverse +transcription from these internal priming sites will install an oligodT +sequence at the 3' end of the cDNA. Sequence variants within these internal +priming sites are enriched for variants converting the genomic sequence to +the A encoded by the oligodT primer. Trimming poly(A) from the 3' ends of +reads reduces but does not eliminate these signals -This function will identify regions that are enriched for mispriming events. Reads -that were trimmed to remove poly(A) (encoded in the pa tag by 10x genomics) are -identified. The aligned 3' positions of these reads are counted, and sites passing -thresholds (at least 2 reads) are retained as possible sites of mispriming. Be default -regions 5 bases upstream and 20 bases downstream of these putative mispriming sites +This function will identify regions that are enriched for mispriming events. +Reads that were trimmed to remove poly(A) (encoded in the pa tag by +10x Genomics) are identified. The aligned 3' positions of these reads are +counted, and sites passing thresholds (at least 2 reads) are retained as +possible sites of mispriming. Be default regions 5 bases upstream and +20 bases downstream of these putative mispriming sites are returned. } \examples{ diff --git a/man/find_scde_sites.Rd b/man/find_scde_sites.Rd index 7f46923..6f619c9 100644 --- a/man/find_scde_sites.Rd +++ b/man/find_scde_sites.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/differential_editing.R \name{find_scde_sites} \alias{find_scde_sites} -\title{Identify sites with differential editing between cells in single cell datasets} +\title{Identify sites with differential editing between cells in single cell +datasets} \usage{ find_scde_sites(sce, group, rowData = FALSE, BPPARAM = SerialParam(), ...) } @@ -11,23 +12,25 @@ find_scde_sites(sce, group, rowData = FALSE, BPPARAM = SerialParam(), ...) \item{group}{column name from colData used to define groups to compare.} -\item{rowData}{if TRUE, \link{rowData} from the input \link{SingleCellExperiment} will be -included in the output DataFrames} +\item{rowData}{if TRUE, \link{rowData} from the input \link{SingleCellExperiment} will +be included in the output DataFrames} -\item{BPPARAM}{BiocParallel backend for control how paralllel computations are -performed.} +\item{BPPARAM}{BiocParallel backend for control how parallel computations +are performed.} \item{...}{Additional arguments passed to \link[scran:combineMarkers]{scran::combineMarkers}} } \value{ -A named list of \link{DataFrame}s containing results for each cluster specified by \code{group}. -The difference in editing frequencies between cluster pairs are denoted as \code{dEF}. -See \link[scran:combineMarkers]{scran::combineMarkers} for a description of additional output fields. +A named list of \link{DataFrame}s containing results for each cluster specified by +\code{group}. The difference in editing frequencies between cluster pairs are +denoted as \code{dEF}. See \link[scran:combineMarkers]{scran::combineMarkers} for a description of additional +output fields. } \description{ Compare editing frequencies between clusters or celltypes. REF and ALT counts -from each cluster are pooled to create pseudobulk estimates. Each pair of clusters -are compared using fisher exact tests. Statistics are aggregated across each pairwise +from each cluster are pooled to create pseudobulk estimates. Each pair of +clusters are compared using fisher exact tests. Statistics are aggregated +across each pairwise comparison using \link[scran:combineMarkers]{scran::combineMarkers}. } \examples{ diff --git a/man/make_de_object.Rd b/man/make_de_object.Rd index 4d4aaa0..502b33c 100644 --- a/man/make_de_object.Rd +++ b/man/make_de_object.Rd @@ -17,18 +17,18 @@ make_de_object( \item{rse}{A \link{RangedSummarizedExperiment} object} \item{edit_from}{This should correspond to a nucleotide or assay -(\code{A}, \code{C}, \code{G}, \code{T}, \code{Ref}, or \code{Alt}) you expect in the reference. Ex. for A to I -editing events, this would be \code{A}.} +(\code{A}, \code{C}, \code{G}, \code{T}, \code{Ref}, or \code{Alt}) you expect in the reference. +Ex. for A to I editing events, this would be \code{A}.} \item{edit_to}{This should correspond to a nucleotide or assay -(\code{A}, \code{C}, \code{G}, \code{T}, \code{Ref}, or \code{Alt}) you expect in the editing site. Ex. for A -to I editing events, this would be \code{G}.} +(\code{A}, \code{C}, \code{G}, \code{T}, \code{Ref}, or \code{Alt}) you expect in the editing site. +Ex. for A to I editing events, this would be \code{G}.} \item{min_prop}{The minimum required proportion of reads edited at a site. At least \code{min_samples} need to pass this to keep the site.} -\item{max_prop}{The maximum allowable proportion of reads edited at a site. At -least \code{min_samples} need to pass this to keep the site.} +\item{max_prop}{The maximum allowable proportion of reads edited at a site. +At least \code{min_samples} need to pass this to keep the site.} \item{min_samples}{The minimum number of samples passing the \code{min_prop} and \code{max_prop} cutoffs to keep a site.} diff --git a/man/mock_rse.Rd b/man/mock_rse.Rd index df851ae..51a430c 100644 --- a/man/mock_rse.Rd +++ b/man/mock_rse.Rd @@ -13,9 +13,9 @@ mock_rse() RangedSummarizedExperiment populated with pileup data } \description{ -A RangedSummarizedExperiment containing a subset of data from an RNA-seq experiment -to measure the effects of IFN treatment of cell lines with wild-type -or ADAR1-KO. +A RangedSummarizedExperiment containing a subset of data from an RNA-seq +experiment to measure the effects of IFN treatment of cell lines with +wild-type or ADAR1-KO. } \examples{ mock_rse() diff --git a/man/pileup_cells.Rd b/man/pileup_cells.Rd index b6e787c..4964916 100644 --- a/man/pileup_cells.Rd +++ b/man/pileup_cells.Rd @@ -19,19 +19,20 @@ pileup_cells( ) } \arguments{ -\item{bamfiles}{a path to a BAM file (for droplet scRNA-seq), or a vector of paths -to BAM files (Smart-seq2). Can be supplied as a character vector, \link{BamFile}, or -\link{BamFileList}.} +\item{bamfiles}{a path to a BAM file (for droplet scRNA-seq), or a vector of +paths to BAM files (Smart-seq2). Can be supplied as a character vector, +\link{BamFile}, or \link{BamFileList}.} \item{sites}{a GRanges object containing sites to process. See examples for valid formatting.} -\item{cell_barcodes}{A character vector of single cell barcodes to process. If -processing multiple BAM files (e.g. Smart-seq2), provide a character vector -of unique identifiers for each input BAM, to name each BAM file in the output files.} +\item{cell_barcodes}{A character vector of single cell barcodes to process. +If processing multiple BAM files (e.g. Smart-seq2), provide a character +vector of unique identifiers for each input BAM, to name each BAM file in the +output files.} -\item{output_directory}{Output directory for output matrix files. The directory -will be generated if it doesn't exist.} +\item{output_directory}{Output directory for output matrix files. The +directory will be generated if it doesn't exist.} \item{chroms}{A character vector of chromosomes to process. If supplied, only sites present in the listed chromosomes will be processed} @@ -43,11 +44,12 @@ sites present in the listed chromosomes will be processed} \item{param}{object of class \code{\link[=FilterParam]{FilterParam()}} which specify various filters to apply to reads and sites during pileup. Note that the \code{min_depth} and \code{min_variant_reads} parameters if set > 0 specify the number of reads -from any cell required in order to report a site. E.g. if \code{min_variant_reads} is -set to 2, then at least 2 reads (from any cell) must have a variant in order -to report the site. Setting \code{min_depth} and \code{min_variant_reads} to 0 reports -all sites present in the \code{sites} object. The following options are not enabled -for pileup_cells(): \code{max_mismatch_type}, \code{homopolymer_len}, and \code{min_allelic_freq}.} +from any cell required in order to report a site. E.g. if +\code{min_variant_reads} is set to 2, then at least 2 reads (from any cell) must +have a variant in order to report the site. Setting \code{min_depth} and +\code{min_variant_reads} to 0 reports all sites present in the \code{sites} object. +The following options are not enabled for pileup_cells(): +\code{max_mismatch_type}, \code{homopolymer_len}, and \code{min_allelic_freq}.} \item{BPPARAM}{BiocParallel instance. Parallel computation occurs across chromosomes.} @@ -65,19 +67,19 @@ populated with two assays, \code{nRef} and \code{nAlt}, which represent base cou for the reference and alternate alleles. The \code{\link[=rowRanges]{rowRanges()}} will contain the genomic interval for each site, along with \code{REF} and \code{ALT} columns. The rownames will be populated with the format -\verb{site_[seqnames]_[position(1-based)]_[strand]_[allele]}, with \code{strand} being encoded -as 1 = +, 2 = -, and 3 = *, and allele being \code{REF} + \code{ALT}. +\verb{site_[seqnames]_[position(1-based)]_[strand]_[allele]}, with \code{strand} +being encoded as 1 = +, 2 = -, and 3 = *, and allele being \code{REF} + \code{ALT}. If \code{return_sce} is \code{FALSE} then a character vector of paths to the sparseMatrix files (\code{barcodes.txt.gz}, \code{sites.txt.gz}, \code{counts.mtx.gz}), will be returned. These files can be imported using \code{\link[=read_sparray]{read_sparray()}}. } \description{ -This function processes scRNA-seq library to enumerate base counts -for Reference (unedited) or Alternate ( -edited) bases at specified sites in single cells. \code{pileup_cells} can process -droplet scRNA-seq libraries, from a BAM file containing a cell-barcode and UMI, -or well-based libraries that do not contain cell-barcodes. +This function processes scRNA-seq library to enumerate base +counts for Reference (unedited) or Alternate (edited) bases at specified +sites in single cells. \code{pileup_cells} can process droplet scRNA-seq +libraries, from a BAM file containing a cell-barcode and UMI, or well-based +libraries that do not contain cell-barcodes. The \code{sites} parameter specifies sites to quantify. This must be a \link{GRanges} object with 1 base intervals, a strand (+ or -), and supplemented with @@ -112,8 +114,10 @@ sce many_small_bams <- rep(bam_fn, 10) bam_ids <- LETTERS[1:10] -fp <- FilterParam(library_type = "unstranded", - remove_overlaps = TRUE) +fp <- FilterParam( + library_type = "unstranded", + remove_overlaps = TRUE +) pileup_cells(many_small_bams, sites = gr, diff --git a/man/pileup_sites.Rd b/man/pileup_sites.Rd index 1b139c9..5021619 100644 --- a/man/pileup_sites.Rd +++ b/man/pileup_sites.Rd @@ -42,22 +42,22 @@ FilterParam( ) } \arguments{ -\item{bamfiles}{a character vector, \link{BamFile} or \link{BamFileList} indicating 1 or -more BAM files to process. If named, the names will be included in the \link{colData} -of the \link{RangedSummarizedExperiment} as a \code{sample} column, otherwise the names will -be taken from the basename of the BAM file.} +\item{bamfiles}{a character vector, \link{BamFile} or \link{BamFileList} indicating 1 +or more BAM files to process. If named, the names will be included in the +\link{colData} of the \link{RangedSummarizedExperiment} as a \code{sample} column, otherwise +the names will be taken from the basename of the BAM file.} -\item{fasta}{path to genome fasta file used for read alignment. Can be provided in -compressed gzip or bgzip format.} +\item{fasta}{path to genome fasta file used for read alignment. Can be +provided in compressed gzip or bgzip format.} \item{sites}{a \link{GRanges} object containing regions or sites to process.} -\item{region}{samtools region query string (i.e. \code{chr1:100-1000}). Can be combined -with sites, in which case sites will be filtered to keep only sites within the -region.} +\item{region}{samtools region query string (i.e. \code{chr1:100-1000}). Can be +combined with sites, in which case sites will be filtered to keep only sites +within the region.} -\item{chroms}{chromosomes to process, provided as a character vector. Not to be used with the -region parameter.} +\item{chroms}{chromosomes to process, provided as a character vector. Not to +be used with the region parameter.} \item{param}{object of class \code{\link[=FilterParam]{FilterParam()}} which specify various filters to apply to reads and sites during pileup.} @@ -89,7 +89,8 @@ input BAM file can be provided as a vector.} to generate.} \item{only_keep_variants}{if TRUE, then only variant sites will be reported -(FALSE by default). Values for each input BAM file can be provided as a vector.} +(FALSE by default). Values for each input BAM file can be provided as a +vector.} \item{trim_5p}{Bases to trim from 5' end of read alignments} @@ -119,23 +120,25 @@ c(X, Y).} \item{read_bqual}{Exclude read if more than X percent of the bases have base qualities less than Y. Numeric vector of length 2. e.g. c(0.25, 20)} -\item{min_variant_reads}{Required number of reads containing a variant for a site -to be reported. Calculated per bam file, such that if 1 bam file has >= min_variant_reads, -then the site will be reported.} +\item{min_variant_reads}{Required number of reads containing a variant for a +site to be reported. Calculated per bam file, such that if 1 bam file has >= +min_variant_reads, then the site will be reported.} -\item{min_allelic_freq}{minimum allelic frequency required for a variant to be -reported in ALT assay.} +\item{min_allelic_freq}{minimum allelic frequency required for a variant to +be reported in ALT assay.} -\item{report_multiallelic}{if TRUE, report sites with multiple variants passing -filters. If FALSE, site will not be reported.} +\item{report_multiallelic}{if TRUE, report sites with multiple variants +passing filters. If FALSE, site will not be reported.} -\item{remove_overlaps}{if TRUE, enable read pair overlap detection, which will count only -1 read in regions where read pairs overlap using the htslib algorithm. In brief -for each overlapping base pair the base quality of the base with the lower quality -is set to 0, which discards it from being counted.} +\item{remove_overlaps}{if TRUE, enable read pair overlap detection, which +will count only 1 read in regions where read pairs overlap using the htslib +algorithm. In brief for each overlapping base pair the base quality of the +base with the lower quality is set to 0, which discards it from being +counted.} } \value{ -A \link{RangedSummarizedExperiment} object populated with multiple assays: +A \link{RangedSummarizedExperiment} object populated with +multiple assays: \itemize{ \item \code{ALT}: Alternate base(s) found at each position \item \code{nRef}: # of reads supporting the reference base @@ -162,11 +165,12 @@ The rownames will be populated with the format as 1 = +, 2 = -, and 3 = *. } \description{ -This function uses a pileup routine to examine numerate base counts from -alignments at specified sites, regions, or across all read alignments, from one or -more BAM files. Alignment and site filtering -options are controlled by the \code{FilterParam} class.A \link{RangedSummarizedExperiment} object -is returned, populated with base count statistics for each supplied BAM file. +This function uses a pileup routine to examine numerate base +counts from alignments at specified sites, regions, or across all read +alignments, from one or more BAM files. Alignment and site filtering +options are controlled by the \code{FilterParam} class. A +\link{RangedSummarizedExperiment} object is returned, populated with base count +statistics for each supplied BAM file. } \examples{ library(SummarizedExperiment) diff --git a/man/read_sparray.Rd b/man/read_sparray.Rd index 235ee2a..9f11dda 100644 --- a/man/read_sparray.Rd +++ b/man/read_sparray.Rd @@ -50,7 +50,7 @@ outdir <- tempdir() bai <- indexBam(bam_fn) fp <- FilterParam(library_type = "fr-second-strand") -mtx_fns <- pileup_cells(bam_fn, gr, cbs, outdir, return_sce = FALSE) +mtx_fns <- pileup_cells(bam_fn, gr, cbs, outdir, return_sce = FALSE) sce <- read_sparray(mtx_fns[1], mtx_fns[2], mtx_fns[3]) sce diff --git a/tests/testthat/helper-astyle.R b/tests/testthat/helper-astyle.R index a2d63bc..b6bdfac 100644 --- a/tests/testthat/helper-astyle.R +++ b/tests/testthat/helper-astyle.R @@ -1,31 +1,31 @@ vcapply <- function(X, FUN, ..., USE.NAMES = TRUE) { - vapply(X = X, FUN = FUN, FUN.VALUE = character(1L), ..., USE.NAMES = USE.NAMES) + vapply(X = X, FUN = FUN, FUN.VALUE = character(1L), ..., USE.NAMES = USE.NAMES) } astyle <- function(extra_args = character()) { - astyle_cmd <- "astyle" - if (Sys.which(astyle_cmd) == "") { - testthat::skip("astyle not found") - } + astyle_cmd <- "astyle" + if (Sys.which(astyle_cmd) == "") { + testthat::skip("astyle not found") + } - astyle_args <- c( - "-n", - "--indent=spaces=2", - "--indent-namespaces", - "--indent-preproc-block", - "--unpad-paren", - "--pad-header", - "--min-conditional-indent=0", - "--align-pointer=type", - "--align-reference=type" - ) + astyle_args <- c( + "-n", + "--indent=spaces=2", + "--indent-namespaces", + "--indent-preproc-block", + "--unpad-paren", + "--pad-header", + "--min-conditional-indent=0", + "--align-pointer=type", + "--align-reference=type" + ) - src_path <- normalizePath(vcapply(c("../../src"), testthat::test_path)) - src_files <- dir(src_path, "[.](?:cpp|h|c)$", recursive = TRUE, full.names = TRUE) - astyle_files <- grep("(?:src/ext|init.c)", src_files, value = TRUE, invert = TRUE) - output <- system2(astyle_cmd, c(astyle_args, astyle_files, extra_args), stdout = TRUE, stderr = TRUE) - unchanged <- grepl("^Unchanged", output) - if (any(!unchanged)) { - warning(paste(output[!unchanged], collapse = "\n")) - } + src_path <- normalizePath(vcapply(c("../../src"), testthat::test_path)) + src_files <- dir(src_path, "[.](?:cpp|h|c)$", recursive = TRUE, full.names = TRUE) + astyle_files <- grep("(?:src/ext|init.c)", src_files, value = TRUE, invert = TRUE) + output <- system2(astyle_cmd, c(astyle_args, astyle_files, extra_args), stdout = TRUE, stderr = TRUE) + unchanged <- grepl("^Unchanged", output) + if (any(!unchanged)) { + warning(paste(output[!unchanged], collapse = "\n")) + } } diff --git a/tests/testthat/test_aei.R b/tests/testthat/test_aei.R index ee8a0c2..1fb8b25 100644 --- a/tests/testthat/test_aei.R +++ b/tests/testthat/test_aei.R @@ -17,7 +17,7 @@ mock_snps <- GRanges("SPCS3", IRanges(matchPattern("A", scanFa(fafn)[["SPCS3"]]) mock_snps_gpos <- as(mock_snps, "GPos") mock_genes <- mock_alu_ranges -strand(mock_genes) <- c("-","+", "-") +strand(mock_genes) <- c("-", "+", "-") test_that("calc_aei basic options work", { fp <- FilterParam(library_type = "fr-first-strand") @@ -61,7 +61,6 @@ test_that("calc_aei basic options work", { us_aei <- calc_AEI(bams, fafn, mock_alu_ranges, mock_genes, param = fp) expect_true(all(us_aei$AEI[, "A_G"] > us_aei$AEI[, "T_C"])) - }) test_that("BamFile class works", { diff --git a/tests/testthat/test_de.R b/tests/testthat/test_de.R index 2131da7..f610d8f 100644 --- a/tests/testthat/test_de.R +++ b/tests/testthat/test_de.R @@ -23,8 +23,9 @@ test_that("calc_edit_frequency works", { # depth already exists expect_message(calc_edit_frequency(rse)) expect_error(calc_edit_frequency(rse, - edit_from = "garbage-in", - edit_to = "garbage-out")) + edit_from = "garbage-in", + edit_to = "garbage-out" + )) }) test_that("make_de_object works", { diff --git a/tests/testthat/test_pileup_cells.R b/tests/testthat/test_pileup_cells.R index 58aa593..0c288bd 100644 --- a/tests/testthat/test_pileup_cells.R +++ b/tests/testthat/test_pileup_cells.R @@ -44,15 +44,17 @@ test_that("basic functionality works", { expect_warning(pileup_cells(bam_fn, gr, c("non", "existent", "barcodes"), outdir, - param = FilterParam(library_type = "fr-second-strand", - min_depth = 1) + param = FilterParam( + library_type = "fr-second-strand", + min_depth = 1 + ) )) # returns empty matrices sce <- pileup_cells(bam_fn, gr, - c("non", "existent", "barcodes"), - outdir, - param = fp + c("non", "existent", "barcodes"), + outdir, + param = fp ) expect_equal(sum(assay(sce, "nRef")) + sum(assay(sce, "nAlt")), 0L) @@ -144,12 +146,16 @@ test_that("if multiple bams are supplied, require nbam = # of barcodes", { test_that("output files are generated", { sce <- pileup_cells(bam_fn, gr, cbs, outdir) - mtx_fns <- file.path(outdir, - c("counts.mtx.gz", - "sites.txt.gz", - "barcodes.txt.gz")) + mtx_fns <- file.path( + outdir, + c( + "counts.mtx.gz", + "sites.txt.gz", + "barcodes.txt.gz" + ) + ) expect_true(all(file.exists(mtx_fns))) - are_gzipped <- lapply(mtx_fns, function(x){ + are_gzipped <- lapply(mtx_fns, function(x) { fo <- file(x) xx <- summary(fo)$class == "gzfile" close(fo) @@ -161,10 +167,14 @@ test_that("output files are generated", { test_that("read_sparray reconstructs rse from output files", { sce <- pileup_cells(bam_fn, gr, cbs, outdir, param = FilterParam(min_depth = 0)) - mtx_fns <- file.path(outdir, - c("counts.mtx.gz", - "sites.txt.gz", - "barcodes.txt.gz")) + mtx_fns <- file.path( + outdir, + c( + "counts.mtx.gz", + "sites.txt.gz", + "barcodes.txt.gz" + ) + ) sp_sce <- read_sparray(mtx_fns[1], mtx_fns[2], mtx_fns[3], "coordinate") expect_true(identical(sce, sp_sce)) @@ -183,7 +193,6 @@ test_that("BamFile and BamFileList input work", { sce <- pileup_cells(bfl, gr, cbs, outdir) expect_true(is(sce, "SingleCellExperiment")) expect_equal(dim(sce), c(4, 556)) - }) test_that("custom indexes work", { @@ -194,12 +203,12 @@ test_that("custom indexes work", { bfl <- BamFileList(rep(bam_fn, 2), c(bai_2, bai_2)) sce <- pileup_cells(bfl, - sites = gr, - cell_barcodes = LETTERS[1:2], - cb_tag = NULL, - umi_tag = NULL, - outdir, - param = FilterParam(min_depth = 0) + sites = gr, + cell_barcodes = LETTERS[1:2], + cb_tag = NULL, + umi_tag = NULL, + outdir, + param = FilterParam(min_depth = 0) ) expect_true(all(gr == rowRanges(sce))) unlink(c(bai_1, bai_2)) @@ -208,15 +217,19 @@ test_that("custom indexes work", { # get larger set of sites to query fa_fn <- raer_example("mouse_tiny.fasta") -bulkfp <- FilterParam(min_mapq = 255L, - min_variant_reads = 1, - min_allelic_freq = 0.01, - only_keep_variants = TRUE, - report_multiallelic = FALSE, - library_type = "fr-second-strand", - bam_flags = scanBamFlag(isSecondaryAlignment = FALSE, - isSupplementaryAlignment = FALSE, - isNotPassingQualityControls = FALSE)) +bulkfp <- FilterParam( + min_mapq = 255L, + min_variant_reads = 1, + min_allelic_freq = 0.01, + only_keep_variants = TRUE, + report_multiallelic = FALSE, + library_type = "fr-second-strand", + bam_flags = scanBamFlag( + isSecondaryAlignment = FALSE, + isSupplementaryAlignment = FALSE, + isNotPassingQualityControls = FALSE + ) +) rse <- pileup_sites(bam_fn, fa_fn, chrom = "2", param = bulkfp) rowData(rse)$ALT <- assay(rse, "ALT")[, 1] sites <- rowRanges(rse) @@ -230,9 +243,10 @@ test_that("rownames are not duplicated", { test_that("multiple alleles per site can be queried", { fp <- FilterParam(library_type = "fr-second-strand", min_depth = 0) ol_sites <- GRanges(rep(c("2:261", "2:262", "2:279"), each = 2), - strand = rep(c("+", "-"), 3), - REF = c("G", "C", "C", "G", "G", "C"), - ALT = c("T", "A", "T", "A", "T", "A")) + strand = rep(c("+", "-"), 3), + REF = c("G", "C", "C", "G", "G", "C"), + ALT = c("T", "A", "T", "A", "T", "A") + ) sce <- pileup_cells(bam_fn, ol_sites, cbs, outdir, param = fp) expect_true(nrow(sce) == 6) expect_true(all(Matrix::rowSums(assay(sce, "nAlt")) > 0)) @@ -245,7 +259,8 @@ mouse_sam_hdr <- c( "@SQ\tSN:2\tLN:1115", "@SQ\tSN:6\tLN:400", "@SQ\tSN:11\tLN:493", - "@SQ\tSN:8\tLN:396") + "@SQ\tSN:8\tLN:396" +) change_base <- function(aln, pos, new_base) { rec <- strsplit(aln, "\t")[[1]] @@ -275,13 +290,16 @@ test_that("UMI consensus base selection works", { fp <- FilterParam(library_type = "fr-first-strand", min_depth = 0) ol_sites <- GRanges(c("2:20", "2:645", "2:646"), - strand = "+", - REF = "A", - ALT = "G") + strand = "+", + REF = "A", + ALT = "G" + ) # 1 reads support ref 1 alt sce <- pileup_cells(tbam, ol_sites, - "AGGGAGTAGGCTATCT-1", - outdir, param = fp) + "AGGGAGTAGGCTATCT-1", + outdir, + param = fp + ) expect_true(all(assay(sce, "nRef")[, 1] == c(0, 0, 1))) expect_true(all(assay(sce, "nAlt")[, 1] == c(0, 1, 0))) @@ -290,8 +308,10 @@ test_that("UMI consensus base selection works", { tbam <- asBam(tf, overwrite = TRUE) tbai <- indexBam(tbam) sce <- pileup_cells(tbam, ol_sites, - "AGGGAGTAGGCTATCT-1", - outdir, param = fp) + "AGGGAGTAGGCTATCT-1", + outdir, + param = fp + ) expect_true(all(assay(sce, "nRef")[, 1] == c(0, 1, 1))) expect_true(all(assay(sce, "nAlt")[, 1] == c(0, 0, 0))) @@ -302,17 +322,20 @@ test_that("UMI consensus base selection works", { tbai <- indexBam(tbam) sce <- pileup_cells(tbam, ol_sites, - "AGGGAGTAGGCTATCT-1", - outdir, param = fp) + "AGGGAGTAGGCTATCT-1", + outdir, + param = fp + ) expect_true(all(assay(sce, "nRef")[, 1] == c(0, 0, 1))) expect_true(all(assay(sce, "nAlt")[, 1] == c(0, 1, 0))) # no UMI tag disables consensus fp <- FilterParam(library_type = "fr-first-strand", min_base_quality = 10, min_depth = 0) sce <- pileup_cells(tbam, ol_sites, - "AGGGAGTAGGCTATCT-1", - umi_tag = NULL, - outdir, param = fp) + "AGGGAGTAGGCTATCT-1", + umi_tag = NULL, + outdir, param = fp + ) expect_true(all(assay(sce, "nRef")[, 1] == c(0, 2, 3))) expect_true(all(assay(sce, "nAlt")[, 1] == c(0, 1, 0))) @@ -323,15 +346,10 @@ test_that("UMI consensus base selection works", { tbai <- indexBam(tbam) sce <- pileup_cells(tbam, ol_sites, - "AGGGAGTAGGCTATCT-1", - outdir, param = fp) + "AGGGAGTAGGCTATCT-1", + outdir, + param = fp + ) expect_true(all(assay(sce, "nRef")[, 1] == c(0, 0, 1))) expect_true(all(assay(sce, "nAlt")[, 1] == c(0, 0, 0))) }) - - - - - - - diff --git a/tests/testthat/test_pileup_sites.R b/tests/testthat/test_pileup_sites.R index 9dc1008..28c1b57 100644 --- a/tests/testthat/test_pileup_sites.R +++ b/tests/testthat/test_pileup_sites.R @@ -189,8 +189,8 @@ check_nRef_calc <- function(input, nts_in = nts) { test_that("pileup check nRef and nAlt", { strds <- c( - "fr-first-strand", "fr-second-strand","unstranded" - ) + "fr-first-strand", "fr-second-strand", "unstranded" + ) for (strd in strds) { res <- pileup_sites(bamfn, fafn, @@ -618,7 +618,6 @@ test_that("BamFile and BamFileList input work", { res <- pileup_sites(bfl, fafn, sites) expect_equal(length(rowData(res)$REF), 182) expect_equal(length(assays(res)), 7) - }) test_that("custom indexes work", { diff --git a/tests/testthat/test_rse.R b/tests/testthat/test_rse.R index 88e82e7..eafac33 100644 --- a/tests/testthat/test_rse.R +++ b/tests/testthat/test_rse.R @@ -54,17 +54,24 @@ test_that("annot_snps works", { # check for snp matching # first 5 are SNPS, 6th is not - gr <- GRanges(c("1:14653", "1:14677", "1:16495", - "1:99166", "1:99180", "1:100000"), - strand = c(rep("+", 3), rep("-", 3)), - seqinfo = seqinfo(SNPlocs.Hsapiens.dbSNP144.GRCh38), - ALT = c("T", "T", "C", "A", "C", "C")) + gr <- GRanges( + c( + "1:14653", "1:14677", "1:16495", + "1:99166", "1:99180", "1:100000" + ), + strand = c(rep("+", 3), rep("-", 3)), + seqinfo = seqinfo(SNPlocs.Hsapiens.dbSNP144.GRCh38), + ALT = c("T", "T", "C", "A", "C", "C") + ) res <- annot_snps(gr, - SNPlocs.Hsapiens.dbSNP144.GRCh38, - genome = BSgenome.Hsapiens.NCBI.GRCh38) - expect_true("snp_matches_site" %in% colnames(mcols(res))) - expect_equal(decode(res$snp_matches_site), - c(TRUE, FALSE, TRUE, TRUE, FALSE, NA)) + SNPlocs.Hsapiens.dbSNP144.GRCh38, + genome = BSgenome.Hsapiens.NCBI.GRCh38 + ) + expect_true("snp_matches_site" %in% colnames(mcols(res))) + expect_equal( + decode(res$snp_matches_site), + c(TRUE, FALSE, TRUE, TRUE, FALSE, NA) + ) } } }) diff --git a/tests/testthat/test_scaei.R b/tests/testthat/test_scaei.R index 2bce289..49cc99d 100644 --- a/tests/testthat/test_scaei.R +++ b/tests/testthat/test_scaei.R @@ -17,7 +17,7 @@ genes_gr <- GRanges(c( )) # alu intervals -alus_gr <- GRanges(c( +alus_gr <- GRanges(c( "2:110-380", "2:510-600", "2:610-670" @@ -30,8 +30,10 @@ test_that("calc_scaei basic functions work", { s[strand(sites) == "-"] <- reverseComplement(s[strand(sites) == "-"]) expect_true(all(s == "A")) - fp <- FilterParam(library_type = "fr-second-strand", - min_mapq = 255) + fp <- FilterParam( + library_type = "fr-second-strand", + min_mapq = 255 + ) res <- calc_scAEI(bam_fn, sites, cbs, fp) expect_equal(nrow(res), 3L) expect_true(is(res, "DataFrame")) @@ -49,12 +51,13 @@ test_that("guard against no sites in output when processing unstranded", { sites <- get_scAEI_sites(fa_fn, genes_gr, alus_gr) - fp <- FilterParam(library_type = "unstranded", - min_mapq = 255) + fp <- FilterParam( + library_type = "unstranded", + min_mapq = 255 + ) res <- calc_scAEI(bam_fn, sites, cbs, fp) expect_equal(nrow(res), 3L) sites <- GRanges(seqnames = "2", IRanges(start = 1:10, end = 1:10), strand = "+", id = 1, gene_strand = "defined", REF = "A", ALT = "G") expect_error(expect_warning(calc_scAEI(bam_fn, sites, cbs, fp))) }) - diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R index 3179262..91d4ba9 100644 --- a/tests/testthat/test_utils.R +++ b/tests/testthat/test_utils.R @@ -2,8 +2,8 @@ test_that("fisher exact test works", { set.seed(42) x <- sample(0:1e4, 400, replace = TRUE) a_ref <- x[1:100] - a_alt <- x[101:200] - b_ref <- x[201:300] + a_alt <- x[101:200] + b_ref <- x[201:300] b_alt <- x[301:400] a <- cbind(a_ref, a_alt) b <- cbind(b_ref, b_alt) @@ -52,10 +52,11 @@ test_that("filter mispriming works", { expect_error(find_mispriming_sites(bam_fn, fa_fn, tag_values = "more-nonsense")) res <- find_mispriming_sites(bam_fn, fa_fn, - min_reads = 1, - pos_5p = 1, - pos_3p = 1, - verbose = FALSE) + min_reads = 1, + pos_5p = 1, + pos_3p = 1, + verbose = FALSE + ) expect_true(width(res) == 3L) }) @@ -92,7 +93,7 @@ test_that("correct_strand works", { ans <- correct_strand(rse, genes) alts <- assay(ans, "ALT")[, 1] - alts <- alts[alts != "-" & as.vector(strand(ans)) == "-"] + alts <- alts[alts != "-" & as.vector(strand(ans)) == "-"] og_alts <- assay(rse[names(alts), ], "ALT")[, 1] expect_true(all(og_alts != alts)) expect_true(all(nchar(og_alts) == nchar(alts))) @@ -107,7 +108,7 @@ test_that("correct_strand works", { expect_true(all(strand(minus_rse) == "-")) alts <- assay(minus_rse, "ALT")[, 1] - alts <- alts[alts != "-" & as.vector(strand(minus_rse)) == "-"] + alts <- alts[alts != "-" & as.vector(strand(minus_rse)) == "-"] og_alts <- assay(rse[names(alts), ], "ALT")[, 1] expect_true(all(comp_bases(alts) == og_alts)) }) diff --git a/vignettes/raer.Rmd b/vignettes/raer.Rmd index 68691b6..62696c1 100644 --- a/vignettes/raer.Rmd +++ b/vignettes/raer.Rmd @@ -24,20 +24,56 @@ knitr::opts_chunk$set(message = FALSE, warning = FALSE) # Introduction -The *raer* (RNA Adenosine editing in R) package provides tools to characterize A-to-I editing in single cell and bulk RNA-sequencing datasets. Both novel and known editing sites can be detected and quantified beginning with BAM alignment files. At it's core the *raer* package uses the pileup routines from the HTSlib C library (@Bonfield2021-yo) to identify candidate RNA editing sites, and leverages the annotation resources in the Bioconductor ecosystem to further characterize and identify high-confidence RNA editing sites. +The *raer* (RNA Adenosine editing in R) package provides tools to characterize +A-to-I editing in single cell and bulk RNA-sequencing datasets. Both novel and +known editing sites can be detected and quantified beginning with BAM alignment +files. At it's core the *raer* package uses the pileup routines from the HTSlib +C library (@Bonfield2021-yo) to identify candidate RNA editing sites, and +leverages the annotation resources in the Bioconductor ecosystem to further +characterize and identify high-confidence RNA editing sites. -Here we demonstrate how to use the *raer* package to a) quantify RNA editing sites in droplet scRNA-seq dataset, b) identify editing sites with condition specific editing in bulk RNA-seq data, and c) predict novel editing sites from bulk RNA-seq. +Here we demonstrate how to use the *raer* package to a) quantify RNA editing +sites in droplet scRNA-seq dataset, b) identify editing sites with condition +specific editing in bulk RNA-seq data, and c) predict novel editing sites from +bulk RNA-seq. + +# Installation + +The `raer` package can be installed from Bioconductor using `BiocManager`. + +```{r, eval = FALSE} +if (!require("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} + +# The following initializes usage of Bioc devel +BiocManager::install(version = "devel") + +BiocManager::install("raer") +``` + +Alternatively `raer` can be installed from github using +`BiocManager::install("rnabioco/raer")`. # Characterizing RNA editing sites in scRNA-seq data -Here we will use the *raer* package to examine RNA editing in droplet-based single cell RNA-seq data. `pileup_cells()` enables quantification of edited and non-edited bases at specified sites from scRNA-seq data. +Here we will use the *raer* package to examine RNA editing in droplet-based +single cell RNA-seq data. `pileup_cells()` enables quantification of edited and +non-edited bases at specified sites from scRNA-seq data. -For this example we will examine a scRNA-seq dataset from human PBMC cells provided by [10x Genomics](https://www.10xgenomics.com/resources/datasets/10k-human-pbmcs-3-v3-1-chromium-x-with-intronic-reads-3-1-high). The single cell data was aligned and processed using the 10x Genomics cellranger pipeline. +For this example we will examine a scRNA-seq dataset from human PBMC cells +provided by [10x Genomics](https://www.10xgenomics.com/resources/datasets/10k-human-pbmcs-3-v3-1-chromium-x-with-intronic-reads-3-1-high). The single cell data was aligned and processed using the +10x Genomics cellranger pipeline. The PBMC scRNA-seq dataset from 10x Genomics, along with other -needed files will downloaded and cached using `pbmc_10x()`from the *raerdata* ExperimentHub package. For this vignette, the BAM file was subset to retain 2 million alignments that overlap human RNA editing sites on chromosome 16. +needed files will downloaded and cached using `pbmc_10x()`from the *raerdata* +ExperimentHub package. For this vignette, the BAM file was subset to retain +2 million alignments that overlap human RNA editing sites on chromosome 16. -`pbmc_10x()` returns a list containing a `BamFile` object, a `GRanges` object with known RNA editing sites from the `REDIportal` database @Mansi2021-za, and a `SingleCellExperiment` populated with the gene expression data and cell type annotations. +`pbmc_10x()` returns a list containing a `BamFile` object, a `GRanges` object +with known RNA editing sites from the `REDIportal` database @Mansi2021-za, and +a `SingleCellExperiment` populated with the gene expression data and cell type +annotations. ```{r message=FALSE} library(raer) @@ -60,14 +96,21 @@ plotUMAP(sce, colour_by = "celltype") ## Specifying sites to quantify -Next we'll select editing sites to quantify. For this analysis we will use RNA editing sites cataloged in the REDIportal database @Mansi2021-za. +Next we'll select editing sites to quantify. For this analysis we will use RNA +editing sites cataloged in the REDIportal database @Mansi2021-za. ```{r} editing_sites ``` -The sites to quantify are specified using a custom formatted GRanges object with 1 base intervals, a strand (+ or -), and supplemented with metadata columns named `REF` and `ALT` containing the reference and alternate base to query. In this case we are -only interested in A->I editing, so we set the ref and alt to `A` and `G`. Note that the `REF` and `ALT` bases are in reference to strand. For a `-` strand interval the bases should be the complement of the `+` strand bases. Also note that these bases can be stored as traditional character vectors or as `Rle()` objects to save memory. +The sites to quantify are specified using a custom formatted GRanges object +with 1 base intervals, a strand (+ or -), and supplemented with metadata +columns named `REF` and `ALT` containing the reference and alternate base to +query. In this case we are only interested in A->I editing, so we set the ref +and alt to `A` and `G`. Note that the `REF` and `ALT` bases are in reference to +strand. For a `-` strand interval the bases should be the complement of the `+` +strand bases. Also note that these bases can be stored as traditional character +vectors or as `Rle()` objects to save memory. ```{r} editing_sites$REF <- Rle("A") @@ -77,19 +120,46 @@ editing_sites ## Quantifying sites in single cells using *pileup_cells* -`pileup_cells()` quantifies edited and non-edited UMI counts per cell barcode, then organizes the site counts into a `SingleCellExperiment` object. `pileup_cells()` accepts a `FilterParam()` object that specifies parameters for multiple read-level and site-level filtering and processing options. Note that `pileup_cells()` is strand sensitive by default, so it is important to ensure that the strand of the input sites is correctly annotated, and that the `library-type` is set correctly for the strandedness of the sequencing library. For 10x Genomics data, the library type is set to `fr-second-strand`, indicating that the strand of the BAM alignments is the same strand as the RNA. See [quantifying Smart-seq2 scRNA-seq libraries](#ss2) for an example of using *pileup_cells()* to handle unstranded data and data from libraries that produce 1 BAM file for each cell. - -To exclude duplicate reads derived from PCR, `pileup_cells()` can use a UMI sequence, supplied via the `umi_tag` argument, to only count 1 read for each CB-UMI pair at each editing site position. Note however that by default the `bam_flags` argument for the `FilterParam` class is set to **include** duplicate reads when using `pileup_cells()`. Droplet single cell libraries produce multiple cDNA fragments from a single reverse transcription event. The cDNA fragments have different alignment positions due to fragmentation despite being derived from a single RNA molecule. scRNA-seq data processed by cellranger from 10x Genomics will set the "Not primary alignment" BAM flag for every read except one read for each UMI. If duplicates are removed based on this BAM flag, then only 1 representative fragment for a single UMI will be examined, which will exclude many valid regions. - -To reduce processing time many functions in the *raer* package operate in parallel across multiple chromosomes. To enable parallel processing, a `BiocParallel` backend can be supplied via the `BPPARAM` argument (e.g. `MultiCoreParam()`). +`pileup_cells()` quantifies edited and non-edited UMI counts per cell barcode, +then organizes the site counts into a `SingleCellExperiment` object. +`pileup_cells()` accepts a `FilterParam()` object that specifies parameters for +multiple read-level and site-level filtering and processing options. Note that +`pileup_cells()` is strand sensitive by default, so it is important to ensure +that the strand of the input sites is correctly annotated, and that the +`library-type` is set correctly for the strandedness of the sequencing library. +For 10x Genomics data, the library type is set to `fr-second-strand`, +indicating that the strand of the BAM alignments is the same strand as the RNA. +See [quantifying Smart-seq2 scRNA-seq libraries](#ss2) for an example of using +*pileup_cells()* to handle unstranded data and data from libraries that produce +1 BAM file for each cell. + +To exclude duplicate reads derived from PCR, `pileup_cells()` can use a UMI +sequence, supplied via the `umi_tag` argument, to only count 1 read for each +CB-UMI pair at each editing site position. Note however that by default the +`bam_flags` argument for the `FilterParam` class is set to **include** duplicate +reads when using `pileup_cells()`. Droplet single cell libraries produce +multiple cDNA fragments from a single reverse transcription event. The cDNA +fragments have different alignment positions due to fragmentation despite being +derived from a single RNA molecule. scRNA-seq data processed by cellranger from +10x Genomics will set the "Not primary alignment" BAM flag for every read except +one read for each UMI. If duplicates are removed based on this BAM flag, then +only 1 representative fragment for a single UMI will be examined, which will +exclude many valid regions. + +To reduce processing time many functions in the *raer* package operate in +parallel across multiple chromosomes. To enable parallel processing, a +`BiocParallel` backend can be supplied via the `BPPARAM` argument (e.g. +`MultiCoreParam()`). ```{r pileup_cells} outdir <- file.path(tempdir(), "sc_edits") cbs <- colnames(sce) -params <- FilterParam(min_mapq = 255, # required alignment MAPQ score - library_type = "fr-second-strand", #library type - min_variant_reads = 1) +params <- FilterParam( + min_mapq = 255, # required alignment MAPQ score + library_type = "fr-second-strand", # library type + min_variant_reads = 1 +) e_sce <- pileup_cells( bamfile = pbmc_bam, @@ -97,44 +167,59 @@ e_sce <- pileup_cells( cell_barcodes = cbs, output_directory = outdir, cb_tag = "CB", - umi_tag = "UB", + umi_tag = "UB", param = params ) e_sce ``` -The outputs from `pileup_cells()` are a `SingleCellExperiment` object populated with -`nRef` and `nAlt` assays containing the base counts for the reference (unedited) and alternate (edited) alleles at each position. +The outputs from `pileup_cells()` are a `SingleCellExperiment` object populated +with `nRef` and `nAlt` assays containing the base counts for the reference +(unedited) and alternate (edited) alleles at each position. -The sparseMatrices are also written to files, at a directory specified by `output_directory`, which can be loaded into R using the `read_sparray()` function. +The sparseMatrices are also written to files, at a directory specified by +`output_directory`, which can be loaded into R using the `read_sparray()` +function. ```{r} dir(outdir) -read_sparray(file.path(outdir, "counts.mtx.gz"), - file.path(outdir, "sites.txt.gz"), - file.path(outdir, "barcodes.txt.gz")) +read_sparray( + file.path(outdir, "counts.mtx.gz"), + file.path(outdir, "sites.txt.gz"), + file.path(outdir, "barcodes.txt.gz") +) ``` -Next we'll filter the single cell editing dataset to find sites with an editing event in at least 5 cells and add the editing counts to the gene expression SingleCellExperiment as an `altExp()`. +Next we'll filter the single cell editing dataset to find sites with an editing +event in at least 5 cells and add the editing counts to the gene expression +SingleCellExperiment as an `altExp()`. ```{r} e_sce <- e_sce[rowSums(assays(e_sce)$nAlt > 0) >= 5, ] -e_sce <- calc_edit_frequency(e_sce, edit_from = "Ref", edit_to = "Alt", replace_na = FALSE) +e_sce <- calc_edit_frequency(e_sce, + edit_from = "Ref", + edit_to = "Alt", + replace_na = FALSE +) altExp(sce) <- e_sce[, colnames(sce)] ``` -With the editing sites added to the gene expression SingleCellExperiment we can use plotting and other methods previously developed for single cell analysis. Here we'll visualize editing sites with the highest edited read counts. +With the editing sites added to the gene expression SingleCellExperiment we can +use plotting and other methods previously developed for single cell analysis. +Here we'll visualize editing sites with the highest edited read counts. ```{r} -to_plot <- rownames(altExp(sce))[order(rowSums(assay(altExp(sce), "nAlt")), decreasing = TRUE)] +to_plot <- rownames(altExp(sce))[order(rowSums(assay(altExp(sce), "nAlt")), + decreasing = TRUE +)] lapply(to_plot[1:5], function(x) { plotUMAP(sce, colour_by = x, by_exprs_values = "nAlt") }) ``` -Alternatively we can view these top edited sites as a Heatmap, showing the average -number of edited reads per site in each cell type. +Alternatively we can view these top edited sites as a Heatmap, showing the +average number of edited reads per site in each cell type. ```{r, fig.height = 7} altExp(sce)$celltype <- sce$celltype @@ -144,87 +229,132 @@ plotGroupedHeatmap(altExp(sce), group = "celltype", exprs_values = "nAlt" ) - ``` *raer* provides additional tools to examine cell type specific editing. -- `find_scde_sites()` will perform statistical testing to identify sites with different editing frequencies between clusters/cell types. +- `find_scde_sites()` will perform statistical testing to identify sites with +different editing frequencies between clusters/cell types. -- `calc_scAEI()` will calculate the Alu Editing Index (`AEI`) metric in single cells. +- `calc_scAEI()` will calculate the Alu Editing Index (`AEI`) metric in single +cells. -**If the editing sites of interest are not known, we recommend the following approach. First, treat the single cell data as a bulk RNA-seq experiment, and follow approaches described in the [Novel editing site detection](#novel) to identify putative editing sites. Then query these sites in single cell mode using `pileup_cells()`** +**If the editing sites of interest are not known, we recommend the following +approach. First, treat the single cell data as a bulk RNA-seq experiment, and +follow approaches described in the [Novel editing site detection](#novel) to +identify putative editing sites. Then query these sites in single cell mode +using `pileup_cells()`** ## Quantifying sites in Smart-seq2 libaries {#ss2} -`pileup_cells()` can also process Smart-seq2 single cell libraries. These datasets typically store data from each cell in separate BAM files and the library type for these alignments are generally unstranded. To process these datasets the `library-type` should be set to `unstranded`, and the reference editing sites need to be reported all on the `+` strand. +`pileup_cells()` can also process Smart-seq2 single cell libraries. These +datasets typically store data from each cell in separate BAM files and the +library type for these alignments are generally unstranded. To process these +datasets the `library-type` should be set to `unstranded`, and the reference +editing sites need to be reported all on the `+` strand. -For example, the editing sites on the minus strand will need to be complemented (set as T -> C rather than A -> G). Additionally the `umi_tag` and `cb_tag` arguments should be set as follows to disable UMI and cell barcode detection. +For example, the editing sites on the minus strand will need to be complemented +(set as T -> C rather than A -> G). Additionally the `umi_tag` and `cb_tag` +arguments should be set as follows to disable UMI and cell barcode detection. -```{r, eval = FALSE} -# Note these steps are not run +To illustrate this functionality, we will reprocess the 10x Genomics pbmc +dataset, treating the data as mock Smart-seq2 data from 3 cells. + +```{r} is_minus <- strand(editing_sites) == "-" -editing_sites[is_minus]$REF <- "T" +editing_sites[is_minus]$REF <- "T" editing_sites[is_minus]$ALT <- "C" strand(editing_sites[is_minus]) <- "+" -vector_of_bam_files <- c("cell1.bam", "cell2.bam", "cell3.bam") -vector_of_cell_ids <- c("cell1", "cell2", "cell3") +fp <- FilterParam( + library_type = "unstranded", + min_mapq = 255, + min_variant_reads = 1 +) + +ss2_bams <- c(pbmc_bam, pbmc_bam, pbmc_bam) +cell_ids <- c("cell1", "cell2", "cell3") -pileup_cells(bamfiles = vector_of_bam_files, - cell_barcodes = vector_of_cell_ids, - sites = editing_sites, - umi_tag = NULL, # no UMI tag in most Smart-seq2 libraries - cb_tag = NULL) # no cell barcode in most Smart-seq2 libraries +pileup_cells( + bamfiles = ss2_bams, + cell_barcodes = cell_ids, + sites = editing_sites, + umi_tag = NULL, # no UMI tag in most Smart-seq2 libraries + cb_tag = NULL, # no cell barcode tag + param = fp, + output_directory = outdir +) ``` # Quantifying RNA editing sites in bulk RNA-Seq {#bulk} -Next we will perform a reanalysis of a published bulk RNA-seq dataset using the *raer* package. The `pileup_sites()` function enable quantification of base counts from bulk RNA-seq data and can be used to identify novel sites (see [Novel editing site detection](#novel)). +Next we will perform a reanalysis of a published bulk RNA-seq dataset using the +*raer* package. The `pileup_sites()` function enable quantification of base +counts from bulk RNA-seq data and can be used to identify novel sites +(see [Novel editing site detection](#novel)). -For this reanalysis, we will examine a bulk RNA-seq dataset from accession [GSE99249](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE99249), which consists of RNA-seq data from ADAR1 mutants and control human cell lines, conditionally treated with Interferon-Beta. We will examine data from two genotypes, ADAR1 WT and KO, both treated with Interferon-B, with triplicate samples. +For this reanalysis, we will examine a bulk RNA-seq dataset from accession [GSE99249](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE99249), which +consists of RNA-seq data from ADAR1 mutants and control human cell lines, +conditionally treated with Interferon-Beta. We will examine data from two +genotypes, ADAR1 WT and KO, both treated with Interferon-B, with triplicate +samples. -Aligned BAM files and other necessary files have been preprocessed for this vignette and are available using `GSE99249()` from the `raerdata` package. Calling `GSE99249()` will downloaded and cache the necessary files and return a list containing the data. +Aligned BAM files and other necessary files have been preprocessed for this +vignette and are available using `GSE99249()` from the `raerdata` package. +Calling `GSE99249()` will downloaded and cache the necessary files and return +a list containing the data. ```{r} ifnb <- GSE99249() names(ifnb) ``` -`bams` contains a vector of `BamFile` objects with the paths to each BAM file. These BAM files are a subset of the full BAM files, containing alignments from chromosome 18. +`bams` contains a vector of `BamFile` objects with the paths to each BAM file. +These BAM files are a subset of the full BAM files, containing alignments from +chromosome 18. ```{r} bam_files <- ifnb$bams names(bam_files) ``` -To quantify editing sites we will need a FASTA file to compare read alignments to the reference sequence. For space reasons we'll use a FASTA file containing only chromosome 18 for this demo. +To quantify editing sites we will need a FASTA file to compare read alignments +to the reference sequence. For space reasons we'll use a FASTA file containing +only chromosome 18 for this demo. ```{r} fafn <- ifnb$fasta ``` -We will again use the database of known human editing sites from REDIPortal, only processing those from `chr18`. +We will again use the database of known human editing sites from REDIPortal, +only processing those from `chr18`. ```{r} editing_sites <- ifnb$sites -chr_18_editing_sites <- keepSeqlevels(editing_sites, "chr18", pruning.mode="coarse") +chr_18_editing_sites <- keepSeqlevels(editing_sites, "chr18", + pruning.mode = "coarse" +) ``` ## Generate editing site read counts using *pileup_sites* -The `pileup_sites()` function will process BAM files and calculate base counts at each supplied position. The `FilterParam()` will again be used to specify parameters to exclude reads and bases based on commonly used filters for detecting RNA-editing events. Specific regions can also be queried using the `region` argument which accepts a samtools style region specification string (e.g. `chr` or `chr:start-end`). +The `pileup_sites()` function will process BAM files and calculate base counts +at each supplied position. The `FilterParam()` will again be used to specify +parameters to exclude reads and bases based on commonly used filters for +detecting RNA-editing events. Specific regions can also be queried using the +`region` argument which accepts a samtools style region specification string +(e.g. `chr` or `chr:start-end`). ```{r} fp <- FilterParam( - only_keep_variants = TRUE, # only report sites with variants - trim_5p = 5, # bases to remove from 5' or 3' end + only_keep_variants = TRUE, # only report sites with variants + trim_5p = 5, # bases to remove from 5' or 3' end trim_3p = 5, - min_base_quality = 30, # minimum base quality score - min_mapq = 255, # minimum MAPQ read score + min_base_quality = 30, # minimum base quality score + min_mapq = 255, # minimum MAPQ read score library_type = "fr-first-strand", # library type - min_splice_overhang = 10 # minimum required splice site overhang + min_splice_overhang = 10 # minimum required splice site overhang ) rse <- pileup_sites(bam_files, @@ -238,9 +368,16 @@ rse ``` -Pileup data is stored in a `RangedSummarizedExperiment` object which facilitates comparisons across samples and conveniently stores genomic coordinate information. The `rowData()` and `rowRanges()` slots are populated with the reference base (`REF`) and information related to each editing site, and similarly the `colData()` slot can be used to store sample metadata. +Pileup data is stored in a `RangedSummarizedExperiment` object which facilitates +comparisons across samples and conveniently stores genomic coordinate +information. The `rowData()` and `rowRanges()` slots are populated with the +reference base (`REF`) and information related to each editing site, and +similarly the `colData()` slot can be used to store sample metadata. -The base counts and other information are stored in different assays within the object. `REF` and `ALT` bases and base count data are all provided in a stand specific fashion depending on the supplied `library-type` parameter. The `REF` and `ALT` bases are in reference to the strand. +The base counts and other information are stored in different assays within the +object. `REF` and `ALT` bases and base count data are all provided in a stand +specific fashion depending on the supplied `library-type` parameter. The `REF` +and `ALT` bases are in reference to the strand. ```{r} assays(rse) @@ -248,17 +385,25 @@ assay(rse, "nA")[1:5, ] assay(rse, "nG")[1:5, ] ``` -Next we'll add sample information which will be needed for identify sites with differential editing frequencies across genotypes. +Next we'll add sample information which will be needed for identify sites with +differential editing frequencies across genotypes. ```{r} colData(rse)$treatment <- "Interferon beta" -colData(rse)$genotype <- factor(rep(c("ADAR1KO","Wildtype"), each = 3)) +colData(rse)$genotype <- factor(rep(c("ADAR1KO", "Wildtype"), each = 3)) colData(rse) ``` ## Prepare for differential editing -*raer* provides the `calc_edit_frequency` function to calculate the editing percentage and read depth at each position. With the `drop = TRUE` argument we will also exclude non-adenosine sites. The editing frequencies will not be used for differential editing analysis, which will be conducted using the raw counts, however these are useful for filtering and visualization. `calc_edit_frequency` will add two additional assays to the object, the editing frequency (`edit_freq`) and read `depth`, both computed based on the `edit_to` and `edit_from` counts. +*raer* provides the `calc_edit_frequency` function to calculate the editing +percentage and read depth at each position. With the `drop = TRUE` argument we +will also exclude non-adenosine sites. The editing frequencies will not be used +for differential editing analysis, which will be conducted using the raw counts, +however these are useful for filtering and visualization. `calc_edit_frequency` +will add two additional assays to the object, the editing frequency +(`edit_freq`) and read `depth`, both computed based on the `edit_to` and +`edit_from` counts. ```{r} @@ -269,7 +414,9 @@ rse <- calc_edit_frequency(rse, ) ``` -We'll next filter to exclude low frequency editing events. For this analysis we require that an editing site shows editing in at least 1 sample and has at least 5 counts in each sample. +We'll next filter to exclude low frequency editing events. For this analysis we +require that an editing site shows editing in at least 1 sample and has at +least 5 counts in each sample. ```{r} has_editing <- rowSums(assay(rse, "edit_freq") > 0) >= 1 @@ -279,21 +426,27 @@ rse <- rse[has_editing & has_depth, ] rse ``` -Once the object has been filtered, we will transform it into an alternative data structure for differential editing analysis that contains an assay with read counts of both the `ALT` and `REF` alleles in a single matrix. +Once the object has been filtered, we will transform it into an alternative data +structure for differential editing analysis that contains an assay with read +counts of both the `ALT` and `REF` alleles in a single matrix. ```{r} deobj <- make_de_object(rse, min_prop = 0.05, min_samples = 3) -assay(deobj, "counts")[1:3, c(1,7,2,8)] +assay(deobj, "counts")[1:3, c(1, 7, 2, 8)] ``` ## Run differential editing -At this stage, you can use the object to perform differential yourself or use `find_de_sites()` to use `edgeR` or `DESeq2` to identify condition specific editing events. For differential editing, we use the design `design <- ~0 + condition:sample + condition:count` and perform testing to compare the edited read counts against unedited read counts. +At this stage, you can use the object to perform differential yourself or use +`find_de_sites()` to use `edgeR` or `DESeq2` to identify condition specific +editing events. For differential editing, we use the design +`design <- ~0 + condition:sample + condition:count` and perform testing to +compare the edited read counts against unedited read counts. ```{r} deobj$sample <- factor(deobj$sample) -de_results <- find_de_sites(deobj, +de_results <- find_de_sites(deobj, test = "DESeq2", sample_col = "sample", condition_col = "genotype", @@ -302,7 +455,8 @@ de_results <- find_de_sites(deobj, ) ``` -This returns a list containing the dds object, the full results, the significant results, and the model matrix. +This returns a list containing the dds object, the full results, the significant +results, and the model matrix. ```{r} de_results$sig_results[1:5, ] @@ -312,24 +466,34 @@ de_results$sig_results[1:5, ] library(ComplexHeatmap) top_sites <- rownames(de_results$sig_results)[1:20] -Heatmap(assay(rse, "edit_freq")[top_sites, ], - name = "editing frequency", - column_labels = paste0(rse$genotype, "-", rse$treatment) +Heatmap(assay(rse, "edit_freq")[top_sites, ], + name = "editing frequency", + column_labels = paste0(rse$genotype, "-", rse$treatment) ) ``` -As anticipated the top identified sites are those with greatly reduced editing in the ADAR1KO samples. +As anticipated the top identified sites are those with greatly reduced editing +in the ADAR1KO samples. ## Examine overall editing activites using the Alu Editing Index -For some studies it is informative to assess the overall ADAR editing activity in addition -to examining individual editing sites. The **Alu Editing Index** (AEI), developed -by @Roth2019-yu, is a metric that summarizes that amount of editing occurring -at ALU elements which account for the vast majority of A-to-I editing (> 99%) in humans. +For some studies it is informative to assess the overall ADAR editing activity +in addition to examining individual editing sites. The **Alu Editing Index** +(AEI), developed by @Roth2019-yu, is a metric that summarizes that amount of +editing occurring at ALU elements which account for the vast majority of A-to-I +editing (> 99%) in humans. -*raer* provides `calc_AEI()`, based on this approach, to calculate the AEI metric. Many of the same parameters used for `pileup_sites()` are available in `calc_AEI()`. +*raer* provides `calc_AEI()`, based on this approach, to calculate the AEI +metric. Many of the same parameters used for `pileup_sites()` are available in +`calc_AEI()`. -First we will use the `AnnotationHub` package to obtain coordinates for ALU elements in the human genome. For this example we will only examine a subset of ALUs on `chr18`. We will also use a `SNPlocs` package, based on the dbSNP database, to exclude any SNPs overlapping the ALU elements from the AEI calculation. The SNP coordinates are `NCBI` based, whereas the `ALU` elements are based on `hg38`, we will therefore convert between the two as needed to obtain SNP and ALU element coordinates based on `hg38`. +First we will use the `AnnotationHub` package to obtain coordinates for ALU +elements in the human genome. For this example we will only examine a subset of +ALUs on `chr18`. We will also use a `SNPlocs` package, based on the dbSNP +database, to exclude any SNPs overlapping the ALU elements from the AEI +calculation. The SNP coordinates are `NCBI` based, whereas the `ALU` elements +are based on `hg38`, we will therefore convert between the two as needed to +obtain SNP and ALU element coordinates based on `hg38`. ```{r} @@ -341,7 +505,7 @@ rmsk_hg38 <- ah[["AH99003"]] alus <- rmsk_hg38[rmsk_hg38$repFamily == "Alu", ] alus <- alus[seqnames(alus) == "chr18", ] -alus <- keepStandardChromosomes(alus) +alus <- keepStandardChromosomes(alus) alus <- alus[1:1000, ] seqlevelsStyle(alus) <- "NCBI" @@ -356,21 +520,23 @@ seqlevelsStyle(alus) <- "UCSC" alus[1:3, ] ``` -`calc_AEI()` returns a matrix containing the AEI calculated for all allelic combinations and a more detailed table containing values for each chromosome. +`calc_AEI()` returns a matrix containing the AEI calculated for all allelic +combinations and a more detailed table containing values for each chromosome. ```{r} -alu_index <- calc_AEI(bam_files, - fasta = fafn, - snp_db = alu_snps, - alu_ranges = alus, - param = fp) +alu_index <- calc_AEI(bam_files, + fasta = fafn, + snp_db = alu_snps, + alu_ranges = alus, + param = fp +) names(alu_index) ``` ```{r} -Heatmap(alu_index$AEI, - name = "AEI", - row_labels = rse$genotype[match(rownames(alu_index$AEI), rse$sample)] +Heatmap(alu_index$AEI, + name = "AEI", + row_labels = rse$genotype[match(rownames(alu_index$AEI), rse$sample)] ) ``` @@ -380,11 +546,24 @@ reduced in the `ADAR1KO` samples as expected. # Novel RNA editing site detection {#novel} -Next we will demonstrate how to identify novel RNA editing sites using the *raer* package. It is best practice to have a matched DNA sequencing dataset to exclude sample specific genetic variation and common alignment artifacts. However, high confidence editing sites can also be identified if the dataset contains many samples and there are high coverage SNP databases for the organism queried. Additionally high confidence editing sites can also be identified if a dataset contains a sample with reduced or absent ADAR activity. A false-positive rate estimate can be obtained by examining the proportion of A->I editing sites recovered, relative to other variants, (e.g. G->C, C->A). +Next we will demonstrate how to identify novel RNA editing sites using the +*raer* package. It is best practice to have a matched DNA sequencing dataset to +exclude sample specific genetic variation and common alignment artifacts. +However, high confidence editing sites can also be identified if the dataset +contains many samples and there are high coverage SNP databases for the organism +queried. Additionally high confidence editing sites can also be identified if a +dataset contains a sample with reduced or absent ADAR activity. A false-positive +rate estimate can be obtained by examining the proportion of A->I editing sites +recovered, relative to other variants, (e.g. G->C, C->A). -In this analysis a published RNA-seq and whole genome sequencing dataset will be analyzed. High coverage whole-genome sequencing was conducted [ERR262997](https://www.ebi.ac.uk/ena/browser/view/ERR262997?show=reads) along with paired-end RNA-seq [SRR1258218](https://www.ebi.ac.uk/ena/browser/view/SRR1258218?show=reads) in a human cell line (`NA12878`). +In this analysis a published RNA-seq and whole genome sequencing dataset will be +analyzed. High coverage whole-genome sequencing was conducted [ERR262997](https://www.ebi.ac.uk/ena/browser/view/ERR262997?show=reads) along +with paired-end RNA-seq [SRR1258218](https://www.ebi.ac.uk/ena/browser/view/SRR1258218?show=reads) +in a human cell line (`NA12878`). -Aligned BAM files, a genome FASTA file, and a GRanges object containing SNPs corresponding to the first 1Mb region of chr4 have been prepared for this vignette and can be downloaded and cached using `NA12878()`. +Aligned BAM files, a genome FASTA file, and a GRanges object containing SNPs +corresponding to the first 1Mb region of chr4 have been prepared for this +vignette and can be downloaded and cached using `NA12878()`. ```{r} rna_wgs <- NA12878() @@ -393,11 +572,17 @@ names(rna_wgs) Additionally we will use the following additional annotation resources: - - A database of known SNPs, for example from the `SNPlocs.Hsapiens.dbSNP144.GRCh38` package. Due to space and memory constraints in this vignette we will only examine SNPs from the first 1Mb region of chr4. - - - `TxDb.Hsapiens.UCSC.hg38.knownGene`, a database of transcript models. Alternatively these can be generated from a `.gtf` file using `makeTxDbFromGRanges()` from the `GenomicFeatures` package. - - - RepeatMasker annotations, which can be obtained from the `AnnotationHub()` for hg38, as shown in the [bulk RNA-seq](#bulk) tutorial. +- A database of known SNPs, for example from the +`SNPlocs.Hsapiens.dbSNP144.GRCh38` package. Due to space and memory +constraints in this vignette we will only examine SNPs from the first 1Mb +region of chr4. + +- `TxDb.Hsapiens.UCSC.hg38.knownGene`, a database of transcript models. +Alternatively these can be generated from a `.gtf` file using +`makeTxDbFromGRanges()` from the `GenomicFeatures` package. + +- RepeatMasker annotations, which can be obtained from the `AnnotationHub()` +for hg38, as shown in the [bulk RNA-seq](#bulk) tutorial. ```{r} library(TxDb.Hsapiens.UCSC.hg38.knownGene) @@ -406,29 +591,33 @@ txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene chr4snps <- rna_wgs$snps ``` -The `pileup_sites()` function accept multiple BAM files, here we supply one from RNA-seq, and one from whole genome sequencing. A subset of the filtering parameters (`FilterParam()`) can accept multiple arguments matched to each of the input BAM files. This allows us to have distinct settings for the WGS and RNA-seq BAM files. +The `pileup_sites()` function accept multiple BAM files, here we supply one +from RNA-seq, and one from whole genome sequencing. A subset of the filtering +parameters (`FilterParam()`) can accept multiple arguments matched to each of +the input BAM files. This allows us to have distinct settings for the WGS and +RNA-seq BAM files. ```{r} bams <- rna_wgs$bams names(bams) <- c("rna", "dna") fp <- FilterParam( - min_depth = 1, # minimum read depth across all samples - min_base_quality = 30, # minimum base quality - min_mapq = c(255, 30), # minimum MAPQ for each BAM file - library_type = c("fr-first-strand", "unstranded"), # library-type for each BAM file - trim_5p = 5, # bases to trim from 5' end of alignment - trim_3p = 5, # bases to trim from 3' end of alignment - indel_dist = 4, # ignore read if contains an indel within distance from site - min_splice_overhang = 10, # required alignment overhang in order to count read with splice - read_bqual = c(0.25, 20), # minimum fraction of the read (0.25) that must have base quality of (20) + min_depth = 1, # minimum read depth across all samples + min_base_quality = 30, # minimum base quality + min_mapq = c(255, 30), # minimum MAPQ for each BAM file + library_type = c("fr-first-strand", "unstranded"), # sample library-types + trim_5p = 5, # bases to trim from 5' end of alignment + trim_3p = 5, # bases to trim from 3' end of alignment + indel_dist = 4, # ignore read if contains an indel within distance from site + min_splice_overhang = 10, # required spliced alignment overhang + read_bqual = c(0.25, 20), # fraction of the read with base quality only_keep_variants = c(TRUE, FALSE), # report site if rnaseq BAM has variant - report_multiallelic = FALSE, # do not report sites with multiple variant alleles + report_multiallelic = FALSE, # exclude sites with multiple variant alleles ) rse <- pileup_sites(bams, - fasta = rna_wgs$fasta, - chroms = "chr4", - param = fp + fasta = rna_wgs$fasta, + chroms = "chr4", + param = fp ) rse @@ -436,21 +625,27 @@ rse Next we filter to keep those sites with a variant in the RNA-seq, but no variant -in the DNA-seq, and a minimum of 5 reads covering the site in the DNA-seq. The DNA-seq -data is unstranded, and therefore will be reported on the "+" strand whereas the -RNA-seq data will be reported on expressing RNA strand. +in the DNA-seq, and a minimum of 5 reads covering the site in the DNA-seq. The +DNA-seq data is unstranded, and therefore will be reported on the "+" strand +whereas the RNA-seq data will be reported on expressing RNA strand. We therefore use `subsetByOverlaps(..., ignore.strand = TRUE)` to retain sites passing these DNA-seq based filters independent of strand. ```{r} -to_keep <- (assay(rse, "nRef")[, "dna"] >= 5 & assay(rse, "ALT")[, "dna"] == "-") +to_keep <- (assay(rse, "nRef")[, "dna"] >= 5 & + assay(rse, "ALT")[, "dna"] == "-") + rse <- subsetByOverlaps(rse, rse[to_keep, ], ignore.strand = TRUE) nrow(rse) ``` -Next we filter to remove any multiallelic sites. These sites are stored as comma-separated -strings in the `ALT` assay (e.g. `G,C`). Non-variant sites are stored as `-`. -`filter_multiallelic()` will remove any sites that have multiple variants in the samples present in the `summarizedExperiment` object. It will add a new column to the `rowData()` to indicate the variant for each site, and will calculate an `edit_freq` assay with variant allele frequencies for each sample. +Next we filter to remove any multiallelic sites. These sites are stored as +comma-separated strings in the `ALT` assay (e.g. `G,C`). Non-variant sites are +stored as `-`. `filter_multiallelic()` will remove any sites that have multiple +variants in the samples present in the `summarizedExperiment` object. It will +add a new column to the `rowData()` to indicate the variant for each site, and +will calculate an `edit_freq` assay with variant allele frequencies for each +sample. ```{r} rse <- filter_multiallelic(rse) @@ -459,14 +654,18 @@ rowData(rse) ``` -Next we'll remove sites in simple repeat regions. We will add repeat information to the `rowData()` using the `annot_from_gr()` function. +Next we'll remove sites in simple repeat regions. We will add repeat information +to the `rowData()` using the `annot_from_gr()` function. ```{r} # subset both to chromosome 4 to avoid warning about different seqlevels seqlevels(rse, pruning.mode = "coarse") <- "chr4" seqlevels(rmsk_hg38, pruning.mode = "coarse") <- "chr4" -rse <- annot_from_gr(rse, rmsk_hg38, cols_to_map = c(c("repName", "repClass", "repFamily"))) +rse <- annot_from_gr(rse, + rmsk_hg38, + cols_to_map = c("repName", "repClass", "repFamily") +) rowData(rse)[c("repName", "repFamily")] ``` @@ -476,7 +675,9 @@ rowData(rse)[c("repName", "repFamily")] rse <- rse[!rowData(rse)$repFamily %in% c("Simple_repeat", "Low_complexity")] ``` -Next we'll remove sites adjacent to other sites with different variant types. For example if an A->G variant is located proximal to a C->T variant then the variants will be removed. +Next we'll remove sites adjacent to other sites with different variant types. +For example if an A->G variant is located proximal to a C->T variant then the +variants will be removed. ```{r} seqlevels(txdb, pruning.mode = "coarse") <- "chr4" @@ -484,7 +685,11 @@ rse <- filter_clustered_variants(rse, txdb, variant_dist = 100) rse ``` -Next, we'll annotate if the site is a known SNP and remove any known SNPs. If using a SNPlocs package you can use the `annot_snps()` function, which also allows one to compare the variant base to the SNP variant base. Here we will use the `annot_from_gr()` function to annotate using the `chr4snps` object and coarsely remove any editing sites overlapping the same position as a SNP. +Next, we'll annotate if the site is a known SNP and remove any known SNPs. If +using a SNPlocs package you can use the `annot_snps()` function, which also +allows one to compare the variant base to the SNP variant base. Here we will +use the `annot_from_gr()` function to annotate using the `chr4snps` object +and coarsely remove any editing sites overlapping the same position as a SNP. ```{r} rse <- annot_from_gr(rse, chr4snps, "name") @@ -494,7 +699,8 @@ rse <- rse[is.na(rowData(rse)$name), ] rse ``` -Lastly, we'll further filter the edit sites to require that the editing frequency is > 0.05 and that at least 2 reads support the editing site. +Lastly, we'll further filter the edit sites to require that the editing +frequency is > 0.05 and that at least 2 reads support the editing site. ```{r} to_keep <- assay(rse, "edit_freq")[, 1] > 0.05 @@ -503,24 +709,25 @@ rse <- rse[to_keep, ] rse <- rse[assay(rse, "nAlt")[, 1] >= 2] ``` -With the above filtering approach we obtain a set of putative editing sites. The specificity of the filtering can be estimated by examining the number of A-to-G changes relative to other variants. A-to-I RNA editing is more common than other types of editing (e.g. C->U editing by APOBEC enzymes) in human datasets so the majority of the variants should by A-to-G. In this vignette data all of the identified sites are A-to-G. +With the above filtering approach we obtain a set of putative editing sites. +The specificity of the filtering can be estimated by examining the number of +A-to-G changes relative to other variants. A-to-I RNA editing is more common +than other types of editing (e.g. C->U editing by APOBEC enzymes) in human +datasets so the majority of the variants should by A-to-G. In this vignette +data all of the identified sites are A-to-G. ```{r} rowRanges(rse) ``` -Finally once a set of sites has been identified, additional packages in the bioconductor ecosystem, such the [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) package, can be used to determine the genomic context and potential molecular consequences of the editing event. +Finally once a set of sites has been identified, additional packages in the +Bioconductor ecosystem, such as the [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) +package, can be used to determine the genomic context and potential molecular +consequences of the editing event. -
- - Session info - +# R session information ```{r} sessionInfo() ``` - -
- -