diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 9e050e86..bd8acbd4 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -39,6 +39,7 @@ env: run_pkgdown: 'true' has_RUnit: 'false' cache-version: 'cache-v1' + run_docker: 'false' jobs: build-check: @@ -51,9 +52,9 @@ jobs: fail-fast: false matrix: config: - - { os: ubuntu-latest, r: '4.0', bioc: '3.12', cont: "bioconductor/bioconductor_docker:RELEASE_3_12", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } - - { os: macOS-latest, r: '4.0', bioc: '3.12'} - - { os: windows-latest, r: '4.0', bioc: '3.12'} + - { os: ubuntu-latest, r: 'devel', bioc: '3.13', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } + - { os: macOS-latest, r: 'devel', bioc: '3.13'} + - { os: windows-latest, r: 'devel', bioc: '3.13'} env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true RSPM: ${{ matrix.config.rspm }} @@ -101,16 +102,16 @@ jobs: uses: actions/cache@v2 with: path: ${{ env.R_LIBS_USER }} - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_12-r-4.0-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_12-r-4.0- + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel- - name: Cache R packages on Linux if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' " uses: actions/cache@v2 with: path: /home/runner/work/_temp/Library - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_12-r-4.0-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_12-r-4.0- + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel- - name: Install Linux system dependencies if: runner.os == 'Linux' @@ -156,6 +157,13 @@ jobs: run: | BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE) shell: Rscript {0} + + ## 12th April 2021 No Seurat binary available for Mac currently so install from source + - name: Install Seurat for macOS-latest + if: matrix.config.os == 'macOS-latest' + run: | + BiocManager::install("Seurat", type="source") + shell: Rscript {0} - name: Install dependencies pass 1 run: | @@ -180,7 +188,7 @@ jobs: ## Pass #2 at installing dependencies message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****')) remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE) - + ## For running the checks message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****')) remotes::install_cran("rcmdcheck") @@ -203,7 +211,7 @@ jobs: - name: Install pkgdown if: github.ref == 'refs/heads/master' && env.run_pkgdown == 'true' && runner.os == 'Linux' run: | - remotes::install_cran("pkgdown") + remotes::install_github("r-lib/pkgdown") shell: Rscript {0} - name: Session info @@ -272,6 +280,15 @@ jobs: if: failure() uses: actions/upload-artifact@master with: - name: ${{ runner.os }}-biocversion-RELEASE_3_12-r-4.0-results + name: ${{ runner.os }}-biocversion-devel-r-devel-results path: check - \ No newline at end of file + + - uses: docker/build-push-action@v1 + if: "!contains(github.event.head_commit.message, '/nodocker') && env.run_docker == 'true' && runner.os == 'Linux' " + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + repository: stemangiola/tidybulk + tag_with_ref: true + tag_with_sha: true + tags: latest diff --git a/DESCRIPTION b/DESCRIPTION index d32e9bc9..e9013282 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -66,7 +66,8 @@ Suggests: devtools, functional, survminer, - tidySummarizedExperiment + tidySummarizedExperiment, + markdown VignetteBuilder: knitr RdMacros: @@ -76,3 +77,4 @@ biocViews: AssayDomain, Infrastructure, RNASeq, DifferentialExpression, GeneExpr Encoding: UTF-8 LazyData: true RoxygenNote: 7.1.1 +LazyDataCompression: xz diff --git a/NAMESPACE b/NAMESPACE index eee79631..ca5b0325 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -81,6 +81,7 @@ import(tibble) import(tidyr) importFrom(SummarizedExperiment,assays) importFrom(SummarizedExperiment,colData) +importFrom(SummarizedExperiment,rowData) importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) importFrom(dplyr,distinct) @@ -151,6 +152,7 @@ importFrom(stringr,str_replace) importFrom(stringr,str_replace_all) importFrom(stringr,str_split) importFrom(tibble,as_tibble) +importFrom(tibble,enframe) importFrom(tibble,tibble) importFrom(tidyr,complete) importFrom(tidyr,nest) diff --git a/R/data.R b/R/data.R index 11869ca6..aeb79e8d 100755 --- a/R/data.R +++ b/R/data.R @@ -1,13 +1,3 @@ -#' Example data set -#' -#' -"counts" - -#' Example data set reduced -#' -#' -"counts_mini" - #' Cibersort reference #' #' @@ -23,11 +13,6 @@ #' "ensembl_symbol_mapping" -#' Data set -#' -#' -"breast_tcga_mini" - #' flybaseIDs #' #' diff --git a/R/dplyr_methods.R b/R/dplyr_methods.R index 07725137..7190f71f 100755 --- a/R/dplyr_methods.R +++ b/R/dplyr_methods.R @@ -201,7 +201,7 @@ bind_cols.tidybulk <- function(..., .id = NULL) #' #' @examples #' -#' distinct(tidybulk::counts_mini) +#' tidybulk::se_mini %>% tidybulk() %>% distinct() #' #' #' @export @@ -723,8 +723,8 @@ rowwise.tidybulk <- function(data, ...) #' #' @examples #'`%>%` = magrittr::`%>%` -#' annotation = tidybulk::counts %>% distinct(sample) %>% mutate(source = "AU") -#' tidybulk::counts %>% left_join(annotation) +#' annotation = tidybulk::counts_SE %>% tidybulk() %>% as_tibble() %>% distinct(sample) %>% mutate(source = "AU") +#' tidybulk::counts_SE %>% tidybulk() %>% as_tibble() %>% left_join(annotation) #' #' @rdname dplyr-methods #' @name left_join @@ -763,8 +763,8 @@ left_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", #' #' @examples #'`%>%` = magrittr::`%>%` -#' annotation = tidybulk::counts %>% distinct(sample) %>% mutate(source = "AU") -#' tidybulk::counts %>% inner_join(annotation) +#' annotation = tidybulk::counts_SE %>% tidybulk() %>% as_tibble() %>% distinct(sample) %>% mutate(source = "AU") +#' tidybulk::counts_SE %>% tidybulk() %>% as_tibble() %>% inner_join(annotation) #' #' @rdname join-methods #' @name inner_join @@ -802,8 +802,8 @@ inner_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", #' #' @examples #'`%>%` = magrittr::`%>%` -#' annotation = tidybulk::counts %>% distinct(sample) %>% mutate(source = "AU") -#' tidybulk::counts %>% right_join(annotation) +#' annotation = tidybulk::counts_SE %>% tidybulk() %>% as_tibble() %>% distinct(sample) %>% mutate(source = "AU") +#' tidybulk::counts_SE %>% tidybulk() %>% as_tibble() %>% right_join(annotation) #' #' @rdname join-methods #' @name right_join @@ -843,8 +843,8 @@ right_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", #' #' @examples #'`%>%` = magrittr::`%>%` -#' annotation = tidybulk::counts %>% distinct(sample) %>% mutate(source = "AU") -#' tidybulk::counts %>% full_join(annotation) +#' annotation = tidybulk::counts_SE %>% tidybulk() %>% as_tibble() %>% distinct(sample) %>% mutate(source = "AU") +#' tidybulk::counts_SE %>% tidybulk() %>% as_tibble() %>% full_join(annotation) #' #' @rdname join-methods #' @name full_join diff --git a/R/functions.R b/R/functions.R index 3467183f..4696a875 100755 --- a/R/functions.R +++ b/R/functions.R @@ -289,6 +289,7 @@ get_scaled_counts_bulk <- function(.data, #' @param test_above_log2_fold_change A positive real value. At the moment this works just for edgeR methods, and use the `treat` function, which test the that the difference in abundance is bigger than this parameter rather than zero \url{https://www.rdocumentation.org/packages/edgeR/versions/3.14.0/topics/glmTreat}. #' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") #' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames. +#' @param .sample_total_read_count #' #' @return A tibble with edgeR results #' @@ -302,12 +303,14 @@ get_differential_transcript_abundance_bulk <- function(.data, test_above_log2_fold_change = NULL, scaling_method = "TMM", omit_contrast_in_colnames = FALSE, - prefix = "") { + prefix = "", + .sample_total_read_count = NULL) { # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) .abundance = enquo(.abundance) - + .sample_total_read_count = enquo(.sample_total_read_count) + # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present") @@ -400,6 +403,26 @@ get_differential_transcript_abundance_bulk <- function(.data, edgeR::DGEList(counts = .) %>% + # Override lib.size if imposed + # This is useful in case you are analysing a small amount of genes, + # for which the overall lib.size cannot be calculated + when( + !quo_is_null(.sample_total_read_count) ~ { + + # New library size dataset + new_lib_size = .data %>% pivot_sample(!!.sample) %>% select(!!.sample, !!.sample_total_read_count) + + x = (.) + x$samples$lib.size = + new_lib_size %>% + slice(match(rownames(x$samples), !!.sample)) %>% + pull(!!.sample_total_read_count) + + x + }, + ~ (.) + ) %>% + # Scale data if method is not "none" when( scaling_method != "none" ~ (.) %>% edgeR::calcNormFactors(method = scaling_method), @@ -413,7 +436,7 @@ get_differential_transcript_abundance_bulk <- function(.data, tolower(method) == "edger_robust_likelihood_ratio" ~ (.) %>% edgeR::estimateGLMRobustDisp(design) %>% edgeR::glmFit(design) ) - + edgeR_object %>% # If I have multiple .contrasts merge the results @@ -1485,44 +1508,74 @@ get_reduced_dimensions_MDS_bulk <- log_transform = TRUE) { # Comply with CRAN NOTES . = NULL - + # Get column names .element = enquo(.element) .feature = enquo(.feature) .abundance = enquo(.abundance) - + # Get components from dims components = 1:.dims - + + + # Convert components to components list + if((length(components) %% 2) != 0 ) components = components %>% append(components[1]) + components_list = split(components, ceiling(seq_along(components)/2)) + + # Loop over components list and calculate MDS. (I have to make this process more elegant) mds_object = - .data %>% - - distinct(!!.feature,!!.element,!!.abundance) %>% - - # Check if log transform is needed - ifelse_pipe(log_transform, - ~ .x %>% dplyr::mutate(!!.abundance := !!.abundance %>% log1p())) %>% - - # Stop any column is not if not numeric or integer - ifelse_pipe( - (.) %>% select(!!.abundance) %>% summarise_all(class) %>% `%in%`(c("numeric", "integer")) %>% not() %>% any(), - ~ stop("tidybulk says: .abundance must be numerical or integer") - ) %>% - spread(!!.element,!!.abundance) %>% - as_matrix(rownames = !!.feature, do_check = FALSE) %>% - limma::plotMDS(ndim = .dims, plot = FALSE, top = top) - - # Parse results - mds_object %$% cmdscale.out %>% - as.data.frame %>% - as_tibble(rownames = quo_name(.element)) %>% - setNames(c(quo_name(.element), sprintf("Dim%s", 1:.dims))) %>% - - + components_list %>% + map( + ~ .data %>% + + distinct(!!.feature,!!.element,!!.abundance) %>% + + # Check if log transform is needed + ifelse_pipe(log_transform, + ~ .x %>% dplyr::mutate(!!.abundance := !!.abundance %>% log1p())) %>% + + # Stop any column is not if not numeric or integer + ifelse_pipe( + (.) %>% select(!!.abundance) %>% summarise_all(class) %>% `%in%`(c("numeric", "integer")) %>% `!`() %>% any(), + ~ stop(".abundance must be numerical or integer") + ) %>% + spread(!!.element,!!.abundance) %>% + as_matrix(rownames = !!.feature, do_check = FALSE) %>% + limma::plotMDS(dim.plot = .x, plot = FALSE, top = top) + ) + + map2_dfr( + mds_object, components_list, + ~ { + # Change of function from Bioconductor 3_13 of plotMDS + my_rownames = .x %>% when( + "distance.matrix.squared" %in% names(.x) ~ .x$distance.matrix.squared, + ~ .x$distance.matrix + ) %>% + rownames() + + tibble(my_rownames, .x$x, .x$y) %>% + rename( + !!.element := my_rownames, + !!as.symbol(.y[1]) := `.x$x`, + !!as.symbol(.y[2]) := `.x$y` + ) %>% + gather(Component, `Component value`,-!!.element) + + } + + ) %>% + distinct() %>% + spread(Component, `Component value`) %>% + setNames(c((.) %>% select(1) %>% colnames(), + paste0("Dim", (.) %>% select(-1) %>% colnames()) + )) %>% + + # Attach attributes reattach_internals(.data) %>% memorise_methods_used("limma") %>% - + # Add raw object attach_to_internals(mds_object, "MDS") %>% # Communicate the attribute added @@ -1530,7 +1583,6 @@ get_reduced_dimensions_MDS_bulk <- message("tidybulk says: to access the raw results do `attr(..., \"internals\")$MDS`") (.) } - } #' Get principal component information to a tibble using PCA @@ -2379,7 +2431,7 @@ run_epic = function(mix, reference = NULL) { devtools::install_github("GfellerLab/EPIC") } - if("EPIC" %in% .packages() %>% not) stop("tidybulk says: Please attach the apckage EPIC manually (i.e. library(EPIC)). This is because EPIC is only available on GitHub and it is not possible to seemlessy make EPIC part of the dependencies.") + if("EPIC" %in% .packages() %>% not) stop("tidybulk says: Please install and then load the package EPIC manually (i.e. library(EPIC)). This is because EPIC is not in Bioconductor or CRAN so it is not possible to seamlessly make EPIC part of the dependencies.") # Get common markers if( reference %>% class %>% equals("data.frame")){ @@ -2523,7 +2575,7 @@ get_cell_type_proportions = function(.data, # Other (hidden for the moment) methods using third party wrapper https://icbi-lab.github.io/immunedeconv method %>% tolower %in% c("mcp_counter", "quantiseq", "xcell") ~ { - # Check if package is installed, otherwise install + # # Check if package is installed, otherwise install if (find.package("immunedeconv", quiet = TRUE) %>% length %>% equals(0)) { message("Installing immunedeconv") devtools::install_github("icbi-lab/immunedeconv", upgrade = FALSE) @@ -2682,14 +2734,6 @@ get_adjusted_counts_for_unwanted_variation_bulk <- function(.data, # Reset column names dplyr::rename(!!value_adjusted := !!.abundance) %>% - # # Add filtering info - # right_join( - # df_for_combat %>% - # distinct(!!.transcript,!!.sample, - # lowly_abundant), - # by = c(quo_name(.transcript), quo_name(.sample)) - # )%>% - # Attach attributes reattach_internals(.data) %>% memorise_methods_used("sva") @@ -2865,7 +2909,9 @@ tidybulk_to_SummarizedExperiment = function(.data, #' #' @examples #' -#' as_matrix(head(dplyr::select(tidybulk::counts_mini, transcript, count)), rownames=transcript) +#' library(dplyr) +#' +#' tidybulk::se_mini %>% tidybulk() %>% select(feature, count) %>% head %>% as_matrix(rownames=feature) #' #' @export as_matrix <- function(tbl, diff --git a/R/functions_SE.R b/R/functions_SE.R index e94c940b..fe5c45d8 100755 --- a/R/functions_SE.R +++ b/R/functions_SE.R @@ -125,6 +125,7 @@ get_clusters_SNN_bulk_SE <- #' @param top An integer. How many top genes to select #' @param of_samples A boolean #' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param scale A boolean #' #' @return A tibble with additional columns #' @@ -134,28 +135,70 @@ get_reduced_dimensions_MDS_bulk_SE <- .dims = 2, top = 500, of_samples = TRUE, - log_transform = TRUE) { + log_transform = TRUE, + scale = NULL # This is only a dummy argument for making it compatibble with PCA + ) { # Comply with CRAN NOTES . = NULL # Get components from dims components = 1:.dims - mds_object = limma::plotMDS(.data, ndim = .dims, plot = FALSE, top = top) + + # Convert components to components list + if((length(components) %% 2) != 0 ) components = components %>% append(components[1]) + components_list = split(components, ceiling(seq_along(components)/2)) + + # Loop over components list and calculate MDS. (I have to make this process more elegant) + mds_object = + components_list %>% + map( + ~ .data %>% + limma::plotMDS(dim.plot = .x, plot = FALSE, top = top) + ) # Return list( raw_result = mds_object, result = - # Parse results - mds_object %$% cmdscale.out %>% - as.data.frame %>% - setNames(c(sprintf("Dim%s", 1:.dims))) + map2_dfr( + mds_object, components_list, + ~ { + + # Change of function from Bioconductor 3_13 of plotMDS + my_rownames = .x %>% when( + "distance.matrix.squared" %in% names(.x) ~ .x$distance.matrix.squared, + ~ .x$distance.matrix + ) %>% + rownames() + + tibble(my_rownames, .x$x, .x$y) %>% + rename( + sample := my_rownames, + !!as.symbol(.y[1]) := `.x$x`, + !!as.symbol(.y[2]) := `.x$y` + ) %>% + gather(Component, `Component value`,-sample) + + } + + + ) %>% + distinct() %>% + spread(Component, `Component value`) %>% + setNames(c((.) %>% select(1) %>% colnames(), + paste0("Dim", (.) %>% select(-1) %>% colnames()) + )) %>% + select(-sample) ) } + + + + #' Get principal component information to a tibble using PCA #' #' @keywords internal @@ -251,7 +294,7 @@ we suggest to partition the dataset for sample clusters. # Parse the PCA results to a tibble x %>% as_tibble(rownames = "sample") %>% - select(sample, sprintf("PC%s", components)) + select(sprintf("PC%s", components)) ) @@ -276,6 +319,7 @@ we suggest to partition the dataset for sample clusters. #' @param top An integer. How many top genes to select #' @param of_samples A boolean #' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param scale A boolean #' @param ... Further parameters passed to the function Rtsne #' #' @return A tibble with additional columns @@ -286,6 +330,7 @@ get_reduced_dimensions_TSNE_bulk_SE <- top = 500, of_samples = TRUE, log_transform = TRUE, + scale = NULL, # This is only a dummy argument for making it compatibble with PCA ...) { # Comply with CRAN NOTES . = NULL @@ -334,7 +379,7 @@ get_reduced_dimensions_TSNE_bulk_SE <- # add element name dplyr::mutate(sample = !!.data %>% colnames) %>% - select(sample, everything()) + select(-sample) ) } @@ -580,7 +625,8 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, test_above_log2_fold_change = NULL, scaling_method = "TMM", omit_contrast_in_colnames = FALSE, - prefix = "") { + prefix = "", + ...) { # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ @@ -961,8 +1007,8 @@ get_differential_transcript_abundance_deseq2_SE <- function(.data, (.) %>% DESeq2::results(contrast = c( parse_formula(.formula)[1], - deseq2_object@colData[,parse_formula(.formula)[1]] %>% levels %>% .[2], - deseq2_object@colData[,parse_formula(.formula)[1]] %>% levels %>% .[1] + deseq2_object@colData[,parse_formula(.formula)[1]] %>% as.factor() %>% levels %>% .[2], + deseq2_object@colData[,parse_formula(.formula)[1]] %>% as.factor() %>% levels %>% .[1] )) %>% as_tibble(rownames = "transcript"), diff --git a/R/methods.R b/R/methods.R index 2df66606..22fa481c 100755 --- a/R/methods.R +++ b/R/methods.R @@ -3,7 +3,7 @@ setOldClass("tidybulk") #' Creates a `tt` object from a `tbl` or `SummarizedExperiment` object #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description tidybulk() creates a `tt` object from a `tbl` formatted as | | | | <...> | #' @@ -34,7 +34,7 @@ setOldClass("tidybulk") #' #' #' -#' my_tt = tidybulk(tidybulk::counts_mini, sample, transcript, count) +#' my_tt = tidybulk(tidybulk::se_mini) #' #' #' @docType methods @@ -239,7 +239,7 @@ setMethod("as_SummarizedExperiment", "tidybulk", .as_SummarizedExperiment) #' Creates a `tt` object from a list of file names of BAM/SAM #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description tidybulk_SAM_BAM() creates a `tt` object from a `tbl` formatted as | | | | <...> | #' @@ -286,7 +286,7 @@ setMethod("tidybulk_SAM_BAM", c(file_names = "character", genome = "character"), #' Scale the counts of transcripts/genes #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description scale_abundance() takes as input a `tbl` formatted as | | | | <...> | and Scales transcript abundance compansating for sequencing depth (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25). #' @@ -323,8 +323,7 @@ setMethod("tidybulk_SAM_BAM", c(file_names = "character", genome = "character"), #' @examples #' #' -#' tidybulk::counts_mini %>% -#' tidybulk(sample, transcript, count) %>% +#' tidybulk::se_mini %>% #' identify_abundant() %>% #' scale_abundance() #' @@ -482,7 +481,7 @@ setMethod("scale_abundance", "tidybulk", .scale_abundance) #' Get clusters of elements (e.g., samples or transcripts) #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description cluster_elements() takes as input a `tbl` formatted as | | | | <...> | and identify clusters in the data. #' @@ -525,7 +524,7 @@ setMethod("scale_abundance", "tidybulk", .scale_abundance) #' @examples #' #' -#' cluster_elements(tidybulk::counts_mini, sample, transcript, count, centers = 2, method="kmeans") +#' cluster_elements(tidybulk::se_mini, centers = 2, method="kmeans") #' #' @docType methods #' @rdname cluster_elements-methods @@ -672,7 +671,7 @@ setMethod("cluster_elements", "tidybulk", .cluster_elements) #' Dimension reduction of the transcript abundance data #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description reduce_dimensions() takes as input a `tbl` formatted as | | | | <...> | and calculates the reduced dimensional space of the transcript abundance. #' @@ -717,15 +716,13 @@ setMethod("cluster_elements", "tidybulk", .cluster_elements) #' #' #' counts.MDS = -#' tidybulk::counts_mini %>% -#' tidybulk(sample, transcript, count) %>% +#' tidybulk::se_mini %>% #' identify_abundant() %>% #' reduce_dimensions( method="MDS", .dims = 3) #' #' #' counts.PCA = -#' tidybulk::counts_mini %>% -#' tidybulk(sample, transcript, count) %>% +#' tidybulk::se_mini %>% #' identify_abundant() %>% #' reduce_dimensions(method="PCA", .dims = 3) #' @@ -896,7 +893,7 @@ setMethod("reduce_dimensions", "tidybulk", .reduce_dimensions) #' Rotate two dimensions (e.g., principal components) of an arbitrary angle #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description rotate_dimensions() takes as input a `tbl` formatted as | | | <...> | and calculates the rotated dimensional space of the transcript abundance. #' @@ -936,8 +933,7 @@ setMethod("reduce_dimensions", "tidybulk", .reduce_dimensions) #' @examples #' #' counts.MDS = -#' tidybulk::counts_mini %>% -#' tidybulk(sample, transcript, count) %>% +#' tidybulk::se_mini %>% #' identify_abundant() %>% #' reduce_dimensions( method="MDS", .dims = 3) #' @@ -1072,7 +1068,7 @@ setMethod("rotate_dimensions", "tidybulk", .rotate_dimensions) #' Drop redundant elements (e.g., samples) for which feature (e.g., transcript/gene) abundances are correlated #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description remove_redundancy() takes as input a `tbl` formatted as | | | | <...> | for correlation method or | | | <...> | for reduced_dimensions method, and returns a `tbl` with dropped elements (e.g., samples). #' @@ -1132,8 +1128,7 @@ setMethod("rotate_dimensions", "tidybulk", .rotate_dimensions) #' @examples #' #' -#' tidybulk::counts_mini %>% -#' tidybulk(sample, transcript, count) %>% +#' tidybulk::se_mini %>% #' identify_abundant() %>% #' remove_redundancy( #' .element = sample, @@ -1143,8 +1138,7 @@ setMethod("rotate_dimensions", "tidybulk", .rotate_dimensions) #' ) #' #' counts.MDS = -#' tidybulk::counts_mini %>% -#' tidybulk(sample, transcript, count) %>% +#' tidybulk::se_mini %>% #' identify_abundant() %>% #' reduce_dimensions( method="MDS", .dims = 3) #' @@ -1274,7 +1268,7 @@ setMethod("remove_redundancy", "tidybulk", .remove_redundancy) #' Adjust transcript abundance for unwanted variation #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description adjust_abundance() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with an edditional adjusted abundance column. This method uses scaled counts if present. #' @@ -1308,9 +1302,9 @@ setMethod("remove_redundancy", "tidybulk", .remove_redundancy) #' #' #' -#' cm = tidybulk::counts_mini +#' cm = tidybulk::se_mini #' cm$batch = 0 -#' cm$batch[cm$sample %in% c("SRR1740035", "SRR1740043")] = 1 +#' cm$batch[colnames(cm) %in% c("SRR1740035", "SRR1740043")] = 1 #' #' res = #' cm %>% @@ -1453,7 +1447,7 @@ setMethod("adjust_abundance", "tidybulk", .adjust_abundance) #' Aggregates multiple counts from the same samples (e.g., from isoforms), concatenates other character columns, and averages other numeric columns #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description aggregate_duplicates() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with aggregated transcripts that were duplicated. #' @@ -1491,10 +1485,7 @@ setMethod("adjust_abundance", "tidybulk", .adjust_abundance) #' @examples #' #' aggregate_duplicates( -#' tidybulk::counts_mini, -#' sample, -#' transcript, -#' `count`, +#' tidybulk::se_mini, #' aggregation_function = sum #' ) #' @@ -1575,7 +1566,7 @@ setMethod("aggregate_duplicates", "tidybulk", .aggregate_duplicates) #' Get cell type proportions from samples #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description deconvolve_cellularity() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with the estimated cell type abundance for each sample #' @@ -1607,8 +1598,10 @@ setMethod("aggregate_duplicates", "tidybulk", .aggregate_duplicates) #' #' @examples #' +#' library(dplyr) +#' #' # Subsetting for time efficiency -#' deconvolve_cellularity(filter(tidybulk::counts, sample=="SRR1740034"), sample, transcript, `count`, cores = 1) +#' tidybulk::se_mini %>% tidybulk() %>% filter(sample=="SRR1740034") %>% deconvolve_cellularity(sample, feature, count, cores = 1) #' #' #' @docType methods @@ -1748,7 +1741,7 @@ setMethod("deconvolve_cellularity", #' #' @examples #' -#' symbol_to_entrez(tidybulk::counts_mini, .transcript = transcript, .sample = sample) +#' tidybulk::se_mini %>% tidybulk() %>% as_tibble() %>% symbol_to_entrez(.transcript = feature, .sample = sample) #' #' @export #' @@ -1798,7 +1791,7 @@ symbol_to_entrez = function(.data, #' #' @examples #' -#' describe_transcript(tidybulk::counts_mini, .transcript = transcript) +#' describe_transcript(tidybulk::se_mini) #' #' @docType methods #' @rdname describe_transcript-methods @@ -1937,8 +1930,10 @@ setMethod("describe_transcript", "tidybulk", .describe_transcript) #' #' @examples #' -#' -#' ensembl_to_symbol(tidybulk::counts_ensembl, ens) +#' library(dplyr) +#' +#' tidybulk::counts_SE %>% tidybulk() %>% as_tibble() %>% ensembl_to_symbol(feature) +#' #' #' #' @docType methods @@ -2023,7 +2018,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' Perform differential transcription testing using edgeR quasi-likelihood (QLT), edgeR likelihood-ratio (LR), limma-voom, limma-voom-with-quality-weights or DESeq2 #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description test_differential_abundance() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with additional columns for the statistics from the hypothesis test. #' @@ -2046,7 +2041,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get). #' @param significance_threshold DEPRECATED - A real between 0 and 1 (usually 0.05). #' @param fill_missing_values DEPRECATED - A boolean. Whether to fill missing sample/transcript values with the median of the transcript. This is rarely needed. -#' +#' @param ... Further arguments passed to some of the internal functions. Currently, it is needed just for internal debug. #' #' #' @details This function provides the option to use edgeR \url{https://doi.org/10.1093/bioinformatics/btp616}, limma-voom \url{https://doi.org/10.1186/gb-2014-15-2-r29}, limma_voom_sample_weights \url{https://doi.org/10.1093/nar/gkv412} or DESeq2 \url{https://doi.org/10.1186/s13059-014-0550-8} to perform the testing. @@ -2098,15 +2093,13 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' #' # edgeR #' -#' tidybulk::counts_mini %>% -#' tidybulk(sample, transcript, count) %>% +#' tidybulk::se_mini %>% #' identify_abundant() %>% #' test_differential_abundance( ~ condition ) #' #' # The function `test_differential_abundance` operates with contrasts too #' -#' tidybulk::counts_mini %>% -#' tidybulk(sample, transcript, count) %>% +#' tidybulk::se_mini %>% #' identify_abundant() %>% #' test_differential_abundance( #' ~ 0 + condition, @@ -2115,15 +2108,16 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' #' # DESeq2 - equivalent for limma-voom #' -#' tidybulk::counts_mini %>% -#' tidybulk(sample, transcript, count) %>% +#' my_se_mini = tidybulk::se_mini +#' my_se_mini$condition = factor(my_se_mini$condition) +#' +#' my_se_mini %>% #' identify_abundant() %>% #' test_differential_abundance( ~ condition, method="deseq2" ) #' #' # The function `test_differential_abundance` operates with contrasts too #' -#' tidybulk::counts_mini %>% -#' tidybulk(sample, transcript, count) %>% +#' my_se_mini %>% #' identify_abundant() %>% #' test_differential_abundance( #' ~ 0 + condition, @@ -2147,6 +2141,7 @@ setGeneric("test_differential_abundance", function(.data, omit_contrast_in_colnames = FALSE, prefix = "", action = "add", + ..., # DEPRECATED significance_threshold = NULL, @@ -2168,6 +2163,7 @@ setGeneric("test_differential_abundance", function(.data, prefix = "", action = "add", + ..., # DEPRECATED significance_threshold = NULL, @@ -2244,7 +2240,8 @@ such as batch effects (if applicable) in the formula. test_above_log2_fold_change = test_above_log2_fold_change, scaling_method = scaling_method, omit_contrast_in_colnames = omit_contrast_in_colnames, - prefix = prefix + prefix = prefix, + ... ), # Voom @@ -2361,7 +2358,7 @@ setMethod("test_differential_abundance", #' Keep variable transcripts #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description keep_variable() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with additional columns for the statistics from the hypothesis test. #' @@ -2394,10 +2391,7 @@ setMethod("test_differential_abundance", #' #' #' keep_variable( -#' tidybulk::counts_mini, -#' sample, -#' transcript, -#' `count`, +#' tidybulk::se_mini, #' top = 500 #' ) #' @@ -2472,7 +2466,7 @@ setMethod("keep_variable", "tidybulk", .keep_variable) #' find abundant transcripts #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description identify_abundant() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with additional columns for the statistics from the hypothesis test. #' @@ -2510,10 +2504,7 @@ setMethod("keep_variable", "tidybulk", .keep_variable) #' #' #' identify_abundant( -#' tidybulk::counts_mini, -#' sample, -#' transcript, -#' `count` +#' tidybulk::se_mini #' ) #' #' @@ -2651,10 +2642,7 @@ setMethod("identify_abundant", "tidybulk", .identify_abundant) #' #' #' keep_abundant( -#' tidybulk::counts_mini, -#' sample, -#' transcript, -#' `count` +#' tidybulk::se_mini #' ) #' #' @@ -2745,7 +2733,7 @@ setMethod("keep_abundant", "tidybulk", .keep_abundant) #' analyse gene enrichment with EGSEA #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description test_gene_enrichment() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with the additional transcript symbol column #' @@ -2804,7 +2792,7 @@ setMethod("keep_abundant", "tidybulk", .keep_abundant) #' @examples #' \dontrun{ #' -#' df_entrez = symbol_to_entrez(tidybulk::counts_mini, .transcript = transcript, .sample = sample) +#' df_entrez = tidybulk::se_mini %>% tidybulk() %>% as_tibble() %>% symbol_to_entrez( .transcript = feature, .sample = sample) #' df_entrez = aggregate_duplicates(df_entrez, aggregation_function = sum, .sample = sample, .transcript = entrez, .abundance = count) #' #' library("EGSEA") @@ -2860,7 +2848,7 @@ setGeneric("test_gene_enrichment", function(.data, # Validate data frame if(do_validate()) { validation(.data, !!.sample, !!.entrez, !!.abundance) - warning_if_data_is_not_rectangular(.data, !!.sample, !!.transcript, !!.abundance) + warning_if_data_is_not_rectangular(.data, !!.sample, !!.entrez, !!.abundance) } .data %>% @@ -2924,7 +2912,7 @@ setMethod("test_gene_enrichment", #' analyse gene over-representation with GSEA #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description test_gene_overrepresentation() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with the GSEA statistics #' @@ -2964,9 +2952,9 @@ setMethod("test_gene_enrichment", #' #' @examples #' -#' df_entrez = symbol_to_entrez(tidybulk::counts_mini, .transcript = transcript, .sample = sample) +#' df_entrez = tidybulk::se_mini %>% tidybulk() %>% as_tibble() %>% symbol_to_entrez( .transcript = feature, .sample = sample) #' df_entrez = aggregate_duplicates(df_entrez, aggregation_function = sum, .sample = sample, .transcript = entrez, .abundance = count) -#' df_entrez = mutate(df_entrez, do_test = transcript %in% c("TNFRSF4", "PLCH2", "PADI4", "PAX7")) +#' df_entrez = mutate(df_entrez, do_test = feature %in% c("TNFRSF4", "PLCH2", "PADI4", "PAX7")) #' #' test_gene_overrepresentation( #' df_entrez, @@ -3076,7 +3064,7 @@ setMethod("test_gene_overrepresentation", #' Extract sample-wise information #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description pivot_sample() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with only sample-related columns #' @@ -3098,10 +3086,7 @@ setMethod("test_gene_overrepresentation", #' @examples #' #' -#' pivot_sample( -#' tidybulk::counts_mini, -#' .sample = sample -#' ) +#' pivot_sample(tidybulk::se_mini ) #' #' #' @docType methods @@ -3171,7 +3156,7 @@ setMethod("pivot_sample", #' Extract transcript-wise information #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description pivot_transcript() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with only sample-related columns #' @@ -3193,10 +3178,7 @@ setMethod("pivot_sample", #' @examples #' #' -#' pivot_transcript( -#' tidybulk::counts_mini, -#' .transcript = transcript -#' ) +#' pivot_transcript(tidybulk::se_mini ) #' #' #' @docType methods @@ -3291,7 +3273,7 @@ setMethod("pivot_transcript", #' #' @examples #' -#' fill_missing_abundance(tidybulk::counts_mini, sample, transcript, count, fill_with = 0) +#' tidybulk::se_mini %>% tidybulk() %>% fill_missing_abundance( fill_with = 0) #' #' #' @docType methods @@ -3372,7 +3354,7 @@ setMethod("fill_missing_abundance", "tidybulk", .fill_missing_abundance) #' impute transcript abundance if missing from sample-transcript pairs #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description impute_missing_abundance() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with an edditional adjusted abundance column. This method uses scaled counts if present. #' @@ -3399,11 +3381,8 @@ setMethod("fill_missing_abundance", "tidybulk", .fill_missing_abundance) #' #' res = #' impute_missing_abundance( -#' tidybulk::counts_mini, -#' ~ condition, -#' .sample = sample, -#' .transcript = transcript, -#' .abundance = count +#' tidybulk::se_mini, +#' ~ condition #' ) #' #' @@ -3495,7 +3474,7 @@ setMethod("impute_missing_abundance", "tidybulk", .impute_missing_abundance) #' Add differential tissue composition information to a tbl #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description test_differential_cellularity() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with additional columns for the statistics from the hypothesis test. #' @@ -3552,11 +3531,8 @@ setMethod("impute_missing_abundance", "tidybulk", .impute_missing_abundance) #' #' # Regular regression #' test_differential_cellularity( -#' tidybulk::counts_mini, +#' tidybulk::se_mini , #' . ~ condition, -#' sample, -#' transcript, -#' count, #' cores = 1 #' ) #' @@ -3564,7 +3540,8 @@ setMethod("impute_missing_abundance", "tidybulk", .impute_missing_abundance) #' library(dplyr) #' library(tidyr) #' -#' tidybulk::counts_mini %>% +#' tidybulk::se_mini %>% +#' tidybulk() %>% #' #' # Add survival data #' nest(data = -sample) %>% @@ -3577,9 +3554,6 @@ setMethod("impute_missing_abundance", "tidybulk", .impute_missing_abundance) #' # Test #' test_differential_cellularity( #' survival::Surv(days, dead) ~ ., -#' sample, -#' transcript, -#' count, #' cores = 1 #' ) #' @@ -3676,7 +3650,7 @@ setMethod("test_differential_cellularity", #' Test of stratification of biological replicates based on tissue composition, one cell-type at the time, using Kaplan-meier curves. #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description test_stratification_cellularity() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with additional columns for the statistics from the hypothesis test. #' @@ -3722,7 +3696,8 @@ setMethod("test_differential_cellularity", #' library(dplyr) #' library(tidyr) #' -#' tidybulk::counts_mini %>% +#' tidybulk::se_mini %>% +#' tidybulk() %>% #' #' # Add survival data #' nest(data = -sample) %>% @@ -3733,9 +3708,6 @@ setMethod("test_differential_cellularity", #' unnest(data) %>% #' test_stratification_cellularity( #' survival::Surv(days, dead) ~ ., -#' sample, -#' transcript, -#' count, #' cores = 1 #' ) #' @@ -3831,7 +3803,7 @@ setMethod("test_stratification_cellularity", #' Produces the bibliography list of your workflow #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description get_bibliography() takes as input a `tidybulk` #' @@ -3848,7 +3820,7 @@ setMethod("test_stratification_cellularity", #' @examples #' #' # Define tidybulk tibble -#' df = tidybulk(tidybulk::counts_mini, sample, transcript, count) +#' df = tidybulk(tidybulk::se_mini) #' #' get_bibliography(df) #' diff --git a/R/methods_SE.R b/R/methods_SE.R index 4527422e..bc0da353 100755 --- a/R/methods_SE.R +++ b/R/methods_SE.R @@ -323,6 +323,7 @@ setMethod("cluster_elements", top = top, of_samples = of_samples, log_transform = log_transform, + scale=scale, ... ) @@ -923,7 +924,8 @@ setMethod( test_above_log2_fold_change = NULL, scaling_method = "TMM", omit_contrast_in_colnames = FALSE, - prefix = "") + prefix = "", + ...) { # Clearly state what counts are used @@ -958,7 +960,8 @@ such as batch effects (if applicable) in the formula. test_above_log2_fold_change = test_above_log2_fold_change, scaling_method = scaling_method, omit_contrast_in_colnames = omit_contrast_in_colnames, - prefix = prefix + prefix = prefix, + ... ), # Voom @@ -1109,15 +1112,15 @@ setMethod("keep_variable", .sample = NULL, .transcript = NULL, .abundance = NULL, - factor_of_interest = NULL, - minimum_counts = 10, - minimum_proportion = 0.7) + factor_of_interest = NULL, + minimum_counts = 10, + minimum_proportion = 0.7) { factor_of_interest = enquo(factor_of_interest) - + # Check factor_of_interest if( !is.null(factor_of_interest) && @@ -1125,48 +1128,56 @@ setMethod("keep_variable", (quo_name(factor_of_interest) %in% colnames(colData(.data)) %>% not()) ) stop(sprintf("tidybulk says: the column %s is not present in colData", quo_name(factor_of_interest))) - + if (minimum_counts < 0) stop("The parameter minimum_counts must be > 0") if (minimum_proportion < 0 | minimum_proportion > 1) stop("The parameter minimum_proportion must be between 0 and 1") - + # If column is present use this instead of doing more work if(".abundant" %in% colnames(colData(.data))){ message("tidybulk says: the column .abundant already exists in colData. Nothing was done") - + # Return return(.data) } - - + + string_factor_of_interest = - + factor_of_interest %>% when( - quo_is_symbol(factor_of_interest) && + quo_is_symbol(factor_of_interest) && ( colData(.data)[, quo_name(factor_of_interest)] %>% - class %in% c("numeric", "integer", "double")) ~ + class %in% c("numeric", "integer", "double")) ~ { message("tidybulk says: The factor of interest is continuous (e.g., integer,numeric, double). The data will be filtered without grouping.") NULL }, - quo_is_symbol(factor_of_interest) ~ + quo_is_symbol(factor_of_interest) ~ colData(.data)[, quo_name(factor_of_interest)], ~ NULL ) - + + # Check if package is installed, otherwise install + if (find.package("edgeR", quiet = TRUE) %>% length %>% equals(0)) { + message("Installing edgeR needed for analyses") + if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager", repos = "https://cloud.r-project.org") + BiocManager::install("edgeR", ask = FALSE) + } + # Get gene to exclude gene_to_exclude = .data %>% - + # Extract assay assays() %>% as.list() %>% .[[1]] %>% - + # Call edgeR edgeR::filterByExpr( min.count = minimum_counts, @@ -1176,14 +1187,15 @@ setMethod("keep_variable", not() %>% which %>% names - + rowData(.data)$.abundant = (rownames(rowData(.data)) %in% gene_to_exclude) %>% not() - + # Return .data } + #' identify_abundant #' @inheritParams identify_abundant #' @@ -1486,8 +1498,7 @@ setMethod("test_gene_enrichment", stop(sprintf("tidybulk says: wrong species name. MSigDB uses the latin species names (e.g., %s)", paste(msigdbr::msigdbr_species()$species_name, collapse=", "))) .data %>% - rowData() %>% - as_tibble() %>% + pivot_transcript() %>% filter(!!.do_test) %>% distinct(!!.entrez) %>% pull(!!.entrez) %>% @@ -1654,6 +1665,9 @@ setMethod("pivot_transcript", select( transcript, col_formula) %>% distinct() + # If no missing just return the same matrix + if(nrow(NA_data) == 0) return(.x) + .data_OK = .my_data %>% anti_join(NA_data, by = c("transcript", col_formula)) @@ -1970,6 +1984,10 @@ setMethod("get_bibliography", .get_bibliography) #' describe_transcript +#' +#' @importFrom SummarizedExperiment rowData +#' @importFrom tibble enframe +#' #' @inheritParams describe_transcript #' #' @docType methods @@ -2005,13 +2023,21 @@ setMethod("get_bibliography", BiocManager::install("AnnotationDbi", ask = FALSE) } - description_df = - + .transcript = enquo(.transcript) + + # Transcript rownames by default + my_transcripts = + .transcript %>% + when( + quo_is_null(.) ~ rownames(.data), + ~ rowData(.data)[,quo_name(.transcript)] + ) + description_df = # Human tryCatch(suppressMessages(AnnotationDbi::mapIds( org.Hs.eg.db::org.Hs.eg.db, - keys = pull(.data, transcript) %>% unique %>% as.character, #ensembl_symbol_mapping$transcript %>% unique, + keys = my_transcripts, #ensembl_symbol_mapping$transcript %>% unique, column = "GENENAME", keytype = "SYMBOL", multiVals = "first" @@ -2022,7 +2048,7 @@ setMethod("get_bibliography", c( tryCatch(suppressMessages(AnnotationDbi::mapIds( org.Mm.eg.db::org.Mm.eg.db, - keys = pull(.data, transcript) %>% unique %>% as.character, #ensembl_symbol_mapping$transcript %>% unique, + keys = my_transcripts, #ensembl_symbol_mapping$transcript %>% unique, column = "GENENAME", keytype = "SYMBOL", multiVals = "first" @@ -2041,8 +2067,13 @@ setMethod("get_bibliography", slice(1) %>% ungroup() - .data %>% - left_join(description_df, by = "transcript") + rowData(.data) = rowData(.data) %>% cbind( + tibble(transcript = rownames(!!.data)) %>% + left_join(description_df, by = "transcript") %>% + select(description) + ) + + .data } #' describe_transcript diff --git a/R/tidyr_methods.R b/R/tidyr_methods.R index bb2cb5be..c9693c4e 100755 --- a/R/tidyr_methods.R +++ b/R/tidyr_methods.R @@ -5,7 +5,7 @@ #' @param data A tbl. (See tidyr) #' @param cols <[`tidy-select`][tidyr_tidy_select]> Columns to unnest. #' If you `unnest()` multiple columns, parallel entries must be of -#' compatible sizes, i.e. they're either equal or length 1 (following the +#' compatibble sizes, i.e. they're either equal or length 1 (following the #' standard tidyverse recycling rules). #' @param ... <[`tidy-select`][tidyr_tidy_select]> Columns to nest, specified #' using name-variable pairs of the form `new_col=c(col1, col2, col3)`. @@ -43,7 +43,7 @@ #' #' library(dplyr) #' -#' nest(tidybulk(tidybulk::counts_mini, sample, transcript, count), data = -transcript) %>% +#' tidybulk::se_mini %>% tidybulk() %>% nest( data = -feature) %>% #' unnest(data) #' #' @rdname nest-methods @@ -84,7 +84,7 @@ unnest.nested_tidybulk <- function (data, cols, ..., keep_empty=FALSE, ptype=NUL #' #' @examples #' -#' nest(tidybulk(tidybulk::counts_mini, sample, transcript, count), data = -transcript) +#' tidybulk::se_mini %>% tidybulk() %>% nest( data = -feature) #' #' @rdname nest-methods #' @name nest diff --git a/R/utilities.R b/R/utilities.R index d1eb7ce1..a76775a1 100755 --- a/R/utilities.R +++ b/R/utilities.R @@ -990,7 +990,7 @@ get_specific_annotation_columns = function(.data, .col){ #' log10_reverse_trans #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description it perform log scaling and reverse the axis. Useful to plot negative log probabilities. To not be used directly but with ggplot (e.g. scale_y_continuous(trans = "log10_reverse") ) #' @@ -1019,7 +1019,7 @@ log10_reverse_trans <- function() { #' logit scale #' -#' \lifecycle{maturing} +#' `r lifecycle::badge("maturing")` #' #' @description it perform logit scaling with right axis formatting. To not be used directly but with ggplot (e.g. scale_y_continuous(trans = "log10_reverse") ) #' @@ -1115,6 +1115,14 @@ add_scaled_counts_bulk.get_low_expressed <- function(.data, factor_of_interest = enquo(factor_of_interest) + # Check if package is installed, otherwise install + if (find.package("edgeR", quiet = TRUE) %>% length %>% equals(0)) { + message("Installing edgeR needed for differential transcript abundance analyses") + if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager", repos = "https://cloud.r-project.org") + BiocManager::install("edgeR", ask = FALSE) + } + # Check if factor_of_interest is continuous and exists string_factor_of_interest = diff --git a/R/zzz.R b/R/zzz.R index caa0f4d7..cfa9fcb7 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -5,8 +5,10 @@ msg = paste0("======================================== ", pkgname, " version ", version, " If you use TIDYBULK in published research, please cite: + Mangiola et al. tidybulk: an R tidy framework for modular transcriptomic data analysis. Genome Biology 2021. + This message can be suppressed by: suppressPackageStartupMessages(library(tidybulk)) ======================================== diff --git a/README.Rmd b/README.Rmd index 280983fe..47e9a0ea 100755 --- a/README.Rmd +++ b/README.Rmd @@ -76,7 +76,7 @@ Utilities | Description `impute_missing_abundance` | Impute abundance for missing data points using sample groupings `fill_missing_abundance` | Fill abundance for missing data points using an arbitrary value -All functions are directly compatible with `SummarizedExperiment` object. +All functions are directly compatibble with `SummarizedExperiment` object. ```{r, echo=FALSE, include=FALSE, } @@ -108,6 +108,8 @@ my_theme = axis.title.y = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)) ) +tibble_counts = tidybulk::counts_SE %>% tidybulk() %>% as_tibble() + ``` ## Installation @@ -235,9 +237,9 @@ x <- x[o[1L:top],,drop=FALSE] norm_counts.table = norm_counts.table[rownames(x)] -norm_counts.table$cell_type = tidybulk::counts[ +norm_counts.table$cell_type = tibble_counts[ match( - tidybulk::counts$sample, + tibble_counts$sample, rownames(norm_counts.table) ), "Cell.type" @@ -275,8 +277,8 @@ cmds = cmds %$% cmdscale.out %>% setNames(sprintf("Dim%s", 1:6)) -cmds$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(cmds)), +cmds$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(cmds)), "Cell.type" ] ``` @@ -357,8 +359,8 @@ tsne = Rtsne::Rtsne( perplexity=10, pca_scale =TRUE )$Y -tsne$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(tsne)), +tsne$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(tsne)), "Cell.type" ] ``` @@ -537,8 +539,8 @@ results <- CIBERSORT( "mixture_file.txt", perm=100, QN=TRUE ) -results$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(results)), +results$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(results)), "Cell.type" ] @@ -638,8 +640,8 @@ count_m_log = log(count_m + 1) k = kmeans(count_m_log, iter.max = 1000, ...) cluster = k$cluster -cluster$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(cluster)), +cluster$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(cluster)), c("Cell.type", "Dim1", "Dim2") ] @@ -683,8 +685,8 @@ snn = FindNeighbors(snn) snn = FindClusters(snn, method = "igraph", ...) snn = snn[["seurat_clusters"]] -snn$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(snn)), +snn$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(snn)), c("Cell.type", "Dim1", "Dim2") ] diff --git a/README.md b/README.md index 84f0081e..bc659376 100755 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ Please have a look also to | `impute_missing_abundance` | Impute abundance for missing data points using sample groupings | | `fill_missing_abundance` | Fill abundance for missing data points using an arbitrary value | -All functions are directly compatible with `SummarizedExperiment` +All functions are directly compatibble with `SummarizedExperiment` object. ## Installation diff --git a/data/breast_tcga_mini.rda b/data/breast_tcga_mini.rda deleted file mode 100755 index 6fe6473d..00000000 Binary files a/data/breast_tcga_mini.rda and /dev/null differ diff --git a/data/counts.rda b/data/counts.rda deleted file mode 100755 index 7e6b5e25..00000000 Binary files a/data/counts.rda and /dev/null differ diff --git a/data/counts_mini.rda b/data/counts_mini.rda deleted file mode 100755 index df000e4e..00000000 Binary files a/data/counts_mini.rda and /dev/null differ diff --git a/dev/dplyr-master-methods.R b/dev/dplyr-master-methods.R index a9d5656e..22fbb5ba 100755 --- a/dev/dplyr-master-methods.R +++ b/dev/dplyr-master-methods.R @@ -268,7 +268,7 @@ dplyr::arrange_if #' #' @examples #' -#' distinct(tidybulk::counts_mini) +#' tidybulk::se_mini %>% tidybulk() %>% distinct() #' #' #' @export diff --git a/doc/introduction.R b/doc/introduction.R index 1a6d47d0..d5e7462c 100644 --- a/doc/introduction.R +++ b/doc/introduction.R @@ -27,6 +27,8 @@ my_theme = axis.title.y = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)) ) +tibble_counts = tidybulk::counts_SE %>% tidybulk() %>% as_tibble() + ## ----eval=FALSE--------------------------------------------------------------- # BiocManager::install("tidybulk") @@ -73,9 +75,9 @@ counts_SE.norm.variable = counts_SE.norm %>% keep_variable() # # norm_counts.table = norm_counts.table[rownames(x)] # -# norm_counts.table$cell_type = tidybulk::counts[ +# norm_counts.table$cell_type = tibble_counts[ # match( -# tidybulk::counts$sample, +# tibble_counts$sample, # rownames(norm_counts.table) # ), # "Cell.type" @@ -97,8 +99,8 @@ counts_SE.norm.MDS = # cmdscale.out %>% # setNames(sprintf("Dim%s", 1:6)) # -# cmds$cell_type = tidybulk::counts[ -# match(tidybulk::counts$sample, rownames(cmds)), +# cmds$cell_type = tibble_counts[ +# match(tibble_counts$sample, rownames(cmds)), # "Cell.type" # ] @@ -152,8 +154,8 @@ counts_SE.norm.tSNE = # perplexity=10, # pca_scale =TRUE # )$Y -# tsne$cell_type = tidybulk::counts[ -# match(tidybulk::counts$sample, rownames(tsne)), +# tsne$cell_type = tibble_counts[ +# match(tibble_counts$sample, rownames(tsne)), # "Cell.type" # ] @@ -272,8 +274,8 @@ counts_SE.cibersort = # "mixture_file.txt", # perm=100, QN=TRUE # ) -# results$cell_type = tidybulk::counts[ -# match(tidybulk::counts$sample, rownames(results)), +# results$cell_type = tibble_counts[ +# match(tibble_counts$sample, rownames(results)), # "Cell.type" # ] # @@ -292,11 +294,11 @@ counts_SE.cibersort %>% my_theme + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), aspect.ratio=1/5) -## ----DC----------------------------------------------------------------------- - - counts_SE %>% - test_differential_cellularity(. ~ condition ) - +## ----DC, eval=FALSE----------------------------------------------------------- +# +# counts_SE %>% +# test_differential_cellularity(. ~ condition ) +# ## ----DC_censored, eval=FALSE-------------------------------------------------- # @@ -314,8 +316,8 @@ counts_SE.norm.cluster = counts_SE.norm.MDS %>% # k = kmeans(count_m_log, iter.max = 1000, ...) # cluster = k$cluster # -# cluster$cell_type = tidybulk::counts[ -# match(tidybulk::counts$sample, rownames(cluster)), +# cluster$cell_type = tibble_counts[ +# match(tibble_counts$sample, rownames(cluster)), # c("Cell.type", "Dim1", "Dim2") # ] # @@ -346,8 +348,8 @@ counts_SE.norm.SNN = # snn = FindClusters(snn, method = "igraph", ...) # snn = snn[["seurat_clusters"]] # -# snn$cell_type = tidybulk::counts[ -# match(tidybulk::counts$sample, rownames(snn)), +# snn$cell_type = tibble_counts[ +# match(tibble_counts$sample, rownames(snn)), # c("Cell.type", "Dim1", "Dim2") # ] # @@ -441,7 +443,9 @@ counts_SE.norm.non_redundant %>% counts_ensembl %>% ensembl_to_symbol(ens) ## ----description-------------------------------------------------------------- -counts_SE %>% describe_transcript() %>% select(transcript, description, everything()) +counts_SE %>% + describe_transcript() %>% + select(feature, description, everything()) ## ----------------------------------------------------------------------------- sessionInfo() diff --git a/doc/introduction.Rmd b/doc/introduction.Rmd index 223528a8..758f53ef 100755 --- a/doc/introduction.Rmd +++ b/doc/introduction.Rmd @@ -73,7 +73,7 @@ Utilities | Description `impute_missing_abundance` | Impute abundance for missing data points using sample groupings `fill_missing_abundance` | Fill abundance for missing data points using an arbitrary value -All functions are directly compatible with `SummarizedExperiment` object. +All functions are directly compatibble with `SummarizedExperiment` object. ```{r, echo=FALSE, include=FALSE, } @@ -105,6 +105,8 @@ my_theme = axis.title.y = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)) ) +tibble_counts = tidybulk::counts_SE %>% tidybulk() %>% as_tibble() + ``` ## Installation @@ -232,9 +234,9 @@ x <- x[o[1L:top],,drop=FALSE] norm_counts.table = norm_counts.table[rownames(x)] -norm_counts.table$cell_type = tidybulk::counts[ +norm_counts.table$cell_type = tibble_counts[ match( - tidybulk::counts$sample, + tibble_counts$sample, rownames(norm_counts.table) ), "Cell.type" @@ -272,8 +274,8 @@ cmds = cmds %$% cmdscale.out %>% setNames(sprintf("Dim%s", 1:6)) -cmds$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(cmds)), +cmds$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(cmds)), "Cell.type" ] ``` @@ -354,8 +356,8 @@ tsne = Rtsne::Rtsne( perplexity=10, pca_scale =TRUE )$Y -tsne$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(tsne)), +tsne$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(tsne)), "Cell.type" ] ``` @@ -534,8 +536,8 @@ results <- CIBERSORT( "mixture_file.txt", perm=100, QN=TRUE ) -results$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(results)), +results$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(results)), "Cell.type" ] @@ -564,7 +566,7 @@ counts_SE.cibersort %>% We can also perform a statistical test on the differential cell-type abundance across conditions -```{r DC} +```{r DC, eval=FALSE} counts_SE %>% test_differential_cellularity(. ~ condition ) @@ -601,8 +603,8 @@ count_m_log = log(count_m + 1) k = kmeans(count_m_log, iter.max = 1000, ...) cluster = k$cluster -cluster$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(cluster)), +cluster$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(cluster)), c("Cell.type", "Dim1", "Dim2") ] @@ -646,8 +648,8 @@ snn = FindNeighbors(snn) snn = FindClusters(snn, method = "igraph", ...) snn = snn[["seurat_clusters"]] -snn$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(snn)), +snn$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(snn)), c("Cell.type", "Dim1", "Dim2") ] @@ -789,7 +791,9 @@ counts_ensembl %>% ensembl_to_symbol(ens) We can add gene full name (and in future description) from symbol identifiers. This currently works for human and mouse. ```{r description} -counts_SE %>% describe_transcript() %>% select(transcript, description, everything()) +counts_SE %>% + describe_transcript() %>% + select(feature, description, everything()) ``` ## Appendix diff --git a/doc/introduction.html b/doc/introduction.html index 7a5ee08c..c38e93a3 100644 --- a/doc/introduction.html +++ b/doc/introduction.html @@ -350,7 +350,7 @@

Functions/utilities available

-

All functions are directly compatible with SummarizedExperiment object.

+

All functions are directly compatibble with SummarizedExperiment object.

Installation

@@ -371,19 +371,20 @@

Data

counts_SE
 
-
## # A tibble abstraction: 408,624 x 8
-##    transcript sample     count Cell.type time  condition batch factor_of_intere…
-##    <chr>      <chr>      <dbl> <fct>     <fct> <lgl>     <fct> <lgl>            
-##  1 A1BG       SRR1740034   153 b_cell    0 d   TRUE      0     TRUE             
-##  2 A1BG-AS1   SRR1740034    83 b_cell    0 d   TRUE      0     TRUE             
-##  3 AAAS       SRR1740034   868 b_cell    0 d   TRUE      0     TRUE             
-##  4 AACS       SRR1740034   222 b_cell    0 d   TRUE      0     TRUE             
-##  5 AAGAB      SRR1740034   590 b_cell    0 d   TRUE      0     TRUE             
-##  6 AAMDC      SRR1740034    48 b_cell    0 d   TRUE      0     TRUE             
-##  7 AAMP       SRR1740034  1257 b_cell    0 d   TRUE      0     TRUE             
-##  8 AANAT      SRR1740034   284 b_cell    0 d   TRUE      0     TRUE             
-##  9 AAR2       SRR1740034   379 b_cell    0 d   TRUE      0     TRUE             
-## 10 AARS2      SRR1740034   685 b_cell    0 d   TRUE      0     TRUE             
+
## # A SummarizedExperiment-tibble abstraction: 408,624 x 8
+## # Transcripts=8513 | Samples=48 | Assays=count
+##    feature  sample     count Cell.type time  condition batch factor_of_interest
+##    <chr>    <chr>      <dbl> <fct>     <fct> <lgl>     <fct> <lgl>             
+##  1 A1BG     SRR1740034   153 b_cell    0 d   TRUE      0     TRUE              
+##  2 A1BG-AS1 SRR1740034    83 b_cell    0 d   TRUE      0     TRUE              
+##  3 AAAS     SRR1740034   868 b_cell    0 d   TRUE      0     TRUE              
+##  4 AACS     SRR1740034   222 b_cell    0 d   TRUE      0     TRUE              
+##  5 AAGAB    SRR1740034   590 b_cell    0 d   TRUE      0     TRUE              
+##  6 AAMDC    SRR1740034    48 b_cell    0 d   TRUE      0     TRUE              
+##  7 AAMP     SRR1740034  1257 b_cell    0 d   TRUE      0     TRUE              
+##  8 AANAT    SRR1740034   284 b_cell    0 d   TRUE      0     TRUE              
+##  9 AAR2     SRR1740034   379 b_cell    0 d   TRUE      0     TRUE              
+## 10 AARS2    SRR1740034   685 b_cell    0 d   TRUE      0     TRUE              
 ## # … with 40 more rows
 
@@ -503,9 +504,9 @@

Filter variable transcripts

norm_counts.table = norm_counts.table[rownames(x)] -norm_counts.table$cell_type = tidybulk::counts[ +norm_counts.table$cell_type = tibble_counts[ match( - tidybulk::counts$sample, + tibble_counts$sample, rownames(norm_counts.table) ), "Cell.type" @@ -545,8 +546,8 @@

Reduce dimensions

cmdscale.out %>% setNames(sprintf(“Dim%s”, 1:6)) -cmds$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(cmds)), +cmds$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(cmds)), “Cell.type” ] “ @@ -560,18 +561,18 @@

Reduce dimensions

## # A tibble: 48 x 15
-##      Dim1   Dim2   Dim3     Dim4    Dim5    Dim6 sample Cell.type time 
-##     <dbl>  <dbl>  <dbl>    <dbl>   <dbl>   <dbl> <chr>  <chr>     <chr>
-##  1 -1.46   0.220 -1.68  -0.0553   0.0658 -0.126  SRR17… b_cell    0 d  
-##  2 -1.46   0.226 -1.71  -0.0300   0.0454 -0.137  SRR17… b_cell    1 d  
-##  3 -1.44   0.193 -1.60  -0.0890   0.0503 -0.121  SRR17… b_cell    3 d  
-##  4 -1.44   0.198 -1.67  -0.0891   0.0543 -0.110  SRR17… b_cell    7 d  
-##  5  0.243 -1.42   0.182 -0.00642 -0.503  -0.131  SRR17… dendriti… 0 d  
-##  6  0.191 -1.42   0.195 -0.0180  -0.457  -0.130  SRR17… dendriti… 1 d  
-##  7  0.257 -1.42   0.152 -0.0130  -0.582  -0.0927 SRR17… dendriti… 3 d  
-##  8  0.162 -1.43   0.189 -0.0232  -0.452  -0.109  SRR17… dendriti… 7 d  
-##  9  0.516 -1.47   0.240  0.251    0.457  -0.119  SRR17… monocyte  0 d  
-## 10  0.514 -1.41   0.231  0.219    0.458  -0.131  SRR17… monocyte  1 d  
+##      Dim1   Dim2   Dim3     Dim4    Dim5    Dim6 sample    Cell.type       time 
+##     <dbl>  <dbl>  <dbl>    <dbl>   <dbl>   <dbl> <chr>     <chr>           <chr>
+##  1 -1.46   0.220 -1.68  -0.0553   0.0658 -0.126  SRR17400… b_cell          0 d  
+##  2 -1.46   0.226 -1.71  -0.0300   0.0454 -0.137  SRR17400… b_cell          1 d  
+##  3 -1.44   0.193 -1.60  -0.0890   0.0503 -0.121  SRR17400… b_cell          3 d  
+##  4 -1.44   0.198 -1.67  -0.0891   0.0543 -0.110  SRR17400… b_cell          7 d  
+##  5  0.243 -1.42   0.182 -0.00642 -0.503  -0.131  SRR17400… dendritic_myel… 0 d  
+##  6  0.191 -1.42   0.195 -0.0180  -0.457  -0.130  SRR17400… dendritic_myel… 1 d  
+##  7  0.257 -1.42   0.152 -0.0130  -0.582  -0.0927 SRR17400… dendritic_myel… 3 d  
+##  8  0.162 -1.43   0.189 -0.0232  -0.452  -0.109  SRR17400… dendritic_myel… 7 d  
+##  9  0.516 -1.47   0.240  0.251    0.457  -0.119  SRR17400… monocyte        0 d  
+## 10  0.514 -1.41   0.231  0.219    0.458  -0.131  SRR17400… monocyte        1 d  
 ## # … with 38 more rows, and 6 more variables: condition <chr>, batch <chr>,
 ## #   factor_of_interest <chr>, merged.transcripts <dbl>, TMM <dbl>,
 ## #   multiplier <dbl>
@@ -618,22 +619,22 @@ 

Reduce dimensions

counts_SE.norm.PCA %>% pivot_sample() %>% select(contains("PC"), everything())
 
-
## # A tibble: 48 x 16
-##       PC1   PC2    PC3    PC4    PC5   PC6 sample Cell.type time  condition
-##     <dbl> <dbl>  <dbl>  <dbl>  <dbl> <dbl> <chr>  <chr>     <chr> <chr>    
-##  1 -32.7  -4.93 -37.5  -1.24   -1.47 -2.81 SRR17… b_cell    0 d   TRUE     
-##  2 -32.7  -5.05 -38.1  -0.672  -1.02 -3.06 SRR17… b_cell    1 d   TRUE     
-##  3 -32.2  -4.32 -35.8  -1.99   -1.12 -2.70 SRR17… b_cell    3 d   TRUE     
-##  4 -32.3  -4.43 -37.3  -1.99   -1.21 -2.45 SRR17… b_cell    7 d   TRUE     
-##  5   5.44 31.8    4.08 -0.144  11.3  -2.94 SRR17… dendriti… 0 d   FALSE    
-##  6   4.28 31.7    4.35 -0.403  10.2  -2.91 SRR17… dendriti… 1 d   FALSE    
-##  7   5.74 31.7    3.40 -0.290  13.0  -2.07 SRR17… dendriti… 3 d   FALSE    
-##  8   3.62 32.1    4.23 -0.519  10.1  -2.43 SRR17… dendriti… 7 d   FALSE    
-##  9  11.5  32.8    5.37  5.60  -10.2  -2.66 SRR17… monocyte  0 d   FALSE    
-## 10  11.5  31.6    5.16  4.90  -10.2  -2.92 SRR17… monocyte  1 d   FALSE    
-## # … with 38 more rows, and 6 more variables: batch <chr>,
+
## # A tibble: 48 x 15
+##        PC1   PC2    PC3     PC4    PC5   PC6 sample  Cell.type   time  condition
+##      <dbl> <dbl>  <dbl>   <dbl>  <dbl> <dbl> <chr>   <chr>       <chr> <chr>    
+##  1 -12.6   -2.52 -14.9  -0.424  -0.592 -1.22 SRR174… b_cell      0 d   TRUE     
+##  2 -12.6   -2.57 -15.2  -0.140  -0.388 -1.30 SRR174… b_cell      1 d   TRUE     
+##  3 -12.6   -2.41 -14.5  -0.714  -0.344 -1.10 SRR174… b_cell      3 d   TRUE     
+##  4 -12.5   -2.34 -14.9  -0.816  -0.427 -1.00 SRR174… b_cell      7 d   TRUE     
+##  5   0.189 13.0    1.66 -0.0269  4.64  -1.35 SRR174… dendritic_… 0 d   FALSE    
+##  6  -0.293 12.9    1.76 -0.0727  4.21  -1.28 SRR174… dendritic_… 1 d   FALSE    
+##  7   0.407 13.0    1.42 -0.0529  5.37  -1.01 SRR174… dendritic_… 3 d   FALSE    
+##  8  -0.620 13.0    1.73 -0.201   4.17  -1.07 SRR174… dendritic_… 7 d   FALSE    
+##  9   2.56  13.5    2.32  2.03   -4.32  -1.22 SRR174… monocyte    0 d   FALSE    
+## 10   2.65  13.1    2.21  1.80   -4.29  -1.30 SRR174… monocyte    1 d   FALSE    
+## # … with 38 more rows, and 5 more variables: batch <chr>,
 ## #   factor_of_interest <chr>, merged.transcripts <dbl>, TMM <dbl>,
-## #   multiplier <dbl>, sample.x <chr>
+## #   multiplier <dbl>
 
counts_SE.norm.PCA %>%
@@ -641,7 +642,7 @@ 

Reduce dimensions

GGally::ggpairs(columns = 11:13, ggplot2::aes(colour=`Cell.type`))
-

plot of chunk plot_pca

+

plot of chunk plot_pca

tSNE

@@ -673,8 +674,8 @@

Reduce dimensions

perplexity=10, pca_scale =TRUE )$Y -tsne$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(tsne)), +tsne$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(tsne)), “Cell.type” ] “ @@ -689,19 +690,19 @@

Reduce dimensions

select(contains("tSNE"), everything())
-
## # A tibble: 251 x 5
-##      tSNE1  tSNE2 sample                       Call  sample.x                   
-##      <dbl>  <dbl> <chr>                        <fct> <chr>                      
-##  1   8.68   -3.70 TCGA-A1-A0SD-01A-11R-A115-07 LumA  TCGA-A1-A0SD-01A-11R-A115-…
-##  2  -9.59   -4.43 TCGA-A1-A0SF-01A-11R-A144-07 LumA  TCGA-A1-A0SF-01A-11R-A144-…
-##  3   2.00   -1.98 TCGA-A1-A0SG-01A-11R-A144-07 LumA  TCGA-A1-A0SG-01A-11R-A144-…
-##  4  -3.39  -13.0  TCGA-A1-A0SH-01A-11R-A084-07 LumA  TCGA-A1-A0SH-01A-11R-A084-…
-##  5   0.519  12.0  TCGA-A1-A0SI-01A-11R-A144-07 LumB  TCGA-A1-A0SI-01A-11R-A144-…
-##  6  -1.82    1.00 TCGA-A1-A0SJ-01A-11R-A084-07 LumA  TCGA-A1-A0SJ-01A-11R-A084-…
-##  7 -12.8    31.1  TCGA-A1-A0SK-01A-12R-A084-07 Basal TCGA-A1-A0SK-01A-12R-A084-…
-##  8   8.74    6.88 TCGA-A1-A0SM-01A-11R-A084-07 LumA  TCGA-A1-A0SM-01A-11R-A084-…
-##  9   7.96    8.30 TCGA-A1-A0SN-01A-11R-A144-07 LumB  TCGA-A1-A0SN-01A-11R-A144-…
-## 10   1.56  -23.1  TCGA-A1-A0SQ-01A-21R-A144-07 LumA  TCGA-A1-A0SQ-01A-21R-A144-…
+
## # A tibble: 251 x 4
+##     tSNE1   tSNE2 sample                       Call 
+##     <dbl>   <dbl> <chr>                        <fct>
+##  1 -15.0   -7.03  TCGA-A1-A0SD-01A-11R-A115-07 LumA 
+##  2   4.39 -10.2   TCGA-A1-A0SF-01A-11R-A144-07 LumA 
+##  3 -16.1   10.4   TCGA-A1-A0SG-01A-11R-A144-07 LumA 
+##  4  -1.42  -1.15  TCGA-A1-A0SH-01A-11R-A084-07 LumA 
+##  5  -4.45  -0.940 TCGA-A1-A0SI-01A-11R-A144-07 LumB 
+##  6   7.01   9.68  TCGA-A1-A0SJ-01A-11R-A084-07 LumA 
+##  7  39.1   -4.71  TCGA-A1-A0SK-01A-12R-A084-07 Basal
+##  8   2.60   0.187 TCGA-A1-A0SM-01A-11R-A084-07 LumA 
+##  9   9.85  10.7   TCGA-A1-A0SN-01A-11R-A144-07 LumB 
+## 10 -24.2   15.8   TCGA-A1-A0SQ-01A-21R-A144-07 LumA 
 ## # … with 241 more rows
 
@@ -710,7 +711,7 @@

Reduce dimensions

ggplot(aes(x = `tSNE1`, y = `tSNE2`, color=Call)) + geom_point() + my_theme
-

plot of chunk unnamed-chunk-11

+

plot of chunk unnamed-chunk-11

Rotate dimensions

@@ -886,8 +887,8 @@

Deconvolve Cell type composition

"mixture_file.txt", perm=100, QN=TRUE ) -results$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(results)), +results$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(results)), "Cell.type" ] ``` @@ -921,23 +922,6 @@

Test differential cell-type abundance

test_differential_cellularity(. ~ condition )
-
## # A tibble: 22 x 7
-##    .cell_type cell_type_propo… `estimate_(Inte… estimate_condit…
-##    <chr>      <list>                      <dbl>            <dbl>
-##  1 cibersort… <tibble [48 × 8…            -2.97            2.33 
-##  2 cibersort… <tibble [48 × 8…            -4.79            1.86 
-##  3 cibersort… <tibble [48 × 8…            -5.44           -0.503
-##  4 cibersort… <tibble [48 × 8…            -2.28            0.883
-##  5 cibersort… <tibble [48 × 8…            -2.79           -0.637
-##  6 cibersort… <tibble [48 × 8…            -2.60            0.320
-##  7 cibersort… <tibble [48 × 8…            -3.72            2.14 
-##  8 cibersort… <tibble [48 × 8…            -5.20           -0.251
-##  9 cibersort… <tibble [48 × 8…            -4.80            1.75 
-## 10 cibersort… <tibble [48 × 8…            -5.34           -0.219
-## # … with 12 more rows, and 3 more variables: std.error_conditionTRUE <dbl>,
-## #   statistic_conditionTRUE <dbl>, p.value_conditionTRUE <dbl>
-
-

We can also perform regression analysis with censored data (coxph).

    counts_SE %>%
@@ -968,8 +952,8 @@ 

Cluster samples

k = kmeans(count_m_log, iter.max = 1000, …) cluster = k$cluster -cluster$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(cluster)), +cluster$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(cluster)), c(“Cell.type”, “Dim1”, “Dim2”) ] “ @@ -1017,8 +1001,8 @@

Cluster samples

snn = FindClusters(snn, method = “igraph”, …) snn = snn[[“seurat_clusters”]] -snn$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(snn)), +snn$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(snn)), c(“Cell.type”, “Dim1”, “Dim2”) ] “ @@ -1031,19 +1015,19 @@

Cluster samples

select(contains("tSNE"), everything())
-
## # A tibble: 251 x 6
-##      tSNE1  tSNE2 sample                 Call  sample.x              cluster_SNN
-##      <dbl>  <dbl> <chr>                  <fct> <chr>                 <fct>      
-##  1   8.68   -3.70 TCGA-A1-A0SD-01A-11R-… LumA  TCGA-A1-A0SD-01A-11R… 0          
-##  2  -9.59   -4.43 TCGA-A1-A0SF-01A-11R-… LumA  TCGA-A1-A0SF-01A-11R… 1          
-##  3   2.00   -1.98 TCGA-A1-A0SG-01A-11R-… LumA  TCGA-A1-A0SG-01A-11R… 0          
-##  4  -3.39  -13.0  TCGA-A1-A0SH-01A-11R-… LumA  TCGA-A1-A0SH-01A-11R… 3          
-##  5   0.519  12.0  TCGA-A1-A0SI-01A-11R-… LumB  TCGA-A1-A0SI-01A-11R… 3          
-##  6  -1.82    1.00 TCGA-A1-A0SJ-01A-11R-… LumA  TCGA-A1-A0SJ-01A-11R… 0          
-##  7 -12.8    31.1  TCGA-A1-A0SK-01A-12R-… Basal TCGA-A1-A0SK-01A-12R… 1          
-##  8   8.74    6.88 TCGA-A1-A0SM-01A-11R-… LumA  TCGA-A1-A0SM-01A-11R… 3          
-##  9   7.96    8.30 TCGA-A1-A0SN-01A-11R-… LumB  TCGA-A1-A0SN-01A-11R… 3          
-## 10   1.56  -23.1  TCGA-A1-A0SQ-01A-21R-… LumA  TCGA-A1-A0SQ-01A-21R… 0          
+
## # A tibble: 251 x 5
+##     tSNE1   tSNE2 sample                       Call  cluster_SNN
+##     <dbl>   <dbl> <chr>                        <fct> <fct>      
+##  1 -15.0   -7.03  TCGA-A1-A0SD-01A-11R-A115-07 LumA  0          
+##  2   4.39 -10.2   TCGA-A1-A0SF-01A-11R-A144-07 LumA  1          
+##  3 -16.1   10.4   TCGA-A1-A0SG-01A-11R-A144-07 LumA  0          
+##  4  -1.42  -1.15  TCGA-A1-A0SH-01A-11R-A084-07 LumA  3          
+##  5  -4.45  -0.940 TCGA-A1-A0SI-01A-11R-A144-07 LumB  3          
+##  6   7.01   9.68  TCGA-A1-A0SJ-01A-11R-A084-07 LumA  0          
+##  7  39.1   -4.71  TCGA-A1-A0SK-01A-12R-A084-07 Basal 1          
+##  8   2.60   0.187 TCGA-A1-A0SM-01A-11R-A084-07 LumA  3          
+##  9   9.85  10.7   TCGA-A1-A0SN-01A-11R-A144-07 LumB  3          
+## 10 -24.2   15.8   TCGA-A1-A0SQ-01A-21R-A144-07 LumA  0          
 ## # … with 241 more rows
 
@@ -1054,7 +1038,7 @@

Cluster samples

ggplot(aes(x = `tSNE1`, y = `tSNE2`, color=Call)) + geom_point() + facet_grid(~source) + my_theme
-

plot of chunk SNN_plot

+

plot of chunk SNN_plot

# Do differential transcription between clusters
 counts_SE.norm.SNN %>%
@@ -1065,19 +1049,20 @@ 

Cluster samples

)
-
## # A tibble abstraction: 125,500 x 16
-##    transcript sample  count count_scaled Call  sample.x tSNE1 tSNE2 cluster_SNN
-##    <chr>      <chr>   <int>        <int> <fct> <chr>    <dbl> <dbl> <fct>      
-##  1 ENSG00000… TCGA-…  22114        22114 LumA  TCGA-A1…  8.68 -3.70 0          
-##  2 ENSG00000… TCGA-… 128257       128257 LumA  TCGA-A1…  8.68 -3.70 0          
-##  3 ENSG00000… TCGA-…  23971        23971 LumA  TCGA-A1…  8.68 -3.70 0          
-##  4 ENSG00000… TCGA-…  22518        22518 LumA  TCGA-A1…  8.68 -3.70 0          
-##  5 ENSG00000… TCGA-…  23250        23250 LumA  TCGA-A1…  8.68 -3.70 0          
-##  6 ENSG00000… TCGA-…  30039        30039 LumA  TCGA-A1…  8.68 -3.70 0          
-##  7 ENSG00000… TCGA-…  32987        32987 LumA  TCGA-A1…  8.68 -3.70 0          
-##  8 ENSG00000… TCGA-…  42292        42292 LumA  TCGA-A1…  8.68 -3.70 0          
-##  9 ENSG00000… TCGA-…  12417        12417 LumA  TCGA-A1…  8.68 -3.70 0          
-## 10 ENSG00000… TCGA-…  40820        40820 LumA  TCGA-A1…  8.68 -3.70 0          
+
## # A SummarizedExperiment-tibble abstraction: 125,500 x 15
+## # Transcripts=500 | Samples=251 | Assays=count, count_scaled
+##    feature sample  count count_scaled Call  tSNE1 tSNE2 cluster_SNN
+##    <chr>   <chr>   <int>        <int> <fct> <dbl> <dbl> <fct>      
+##  1 ENSG00… TCGA-…  22114        22114 LumA  -15.0 -7.03 0          
+##  2 ENSG00… TCGA-… 128257       128257 LumA  -15.0 -7.03 0          
+##  3 ENSG00… TCGA-…  23971        23971 LumA  -15.0 -7.03 0          
+##  4 ENSG00… TCGA-…  22518        22518 LumA  -15.0 -7.03 0          
+##  5 ENSG00… TCGA-…  23250        23250 LumA  -15.0 -7.03 0          
+##  6 ENSG00… TCGA-…  30039        30039 LumA  -15.0 -7.03 0          
+##  7 ENSG00… TCGA-…  32987        32987 LumA  -15.0 -7.03 0          
+##  8 ENSG00… TCGA-…  42292        42292 LumA  -15.0 -7.03 0          
+##  9 ENSG00… TCGA-…  12417        12417 LumA  -15.0 -7.03 0          
+## 10 ENSG00… TCGA-…  40820        40820 LumA  -15.0 -7.03 0          
 ## # … with 40 more rows, and 7 more variables: factor_of_interest <lgl>,
 ## #   .abundant <lgl>, logFC <dbl>, logCPM <dbl>, F <dbl>, PValue <dbl>,
 ## #   FDR <dbl>
@@ -1192,18 +1177,18 @@ 

From ensembl IDs to gene symbol IDs

## # A tibble: 119 x 8
-##    ens   iso   `read count` sample cases_0_project… cases_0_samples… transcript
-##    <chr> <chr>        <dbl> <chr>  <chr>            <chr>            <chr>     
-##  1 ENSG… 13             144 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
-##  2 ENSG… 13              72 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
-##  3 ENSG… 13               0 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
-##  4 ENSG… 13            1099 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
-##  5 ENSG… 13              11 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
-##  6 ENSG… 13               2 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
-##  7 ENSG… 13               3 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
-##  8 ENSG… 13            2678 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
-##  9 ENSG… 13             751 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
-## 10 ENSG… 13               1 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
+##    ens    iso   `read count` sample cases_0_project… cases_0_samples… transcript
+##    <chr>  <chr>        <dbl> <chr>  <chr>            <chr>            <chr>     
+##  1 ENSG0… 13             144 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
+##  2 ENSG0… 13              72 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
+##  3 ENSG0… 13               0 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
+##  4 ENSG0… 13            1099 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
+##  5 ENSG0… 13              11 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
+##  6 ENSG0… 13               2 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
+##  7 ENSG0… 13               3 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
+##  8 ENSG0… 13            2678 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
+##  9 ENSG0… 13             751 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
+## 10 ENSG0… 13               1 TARGE… Acute Myeloid L… Primary Blood D… TSPAN6    
 ## # … with 109 more rows, and 1 more variable: ref_genome <chr>
 
@@ -1211,22 +1196,25 @@

From gene symbol to gene description (gene name in full)

We can add gene full name (and in future description) from symbol identifiers. This currently works for human and mouse.

-
counts_SE %>% describe_transcript() %>% select(transcript, description, everything())
+
counts_SE %>% 
+    describe_transcript() %>% 
+    select(feature, description, everything())
 
-
## # A tibble abstraction: 408,624 x 9
-##    transcript sample count Cell.type time  condition batch factor_of_inter…
-##    <chr>      <chr>  <dbl> <fct>     <fct> <lgl>     <fct> <lgl>           
-##  1 A1BG       SRR17…   153 b_cell    0 d   TRUE      0     TRUE            
-##  2 A1BG-AS1   SRR17…    83 b_cell    0 d   TRUE      0     TRUE            
-##  3 AAAS       SRR17…   868 b_cell    0 d   TRUE      0     TRUE            
-##  4 AACS       SRR17…   222 b_cell    0 d   TRUE      0     TRUE            
-##  5 AAGAB      SRR17…   590 b_cell    0 d   TRUE      0     TRUE            
-##  6 AAMDC      SRR17…    48 b_cell    0 d   TRUE      0     TRUE            
-##  7 AAMP       SRR17…  1257 b_cell    0 d   TRUE      0     TRUE            
-##  8 AANAT      SRR17…   284 b_cell    0 d   TRUE      0     TRUE            
-##  9 AAR2       SRR17…   379 b_cell    0 d   TRUE      0     TRUE            
-## 10 AARS2      SRR17…   685 b_cell    0 d   TRUE      0     TRUE            
+
## # A SummarizedExperiment-tibble abstraction: 408,624 x 9
+## # Transcripts=8513 | Samples=48 | Assays=count
+##    feature sample count Cell.type time  condition batch factor_of_inter…
+##    <chr>   <chr>  <dbl> <fct>     <fct> <lgl>     <fct> <lgl>           
+##  1 A1BG    SRR17…   153 b_cell    0 d   TRUE      0     TRUE            
+##  2 A1BG-A… SRR17…    83 b_cell    0 d   TRUE      0     TRUE            
+##  3 AAAS    SRR17…   868 b_cell    0 d   TRUE      0     TRUE            
+##  4 AACS    SRR17…   222 b_cell    0 d   TRUE      0     TRUE            
+##  5 AAGAB   SRR17…   590 b_cell    0 d   TRUE      0     TRUE            
+##  6 AAMDC   SRR17…    48 b_cell    0 d   TRUE      0     TRUE            
+##  7 AAMP    SRR17…  1257 b_cell    0 d   TRUE      0     TRUE            
+##  8 AANAT   SRR17…   284 b_cell    0 d   TRUE      0     TRUE            
+##  9 AAR2    SRR17…   379 b_cell    0 d   TRUE      0     TRUE            
+## 10 AARS2   SRR17…   685 b_cell    0 d   TRUE      0     TRUE            
 ## # … with 40 more rows, and 1 more variable: description <chr>
 
@@ -1235,13 +1223,13 @@

Appendix

sessionInfo()
 
-
## R version 4.0.4 (2021-02-15)
+
## R version 4.0.5 (2021-03-31)
 ## Platform: x86_64-pc-linux-gnu (64-bit)
 ## Running under: CentOS Linux 7 (Core)
 ## 
 ## Matrix products: default
-## BLAS:   /stornext/System/data/apps/R/R-4.0.4/lib64/R/lib/libRblas.so
-## LAPACK: /stornext/System/data/apps/R/R-4.0.4/lib64/R/lib/libRlapack.so
+## BLAS:   /stornext/System/data/apps/R/R-4.0.5/lib64/R/lib/libRblas.so
+## LAPACK: /stornext/System/data/apps/R/R-4.0.5/lib64/R/lib/libRlapack.so
 ## 
 ## locale:
 ##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
@@ -1256,70 +1244,69 @@ 

Appendix

## [8] methods base ## ## other attached packages: -## [1] tidySummarizedExperiment_0.99.3 SummarizedExperiment_1.20.0 -## [3] Biobase_2.50.0 GenomicRanges_1.42.0 -## [5] GenomeInfoDb_1.26.2 IRanges_2.24.1 -## [7] S4Vectors_0.28.1 BiocGenerics_0.36.0 -## [9] MatrixGenerics_1.2.1 matrixStats_0.58.0 -## [11] tidybulk_1.2.1 ggrepel_0.9.1 -## [13] ggplot2_3.3.3 magrittr_2.0.1 -## [15] tibble_3.0.6 tidyr_1.1.2 -## [17] dplyr_1.0.4 knitr_1.31 +## [1] tidySummarizedExperiment_1.2.1 SummarizedExperiment_1.20.0 +## [3] Biobase_2.50.0 GenomicRanges_1.42.0 +## [5] GenomeInfoDb_1.26.4 IRanges_2.24.1 +## [7] S4Vectors_0.28.1 BiocGenerics_0.36.0 +## [9] MatrixGenerics_1.2.1 matrixStats_0.58.0 +## [11] tidybulk_1.3.1 ggrepel_0.9.1 +## [13] ggplot2_3.3.3 magrittr_2.0.1 +## [15] tibble_3.1.0 tidyr_1.1.3 +## [17] dplyr_1.0.5 knitr_1.31 ## ## loaded via a namespace (and not attached): -## [1] utf8_1.1.4 reticulate_1.18 tidyselect_1.1.0 -## [4] RSQLite_2.2.3 AnnotationDbi_1.52.0 htmlwidgets_1.5.3 -## [7] grid_4.0.4 BiocParallel_1.24.1 Rtsne_0.15 +## [1] utf8_1.2.1 reticulate_1.18 tidyselect_1.1.0 +## [4] RSQLite_2.2.5 AnnotationDbi_1.52.0 htmlwidgets_1.5.3 +## [7] grid_4.0.5 BiocParallel_1.24.1 Rtsne_0.15 ## [10] widyr_0.1.3 devtools_2.3.2 munsell_0.5.0 ## [13] codetools_0.2-18 ica_1.0-2 preprocessCore_1.52.1 ## [16] future_1.21.0 miniUI_0.1.1.1 withr_2.4.1 ## [19] colorspace_2.0-0 highr_0.8 rstudioapi_0.13 -## [22] Seurat_4.0.0 ROCR_1.0-11 tensor_1.5 +## [22] Seurat_4.0.1 ROCR_1.0-11 tensor_1.5 ## [25] listenv_0.8.0 labeling_0.4.2 GenomeInfoDbData_1.2.4 -## [28] polyclip_1.10-0 bit64_4.0.5 farver_2.0.3 -## [31] betareg_3.1-4 rprojroot_2.0.2 parallelly_1.23.0 -## [34] vctrs_0.3.6 generics_0.1.0 xfun_0.21 -## [37] R6_2.5.0 markdown_1.1 locfit_1.5-9.4 -## [40] flexmix_2.3-17 bitops_1.0-6 spatstat.utils_2.0-0 -## [43] cachem_1.0.4 reshape_0.8.8 DelayedArray_0.16.1 -## [46] assertthat_0.2.1 promises_1.2.0.1 scales_1.1.1 -## [49] nnet_7.3-15 gtable_0.3.0 org.Mm.eg.db_3.12.0 -## [52] globals_0.14.0 sva_3.38.0 processx_3.4.5 -## [55] goftest_1.2-2 sandwich_3.0-0 rlang_0.4.10 -## [58] genefilter_1.72.1 splines_4.0.4 lazyeval_0.2.2 -## [61] broom_0.7.4 reshape2_1.4.4 abind_1.4-5 -## [64] tidytext_0.3.0 backports_1.2.1 httpuv_1.5.5 -## [67] tokenizers_0.2.1 tools_4.0.4 usethis_2.0.1 -## [70] ellipsis_0.3.1 RColorBrewer_1.1-2 sessioninfo_1.1.1 -## [73] ggridges_0.5.3 Rcpp_1.0.6 plyr_1.8.6 -## [76] zlibbioc_1.36.0 purrr_0.3.4 RCurl_1.98-1.2 -## [79] ps_1.5.0 prettyunits_1.1.1 rpart_4.1-15 -## [82] deldir_0.2-10 pbapply_1.4-3 cowplot_1.1.1 -## [85] zoo_1.8-8 SeuratObject_4.0.0 cluster_2.1.0 -## [88] fs_1.5.0 data.table_1.14.0 scattermore_0.7 -## [91] lmtest_0.9-38 RANN_2.6.1 SnowballC_0.7.0 -## [94] fitdistrplus_1.1-3 pkgload_1.1.0 hms_1.0.0 -## [97] patchwork_1.1.1 mime_0.10 evaluate_0.14 -## [100] xtable_1.8-4 XML_3.99-0.5 gridExtra_2.3 -## [103] testthat_3.0.2 compiler_4.0.4 KernSmooth_2.23-18 -## [106] crayon_1.4.1 htmltools_0.5.1.1 mgcv_1.8-33 -## [109] later_1.1.0.1 Formula_1.2-4 DBI_1.1.1 -## [112] MASS_7.3-53 Matrix_1.3-2 readr_1.4.0 -## [115] cli_2.3.0 igraph_1.2.6 pkgconfig_2.0.3 -## [118] plotly_4.9.3 annotate_1.68.0 XVector_0.30.0 -## [121] janeaustenr_0.1.5 stringr_1.4.0 callr_3.5.1 -## [124] digest_0.6.27 sctransform_0.3.2.9003 RcppAnnoy_0.0.18 -## [127] spatstat.data_2.0-0 leiden_0.3.7 uwot_0.1.10 -## [130] edgeR_3.32.1 modeltools_0.2-23 shiny_1.6.0 -## [133] lifecycle_1.0.0 nlme_3.1-152 jsonlite_1.7.2 -## [136] desc_1.2.0 viridisLite_0.3.0 limma_3.46.0 -## [139] fansi_0.4.2 pillar_1.4.7 lattice_0.20-41 -## [142] GGally_2.1.0 fastmap_1.1.0 httr_1.4.2 -## [145] pkgbuild_1.2.0 survival_3.2-7 glue_1.4.2 -## [148] remotes_2.2.0 spatstat_1.64-1 png_0.1-7 -## [151] bit_4.0.4 class_7.3-18 stringi_1.5.3 -## [154] blob_1.2.1 org.Hs.eg.db_3.12.0 memoise_2.0.0 -## [157] irlba_2.3.3 e1071_1.7-4 future.apply_1.7.0 +## [28] polyclip_1.10-0 bit64_4.0.5 farver_2.1.0 +## [31] rprojroot_2.0.2 parallelly_1.24.0 vctrs_0.3.7 +## [34] generics_0.1.0 xfun_0.22 R6_2.5.0 +## [37] markdown_1.1 locfit_1.5-9.4 bitops_1.0-6 +## [40] spatstat.utils_2.1-0 cachem_1.0.4 reshape_0.8.8 +## [43] DelayedArray_0.16.3 assertthat_0.2.1 promises_1.2.0.1 +## [46] scales_1.1.1 gtable_0.3.0 org.Mm.eg.db_3.12.0 +## [49] globals_0.14.0 sva_3.38.0 processx_3.5.1 +## [52] goftest_1.2-2 rlang_0.4.10 genefilter_1.72.1 +## [55] splines_4.0.5 lazyeval_0.2.2 spatstat.geom_2.0-1 +## [58] broom_0.7.6 reshape2_1.4.4 abind_1.4-5 +## [61] tidytext_0.3.0 backports_1.2.1 httpuv_1.5.5 +## [64] tokenizers_0.2.1 tools_4.0.5 usethis_2.0.1 +## [67] ellipsis_0.3.1 spatstat.core_2.0-0 RColorBrewer_1.1-2 +## [70] proxy_0.4-25 sessioninfo_1.1.1 ggridges_0.5.3 +## [73] Rcpp_1.0.6 plyr_1.8.6 zlibbioc_1.36.0 +## [76] purrr_0.3.4 RCurl_1.98-1.3 ps_1.6.0 +## [79] prettyunits_1.1.1 rpart_4.1-15 deldir_0.2-10 +## [82] pbapply_1.4-3 cowplot_1.1.1 zoo_1.8-9 +## [85] SeuratObject_4.0.0 cluster_2.1.1 fs_1.5.0 +## [88] data.table_1.14.0 scattermore_0.7 lmtest_0.9-38 +## [91] RANN_2.6.1 SnowballC_0.7.0 fitdistrplus_1.1-3 +## [94] pkgload_1.2.0 hms_1.0.0 patchwork_1.1.1 +## [97] mime_0.10 evaluate_0.14 xtable_1.8-4 +## [100] XML_3.99-0.6 gridExtra_2.3 testthat_3.0.2 +## [103] compiler_4.0.5 KernSmooth_2.23-18 crayon_1.4.1 +## [106] htmltools_0.5.1.1 mgcv_1.8-34 later_1.1.0.1 +## [109] DBI_1.1.1 MASS_7.3-53.1 Matrix_1.3-2 +## [112] readr_1.4.0 cli_2.4.0 igraph_1.2.6 +## [115] pkgconfig_2.0.3 plotly_4.9.3 spatstat.sparse_2.0-0 +## [118] annotate_1.68.0 XVector_0.30.0 janeaustenr_0.1.5 +## [121] stringr_1.4.0 callr_3.6.0 digest_0.6.27 +## [124] sctransform_0.3.2.9003 RcppAnnoy_0.0.18 spatstat.data_2.1-0 +## [127] leiden_0.3.7 uwot_0.1.10 edgeR_3.32.1 +## [130] shiny_1.6.0 lifecycle_1.0.0 nlme_3.1-152 +## [133] jsonlite_1.7.2 desc_1.3.0 viridisLite_0.3.0 +## [136] limma_3.46.0 fansi_0.4.2 pillar_1.5.1 +## [139] lattice_0.20-41 GGally_2.1.1 fastmap_1.1.0 +## [142] httr_1.4.2 pkgbuild_1.2.0 survival_3.2-10 +## [145] glue_1.4.2 remotes_2.3.0 png_0.1-7 +## [148] bit_4.0.4 class_7.3-18 stringi_1.5.3 +## [151] blob_1.2.1 org.Hs.eg.db_3.12.0 memoise_2.0.0 +## [154] irlba_2.3.3 e1071_1.7-6 future.apply_1.7.0
diff --git a/man/adjust_abundance-methods.Rd b/man/adjust_abundance-methods.Rd index fc9d57d5..73a7e6c1 100644 --- a/man/adjust_abundance-methods.Rd +++ b/man/adjust_abundance-methods.Rd @@ -110,7 +110,7 @@ A `SummarizedExperiment` object adjust_abundance() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with an edditional adjusted abundance column. This method uses scaled counts if present. } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This function adjusts the abundance for (known) unwanted variation. At the moment just an unwanted covariate is allowed at a time using Combat (DOI: 10.1093/bioinformatics/bts034) @@ -122,9 +122,9 @@ Underlying method: -cm = tidybulk::counts_mini +cm = tidybulk::se_mini cm$batch = 0 -cm$batch[cm$sample \%in\% c("SRR1740035", "SRR1740043")] = 1 +cm$batch[colnames(cm) \%in\% c("SRR1740035", "SRR1740043")] = 1 res = cm \%>\% diff --git a/man/aggregate_duplicates-methods.Rd b/man/aggregate_duplicates-methods.Rd index f45ce762..a219492d 100644 --- a/man/aggregate_duplicates-methods.Rd +++ b/man/aggregate_duplicates-methods.Rd @@ -94,7 +94,7 @@ A `SummarizedExperiment` object aggregate_duplicates() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with aggregated transcripts that were duplicated. } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This function aggregates duplicated transcripts (e.g., isoforms, ensembl). For example, we often have to convert ensembl symbols to gene/transcript symbol, @@ -112,10 +112,7 @@ For example, we often have to convert ensembl symbols to gene/transcript symbol, \examples{ aggregate_duplicates( - tidybulk::counts_mini, - sample, - transcript, - `count`, + tidybulk::se_mini, aggregation_function = sum ) diff --git a/man/as_matrix.Rd b/man/as_matrix.Rd index 6f260722..063eb51c 100644 --- a/man/as_matrix.Rd +++ b/man/as_matrix.Rd @@ -21,6 +21,8 @@ Get matrix from tibble } \examples{ -as_matrix(head(dplyr::select(tidybulk::counts_mini, transcript, count)), rownames=transcript) +library(dplyr) + +tidybulk::se_mini \%>\% tidybulk() \%>\% select(feature, count) \%>\% head \%>\% as_matrix(rownames=feature) } diff --git a/man/breast_tcga_mini.Rd b/man/breast_tcga_mini.Rd deleted file mode 100644 index 74fcd18d..00000000 --- a/man/breast_tcga_mini.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{breast_tcga_mini} -\alias{breast_tcga_mini} -\title{Data set} -\format{ -An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 125500 rows and 5 columns. -} -\usage{ -breast_tcga_mini -} -\description{ -Data set -} -\keyword{datasets} diff --git a/man/cluster_elements-methods.Rd b/man/cluster_elements-methods.Rd index a8b1dc55..9ea9e6ea 100644 --- a/man/cluster_elements-methods.Rd +++ b/man/cluster_elements-methods.Rd @@ -118,7 +118,7 @@ A `SummarizedExperiment` object cluster_elements() takes as input a `tbl` formatted as | | | | <...> | and identify clusters in the data. } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` identifies clusters in the data, normally of samples. This function returns a tibble with additional columns for the cluster annotation. @@ -139,6 +139,6 @@ Seurat::FindClusters(method = "igraph", ...) \examples{ - cluster_elements(tidybulk::counts_mini, sample, transcript, count, centers = 2, method="kmeans") + cluster_elements(tidybulk::se_mini, centers = 2, method="kmeans") } diff --git a/man/counts.Rd b/man/counts.Rd deleted file mode 100644 index 230b5a4e..00000000 --- a/man/counts.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{counts} -\alias{counts} -\title{Example data set} -\format{ -An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 408624 rows and 8 columns. -} -\usage{ -counts -} -\description{ -Example data set -} -\keyword{datasets} diff --git a/man/counts_mini.Rd b/man/counts_mini.Rd deleted file mode 100644 index 95855cc4..00000000 --- a/man/counts_mini.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{counts_mini} -\alias{counts_mini} -\title{Example data set reduced} -\format{ -An object of class \code{spec_tbl_df} (inherits from \code{tbl_df}, \code{tbl}, \code{data.frame}) with 2635 rows and 6 columns. -} -\usage{ -counts_mini -} -\description{ -Example data set reduced -} -\keyword{datasets} diff --git a/man/deconvolve_cellularity-methods.Rd b/man/deconvolve_cellularity-methods.Rd index 4ff30bd2..6b8bef3f 100644 --- a/man/deconvolve_cellularity-methods.Rd +++ b/man/deconvolve_cellularity-methods.Rd @@ -118,7 +118,7 @@ A `SummarizedExperiment` object deconvolve_cellularity() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with the estimated cell type abundance for each sample } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This function infers the cell type composition of our samples (with the algorithm Cibersort; Newman et al., 10.1038/nmeth.3337). @@ -128,8 +128,10 @@ CIBERSORT(Y = data, X = reference, ...) } \examples{ +library(dplyr) + # Subsetting for time efficiency -deconvolve_cellularity(filter(tidybulk::counts, sample=="SRR1740034"), sample, transcript, `count`, cores = 1) +tidybulk::se_mini \%>\% tidybulk() \%>\% filter(sample=="SRR1740034") \%>\% deconvolve_cellularity(sample, feature, count, cores = 1) } diff --git a/man/describe_transcript-methods.Rd b/man/describe_transcript-methods.Rd index c06d974e..1d784ebe 100644 --- a/man/describe_transcript-methods.Rd +++ b/man/describe_transcript-methods.Rd @@ -62,6 +62,6 @@ describe_transcript } \examples{ -describe_transcript(tidybulk::counts_mini, .transcript = transcript) +describe_transcript(tidybulk::se_mini) } diff --git a/man/distinct-methods.Rd b/man/distinct-methods.Rd index 54d6d6c1..5f3d0486 100644 --- a/man/distinct-methods.Rd +++ b/man/distinct-methods.Rd @@ -18,7 +18,7 @@ distinct } \examples{ -distinct(tidybulk::counts_mini) +tidybulk::se_mini \%>\% tidybulk() \%>\% distinct() } diff --git a/man/dplyr-methods.Rd b/man/dplyr-methods.Rd index 72ffa26d..20999c40 100644 --- a/man/dplyr-methods.Rd +++ b/man/dplyr-methods.Rd @@ -24,7 +24,7 @@ Left join datasets } \examples{ `\%>\%` = magrittr::`\%>\%` -annotation = tidybulk::counts \%>\% distinct(sample) \%>\% mutate(source = "AU") -tidybulk::counts \%>\% left_join(annotation) +annotation = tidybulk::counts_SE \%>\% tidybulk() \%>\% as_tibble() \%>\% distinct(sample) \%>\% mutate(source = "AU") +tidybulk::counts_SE \%>\% tidybulk() \%>\% as_tibble() \%>\% left_join(annotation) } diff --git a/man/ensembl_to_symbol-methods.Rd b/man/ensembl_to_symbol-methods.Rd index 5a9609bd..ac06772c 100644 --- a/man/ensembl_to_symbol-methods.Rd +++ b/man/ensembl_to_symbol-methods.Rd @@ -42,8 +42,10 @@ This is useful since different resources use ensembl IDs while others use gene s } \examples{ +library(dplyr) - ensembl_to_symbol(tidybulk::counts_ensembl, ens) +tidybulk::counts_SE \%>\% tidybulk() \%>\% as_tibble() \%>\% ensembl_to_symbol(feature) + } diff --git a/man/fill_missing_abundance-methods.Rd b/man/fill_missing_abundance-methods.Rd index 3306274f..941487b6 100644 --- a/man/fill_missing_abundance-methods.Rd +++ b/man/fill_missing_abundance-methods.Rd @@ -70,7 +70,7 @@ This function fills the abundance of missing sample-transcript pair using the me } \examples{ -fill_missing_abundance(tidybulk::counts_mini, sample, transcript, count, fill_with = 0) +tidybulk::se_mini \%>\% tidybulk() \%>\% fill_missing_abundance( fill_with = 0) } diff --git a/man/get_bibliography-methods.Rd b/man/get_bibliography-methods.Rd index f67f983e..8fe3a54f 100644 --- a/man/get_bibliography-methods.Rd +++ b/man/get_bibliography-methods.Rd @@ -45,14 +45,14 @@ A `tbl` with additional columns for the statistics from the hypothesis test (e.g get_bibliography() takes as input a `tidybulk` } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This methods returns the bibliography list of your workflow from the internals of a tidybulk tibble (attr(., "internals")) } \examples{ # Define tidybulk tibble -df = tidybulk(tidybulk::counts_mini, sample, transcript, count) +df = tidybulk(tidybulk::se_mini) get_bibliography(df) diff --git a/man/get_differential_transcript_abundance_bulk.Rd b/man/get_differential_transcript_abundance_bulk.Rd index ebe48671..170f7ab0 100755 --- a/man/get_differential_transcript_abundance_bulk.Rd +++ b/man/get_differential_transcript_abundance_bulk.Rd @@ -15,7 +15,8 @@ get_differential_transcript_abundance_bulk( test_above_log2_fold_change = NULL, scaling_method = "TMM", omit_contrast_in_colnames = FALSE, - prefix = "" + prefix = "", + .sample_total_read_count = NULL ) } \arguments{ @@ -38,6 +39,8 @@ get_differential_transcript_abundance_bulk( \item{scaling_method}{A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile")} \item{omit_contrast_in_colnames}{If just one contrast is specified you can choose to omit the contrast label in the colnames.} + +\item{.sample_total_read_count}{} } \value{ A tibble with edgeR results diff --git a/man/get_differential_transcript_abundance_bulk_SE.Rd b/man/get_differential_transcript_abundance_bulk_SE.Rd index 5e62a708..58f9ed46 100755 --- a/man/get_differential_transcript_abundance_bulk_SE.Rd +++ b/man/get_differential_transcript_abundance_bulk_SE.Rd @@ -13,7 +13,8 @@ get_differential_transcript_abundance_bulk_SE( test_above_log2_fold_change = NULL, scaling_method = "TMM", omit_contrast_in_colnames = FALSE, - prefix = "" + prefix = "", + ... ) } \arguments{ diff --git a/man/get_reduced_dimensions_MDS_bulk_SE.Rd b/man/get_reduced_dimensions_MDS_bulk_SE.Rd index 027e4ce8..7b6ce8fd 100644 --- a/man/get_reduced_dimensions_MDS_bulk_SE.Rd +++ b/man/get_reduced_dimensions_MDS_bulk_SE.Rd @@ -9,7 +9,8 @@ get_reduced_dimensions_MDS_bulk_SE( .dims = 2, top = 500, of_samples = TRUE, - log_transform = TRUE + log_transform = TRUE, + scale = NULL ) } \arguments{ @@ -23,6 +24,8 @@ get_reduced_dimensions_MDS_bulk_SE( \item{log_transform}{A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)} +\item{scale}{A boolean} + \item{.abundance}{A column symbol with the value the clustering is based on (e.g., `count`)} \item{.feature}{A column symbol. The column that is represents entities to cluster (i.e., normally genes)} diff --git a/man/get_reduced_dimensions_TSNE_bulk_SE.Rd b/man/get_reduced_dimensions_TSNE_bulk_SE.Rd index 77cc88d7..4fe2727d 100644 --- a/man/get_reduced_dimensions_TSNE_bulk_SE.Rd +++ b/man/get_reduced_dimensions_TSNE_bulk_SE.Rd @@ -10,6 +10,7 @@ get_reduced_dimensions_TSNE_bulk_SE( top = 500, of_samples = TRUE, log_transform = TRUE, + scale = NULL, ... ) } @@ -24,6 +25,8 @@ get_reduced_dimensions_TSNE_bulk_SE( \item{log_transform}{A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)} +\item{scale}{A boolean} + \item{...}{Further parameters passed to the function Rtsne} \item{.abundance}{A column symbol with the value the clustering is based on (e.g., `count`)} diff --git a/man/identify_abundant-methods.Rd b/man/identify_abundant-methods.Rd index 37986367..d51e8d76 100644 --- a/man/identify_abundant-methods.Rd +++ b/man/identify_abundant-methods.Rd @@ -102,7 +102,7 @@ A `SummarizedExperiment` object identify_abundant() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with additional columns for the statistics from the hypothesis test. } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` At the moment this function uses edgeR (DOI: 10.1093/bioinformatics/btp616) @@ -119,10 +119,7 @@ At the moment this function uses edgeR (DOI: 10.1093/bioinformatics/btp616) identify_abundant( - tidybulk::counts_mini, - sample, - transcript, - `count` + tidybulk::se_mini ) diff --git a/man/impute_missing_abundance-methods.Rd b/man/impute_missing_abundance-methods.Rd index 520aff75..4e6a0aa7 100644 --- a/man/impute_missing_abundance-methods.Rd +++ b/man/impute_missing_abundance-methods.Rd @@ -74,7 +74,7 @@ A `SummarizedExperiment` object impute_missing_abundance() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with an edditional adjusted abundance column. This method uses scaled counts if present. } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This function imputes the abundance of missing sample-transcript pair using the median of the sample group defined by the formula } @@ -83,11 +83,8 @@ This function imputes the abundance of missing sample-transcript pair using the res = impute_missing_abundance( - tidybulk::counts_mini, - ~ condition, - .sample = sample, - .transcript = transcript, - .abundance = count + tidybulk::se_mini, + ~ condition ) diff --git a/man/join-methods.Rd b/man/join-methods.Rd index 3f030446..6febe09c 100644 --- a/man/join-methods.Rd +++ b/man/join-methods.Rd @@ -34,15 +34,15 @@ Full join datasets } \examples{ `\%>\%` = magrittr::`\%>\%` -annotation = tidybulk::counts \%>\% distinct(sample) \%>\% mutate(source = "AU") -tidybulk::counts \%>\% inner_join(annotation) +annotation = tidybulk::counts_SE \%>\% tidybulk() \%>\% as_tibble() \%>\% distinct(sample) \%>\% mutate(source = "AU") +tidybulk::counts_SE \%>\% tidybulk() \%>\% as_tibble() \%>\% inner_join(annotation) `\%>\%` = magrittr::`\%>\%` -annotation = tidybulk::counts \%>\% distinct(sample) \%>\% mutate(source = "AU") -tidybulk::counts \%>\% right_join(annotation) +annotation = tidybulk::counts_SE \%>\% tidybulk() \%>\% as_tibble() \%>\% distinct(sample) \%>\% mutate(source = "AU") +tidybulk::counts_SE \%>\% tidybulk() \%>\% as_tibble() \%>\% right_join(annotation) `\%>\%` = magrittr::`\%>\%` -annotation = tidybulk::counts \%>\% distinct(sample) \%>\% mutate(source = "AU") -tidybulk::counts \%>\% full_join(annotation) +annotation = tidybulk::counts_SE \%>\% tidybulk() \%>\% as_tibble() \%>\% distinct(sample) \%>\% mutate(source = "AU") +tidybulk::counts_SE \%>\% tidybulk() \%>\% as_tibble() \%>\% full_join(annotation) } diff --git a/man/keep_abundant-methods.Rd b/man/keep_abundant-methods.Rd index bc488a30..58e6d7ec 100644 --- a/man/keep_abundant-methods.Rd +++ b/man/keep_abundant-methods.Rd @@ -119,10 +119,7 @@ At the moment this function uses edgeR (DOI: 10.1093/bioinformatics/btp616) keep_abundant( - tidybulk::counts_mini, - sample, - transcript, - `count` + tidybulk::se_mini ) diff --git a/man/keep_variable-methods.Rd b/man/keep_variable-methods.Rd index 4f1fe7d4..1fa77264 100644 --- a/man/keep_variable-methods.Rd +++ b/man/keep_variable-methods.Rd @@ -86,7 +86,7 @@ A `SummarizedExperiment` object keep_variable() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with additional columns for the statistics from the hypothesis test. } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` At the moment this function uses edgeR \url{https://doi.org/10.1093/bioinformatics/btp616} } @@ -95,10 +95,7 @@ At the moment this function uses edgeR \url{https://doi.org/10.1093/bioinformati keep_variable( - tidybulk::counts_mini, - sample, - transcript, - `count`, + tidybulk::se_mini, top = 500 ) diff --git a/man/log10_reverse_trans.Rd b/man/log10_reverse_trans.Rd index 7474bc34..f5df8d5d 100644 --- a/man/log10_reverse_trans.Rd +++ b/man/log10_reverse_trans.Rd @@ -13,7 +13,7 @@ A scales object it perform log scaling and reverse the axis. Useful to plot negative log probabilities. To not be used directly but with ggplot (e.g. scale_y_continuous(trans = "log10_reverse") ) } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` } \examples{ diff --git a/man/logit_trans.Rd b/man/logit_trans.Rd index f9164922..205d19e8 100644 --- a/man/logit_trans.Rd +++ b/man/logit_trans.Rd @@ -13,7 +13,7 @@ A scales object it perform logit scaling with right axis formatting. To not be used directly but with ggplot (e.g. scale_y_continuous(trans = "log10_reverse") ) } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` } \examples{ diff --git a/man/nest-methods.Rd b/man/nest-methods.Rd index 59cc5b2b..b81e5897 100644 --- a/man/nest-methods.Rd +++ b/man/nest-methods.Rd @@ -9,7 +9,7 @@ \item{cols}{<[`tidy-select`][tidyr_tidy_select]> Columns to unnest. If you `unnest()` multiple columns, parallel entries must be of -compatible sizes, i.e. they're either equal or length 1 (following the +compatibble sizes, i.e. they're either equal or length 1 (following the standard tidyverse recycling rules).} \item{names_sep}{If `NULL`, the default, the names will be left @@ -54,10 +54,10 @@ nest library(dplyr) -nest(tidybulk(tidybulk::counts_mini, sample, transcript, count), data = -transcript) \%>\% +tidybulk::se_mini \%>\% tidybulk() \%>\% nest( data = -feature) \%>\% unnest(data) -nest(tidybulk(tidybulk::counts_mini, sample, transcript, count), data = -transcript) +tidybulk::se_mini \%>\% tidybulk() \%>\% nest( data = -feature) } diff --git a/man/pivot_sample-methods.Rd b/man/pivot_sample-methods.Rd index c8c7c0ba..28c4a03f 100644 --- a/man/pivot_sample-methods.Rd +++ b/man/pivot_sample-methods.Rd @@ -44,17 +44,14 @@ A `tbl` object pivot_sample() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with only sample-related columns } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This functon extracts only sample-related information for downstream analysis (e.g., visualisation). It is disruptive in the sense that it cannot be passed anymore to tidybulk function. } \examples{ - pivot_sample( - tidybulk::counts_mini, - .sample = sample - ) + pivot_sample(tidybulk::se_mini ) } diff --git a/man/pivot_transcript-methods.Rd b/man/pivot_transcript-methods.Rd index e3ce4caa..86c3225a 100644 --- a/man/pivot_transcript-methods.Rd +++ b/man/pivot_transcript-methods.Rd @@ -44,17 +44,14 @@ A `tbl` object pivot_transcript() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with only sample-related columns } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This functon extracts only transcript-related information for downstream analysis (e.g., visualisation). It is disruptive in the sense that it cannot be passed anymore to tidybulk function. } \examples{ - pivot_transcript( - tidybulk::counts_mini, - .transcript = transcript - ) + pivot_transcript(tidybulk::se_mini ) } diff --git a/man/reduce_dimensions-methods.Rd b/man/reduce_dimensions-methods.Rd index c576992d..1c65df78 100644 --- a/man/reduce_dimensions-methods.Rd +++ b/man/reduce_dimensions-methods.Rd @@ -142,7 +142,7 @@ A `SummarizedExperiment` object reduce_dimensions() takes as input a `tbl` formatted as | | | | <...> | and calculates the reduced dimensional space of the transcript abundance. } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This function reduces the dimensions of the transcript abundances. It can use multi-dimensional scaling (MDS; DOI.org/10.1186/gb-2010-11-3-r25), @@ -162,15 +162,13 @@ Rtsne::Rtsne(data, ...) counts.MDS = - tidybulk::counts_mini \%>\% - tidybulk(sample, transcript, count) \%>\% + tidybulk::se_mini \%>\% identify_abundant() \%>\% reduce_dimensions( method="MDS", .dims = 3) counts.PCA = - tidybulk::counts_mini \%>\% - tidybulk(sample, transcript, count) \%>\% + tidybulk::se_mini \%>\% identify_abundant() \%>\% reduce_dimensions(method="PCA", .dims = 3) diff --git a/man/remove_redundancy-methods.Rd b/man/remove_redundancy-methods.Rd index 64e7c1b0..4ad632cd 100644 --- a/man/remove_redundancy-methods.Rd +++ b/man/remove_redundancy-methods.Rd @@ -139,8 +139,7 @@ remove_redundancy() takes as input a `tbl` formatted as | | \% - tidybulk(sample, transcript, count) \%>\% + tidybulk::se_mini \%>\% identify_abundant() \%>\% remove_redundancy( .element = sample, @@ -150,8 +149,7 @@ remove_redundancy() takes as input a `tbl` formatted as | | \% - tidybulk(sample, transcript, count) \%>\% + tidybulk::se_mini \%>\% identify_abundant() \%>\% reduce_dimensions( method="MDS", .dims = 3) diff --git a/man/rotate_dimensions-methods.Rd b/man/rotate_dimensions-methods.Rd index 0e93ab08..e7949b1c 100644 --- a/man/rotate_dimensions-methods.Rd +++ b/man/rotate_dimensions-methods.Rd @@ -118,7 +118,7 @@ A `SummarizedExperiment` object rotate_dimensions() takes as input a `tbl` formatted as | | | <...> | and calculates the rotated dimensional space of the transcript abundance. } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This function to rotate two dimensions such as the reduced dimensions. @@ -136,8 +136,7 @@ Underlying custom method: \examples{ counts.MDS = - tidybulk::counts_mini \%>\% - tidybulk(sample, transcript, count) \%>\% + tidybulk::se_mini \%>\% identify_abundant() \%>\% reduce_dimensions( method="MDS", .dims = 3) diff --git a/man/scale_abundance-methods.Rd b/man/scale_abundance-methods.Rd index ed194836..272fb288 100644 --- a/man/scale_abundance-methods.Rd +++ b/man/scale_abundance-methods.Rd @@ -92,7 +92,7 @@ A `SummarizedExperiment` object scale_abundance() takes as input a `tbl` formatted as | | | | <...> | and Scales transcript abundance compansating for sequencing depth (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25). } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` Scales transcript abundance compensating for sequencing depth (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25). @@ -106,8 +106,7 @@ edgeR::calcNormFactors(.data, method = c("TMM","TMMwsp","RLE","upperquartile")) \examples{ - tidybulk::counts_mini \%>\% - tidybulk(sample, transcript, count) \%>\% + tidybulk::se_mini \%>\% identify_abundant() \%>\% scale_abundance() diff --git a/man/symbol_to_entrez.Rd b/man/symbol_to_entrez.Rd index 2a646118..1eeb1ca6 100644 --- a/man/symbol_to_entrez.Rd +++ b/man/symbol_to_entrez.Rd @@ -21,6 +21,6 @@ Get ENTREZ id from gene SYMBOL } \examples{ -symbol_to_entrez(tidybulk::counts_mini, .transcript = transcript, .sample = sample) +tidybulk::se_mini \%>\% tidybulk() \%>\% as_tibble() \%>\% symbol_to_entrez(.transcript = feature, .sample = sample) } diff --git a/man/test_differential_abundance-methods.Rd b/man/test_differential_abundance-methods.Rd index 07076bb7..6656016a 100755 --- a/man/test_differential_abundance-methods.Rd +++ b/man/test_differential_abundance-methods.Rd @@ -23,6 +23,7 @@ test_differential_abundance( omit_contrast_in_colnames = FALSE, prefix = "", action = "add", + ..., significance_threshold = NULL, fill_missing_values = NULL ) @@ -40,6 +41,7 @@ test_differential_abundance( omit_contrast_in_colnames = FALSE, prefix = "", action = "add", + ..., significance_threshold = NULL, fill_missing_values = NULL ) @@ -57,6 +59,7 @@ test_differential_abundance( omit_contrast_in_colnames = FALSE, prefix = "", action = "add", + ..., significance_threshold = NULL, fill_missing_values = NULL ) @@ -74,6 +77,7 @@ test_differential_abundance( omit_contrast_in_colnames = FALSE, prefix = "", action = "add", + ..., significance_threshold = NULL, fill_missing_values = NULL ) @@ -81,23 +85,37 @@ test_differential_abundance( \S4method{test_differential_abundance}{SummarizedExperiment}( .data, .formula, + .sample = NULL, + .transcript = NULL, + .abundance = NULL, .contrasts = NULL, method = "edgeR_quasi_likelihood", test_above_log2_fold_change = NULL, scaling_method = "TMM", omit_contrast_in_colnames = FALSE, - prefix = "" + prefix = "", + action = "add", + ..., + significance_threshold = NULL, + fill_missing_values = NULL ) \S4method{test_differential_abundance}{RangedSummarizedExperiment}( .data, .formula, + .sample = NULL, + .transcript = NULL, + .abundance = NULL, .contrasts = NULL, method = "edgeR_quasi_likelihood", test_above_log2_fold_change = NULL, scaling_method = "TMM", omit_contrast_in_colnames = FALSE, - prefix = "" + prefix = "", + action = "add", + ..., + significance_threshold = NULL, + fill_missing_values = NULL ) } \arguments{ @@ -125,6 +143,8 @@ test_differential_abundance( \item{action}{A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).} +\item{...}{Further arguments passed to some of the internal functions. Currently, it is needed just for internal debug.} + \item{significance_threshold}{DEPRECATED - A real between 0 and 1 (usually 0.05).} \item{fill_missing_values}{DEPRECATED - A boolean. Whether to fill missing sample/transcript values with the median of the transcript. This is rarely needed.} @@ -146,7 +166,7 @@ A `SummarizedExperiment` object test_differential_abundance() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with additional columns for the statistics from the hypothesis test. } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This function provides the option to use edgeR \url{https://doi.org/10.1093/bioinformatics/btp616}, limma-voom \url{https://doi.org/10.1186/gb-2014-15-2-r29}, limma_voom_sample_weights \url{https://doi.org/10.1093/nar/gkv412} or DESeq2 \url{https://doi.org/10.1186/s13059-014-0550-8} to perform the testing. All methods use raw counts, irrespective of if scale_abundance or adjust_abundance have been calculated, therefore it is essential to add covariates such as batch effects (if applicable) in the formula. @@ -191,15 +211,13 @@ DESeq2::results() # edgeR - tidybulk::counts_mini \%>\% - tidybulk(sample, transcript, count) \%>\% + tidybulk::se_mini \%>\% identify_abundant() \%>\% test_differential_abundance( ~ condition ) # The function `test_differential_abundance` operates with contrasts too - tidybulk::counts_mini \%>\% - tidybulk(sample, transcript, count) \%>\% + tidybulk::se_mini \%>\% identify_abundant() \%>\% test_differential_abundance( ~ 0 + condition, @@ -208,15 +226,16 @@ DESeq2::results() # DESeq2 - equivalent for limma-voom - tidybulk::counts_mini \%>\% - tidybulk(sample, transcript, count) \%>\% +my_se_mini = tidybulk::se_mini +my_se_mini$condition = factor(my_se_mini$condition) + +my_se_mini \%>\% identify_abundant() \%>\% test_differential_abundance( ~ condition, method="deseq2" ) # The function `test_differential_abundance` operates with contrasts too - tidybulk::counts_mini \%>\% - tidybulk(sample, transcript, count) \%>\% + my_se_mini \%>\% identify_abundant() \%>\% test_differential_abundance( ~ 0 + condition, diff --git a/man/test_differential_cellularity-methods.Rd b/man/test_differential_cellularity-methods.Rd index 396e436e..bbd0aefd 100644 --- a/man/test_differential_cellularity-methods.Rd +++ b/man/test_differential_cellularity-methods.Rd @@ -118,7 +118,7 @@ A `SummarizedExperiment` object test_differential_cellularity() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with additional columns for the statistics from the hypothesis test. } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This routine applies a deconvolution method (e.g., Cibersort; DOI: 10.1038/nmeth.3337) and passes the proportions inferred into a generalised linear model (DOI:dx.doi.org/10.1007/s11749-010-0189-z) @@ -153,11 +153,8 @@ deconvolve_cellularity( # Regular regression test_differential_cellularity( - tidybulk::counts_mini, + tidybulk::se_mini , . ~ condition, - sample, - transcript, - count, cores = 1 ) @@ -165,7 +162,8 @@ deconvolve_cellularity( library(dplyr) library(tidyr) -tidybulk::counts_mini \%>\% +tidybulk::se_mini \%>\% + tidybulk() \%>\% # Add survival data nest(data = -sample) \%>\% @@ -178,9 +176,6 @@ tidybulk::counts_mini \%>\% # Test test_differential_cellularity( survival::Surv(days, dead) ~ ., - sample, - transcript, - count, cores = 1 ) diff --git a/man/test_gene_enrichment-methods.Rd b/man/test_gene_enrichment-methods.Rd index 13082cc8..b5ca69b0 100644 --- a/man/test_gene_enrichment-methods.Rd +++ b/man/test_gene_enrichment-methods.Rd @@ -118,7 +118,7 @@ A `tbl` object test_gene_enrichment() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with the additional transcript symbol column } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This wrapper execute ensemble gene enrichment analyses of the dataset using EGSEA (DOI:0.12688/f1000research.12544.1) @@ -154,7 +154,7 @@ dge %>% \examples{ \dontrun{ -df_entrez = symbol_to_entrez(tidybulk::counts_mini, .transcript = transcript, .sample = sample) +df_entrez = tidybulk::se_mini \%>\% tidybulk() \%>\% as_tibble() \%>\% symbol_to_entrez( .transcript = feature, .sample = sample) df_entrez = aggregate_duplicates(df_entrez, aggregation_function = sum, .sample = sample, .transcript = entrez, .abundance = count) library("EGSEA") diff --git a/man/test_gene_overrepresentation-methods.Rd b/man/test_gene_overrepresentation-methods.Rd index 30887203..e678fced 100644 --- a/man/test_gene_overrepresentation-methods.Rd +++ b/man/test_gene_overrepresentation-methods.Rd @@ -94,7 +94,7 @@ A `RangedSummarizedExperiment` object test_gene_overrepresentation() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with the GSEA statistics } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This wrapper execute gene enrichment analyses of the dataset using a list of transcripts and GSEA. This wrapper uses clusterProfiler (DOI: doi.org/10.1089/omi.2011.0118) on the back-end. @@ -114,9 +114,9 @@ Undelying method: } \examples{ -df_entrez = symbol_to_entrez(tidybulk::counts_mini, .transcript = transcript, .sample = sample) +df_entrez = tidybulk::se_mini \%>\% tidybulk() \%>\% as_tibble() \%>\% symbol_to_entrez( .transcript = feature, .sample = sample) df_entrez = aggregate_duplicates(df_entrez, aggregation_function = sum, .sample = sample, .transcript = entrez, .abundance = count) -df_entrez = mutate(df_entrez, do_test = transcript \%in\% c("TNFRSF4", "PLCH2", "PADI4", "PAX7")) +df_entrez = mutate(df_entrez, do_test = feature \%in\% c("TNFRSF4", "PLCH2", "PADI4", "PAX7")) test_gene_overrepresentation( df_entrez, diff --git a/man/test_stratification_cellularity-methods.Rd b/man/test_stratification_cellularity-methods.Rd index 9945c179..429ea3b9 100644 --- a/man/test_stratification_cellularity-methods.Rd +++ b/man/test_stratification_cellularity-methods.Rd @@ -110,7 +110,7 @@ A `tbl` with additional columns for the statistics from the hypothesis test (e.g test_stratification_cellularity() takes as input a `tbl` formatted as | | | | <...> | and returns a `tbl` with additional columns for the statistics from the hypothesis test. } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This routine applies a deconvolution method (e.g., Cibersort; DOI: 10.1038/nmeth.3337) and passes the proportions inferred into a generalised linear model (DOI:dx.doi.org/10.1007/s11749-010-0189-z) @@ -135,7 +135,8 @@ deconvolve_cellularity( library(dplyr) library(tidyr) - tidybulk::counts_mini \%>\% +tidybulk::se_mini \%>\% + tidybulk() \%>\% # Add survival data nest(data = -sample) \%>\% @@ -146,9 +147,6 @@ mutate( unnest(data) \%>\% test_stratification_cellularity( survival::Surv(days, dead) ~ ., - sample, - transcript, - count, cores = 1 ) diff --git a/man/tidybulk-methods.Rd b/man/tidybulk-methods.Rd index 81587460..2953feed 100644 --- a/man/tidybulk-methods.Rd +++ b/man/tidybulk-methods.Rd @@ -45,7 +45,7 @@ A `tidybulk` object tidybulk() creates a `tt` object from a `tbl` formatted as | | | | <...> | } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This function creates a tidybulk object and is useful if you want to avoid to specify .sample, .transcript and .abundance arguments all the times. @@ -57,7 +57,7 @@ arguments are stored as metadata. They can be extracted as attr(, "inter -my_tt = tidybulk(tidybulk::counts_mini, sample, transcript, count) +my_tt = tidybulk(tidybulk::se_mini) } diff --git a/man/tidybulk_SAM_BAM-methods.Rd b/man/tidybulk_SAM_BAM-methods.Rd index d2570c5f..dba0c05c 100644 --- a/man/tidybulk_SAM_BAM-methods.Rd +++ b/man/tidybulk_SAM_BAM-methods.Rd @@ -26,7 +26,7 @@ A `tidybulk` object tidybulk_SAM_BAM() creates a `tt` object from a `tbl` formatted as | | | | <...> | } \details{ -\lifecycle{maturing} +`r lifecycle::badge("maturing")` This function is based on FeatureCounts package (DOI: 10.1093/bioinformatics/btt656). This function creates a tidybulk object and is useful if you want to avoid to specify .sample, .transcript and .abundance arguments all the times. diff --git a/tests/testthat/test-bulk_methods.R b/tests/testthat/test-bulk_methods.R index 087c06c9..2ae7eb96 100755 --- a/tests/testthat/test-bulk_methods.R +++ b/tests/testthat/test-bulk_methods.R @@ -1,8 +1,11 @@ context('Bulk methods') -input_df = setNames(tidybulk::counts_mini, c("a", "b", "Cell type", "c", "time" , "condition")) +data("se_mini") +data("breast_tcga_mini_SE") -input_df_breast = setNames(tidybulk::breast_tcga_mini, c("a", "b", "c norm", "call", "c")) +input_df = se_mini %>% tidybulk() %>% as_tibble() %>% setNames(c("b","a", "c", "Cell type", "time" , "condition")) + +input_df_breast = breast_tcga_mini_SE %>% tidybulk() %>% as_tibble() %>% setNames(c( "b","a", "c", "c norm", "call")) test_that("Creating tt object from tibble, number of parameters, methods",{ @@ -172,7 +175,7 @@ test_that("filter variable - no object",{ ) expect_equal( - sort(unique(res$b)), + as.character(sort(unique(res$b))), c("FCN1", "IGHD", "IGHM", "IGKC", "TCL1A") ) @@ -373,7 +376,7 @@ test_that("Add differential trancript abundance - no object",{ expect_equal( dplyr::pull(dplyr::slice(distinct(res, b, logFC), 1:4) , "logFC"), - c(NA , 5.477110, -6.079712 , NA), + c(3.597633, 2.473975, 2.470380, NA), tolerance=1e-6 ) @@ -783,7 +786,7 @@ test_that("Get entrez from symbol - no object",{ expect_equal( res$entrez[1:4], - c( "7293", "9651", "23569" ,"5081" ) + c( "5244", "23457", "9744", "43" ) ) }) @@ -906,7 +909,7 @@ test_that("Add adjusted counts - no object",{ expect_equal( unique(res$`c_adjusted`)[c(1, 2, 3, 5)], - c( NA, 1017, 25, 4904), + c(NA, 7948, 2193, 2407), tolerance=1e-6 ) @@ -1099,7 +1102,7 @@ test_that("Only reduced dimensions MDS - no object",{ 3 ) - expect_equal( class(attr(res, "internals")$MDS)[1], "MDS" ) + expect_equal( class(attr(res, "internals")$MDS[[1]])[1], "MDS" ) # Duplicate genes/samples expect_error( @@ -1140,7 +1143,7 @@ test_that("Get reduced dimensions MDS - no object",{ nrow(res), 5 ) - expect_equal( class(attr(res, "internals")$MDS)[1], "MDS" ) + expect_equal( class(attr(res, "internals")$MDS[[1]])[1], "MDS" ) }) test_that("Add reduced dimensions MDS - no object",{ @@ -1166,7 +1169,7 @@ test_that("Add reduced dimensions MDS - no object",{ 9 ) - expect_equal( class(attr(res, "internals")$MDS)[1], "MDS" ) + expect_equal( class(attr(res, "internals")$MDS[[1]])[1], "MDS" ) }) test_that("Only reduced dimensions PCA - no object",{ @@ -1265,8 +1268,10 @@ test_that("Get reduced dimensions tSNE - no object",{ set.seed(132) res = + input_df_breast %>% + identify_abundant(a, b, c) %>% + reduce_dimensions( - setNames(tidybulk::counts, c("a", "b", "Cell type", "c", "time" , "condition", "batch", "factor_of_interest")) %>% identify_abundant(a, b, c), method="tSNE", .abundance = c, .element = a, @@ -1282,11 +1287,11 @@ test_that("Get reduced dimensions tSNE - no object",{ expect_equal( ncol(res), - 8 + 4 ) expect_equal( nrow(res), - 48 + 251 ) # Duplicate genes/samples @@ -1309,8 +1314,10 @@ test_that("Add reduced dimensions tSNE - no object",{ set.seed(132) res = + input_df_breast %>% + identify_abundant(a, b, c) %>% + reduce_dimensions( - setNames(tidybulk::counts, c("a", "b", "Cell type", "c", "time" , "condition", "batch", "factor_of_interest")) %>% identify_abundant(a, b, c), method="tSNE", .abundance = c, .element = a, @@ -1326,7 +1333,7 @@ test_that("Add reduced dimensions tSNE - no object",{ expect_equal( ncol(res), - 11 + 8 ) }) @@ -1452,7 +1459,7 @@ test_that("Aggregate duplicated transcript - no object",{ expect_equal( res$b[1:4], - c( "TNFRSF4", "PLCH2" , "PADI4" , "PAX7" ) + c("ABCB4", "ABCB9", "ACAP1", "ACHE" ) ) expect_equal( @@ -1537,25 +1544,25 @@ test_that("Add description to symbol",{ # Human res = describe_transcript( - tidybulk::counts, - .transcript = transcript + input_df, + .transcript = b ) expect_equal( ncol(res), - 9 + 7 ) res = describe_transcript( - tidybulk::counts %>% tidybulk(sample, transcript, count) + input_df %>% tidybulk(a, b, c) ) expect_equal( ncol(res), - 9 + 7 ) }) @@ -1906,7 +1913,7 @@ test_that("impute missing - no object",{ .abundance = c ) - expect_equal( dplyr::pull(filter(res, b=="TNFRSF4" & a == "SRR1740034"), c), 203.5 ) + expect_equal( dplyr::pull(filter(res, b=="TNFRSF4" & a == "SRR1740034"), c), 6 ) expect_equal( ncol(res), ncol(input_df) ) @@ -1916,9 +1923,9 @@ test_that("impute missing - no object",{ test_that("gene over representation",{ - df_entrez = symbol_to_entrez(tidybulk::counts_mini, .transcript = transcript, .sample = sample) + df_entrez = se_mini %>% tidybulk() %>% as_tibble() %>% symbol_to_entrez(.transcript = feature, .sample = sample) df_entrez = aggregate_duplicates(df_entrez, aggregation_function = sum, .sample = sample, .transcript = entrez, .abundance = count) - df_entrez = mutate(df_entrez, do_test = transcript %in% c("TNFRSF4", "PLCH2", "PADI4", "PAX7")) + df_entrez = mutate(df_entrez, do_test = feature %in% c("TNFRSF4", "PLCH2", "PADI4", "PAX7")) res = test_gene_overrepresentation( diff --git a/tests/testthat/test-bulk_methods_SummarizedExperiment.R b/tests/testthat/test-bulk_methods_SummarizedExperiment.R index fedbfd00..265a7b9f 100755 --- a/tests/testthat/test-bulk_methods_SummarizedExperiment.R +++ b/tests/testthat/test-bulk_methods_SummarizedExperiment.R @@ -1,8 +1,11 @@ context('Bulk methods SummarizedExperiment') -input_df = setNames(tidybulk::counts_mini, c("a", "b", "Cell type", "c", "time" , "condition")) +data("se_mini") +data("breast_tcga_mini_SE") -input_df_breast = setNames(tidybulk::breast_tcga_mini, c("a", "b", "c norm", "call", "c")) +input_df = setNames(se_mini %>% tidybulk() %>% as_tibble(), c( "b","a", "c", "Cell type", "time" , "condition")) + +input_df_breast = setNames( breast_tcga_mini_SE %>% tidybulk() %>% as_tibble(), c( "b", "a","c", "c norm", "call" )) test_that("tidybulk SummarizedExperiment conversion",{ @@ -125,19 +128,19 @@ test_that("Drop redundant correlated - SummarizedExperiment",{ test_that("Get adjusted counts - SummarizedExperiment",{ - cm = input_df + cm = se_mini cm$batch = 0 - cm$batch[cm$a %in% c("SRR1740035", "SRR1740043")] = 1 + cm$batch[colnames(cm) %in% c("SRR1740035", "SRR1740043")] = 1 res = adjust_abundance( - tidybulk:::tidybulk_to_SummarizedExperiment(cm, a, b, c) %>% identify_abundant(), + cm %>% identify_abundant(), ~ condition + batch ) expect_equal(nrow(res), 527 ) - expect_equal( names(SummarizedExperiment::assays(res)), c("c" ,"c_adjusted") ) + expect_equal( names(SummarizedExperiment::assays(res)), c("count" ,"count_adjusted") ) }) @@ -153,7 +156,7 @@ test_that("Aggregate duplicated transcript - SummarizedExperiment",{ test_that("Add cell type proportions - SummarizedExperiment",{ - res = deconvolve_cellularity(tidybulk:::tidybulk_to_SummarizedExperiment(input_df, a, b, c), cores=1 ) + res = deconvolve_cellularity(se_mini, cores=1 ) expect_equal( as.numeric(as.data.frame(res@colData[1, 4:7])), @@ -166,7 +169,7 @@ test_that("Add cell type proportions - SummarizedExperiment",{ test_that("differential trancript abundance - SummarizedExperiment",{ res = test_differential_abundance( - tidybulk:::tidybulk_to_SummarizedExperiment(input_df, a, b, c) %>% + se_mini %>% identify_abundant(factor_of_interest = condition), ~ condition ) @@ -198,7 +201,7 @@ test_that("differential trancript abundance - SummarizedExperiment",{ # Likelihood ratio res2 = test_differential_abundance( - tidybulk:::tidybulk_to_SummarizedExperiment(input_df, a, b, c) %>% + se_mini %>% identify_abundant(factor_of_interest = condition), ~ condition, method = "edgeR_likelihood_ratio" ) @@ -225,8 +228,7 @@ test_that("differential trancript abundance - SummarizedExperiment",{ ) # Treat - input_df %>% - tidybulk:::tidybulk_to_SummarizedExperiment(a, b, c) %>% + se_mini %>% identify_abundant(a, b, c, factor_of_interest = condition) %>% test_differential_abundance( ~ condition, @@ -264,7 +266,7 @@ test_that("filter variable - no object",{ res = keep_variable( - tidybulk:::tidybulk_to_SummarizedExperiment(input_df, a, b, c), + se_mini, top = 5 ) @@ -285,7 +287,7 @@ test_that("impute missing",{ tidybulk:::tidybulk_to_SummarizedExperiment(a, b, c) %>% impute_missing_abundance( ~ condition ) - expect_equal( assays(res) %>% as.list() %>% .[[1]] %>% .["TNFRSF4", "SRR1740034"], 203.5 ) + expect_equal( SummarizedExperiment::assays(res) %>% as.list() %>% .[[1]] %>% .["TNFRSF4", "SRR1740034"], 6 ) expect_equal( nrow(res)*ncol(res), nrow(input_df) ) @@ -295,8 +297,7 @@ test_that("impute missing",{ test_that("differential composition",{ # Cibersort - input_df %>% - tidybulk:::tidybulk_to_SummarizedExperiment(a, b, c) %>% + se_mini %>% test_differential_cellularity(. ~ condition , cores = 1 ) %>% pull(`estimate_(Intercept)`) %>% .[[1]] %>% @@ -304,8 +305,7 @@ test_that("differential composition",{ expect_equal( -2, tollerance =1e-3) # llsr - input_df %>% - tidybulk:::tidybulk_to_SummarizedExperiment(a, b, c) %>% + se_mini %>% test_differential_cellularity( . ~ condition, method="llsr", @@ -426,21 +426,21 @@ test_that("test_stratification_cellularity",{ test_that("pivot",{ - expect_equal( ncol(pivot_sample(tidybulk:::tidybulk_to_SummarizedExperiment(tidybulk(input_df, a, b, c))) ), 4) + expect_equal( ncol(pivot_sample(se_mini) ), 4) - expect_equal( ncol(pivot_transcript(tidybulk:::tidybulk_to_SummarizedExperiment(tidybulk(input_df, a, b, c))) ), 1) + expect_equal( ncol(pivot_transcript(se_mini) ), 1) }) test_that("gene over representation",{ - df_entrez = symbol_to_entrez(tidybulk::counts_mini, .transcript = transcript, .sample = sample) - df_entrez = aggregate_duplicates(df_entrez, aggregation_function = sum, .sample = sample, .transcript = entrez, .abundance = count) - df_entrez = mutate(df_entrez, do_test = transcript %in% c("TNFRSF4", "PLCH2", "PADI4", "PAX7")) + df_entrez = symbol_to_entrez(input_df, .transcript = b, .sample = a) + df_entrez = aggregate_duplicates(df_entrez, aggregation_function = sum, .sample = a, .transcript = entrez, .abundance = c) + df_entrez = mutate(df_entrez, do_test = b %in% c("TNFRSF4", "PLCH2", "PADI4", "PAX7")) res = df_entrez %>% - tidybulk:::tidybulk_to_SummarizedExperiment(sample, transcript, count) %>% + tidybulk:::tidybulk_to_SummarizedExperiment(a, b, c) %>% test_gene_overrepresentation( .entrez = entrez, .do_test = do_test, @@ -452,3 +452,33 @@ test_that("gene over representation",{ }) + + +test_that("Only reduced dimensions MDS - no object",{ + + + + + res = + se_mini %>% + reduce_dimensions( + method = "MDS", + .abundance = c, + .element = a, + .feature = b + ) + + expect_equal( + res$`Dim1`, + c(1.4048441, 1.3933490, -2.0138120 , 0.8832354, -1.6676164), + tolerance=10 + ) + + expect_equal( + ncol(colData(res)), + 5 + ) + + expect_equal( class(attr(res, "internals")$MDS[[1]])[1], "MDS" ) + +}) diff --git a/vignettes/comparison_with_base_R.Rmd b/vignettes/comparison_with_base_R.Rmd old mode 100644 new mode 100755 index a11d2407..e15eca64 --- a/vignettes/comparison_with_base_R.Rmd +++ b/vignettes/comparison_with_base_R.Rmd @@ -62,6 +62,7 @@ library(magrittr) library(ggplot2) library(ggrepel) library(tidybulk) +library(tidySummarizedExperiment) my_theme = @@ -79,6 +80,8 @@ my_theme = axis.title.y = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)) ) +tibble_counts = tidybulk::counts_SE %>% tidybulk() %>% as_tibble() + ``` In this article we show some examples of the differences in coding between tidybulk/tidyverse and base R. We noted a decrease > 10x of assignments and a decrease of > 2x of line numbers. @@ -86,7 +89,7 @@ In this article we show some examples of the differences in coding between tidyb ## Create `tidybulk` tibble. ```{r} -tt = counts_mini %>% tidybulk(sample, transcript, count) +tt = se_mini ``` ## Aggregate duplicated `transcripts` @@ -163,9 +166,9 @@ x <- x[o[1L:top],,drop=FALSE] norm_counts.table = norm_counts.table[rownames(x)] -norm_counts.table$cell_type = tidybulk::counts[ +norm_counts.table$cell_type = tibble_counts[ match( - tidybulk::counts$sample, + tibble_counts$sample, rownames(norm_counts.table) ), "Cell type" @@ -199,8 +202,8 @@ cmds = cmds %$% cmdscale.out %>% setNames(sprintf("Dim%s", 1:6)) -cmds$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(cmds)), +cmds$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(cmds)), "Cell type" ] ``` @@ -237,7 +240,7 @@ pc$cell_type = counts[ Tidy transcriptomics ```{r tsne, cache=TRUE, message=FALSE, warning=FALSE, results='hide'} tt.norm.tSNE = - breast_tcga_mini %>% + breast_tcga_mini_SE %>% tidybulk( sample, ens, count_scaled) %>% identify_abundant() %>% reduce_dimensions( @@ -259,8 +262,8 @@ tsne = Rtsne::Rtsne( perplexity=10, pca_scale =TRUE )$Y -tsne$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(tsne)), +tsne$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(tsne)), "Cell type" ] ``` @@ -389,8 +392,8 @@ results <- CIBERSORT( "mixture_file.txt", perm=100, QN=TRUE ) -results$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(results)), +results$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(results)), "Cell type" ] @@ -417,8 +420,8 @@ count_m_log = log(count_m + 1) k = kmeans(count_m_log, iter.max = 1000, ...) cluster = k$cluster -cluster$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(cluster)), +cluster$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(cluster)), c("Cell type", "Dim1", "Dim2") ] @@ -453,8 +456,8 @@ snn = FindNeighbors(snn) snn = FindClusters(snn, method = "igraph", ...) snn = snn[["seurat_clusters"]] -snn$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(snn)), +snn$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(snn)), c("Cell type", "Dim1", "Dim2") ] diff --git a/vignettes/introduction.Rmd b/vignettes/introduction.Rmd index 4ddae27c..758f53ef 100755 --- a/vignettes/introduction.Rmd +++ b/vignettes/introduction.Rmd @@ -73,7 +73,7 @@ Utilities | Description `impute_missing_abundance` | Impute abundance for missing data points using sample groupings `fill_missing_abundance` | Fill abundance for missing data points using an arbitrary value -All functions are directly compatible with `SummarizedExperiment` object. +All functions are directly compatibble with `SummarizedExperiment` object. ```{r, echo=FALSE, include=FALSE, } @@ -105,6 +105,8 @@ my_theme = axis.title.y = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)) ) +tibble_counts = tidybulk::counts_SE %>% tidybulk() %>% as_tibble() + ``` ## Installation @@ -232,9 +234,9 @@ x <- x[o[1L:top],,drop=FALSE] norm_counts.table = norm_counts.table[rownames(x)] -norm_counts.table$cell_type = tidybulk::counts[ +norm_counts.table$cell_type = tibble_counts[ match( - tidybulk::counts$sample, + tibble_counts$sample, rownames(norm_counts.table) ), "Cell.type" @@ -272,8 +274,8 @@ cmds = cmds %$% cmdscale.out %>% setNames(sprintf("Dim%s", 1:6)) -cmds$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(cmds)), +cmds$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(cmds)), "Cell.type" ] ``` @@ -354,8 +356,8 @@ tsne = Rtsne::Rtsne( perplexity=10, pca_scale =TRUE )$Y -tsne$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(tsne)), +tsne$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(tsne)), "Cell.type" ] ``` @@ -534,8 +536,8 @@ results <- CIBERSORT( "mixture_file.txt", perm=100, QN=TRUE ) -results$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(results)), +results$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(results)), "Cell.type" ] @@ -601,8 +603,8 @@ count_m_log = log(count_m + 1) k = kmeans(count_m_log, iter.max = 1000, ...) cluster = k$cluster -cluster$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(cluster)), +cluster$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(cluster)), c("Cell.type", "Dim1", "Dim2") ] @@ -646,8 +648,8 @@ snn = FindNeighbors(snn) snn = FindClusters(snn, method = "igraph", ...) snn = snn[["seurat_clusters"]] -snn$cell_type = tidybulk::counts[ - match(tidybulk::counts$sample, rownames(snn)), +snn$cell_type = tibble_counts[ + match(tibble_counts$sample, rownames(snn)), c("Cell.type", "Dim1", "Dim2") ] @@ -789,7 +791,9 @@ counts_ensembl %>% ensembl_to_symbol(ens) We can add gene full name (and in future description) from symbol identifiers. This currently works for human and mouse. ```{r description} -counts_SE %>% describe_transcript() %>% select(transcript, description, everything()) +counts_SE %>% + describe_transcript() %>% + select(feature, description, everything()) ``` ## Appendix diff --git a/vignettes/manuscript_differential_transcript_abundance.Rmd b/vignettes/manuscript_differential_transcript_abundance.Rmd index 0e2cb816..c87c0696 100644 --- a/vignettes/manuscript_differential_transcript_abundance.Rmd +++ b/vignettes/manuscript_differential_transcript_abundance.Rmd @@ -1,5 +1,5 @@ --- -title: "Manuscript code - differential transcript abundance" +title: "Manuscript code - differential feature abundance" author: "Stefano Mangiola" date: "`r Sys.Date()`" package: tidybulk @@ -8,11 +8,11 @@ output: toc_float: true vignette: > %\VignetteEngine{knitr::knitr} - %\VignetteIndexEntry{Manuscript code - differential transcript abundance} + %\VignetteIndexEntry{Manuscript code - differential feature abundance} %\usepackage[UTF-8]{inputenc} --- -This decument includes the code used for the manuscript, for the differential transcript abundance. +This decument includes the code used for the manuscript, for the differential feature abundance. ```{r, echo=FALSE, include=FALSE} @@ -71,10 +71,10 @@ coldata = coldata[, c("condition", "type")] # Create tidybulk object counts = cts %>% - as_tibble(rownames = "transcript") %>% + as_tibble(rownames = "feature") %>% pivot_longer(names_to = "sample", values_to = "count", - cols = -transcript) %>% + cols = -feature) %>% left_join( coldata %>% as_tibble(rownames = "sample") %>% @@ -86,7 +86,7 @@ counts = ```{r} # Create a tt object with unique raw and normalised counts tt_scaled <- - tidybulk(counts, sample, transcript, count) %>% + tidybulk(counts, sample, feature, count) %>% aggregate_duplicates() %>% identify_abundant() %>% scale_abundance() @@ -165,10 +165,10 @@ tt_test %>% # Subset data mutate(significant = FDR<0.05 & abs(logFC) >=2) %>% - mutate(transcript = ifelse(significant, as.character(transcript), NA)) %>% + mutate(feature = ifelse(significant, as.character(feature), NA)) %>% # Plot - ggplot(aes(x = logCPM, y = logFC, label=transcript)) + + ggplot(aes(x = logCPM, y = logFC, label=feature)) + geom_point(aes(color = significant, size = significant, alpha=significant)) + geom_text_repel() + scale_color_manual(values=c("black", "#e11f28")) + @@ -180,7 +180,7 @@ tt_test %>% tt_test %>% # Select top genes and reshape data - inner_join( arrange((.), PValue) %>% distinct(transcript) %>% head(6)) %>% + inner_join( arrange((.), PValue) %>% distinct(feature) %>% head(6)) %>% # High level reshaping of the data. # All three count columns are shaped as two columns: @@ -193,7 +193,7 @@ tt_test %>% # This allows the faceted plot ggplot(aes(x = Stage, y = count + 1, fill = condition)) + geom_boxplot() + - facet_wrap(~transcript) + + facet_wrap(~feature) + scale_y_log10() ``` @@ -207,7 +207,7 @@ tt_test %>% filter(FDR < 0.05 & abs(logFC) > 2) %>% # Plot - heatmap( transcript, sample, count_scaled_adjusted) %>% + heatmap( feature, sample, count_scaled_adjusted) %>% add_tile(condition) %>% add_tile(type)