diff --git a/CHANGELOG.md b/CHANGELOG.md
index d66c2116..cd2ca8b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### `Added`
+- [#558](https://github.com/nf-core/ampliseq/pull/558) - Pipeline summary report
+
### `Changed`
### `Fixed`
diff --git a/README.md b/README.md
index 216d2693..e6b84050 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,8 @@ By default, the pipeline currently performs the following:
- Excludes unwanted taxa, produces absolute and relative feature/taxa count tables and plots, plots alpha rarefaction curves, computes alpha and beta diversity indices and plots thereof ([QIIME2](https://www.nature.com/articles/s41587-019-0209-9))
- Calls differentially abundant taxa ([ANCOM](https://www.ncbi.nlm.nih.gov/pubmed/26028277))
- Creates phyloseq R objects ([Phyloseq](https://www.bioconductor.org/packages/release/bioc/html/phyloseq.html))
-- Overall pipeline run summaries ([MultiQC](https://multiqc.info/))
+- Pipeline QC summaries ([MultiQC](https://multiqc.info/))
+- Pipeline summary report ([R Markdown](https://github.com/rstudio/rmarkdown))
## Usage
diff --git a/assets/nf-core-ampliseq_logo_light_long.png b/assets/nf-core-ampliseq_logo_light_long.png
new file mode 100644
index 00000000..8aac12e2
Binary files /dev/null and b/assets/nf-core-ampliseq_logo_light_long.png differ
diff --git a/assets/nf-core_style.css b/assets/nf-core_style.css
new file mode 100644
index 00000000..0195a723
--- /dev/null
+++ b/assets/nf-core_style.css
@@ -0,0 +1,70 @@
+body {
+ font-family: Calibri, helvetica, sans-serif;
+}
+
+h1 {
+ color: rgb(36, 176, 100);
+ font-size: 200%;
+}
+
+h2 {
+ color: rgb(36, 176, 100);
+ font-size: 150%;
+}
+
+h3 {
+ font-size: 100%;
+ font-weight: bold;
+}
+
+h3.subtitle {
+ font-size: 120%;
+ color: rgb(0, 0, 0);
+ font-weight: bold;
+}
+
+h4 {
+ font-size: 100%;
+ font-weight: bold;
+ font-style: italic;
+}
+
+.watermark {
+ opacity: 0.1;
+ position: fixed;
+ top: 50%;
+ left: 50%;
+ font-size: 500%;
+ color: #24b064;
+}
+
+.list-group-item.active {
+ z-index: 2;
+ color: #fff;
+ background-color: #24b064;
+ border-color: #24b064;
+}
+.list-group-item.active:hover {
+ z-index: 2;
+ color: #fff;
+ background-color: #24b064;
+ border-color: #24b064;
+}
+
+#TOC {
+ background-size: contain;
+ padding-top: 60px !important;
+ background-repeat: no-repeat;
+}
+
+.nav-pills > li.active > a,
+.nav-pills > li.active > a:hover,
+.nav-pills > li.active > a:focus {
+ color: #fff;
+ background-color: #24b064;
+}
+
+a {
+ color: #24b064;
+ text-decoration: none;
+}
diff --git a/assets/report_template.Rmd b/assets/report_template.Rmd
new file mode 100644
index 00000000..a14cdc6c
--- /dev/null
+++ b/assets/report_template.Rmd
@@ -0,0 +1,1451 @@
+---
+output:
+ html_document:
+ toc: true # table of contents
+ toc_float: true # float the table of contents to the left of the main document content
+ toc_depth: 3 # header levels 1,2,3
+ theme: default
+ number_sections: true # add section numbering to headers
+ df_print: paged # tables are printed as an html table with support for pagination over rows and columns
+ highlight: pygments
+ pdf_document: true
+#bibliography: ./references.bibtex
+params:
+ # any parameter that is by default "FALSE" is used to evaluate the inclusion of a codeblock with e.g. "eval=!isFALSE(params$mqc_plot)"
+
+ # report style
+ css: NULL
+ report_logo: NULL
+ report_title: "Summary of analysis results"
+ report_abstract: FALSE
+
+ # pipeline versions
+ workflow_manifest_version: NULL
+ workflow_scriptid: NULL
+
+ # flags and arguments
+ flag_retain_untrimmed: FALSE
+ flag_ref_tax_user: FALSE
+ flag_single_end: FALSE
+ barplot: FALSE
+ abundance_tables: FALSE
+ alpha_rarefaction: FALSE
+ ancom: FALSE
+ trunclenf: ""
+ trunclenr: ""
+ max_ee: ""
+ trunc_qmin: FALSE
+ trunc_rmin: ""
+ dada_sample_inference: ""
+ filter_ssu: FALSE
+ min_len_asv: ""
+ max_len_asv: ""
+ cut_its: FALSE
+ dada2_ref_tax_title: FALSE
+ qiime2_ref_tax_title: FALSE
+ sintax_ref_tax_title: FALSE
+ dada2_ref_tax_file: ""
+ qiime2_ref_tax_file: ""
+ sintax_ref_tax_file: ""
+ dada2_ref_tax_citation: ""
+ qiime2_ref_tax_citation: ""
+ sintax_ref_tax_citation: ""
+ exclude_taxa: ""
+ min_frequency: ""
+ min_samples: ""
+ qiime2_filtertaxa: ""
+ val_used_taxonomy: FALSE
+ metadata_category_barplot: FALSE
+ qiime_adonis_formula: FALSE
+
+ # file paths
+ metadata: FALSE
+ samplesheet: FALSE
+ fasta: FALSE
+ input: FALSE
+ mqc_plot: FALSE
+ cutadapt_summary: FALSE
+ dada_filtntrim_args: FALSE
+ dada_qc_f_path: FALSE
+ dada_qc_r_path: ""
+ dada_pp_qc_f_path: ""
+ dada_pp_qc_r_path: ""
+ dada_err_path: FALSE
+ dada_err_run: ""
+ asv_table_path: FALSE
+ path_asv_fa: FALSE
+ path_dada2_tab: FALSE
+ dada_stats_path: FALSE
+ path_barrnap_sum: FALSE
+ filter_ssu_stats: ""
+ filter_ssu_asv: ""
+ filter_len_asv: FALSE
+ filter_len_asv_len_orig: FALSE
+ filter_codons: FALSE
+ stop_codons: ""
+ itsx_cutasv_summary: ""
+ cut_dada_ref_taxonomy: FALSE
+ dada2_taxonomy: FALSE
+ sintax_taxonomy: FALSE
+ pplace_taxonomy: FALSE
+ pplace_heattree: ""
+ qiime2_taxonomy: FALSE
+ filter_stats_tsv: FALSE
+ diversity_indices_depth: ""
+ diversity_indices_beta: FALSE
+ diversity_indices_adonis: ""
+ picrust_pathways: FALSE
+---
+
+
+
+```{r libraries, include=FALSE}
+library("dplyr")
+library("ggplot2")
+library("knitr")
+library("DT")
+library("formattable")
+library("purrr")
+```
+
+
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE) # echo is set in differentialabundance v1.2.0 to TRUE
+```
+
+
+
+```{r, echo=FALSE}
+htmltools::includeCSS(params$css)
+```
+
+```{r results="asis", echo=FALSE}
+cat(paste0("
+
+"))
+```
+
+
+
+```{r}
+if ( endsWith( params$workflow_manifest_version, "dev") ) {
+ ampliseq_version = paste0("version ", params$workflow_manifest_version, ", revision ", params$workflow_scriptid)
+} else {
+ ampliseq_version = paste0("version ",params$workflow_manifest_version)
+}
+report_title <- params$report_title
+report_subtitle <- paste0('nf-core/ampliseq workflow ', ampliseq_version)
+```
+
+---
+title: "`r report_title`"
+subtitle: `r report_subtitle`
+date: '`r format(Sys.Date(), "%B %d, %Y")`'
+---
+
+---
+
+
+
+```{r, results='asis'}
+if ( !isFALSE(params$report_abstract) ) {
+ report_abstract <- paste(readLines(params$report_abstract), collapse="\n")
+ cat(report_abstract)
+} else {
+ # with tab indentation, the following will be a code block!
+ cat(paste0("
+# Abstract
+
+The bioinformatics analysis pipeline [nfcore/ampliseq](https://nf-co.re/ampliseq) is used for amplicon sequencing,
+supporting denoising of any amplicon and supports a variety of taxonomic databases for taxonomic assignment of 16S, ITS, CO1 and 18S amplicons.
+ "))
+}
+```
+
+
+
+```{r, results='asis'}
+if ( !isFALSE(params$metadata) ) {
+ cat(paste0("
+# Data input and Metadata
+
+Pipeline input was saved to the [input](../input) directory.
+ "))
+} else {
+ cat(paste0("
+# Data input
+
+Pipeline input was saved in folder [input](../input).
+ "))
+}
+
+if ( !isFALSE(params$samplesheet) ) {
+ # samplesheet input
+ cat("\nSequencing data was provided in the samplesheet file `", params$samplesheet, "` that is displayed below:", sep="")
+
+ samplesheet <- read.table(file = params$samplesheet, header = TRUE, sep = "\t")
+ # Display table
+ datatable(samplesheet, options = list(
+ scrollX = TRUE,
+ scrollY = "300px",
+ paging = FALSE))
+} else if ( !isFALSE(params$fasta) ) {
+ # fasta input
+ cat("\nASV/OTU sequences were provided in the fasta file `", params$fasta, "`. ", sep="")
+} else if ( !isFALSE(params$input) ) {
+ # folder input
+ cat("\nSequencing data was retrieved from folder `", params$fasta, "`. ", sep="")
+}
+if ( !isFALSE(params$metadata) ) {
+ cat("\nMetadata associated with the sequencing data was provided in `", params$metadata, "` and is displayed below:", sep="")
+
+ metadata <- read.table(file = params$metadata, header = TRUE, sep = "\t")
+ # Display table
+ datatable(metadata, options = list(
+ scrollX = TRUE,
+ scrollY = "300px",
+ paging = FALSE))
+}
+```
+
+
+
+```{r, eval = !isFALSE(params$mqc_plot) || !isFALSE(params$dada_filtntrim_args), results='asis'}
+cat("# Preprocessing\n")
+```
+
+
+
+```{r, eval = !isFALSE(params$mqc_plot), results='asis'}
+cat(paste0("
+## FastQC
+
+[FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads.
+It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C),
+adapter contamination and overrepresented sequences. The sequence quality was checked using FastQC and resulting data was
+aggregated using the FastQC module of [MultiQC](https://multiqc.info/). For more quality controls and per sample quality checks you can check the full
+MultiQC report, which can be found in [multiqc/multiqc_report.html](../multiqc/multiqc_report.html).
+"))
+```
+
+```{r, eval = !isFALSE(params$mqc_plot), out.width='100%', dpi=1200, fig.align='center'}
+knitr::include_graphics(params$mqc_plot)
+```
+
+
+
+```{r, eval = !isFALSE(params$cutadapt_summary), results='asis'}
+cat(paste0("
+## Primer removal with Cutadapt
+
+[Cutadapt](https://journal.embnet.org/index.php/embnetjournal/article/view/200) is trimming primer sequences from sequencing reads.
+Primer sequences are non-biological sequences that often introduce point mutations that do not reflect sample sequences. This is especially
+true for degenerated PCR primer. If primer trimming were to be omitted, artifactual amplicon sequence variants might be computed by
+the denoising tool or sequences might be lost due to being labelled as PCR chimera.
+"))
+
+# import tsv
+cutadapt_summary <- read.table(file = params$cutadapt_summary, header = TRUE, sep = "\t")
+
+cutadapt_passed_col <- as.numeric(substr(
+ cutadapt_summary$cutadapt_passing_filters_percent, 1, 4))
+
+cutadapt_max_discarded <- round( 100 - min(cutadapt_passed_col), 1 )
+cutadapt_avg_passed <- round(mean(cutadapt_passed_col),1)
+
+cutadapt_text_unch <- "Primers were trimmed using cutadapt"
+cutadapt_text_ch <- paste0(" and all untrimmed sequences were discarded. ",
+ "Sequences that did not contain primer sequences were considered artifacts. Less than ",
+ cutadapt_max_discarded, "% of the sequences were discarded per sample and a mean of ",
+ cutadapt_avg_passed, "% of the sequences per sample passed the filtering. ")
+
+if ( isFALSE(params$flag_retain_untrimmed) ) cutadapt_text <- paste0(
+ cutadapt_text_unch, cutadapt_text_ch
+ ) else cutadapt_text <- paste0(cutadapt_text_unch, ". ")
+
+cat(cutadapt_text)
+cat("Cutadapt results can be found in folder [cutadapt](../cutadapt).")
+
+# shorten header by "cutadapt_" to optimize visualisation
+colnames(cutadapt_summary) <- gsub("cutadapt_","",colnames(cutadapt_summary))
+
+datatable(cutadapt_summary, options = list(
+ scrollX = TRUE,
+ scrollY = "300px",
+ paging = FALSE))
+```
+
+
+
+```{r, eval = !isFALSE(params$dada_filtntrim_args), results='asis'}
+cat(paste0("
+## Quality filtering using DADA2
+
+Additional quality filtering can improve sequence recovery.
+Often it is advised trimming the last few nucleotides to avoid less well-controlled errors that can arise there.
+"))
+
+if (params$trunc_qmin) {
+ f_and_tr_args <- readLines(params$dada_filtntrim_args)
+ trunc_len <- strsplit(gsub(".*truncLen = c\\((.+)\\),maxN.*", "\\1",
+ f_and_tr_args), ", ")
+ tr_len_f <- trunc_len[[1]][1]
+ tr_len_r <- trunc_len[[1]][2]
+ cat("Reads were trimmed to a specific length and the length cutoff was ",
+ "automatically determined by the median quality of all input reads. ",
+ "Reads were trimmed before median quality drops ",
+ "below ", params$trunc_qmin, " and at least ",params$trunc_rmin*100,
+ "% of reads are retained, resulting in a trim of ",
+ "forward reads at ", tr_len_f, " bp and reverse ",
+ "reads at ", tr_len_r, " bp, reads shorter than this were discarded. ", sep = "")
+} else if (params$trunclenf == "null" && params$trunclenr == "null") {
+ cat("Reads were not trimmed. ")
+} else if (params$trunclenf != 0 && params$trunclenr != 0) {
+ cat("Forward reads were trimmed at ", params$trunclenf,
+ " bp and reverse reads were trimmed at ", params$trunclenr,
+ " bp, reads shorter than this were discarded. ", sep = "")
+} else if (params$trunclenf != 0) {
+ cat("Forward reads were trimmed at ", params$trunclenf," bp, reads shorter than this were discarded. ", sep = "")
+} else if (params$trunclenr != 0) {
+ cat("Reverse reads were trimmed at ", params$trunclenr," bp, reads shorter than this were discarded. ", sep = "")
+}
+cat("Reads with more than", params$max_ee,"expected errors were discarded.",
+ "Read counts passing the filter are shown in section ['Read counts per sample'](#read-counts-per-sample)",
+ "column 'filtered'.", sep = " ")
+```
+
+
+
+```{r, eval = !isFALSE(params$dada_qc_f_path), results='asis'}
+cat ("**Quality profiles:**\n\n")
+
+if (params$flag_single_end) {
+ cat("Read quality stats for incoming data:")
+} else {
+ cat("Forward (left) and reverse (right) read quality stats for incoming data:")
+}
+```
+
+```{r, eval = !isFALSE(params$dada_qc_f_path), out.width="49%", fig.show='hold', fig.align='default'}
+if (params$flag_single_end) {
+ knitr::include_graphics(params$dada_qc_f_path)
+} else {
+ knitr::include_graphics(c(params$dada_qc_f_path, params$dada_qc_r_path))
+}
+```
+
+```{r, eval = !isFALSE(params$dada_qc_f_path), results='asis'}
+if (params$flag_single_end) {
+ cat("Read quality stats for preprocessed data:")
+} else {
+ cat("Forward (left) and reverse (right) read quality stats for preprocessed data:")
+}
+```
+
+```{r, eval = !isFALSE(params$dada_qc_f_path), out.width="49%", fig.show='hold', fig.align='default'}
+if (params$flag_single_end) {
+ knitr::include_graphics(params$dada_pp_qc_f_path)
+} else {
+ knitr::include_graphics(c(params$dada_pp_qc_f_path, params$dada_pp_qc_r_path))
+}
+```
+
+```{r, eval = !isFALSE(params$dada_qc_f_path), results='asis'}
+cat(paste0("
+Overall read quality profiles are displayed as heat map of the frequency of each quality score at each base position.
+The mean quality score at each position is shown by the green line, and the quartiles of the quality score
+distribution by the orange lines. The red line shows the scaled proportion of reads that extend to at least
+that position. Original plots can be found [folder dada2/QC/](../dada2/QC/) with names that end in `_qual_stats.pdf`.
+"))
+```
+
+
+
+```{r, eval = !isFALSE(params$dada_err_path) || !isFALSE(params$dada_stats_path) || !isFALSE(params$asv_table_path), results='asis'}
+cat(paste0("
+# ASV inference using DADA2
+
+[DADA2](https://doi.org/10.1038/nmeth.3869) performs fast and accurate sample inference from amplicon data with single-nucleotide
+resolution. It infers exact amplicon sequence variants (ASVs) from amplicon data with fewer false positives than many other
+methods while maintaining high sensitivity.
+
+DADA2 reduces sequence errors and dereplicates sequences by quality filtering, denoising,
+read pair merging (for paired end Illumina reads only) and PCR chimera removal.
+"))
+```
+
+
+
+```{r, eval = !isFALSE(params$dada_err_path), results='asis'}
+cat(paste0("
+## Error correction
+
+Read error correction was performed using estimated error rates, visualized below.
+"))
+
+# check if single run or multirun
+flag_multirun = length ( unlist( strsplit( params$dada_err_run,"," ) ) ) != 1
+
+if ( flag_multirun && params$flag_single_end ) {
+ # single end multi run
+ cat("Error rates were estimated for each sequencing run separately. ",
+ "Each 4x4 figure represents one run, in the sequence ", params$dada_err_run,".")
+} else if ( flag_multirun && !params$flag_single_end ) {
+ # paired end multi run
+ cat("Error rates were estimated for each sequencing run separately. ",
+ "Each row represents one run, in the sequence ", params$dada_err_run,".",
+ "For each row, the error rates for forward reads are at the left side and reverse reads are at the right side.")
+} else if ( !flag_multirun && !params$flag_single_end ) {
+ # paired end single run
+ cat("Error rates for forward reads are at the left side and reverse reads are at the right side.")
+}
+```
+
+```{r, eval = !isFALSE(params$dada_err_path), out.width="49%", fig.show='hold', fig.align='default'}
+dada_err_path <- unlist( strsplit( params$dada_err_path,"," ) )
+knitr::include_graphics(dada_err_path)
+```
+
+```{r, eval = !isFALSE(params$dada_err_path), results='asis'}
+cat(paste0("
+Estimated error rates are displayed for each possible transition. The black line shows the estimated error rates after
+convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal
+definition of the Q-score. The estimated error rates (black line) should be a good fit to the observed rates
+(points), and the error rates should drop with increased quality. Original plots can be found in
+[folder dada2/QC/](../dada2/QC/) with names that end in `.err.pdf`.
+"))
+```
+
+
+
+```{r, eval = !isFALSE(params$dada_stats_path), results='asis'}
+cat(paste0("
+## Read counts per sample
+
+Tracking read numbers through DADA2 processing steps for each sample. The following table shows the read numbers after each processing stage.
+"))
+
+if ( params$flag_single_end ) {
+ cat("Processing stages are: input - reads into DADA2, filtered - reads passed quality filtering, ",
+ "denoised - reads after denoising, nonchim - reads in non-chimeric sequences (final ASVs).")
+} else {
+ cat("Processing stages are: input - read pairs into DADA2, filtered - read pairs passed quality filtering, ",
+ "denoisedF - forward reads after denoising, denoisedR - reverse reads after denoising, ",
+ "merged - successfully merged read pairs, nonchim - read pairs in non-chimeric sequences (final ASVs).")
+}
+
+# import stats tsv
+dada_stats <- read.table(file = params$dada_stats_path, header = TRUE, sep = "\t")
+
+# Display table
+datatable(dada_stats, options = list(
+ scrollX = TRUE,
+ scrollY = "300px",
+ paging = FALSE))
+
+cat(paste0("
+Samples with unusual low reads numbers relative to the number of expected ASVs
+should be treated cautiously, because the abundance estimate will be very granular
+and might vary strongly between (theoretical) replicates due to high impact of stochasticity.
+
+Following, the numbers of the table above are shown in stacked barcharts as percentage of DADA2 input reads.
+"))
+
+# Stacked barchart to num of reads
+
+# Calc exluded asvs and transform all cols to percent
+
+if ( params$flag_single_end ) {
+ # single end
+ cat("Stacked barcharts of read numbers per sample and processing stage")
+
+ dada_stats_ex <- data.frame(sample = dada_stats$sample,
+ input = dada_stats$DADA2_input,
+ filtered = dada_stats$DADA2_input-dada_stats$filtered,
+ denoised = dada_stats$filtered-dada_stats$denoised,
+ nonchim = dada_stats$denoised-dada_stats$nonchim,
+ analysis = dada_stats$nonchim)
+ dada_stats_p <- data.frame(sample = dada_stats_ex$sample, round(dada_stats_ex[2:6]/dada_stats_ex$input*100, 2))
+ dada_stats_p_analysis_average <- round(sum(dada_stats_p$analysis)/length(dada_stats_p$analysis), 1)
+ # If more than 20 sample only display subset!
+ if ( nrow(dada_stats_p)>=20 ) {
+ cat(" (display 10 samples of each lowest and highest percentage of reads analysed, of",nrow(dada_stats_p),"samples)")
+ dada_stats_p <- dada_stats_p[order(-dada_stats_p$analysis),]
+ dada_stats_p <- rbind(head(dada_stats_p,10),tail(dada_stats_p,10))
+ }
+ # Stack columns for both stacked barcharts
+ n_samples <- length(dada_stats_p$sample)
+ samples_t <- c(rep(dada_stats_p$sample, 4))
+ steps_t <- c(rep("excluded by filtering", n_samples), rep("excluded by denoised", n_samples),
+ rep("excluded by nonchim", n_samples), rep("reads in final ASVs", n_samples))
+ # stack the column for percentage of asvs
+ asvs_p_t <- as.array(flatten_dbl(dada_stats_p[3:6]))
+ dada_stats_p_t <- data.frame(samples_t, steps_t, asvs_p_t)
+} else {
+ # paired end
+ cat("Stacked barchart of read pair numbers (denoisedF & denoisedR halfed, because each pair is split) per sample and processing stage")
+
+ dada_stats_ex <- data.frame(sample = dada_stats$sample,
+ DADA2_input = dada_stats$DADA2_input,
+ filtered = dada_stats$DADA2_input-dada_stats$filtered,
+ denoisedF = (dada_stats$filtered-dada_stats$denoisedF)/2,
+ denoisedR = (dada_stats$filtered-dada_stats$denoisedR)/2,
+ merged = (dada_stats$denoisedF+dada_stats$denoisedR)/2-dada_stats$merged,
+ nonchim = dada_stats$merged-dada_stats$nonchim,
+ analysis = dada_stats$nonchim)
+ dada_stats_p <- data.frame(sample = dada_stats_ex$sample, round(dada_stats_ex[2:8]/dada_stats_ex$DADA2_input*100, 2))
+ dada_stats_p_analysis_average <- round(sum(dada_stats_p$analysis)/length(dada_stats_p$analysis), 1)
+ # If more than 20 sample only display subset!
+ if ( nrow(dada_stats_p)>=20 ) {
+ cat(" (display 10 samples of each lowest and highest percentage of reads analysed, of",nrow(dada_stats_p),"samples)")
+ dada_stats_p <- dada_stats_p[order(-dada_stats_p$analysis),]
+ dada_stats_p <- rbind(head(dada_stats_p,10),tail(dada_stats_p,10))
+ }
+ # Stack columns for both stacked barcharts
+ n_samples <- length(dada_stats_p$sample)
+ samples_t <- c(rep(dada_stats_p$sample, 6))
+ steps_t <- c(rep("excluded by filtering", n_samples), rep("excluded by denoisedF", n_samples),
+ rep("excluded by denoisedR", n_samples), rep("excluded by merged", n_samples),
+ rep("excluded by nonchim", n_samples), rep("reads in final ASVs", n_samples))
+ # stack the column for percentage of asvs
+ asvs_p_t <- as.array(flatten_dbl(dada_stats_p[3:8]))
+ dada_stats_p_t <- data.frame(samples_t, steps_t, asvs_p_t)
+}
+cat(":\n\n")
+
+# Plot
+dada_stats_p_t$steps_t <- factor(dada_stats_p_t$steps_t, levels=unique(dada_stats_p_t$steps_t))
+dada_stats_p_t$samples_t <- factor(dada_stats_p_t$samples_t, levels=dada_stats_p_t[order(dada_stats_p$analysis),"samples_t"])
+
+plot_dada_stats_p_t <- ggplot(dada_stats_p_t, aes(fill = steps_t, y = asvs_p_t, x = samples_t)) +
+ geom_bar(position = "fill", stat = "identity") +
+ xlab("Samples") +
+ ylab("Fraction of total reads") +
+ coord_flip() +
+ scale_fill_brewer("Filtering Steps", palette = "Spectral")
+plot_dada_stats_p_t
+
+svg("stacked_barchart_of_reads.svg")
+plot_dada_stats_p_t
+invisible(dev.off())
+
+cat(paste0("
+
+Between ",min(dada_stats_p$analysis),"% and ",max(dada_stats_p$analysis),"% reads per sample (average ",dada_stats_p_analysis_average,"%)
+were retained for analysis within DADA2 steps.
+
+The proportion of lost reads per processing stage and sample should not be too high, totalling typically <50%.
+Samples that are very different in lost reads (per stage) to the majority of samples must be compared with caution, because an unusual problem
+(e.g. during nucleotide extraction, library preparation, or sequencing) could have occurred that might add bias to the analysis.
+"))
+```
+
+
+
+```{r, eval = !isFALSE(params$asv_table_path), results='asis'}
+cat("## Inferred ASVs\n\n")
+
+#import asv table
+asv_table <- read.table(file = params$asv_table_path, header = TRUE, sep = "\t")
+n_asv <- length(asv_table$ASV_ID)
+
+# Output text
+cat("Finally,", n_asv,
+ "amplicon sequence variants (ASVs) were obtained across all samples. ")
+cat("The ASVs can be found in [`dada2/ASV_seqs.fasta`](../dada2/). And the corresponding",
+ " quantification of the ASVs across samples is in",
+ "[`dada2/ASV_table.tsv`](../dada2/). An extensive table containing both was ",
+ "saved as [`dada2/DADA2_table.tsv`](../dada2/). ")
+if ( params$dada_sample_inference == "independent" ) {
+ cat("ASVs were inferred for each sample independently.")
+} else if ( params$dada_sample_inference == "pooled" ) {
+ cat("ASVs were inferred from pooled sample information.")
+} else {
+ cat("ASVs were initally inferred for each sample independently, but re-examined with all samples (pseudo-pooled).")
+}
+```
+
+```{r, results='asis'}
+flag_any_filtering <- !isFALSE(params$path_barrnap_sum) || !isFALSE(params$filter_len_asv) || !isFALSE(params$filter_codons)
+```
+
+
+
+```{r, eval = flag_any_filtering, results='asis'}
+cat("# Filtering of ASVs\n")
+```
+
+
+
+```{r, eval = !isFALSE(params$path_barrnap_sum), results='asis'}
+cat("## rRNA detection\n")
+cat("[Barrnap](https://github.com/tseemann/barrnap) classifies the ASVs into the origin domain (including mitochondrial origin).\n\n", sep = "")
+
+# Read the barrnap files and count the lines
+barrnap_sum = read.table( params$path_barrnap_sum, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
+# keep only ASV_ID & eval columns & sort
+barrnap_sum <- subset(barrnap_sum, select = c(ASV_ID,mito_eval,euk_eval,arc_eval,bac_eval))
+# choose kingdom (column) with lowest evalue
+barrnap_sum[is.na(barrnap_sum)] <- 1
+barrnap_sum$result = colnames(barrnap_sum[,2:5])[apply(barrnap_sum[,2:5],1,which.min)]
+barrnap_sum$result = gsub("_eval", "", barrnap_sum$result)
+
+#import asv table
+asv_table <- readLines(params$path_asv_fa)
+n_asv <- sum(grepl("^>", asv_table))
+
+# calculate numbers
+n_classified <- length(barrnap_sum$result)
+n_bac <- sum(grepl("bac", barrnap_sum$result))
+n_arc <- sum(grepl("arc", barrnap_sum$result))
+n_mito <- sum(grepl("mito", barrnap_sum$result))
+n_euk <- sum(grepl("euk", barrnap_sum$result))
+
+barrnap_df_sum <- data.frame(label=c('Bacteria','Archaea','Mitochondria','Eukaryotes','Unclassified'),
+ count=c(n_bac,n_arc,n_mito,n_euk,n_asv - n_classified),
+ percent=c(round( (n_bac/n_asv)*100, 2), round( (n_arc/n_asv)*100, 2), round( (n_mito/n_asv)*100, 2), round( (n_euk/n_asv)*100, 2), round( ( (n_asv - n_classified) /n_asv)*100, 2) ) )
+
+# Build outputtext
+cat( "Barrnap classified ")
+cat( barrnap_df_sum$count[1], "(", barrnap_df_sum$percent[1],"%) ASVs as most similar to Bacteria, " )
+cat( barrnap_df_sum$count[2], "(", barrnap_df_sum$percent[2],"%) ASVs to Archea, " )
+cat( barrnap_df_sum$count[3], "(", barrnap_df_sum$percent[3],"%) ASVs to Mitochondria, " )
+cat( barrnap_df_sum$count[4], "(", barrnap_df_sum$percent[4],"%) ASVs to Eukaryotes, and " )
+cat( barrnap_df_sum$count[5], "(", barrnap_df_sum$percent[5],"%) were below similarity threshold to any kingdom." )
+
+# Barplot
+plot_barrnap_df_sum <- ggplot(barrnap_df_sum,
+ aes(x = reorder(label, desc(label)), y = percent)) +
+ geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) +
+ ylab("% Classification") +
+ xlab("rRNA origins") +
+ coord_flip() +
+ theme_bw() +
+ ylim(0, 100)
+plot_barrnap_df_sum
+
+svg("rrna_detection_with_barrnap.svg")
+plot_barrnap_df_sum
+invisible(dev.off())
+
+cat("\n\nrRNA classification results can be found in folder [barrnap](../barrnap).")
+```
+
+
+
+```{r, eval = !isFALSE(params$path_barrnap_sum) && !isFALSE(params$filter_ssu), results='asis'}
+cat("\n\nASVs were filtered for `",params$filter_ssu,"` (`bac`: Bacteria, `arc`: Archaea, `mito`: Mitochondria, `euk`: Eukaryotes)
+ using the above classification. The following table shows read counts for each sample before and after filtering:\n\n", sep = "")
+
+# Read the barrnap stats file
+filter_ssu_stats = read.table( params$filter_ssu_stats, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
+# shorten header by "ssufilter_" to optimize visualisation
+colnames(filter_ssu_stats) <- gsub("ssufilter_","",colnames(filter_ssu_stats))
+filter_ssu_stats <- subset(filter_ssu_stats, select = c(sample,input,output))
+filter_ssu_stats$'retained%' <- round( filter_ssu_stats$output / filter_ssu_stats$input *100, 2)
+filter_ssu_stats_avg_removed <- 100-sum(filter_ssu_stats$'retained%')/length(filter_ssu_stats$'retained%')
+filter_ssu_stats_max_removed <- 100-min(filter_ssu_stats$'retained%')
+
+# Display table
+datatable(filter_ssu_stats, options = list(
+ scrollX = TRUE,
+ scrollY = "300px",
+ paging = FALSE))
+
+# Read the barrnap asv file
+filter_ssu_asv <- read.table( params$filter_ssu_asv, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
+filter_ssu_asv_filtered <- nrow(filter_ssu_asv)
+
+cat("In average", round(filter_ssu_stats_avg_removed,2), "% reads were removed, but at most",filter_ssu_stats_max_removed,"% reads per sample. ")
+# "n_asv" is taken from the barrnap block above
+cat("The number of ASVs was reduced by",n_asv-filter_ssu_asv_filtered,"(",100-round( filter_ssu_asv_filtered/n_asv*100 ,2),"%), from",n_asv,"to",filter_ssu_asv_filtered," ASVs.")
+```
+
+
+
+```{r, eval = !isFALSE(params$filter_len_asv_len_orig), results='asis'}
+cat(paste0("
+## Sequence length
+
+A length filter was used to reduce potential contamination.
+Before filtering, ASVs had the following length profile (count of 1 was transformed to 1.5 to allow plotting on log10 scale):
+
+"))
+
+# ASV length profile
+
+# import length profile tsv
+filter_len_profile <- read.table(file = params$filter_len_asv_len_orig, header = TRUE, sep = "\t")
+
+# find number of ASVs filtered
+filter_len_asv_filtered <- filter_len_profile
+if ( params$min_len_asv != 0 ) {
+ filter_len_asv_filtered <- subset(filter_len_asv_filtered, Length >= params$min_len_asv)
+}
+if ( params$max_len_asv != 0 ) {
+ filter_len_asv_filtered <- subset(filter_len_asv_filtered, Length <= params$max_len_asv)
+}
+
+# replace 1 with 1.5 to display on log scale
+filter_len_profile$Counts[filter_len_profile$Counts == 1] <- 1.5
+
+plot_filter_len_profile <- ggplot(filter_len_profile,
+ aes(x = Length, y = Counts)) +
+ geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) +
+ ylab("Number of ASVs") +
+ xlab("Length") +
+ scale_y_continuous(trans = "log10") +
+ theme_bw()
+plot_filter_len_profile
+
+svg("asv_length_profile_before_length_filter.svg")
+plot_filter_len_profile
+invisible(dev.off())
+
+cat("\n\n")
+if ( params$min_len_asv != 0 && params$max_len_asv != 0 ) {
+ cat("Filtering omitted all ASVs with length lower than",params$min_len_asv,"or above",params$max_len_asv,"bp. ")
+} else if ( params$min_len_asv != 0 ) {
+ cat("Filtering omitted all ASVs with length lower than",params$min_len_asv,"bp. ")
+} else if ( params$max_len_asv != 0 ) {
+ cat("Filtering omitted all ASVs with length above",params$max_len_asv,"bp. ")
+}
+```
+
+```{r, eval = !isFALSE(params$filter_len_asv), results='asis'}
+# import stats tsv
+filter_len_stats <- read.table(file = params$filter_len_asv, header = TRUE, sep = "\t")
+# only if file not empty continue with reporting below
+flag_filter_len_stats <- nrow(filter_len_stats) > 0
+```
+
+```{r, eval = !isFALSE(params$filter_len_asv) && flag_filter_len_stats, results='asis'}
+# Reads removed
+
+# re-name & re-order columns
+colnames(filter_len_stats) <- gsub("lenfilter_","",colnames(filter_len_stats))
+filter_len_stats <- filter_len_stats[, c("sample", "input", "output")]
+filter_len_stats$'retained%' <- round( filter_len_stats$output / filter_len_stats$input * 100 , 2)
+filter_len_stats_avg_removed <- 100-sum(filter_len_stats$'retained%')/length(filter_len_stats$'retained%')
+filter_len_stats_max_removed <- 100-min(filter_len_stats$'retained%')
+
+cat("The following table shows read counts for each sample before and after filtering:")
+
+# Display table
+datatable(filter_len_stats, options = list(
+ scrollX = TRUE,
+ scrollY = "300px",
+ paging = FALSE))
+
+cat("In average", filter_len_stats_avg_removed, "% reads were removed, but at most",filter_len_stats_max_removed,"% reads per sample.")
+```
+
+```{r, eval = !isFALSE(params$filter_len_asv_len_orig), results='asis'}
+cat("The number of ASVs was reduced by",sum(filter_len_profile$Counts)-sum(filter_len_asv_filtered$Counts),"(",100-round( sum(filter_len_asv_filtered$Counts)/sum(filter_len_profile$Counts)*100 ,2),"%), from",sum(filter_len_profile$Counts),"to",sum(filter_len_asv_filtered$Counts)," ASVs.")
+cat("\n\nLength filter results can be found in folder [asv_length_filter](../asv_length_filter).")
+```
+
+
+
+```{r, eval = !isFALSE(params$filter_codons), results='asis'}
+cat(paste0("
+## Codon usage
+
+Amplicons of coding regions are expected to be free of stop codons and consist of condon tripletts.
+ASVs were filtered against the presence of stop codons (",params$stop_codons,") in the specified open reading frame of the ASV.
+Additionally, ASVs that are not multiple of 3 in length were omitted.
+
+"))
+
+# import stats tsv
+filter_codons_stats <- read.table(file = params$filter_codons, header = TRUE, sep = "\t")
+
+cat("The following table shows read counts for each sample after filtering:")
+
+# Display table
+datatable(filter_codons_stats, options = list(
+ scrollX = TRUE,
+ scrollY = "300px",
+ paging = FALSE))
+
+#TODO: add ASV count after filtering
+
+cat("\n\nCodon usage filter results can be found in folder [codon_filter](../codon_filter).")
+```
+
+
+
+```{r, results='asis'}
+# Check if any taxonomic classification is available
+any_taxonomy <- !isFALSE(params$dada2_taxonomy) || !isFALSE(params$qiime2_taxonomy) || !isFALSE(params$sintax_taxonomy) || !isFALSE(params$pplace_taxonomy)
+```
+
+```{r, eval = any_taxonomy, results='asis'}
+# Header if any taxonomic classification is available
+cat("# Taxonomic Classification\n")
+```
+
+
+
+```{r, eval = !isFALSE(params$cut_its), results='asis'}
+cat(paste0("
+## ITS regions
+
+The ",params$cut_its," region was extracted from each ASV sequence using [ITSx](https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12073).
+Taxonomic classification should have improved performance based on extracted ITS sequence. ITSx results can be found in folder [itsx](../itsx).
+
+Taxonomies per extracted region was then transferred back to the full ASV sequence. No filtering was done based on whether the region was found or not.
+Those taxonomic classifications per ASV can be found in files `ASV_tax.tsv` and `ASV_tax_species.tsv` in folder [dada2/](../dada2/).
+
+However, the files `ASV_ITS_tax.tsv` and `ASV_ITS_tax_species.tsv` in folder [dada2/](../dada2/) contain only the chosen ITS part of just the ASVs where the region was found.
+Of course, different ASVs may contain identical ",params$cut_its," regions, leading to identical taxonomy assignments,
+but the full ASVs were recorded as separate entries anyway to retain maximum resolution at this stage.
+"))
+
+# Read ITSX summary
+itsx_summary <- readLines(params$itsx_cutasv_summary)
+
+origins = FALSE
+itsx_origins <- data.frame(origin=character(), count=numeric(), stringsAsFactors=FALSE)
+for (line in itsx_summary){
+ # get basic statistic
+ if (grepl("Number of sequences in input file:", line)) {
+ itsx_summary_nasv <- as.numeric( sub("Number of sequences in input file: *\t*", "", line) )
+ }
+ if (grepl("Sequences detected as ITS by ITSx:", line)) {
+ itsx_summary_its <- as.numeric( sub("Sequences detected as ITS by ITSx: *\t*", "", line) )
+ }
+ # get preliminar origins
+ if (grepl("----------------------------", line)) {
+ origins = FALSE
+ }
+ if (isTRUE(origins)) {
+ add <- data.frame(origin=sub(":.*", "", line), count=as.numeric( sub(".*: *\t*", "", line) ) )
+ itsx_origins <- rbind(itsx_origins, add)
+ }
+ if (grepl("ITS sequences by preliminary origin:", line)) {
+ origins = TRUE
+ }
+}
+itsx_origins$percent <- round( itsx_origins$count / itsx_summary_nasv * 100, 2)
+
+cat(itsx_summary_its, "of",itsx_summary_nasv,"(",round( itsx_summary_its/itsx_summary_nasv*100 ,2),"%) ASVs were identified as ITS.",
+ "The following plot shows ITS sequences by preliminary origin:")
+
+plot_itsx_origins <- ggplot(itsx_origins,
+ aes(x = origin, y = percent)) +
+ geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) +
+ ylab("%") +
+ xlab("ITS sequences by preliminary origin") +
+ coord_flip() +
+ theme_bw()
+plot_itsx_origins
+
+svg("itsx_preliminary_origin.svg")
+plot_itsx_origins
+invisible(dev.off())
+```
+
+
+
+```{r, eval = !isFALSE(params$dada2_taxonomy), results='asis'}
+cat("## DADA2\n")
+
+# indicate reference taxonomy
+if (!params$flag_ref_tax_user) {
+ cat("The taxonomic classification was performed by [DADA2](https://pubmed.ncbi.nlm.nih.gov/27214047/)
+ using the database: `", params$dada2_ref_tax_title, "`.
+ More details about the reference taxonomy database can be found in the ['Methods section'](#methods).\n\n", sep = "")
+} else {
+ cat("The taxonomic classification was performed by DADA2 using a custom database ",
+ "provided by the user.\n\n", sep = "")
+}
+
+# mention if taxonomy was cut by cutadapt
+if ( !isFALSE(params$cut_dada_ref_taxonomy) ) {
+ cut_dada_ref_taxonomy <- readLines(params$cut_dada_ref_taxonomy)
+ for (line in cut_dada_ref_taxonomy){
+ if (grepl("Total reads processed:", line)) {
+ cut_dada_ref_taxonomy_orig <- sub("Total reads processed: *\t*", "", line)
+ }
+ if (grepl("Reads written \\(passing filters\\):", line)) {
+ cut_dada_ref_taxonomy_filt <- sub("Reads written .passing filters.: *\t*", "", line)
+ }
+ if (grepl("Total basepairs processed:", line)) {
+ cut_dada_ref_taxonomy_orig_bp <- sub("Total basepairs processed: *\t*", "", line)
+ }
+ if (grepl("Total written \\(filtered\\):", line)) {
+ cut_dada_ref_taxonomy_filt_bp <- sub("Total written \\(filtered\\): *\t*", "", line)
+ }
+ }
+
+ cat("The taxonomic reference database was cut by primer sequences to improve matching.
+ The original database had ",cut_dada_ref_taxonomy_orig," sequences with ",cut_dada_ref_taxonomy_orig_bp,
+ ", retained were ",cut_dada_ref_taxonomy_filt," sequences that represented ",cut_dada_ref_taxonomy_filt_bp,".\n\n",
+ sep = "")
+}
+
+# make statistics of taxonomic classification
+asv_tax <- read.table(params$dada2_taxonomy, header = TRUE, sep = "\t")
+
+# Calculate the classified numbers/percent of asv
+level <- subset(asv_tax, select = -c(ASV_ID,confidence,sequence))
+level <- colnames(level)
+
+# Catch 100% highest taxa (e.g. Kingdom) assignment
+if (count(asv_tax, level[1])$n[1] == nrow(asv_tax)){
+ n_1 = 0
+} else {
+ n_1 = count(asv_tax, level[1])$n[1]
+}
+n_asv_tax = nrow(asv_tax)
+n_asv_unclassified <- c(n_1)
+for (x in level[2:length(level)]) {
+ asv_tax_subset <- subset(asv_tax, select = x)
+ colnames(asv_tax_subset)[1] <- "count_this"
+ n_asv_unclassified <- c(n_asv_unclassified, count(asv_tax_subset, count_this)$n[1])
+}
+
+n_asv_classified <- n_asv_tax - n_asv_unclassified
+p_asv_classified <- round(n_asv_classified / n_asv_tax * 100, 2)
+
+asv_classi_df <- data.frame(level, n_asv_classified, p_asv_classified)
+
+# Build output string
+outputstr <- "DADA2 classified "
+for (row in seq_len(nrow(asv_classi_df))) {
+ outputstr <- paste0(outputstr, asv_classi_df[row, ]$p_asv_classified,
+ " % ASVs at ", asv_classi_df[row, ]$level, " level, ")
+}
+outputstr <- substr(outputstr, 1, nchar(outputstr)-2)
+outputstr <- paste0(outputstr, ".\n\n")
+
+# Output Text Classifications
+cat(outputstr)
+
+# Barplot
+# Plot
+asv_classi_df$level <- factor(asv_classi_df$level, levels = asv_classi_df$level)
+plot_asv_classi_df <- ggplot(asv_classi_df,
+ aes(x = reorder(level, desc(level)), y = p_asv_classified)) +
+ geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) +
+ ylab("% Classification") +
+ xlab("Levels") +
+ coord_flip() +
+ theme_bw()
+plot_asv_classi_df
+
+svg("dada2_taxonomic_classification_per_taxonomy_level.svg")
+plot_asv_classi_df
+invisible(dev.off())
+
+cat("\n\nDADA2 taxonomy assignments can be found in folder [dada2](../dada2) in files `ASV_tax_*.tsv`.")
+```
+
+
+
+```{r, eval = !isFALSE(params$qiime2_taxonomy), results='asis'}
+# Header
+cat("## QIIME2\n")
+
+cat("The taxonomic classification was performed by [QIIME2](https://www.nature.com/articles/s41587-019-0209-9)
+ using the database: `", params$qiime2_ref_tax_title, "`.
+ More details about the reference taxonomy database can be found in the ['Methods section'](#methods).\n\n", sep = "")
+
+# Read file and prepare table
+asv_tax <- read.table(params$qiime2_taxonomy, header = TRUE, sep = "\t")
+#asv_tax <- data.frame(do.call('rbind', strsplit(as.character(asv_tax$Taxon),'; ',fixed=TRUE)))
+asv_tax <- subset(asv_tax, select = Taxon)
+
+# Remove greengenes85 ".__" placeholders
+df = as.data.frame(lapply(asv_tax, function(x) gsub(".__", "", x)))
+# remove all last, empty ;
+df = as.data.frame(lapply(df, function(x) gsub(" ;","",x)))
+# remove last remaining, empty ;
+df = as.data.frame(lapply(df, function(x) gsub("; $","",x)))
+
+# get maximum amount of taxa levels per ASV
+max_taxa <- lengths(regmatches(df$Taxon, gregexpr("; ", df$Taxon)))+1
+
+# Currently, all QIIME2 databases seem to have the same levels!
+level <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
+
+# Calculate the classified numbers/percent of asv
+n_asv_tax = nrow(asv_tax)
+
+n_asv_classified <- length(which(max_taxa>=1))
+for (x in 2:length(level)) {
+ n_asv_classified <- c(n_asv_classified, length(which(max_taxa>=x)) )
+}
+p_asv_classified <- round(n_asv_classified / n_asv_tax * 100, 2)
+
+asv_classi_df <- data.frame(level, n_asv_classified, p_asv_classified)
+
+# Build output string
+outputstr <- "QIIME2 classified "
+for (row in seq_len(nrow(asv_classi_df))) {
+ outputstr <- paste0(outputstr, asv_classi_df[row, ]$p_asv_classified,
+ " % ASVs at ", asv_classi_df[row, ]$level, " level, ")
+}
+outputstr <- substr(outputstr, 1, nchar(outputstr)-2)
+outputstr <- paste0(outputstr, ".\n\n")
+
+# Output Text Classifications
+cat(outputstr)
+
+# Barplot
+# Plot
+asv_classi_df$level <- factor(asv_classi_df$level, levels = asv_classi_df$level)
+plot_asv_classi_df <- ggplot(asv_classi_df,
+ aes(x = reorder(level, desc(level)), y = p_asv_classified)) +
+ geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) +
+ ylab("% Classification") +
+ xlab("Levels") +
+ coord_flip() +
+ theme_bw()
+plot_asv_classi_df
+
+svg("qiime2_taxonomic_classification_per_taxonomy_level.svg")
+plot_asv_classi_df
+invisible(dev.off())
+
+cat("\n\nQIIME2 taxonomy assignments can be found in folder [qiime2/taxonomy](../qiime2/taxonomy).")
+```
+
+
+
+```{r, eval = !isFALSE(params$sintax_taxonomy), results='asis'}
+# Header
+cat("## SINTAX\n")
+
+cat("The taxonomic classification was performed by [SINTAX](https://doi.org/10.1101/074161)
+ using the database: `", params$sintax_ref_tax_title, "`.
+ More details about the reference taxonomy database can be found in the ['Methods section'](#methods).\n\n", sep = "")
+
+asv_tax <- read.table(params$sintax_taxonomy, header = TRUE, sep = "\t")
+
+# Calculate the classified numbers/percent of asv
+level <- subset(asv_tax, select = -c(ASV_ID,confidence,sequence))
+level <- colnames(level)
+
+# Catch 100% highest taxa (e.g. Kingdom) assignment
+if (count(asv_tax, level[1])$n[1] == nrow(asv_tax)){
+ n_1 = nrow(asv_tax)
+} else {
+ n_1 = count(asv_tax, level[1])$n[1]
+}
+n_asv_tax = nrow(asv_tax)
+n_asv_unclassified <- c(n_1)
+for (x in level[2:length(level)]) {
+ asv_tax_subset <- subset(asv_tax, select = x)
+ colnames(asv_tax_subset)[1] <- "count_this"
+ n_asv_unclassified <- c(n_asv_unclassified, count(asv_tax_subset, count_this)$n[1])
+}
+
+n_asv_classified <- n_asv_tax - n_asv_unclassified
+p_asv_classified <- round(n_asv_classified / n_asv_tax * 100, 2)
+
+asv_classi_df <- data.frame(level, n_asv_classified, p_asv_classified)
+
+# Build output string
+outputstr <- "SINTAX classified "
+for (row in seq_len(nrow(asv_classi_df))) {
+ outputstr <- paste0(outputstr, asv_classi_df[row, ]$p_asv_classified,
+ " % ASVs at ", asv_classi_df[row, ]$level, " level, ")
+}
+outputstr <- substr(outputstr, 1, nchar(outputstr)-2)
+outputstr <- paste0(outputstr, ".\n\n")
+
+# Output Text Classifications
+cat(outputstr)
+
+# Barplot
+# Plot
+asv_classi_df$level <- factor(asv_classi_df$level, levels = asv_classi_df$level)
+plot_asv_classi_df <- ggplot(asv_classi_df,
+ aes(x = reorder(level, desc(level)), y = p_asv_classified)) +
+ geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) +
+ ylab("% Classification") +
+ xlab("Levels") +
+ coord_flip() +
+ theme_bw()
+plot_asv_classi_df
+
+svg("sintax_taxonomic_classification_per_taxonomy_level.svg")
+plot_asv_classi_df
+invisible(dev.off())
+
+cat("\n\nSINTAX taxonomy assignments can be found in folder [sintax](../sintax).")
+```
+
+
+
+```{r, eval = !isFALSE(params$pplace_taxonomy), results='asis'}
+cat(paste0("
+## Phylogenetic Placement
+
+Phylogenetic placement grafts sequences onto a phylogenetic reference tree and optionally outputs taxonomic annotations.
+The reference tree is ideally made from full-length high-quality sequences containing better evolutionary signal than short amplicons.
+It is hence superior to estimating de-novo phylogenetic trees from short amplicon sequences.
+Extraction of taxonomic classification was performed with [EPA-NG](https://github.com/Pbdas/epa-ng) and [Gappa](https://pubmed.ncbi.nlm.nih.gov/32016344/).
+"))
+
+# Read file and prepare table
+asv_tax <- read.table(params$pplace_taxonomy, header = TRUE, sep = "\t")
+
+# get maximum amount of taxa levels per ASV
+max_taxa <- lengths(regmatches(asv_tax$taxonomy, gregexpr(";", asv_tax$taxonomy)))+1
+
+# labels for levels
+level <- rep(1:max(max_taxa))
+
+# Calculate the classified numbers/percent of asv
+n_asv_tax = nrow(asv_tax)
+
+n_asv_classified <- length(which(max_taxa>=1))
+for (x in 2:length(level)) {
+ n_asv_classified <- c(n_asv_classified, length(which(max_taxa>=x)) )
+}
+p_asv_classified <- round(n_asv_classified / n_asv_tax * 100, 2)
+
+asv_classi_df <- data.frame(level, n_asv_classified, p_asv_classified)
+
+# Build output string
+outputstr <- "Phylogenetic Placement classified "
+for (row in seq_len(nrow(asv_classi_df))) {
+ outputstr <- paste0(outputstr, asv_classi_df[row, ]$p_asv_classified,
+ " % ASVs at taxonomic level ", asv_classi_df[row, ]$level, ", ")
+}
+outputstr <- substr(outputstr, 1, nchar(outputstr)-2)
+outputstr <- paste0(outputstr, ".\n\n")
+
+# Output Text Classifications
+cat(outputstr)
+
+# Barplot
+# Plot
+asv_classi_df$level <- factor(asv_classi_df$level, levels = asv_classi_df$level)
+plot_asv_classi_df <- ggplot(asv_classi_df,
+ aes(x = reorder(level, desc(level)), y = p_asv_classified)) +
+ geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) +
+ ylab("% Classification") +
+ xlab("Taxonomic levels") +
+ coord_flip() +
+ theme_bw()
+plot_asv_classi_df
+
+svg("phylogenetic_placement_taxonomic_classification_per_taxonomy_level.svg")
+plot_asv_classi_df
+invisible(dev.off())
+
+cat("\n\nHeattree of the phylogenetic placement:")
+```
+
+```{r, eval = !isFALSE(params$pplace_taxonomy), out.width="100%", fig.show='hold', fig.align='default'}
+knitr::include_graphics(c(params$pplace_heattree))
+```
+
+```{r, eval = !isFALSE(params$pplace_taxonomy), results='asis'}
+cat("\n\nPhylogenetic placement taxonomy assignments can be found in folder [pplace](../pplace) in file `*.taxonomy.per_query_unique.tsv`.")
+```
+
+
+
+```{r, eval = !isFALSE(params$val_used_taxonomy), results='asis'}
+# Header
+cat("# Downstream analysis with QIIME2\n",
+ "Files that were input to [QIIME2](https://www.nature.com/articles/s41587-019-0209-9) can be found in folder [qiime2/input/](../qiime2/input/).",
+ "Results of taxonomic classification of",params$val_used_taxonomy,"was used in all following analysis, see in the above sections.")
+```
+
+
+
+```{r, eval = !isFALSE(params$filter_stats_tsv), results='asis'}
+cat(paste0("
+## ASV filtering
+
+Unwanted taxa are often off-targets generated in PCR with primers that are not perfectly specific for the target DNA.
+For 16S rRNA sequencing mitrochondria and chloroplast sequences are typically removed because these are frequent unwanted non-bacteria PCR products.
+"))
+
+if ( params$exclude_taxa != "none" ) {
+ cat("ASVs were removed when the taxonomic string contained any of `", params$exclude_taxa, "` (comma separated)", sep="")
+}
+if ( params$min_frequency != 1 ) {
+ cat(", had fewer than", params$min_frequency ,"total read counts over all sample")
+}
+if ( params$min_samples != 1 ) {
+ cat(", and that were present in fewer than", params$min_samples ,"samples")
+}
+cat(". ")
+
+qiime2_filtertaxa <- unlist( strsplit( params$qiime2_filtertaxa, "," ) )
+qiime2_filtertaxa_orig <- as.numeric( qiime2_filtertaxa[1] ) -1
+qiime2_filtertaxa_filt <- as.numeric( qiime2_filtertaxa[2] ) -2
+qiime2_filtertaxa_rm <- qiime2_filtertaxa_orig-qiime2_filtertaxa_filt
+qiime2_filtertaxa_rm_percent <- round( qiime2_filtertaxa_rm/qiime2_filtertaxa_orig*100 ,2)
+
+cat("Consequently,",qiime2_filtertaxa_orig,"ASVs were reduced by",qiime2_filtertaxa_rm,"(",qiime2_filtertaxa_rm_percent,"%) to",qiime2_filtertaxa_filt,".",
+ "The following table shows read counts for each sample before and after filtering:")
+
+# import stats tsv
+filter_stats_tsv <- read.table(file = params$filter_stats_tsv, header = TRUE, sep = "\t")
+colnames(filter_stats_tsv) <- gsub("_tax_filter","",colnames(filter_stats_tsv))
+filter_stats_tsv$retained_percent <- round( filter_stats_tsv$retained_percent, 2)
+filter_stats_tsv$lost_percent <- round( filter_stats_tsv$lost_percent, 2)
+colnames(filter_stats_tsv) <- gsub("_percent","%",colnames(filter_stats_tsv))
+
+# Display table
+datatable(filter_stats_tsv, options = list(
+ scrollX = TRUE,
+ scrollY = "300px",
+ paging = FALSE))
+
+cat("\n\nTables with read count numbers and filtered abundance tables are in folder [qiime2/abundance_tables](../qiime2/abundance_tables).")
+```
+
+
+
+```{r, eval = !isFALSE(params$abundance_tables), results='asis'}
+cat(paste0("
+## Abundance tables
+
+The abundance tables are the final data for further downstream analysis and visualisations.
+The tables are based on the computed ASVs and taxonomic classification, but after removal of unwanted taxa.
+Folder [qiime2/abundance_tables](../qiime2/abundance_tables) contains tap-separated files (.tsv)
+that can be opened by any spreadsheet software.
+
+## Relative abundance tables
+
+Absolute abundance tables produced by the previous steps contain count data, but the compositional
+nature of 16S rRNA amplicon sequencing requires sequencing depth normalisation. This step computes
+relative abundance tables using TSS (Total Sum Scaling normalisation) for various taxonomic levels
+and detailed tables for all ASVs with taxonomic classification, sequence and relative abundance for
+each sample. Typically used for in depth investigation of taxa abundances.
+Folder [qiime2/rel_abundance_tables](../qiime2/rel_abundance_tables) contains tap-separated files (.tsv)
+that can be opened by any spreadsheet software.
+"))
+```
+
+
+
+```{r, eval = !isFALSE(params$barplot), results='asis'}
+cat(paste0("
+## Barplot
+
+Interactive abundance plot that aids exploratory browsing the discovered taxa and their abundance
+in samples and allows sorting for associated meta data. Folder [qiime2/barplot](../qiime2/barplot)
+contains barplots, click [qiime2/barplot/index.html](../qiime2/barplot/index.html) to open it in
+your web browser.
+"))
+```
+
+```{r, eval = !isFALSE(params$metadata_category_barplot), results='asis'}
+cat(paste0("
+Additionally, barplots with average relative abundance values were produced
+for `",params$metadata_category_barplot,"` (comma separated if several) in [qiime2/barplot_average](../qiime2/barplot_average)
+in separate folders following the scheme `barplot_{treatment}`:
+"))
+
+metadata_category_barplot <- sort( unlist( strsplit( params$metadata_category_barplot,"," ) ) )
+for (category in metadata_category_barplot) {
+ barplot_folder_path <- paste0("qiime2/barplot_average/barplot_",category)
+ cat("\n- [",barplot_folder_path,"/index.html](../",barplot_folder_path,"/index.html)\n", sep="")
+}
+```
+
+
+
+```{r, eval = !isFALSE(params$alpha_rarefaction), results='asis'}
+cat(paste0("
+## Alpha diversity rarefaction curves
+
+Produces rarefaction plots for several alpha diversity indices, and is primarily used to determine if the
+richness of the samples has been fully observed or sequenced. If the slope of the curves does not level
+out and the lines do not become horizontal, this might be because the sequencing depth was too low to observe
+all diversity or that sequencing error artificially increases sequence diversity and causes false discoveries.
+
+Folder [qiime2/alpha-rarefaction](../qiime2/alpha-rarefaction) contains the data, click
+[qiime2/alpha-rarefaction/index.html](../qiime2/alpha-rarefaction/index.html) to open it in your web browser.
+"))
+```
+
+
+
+```{r, eval = !isFALSE(params$diversity_indices_beta), results='asis'}
+diversity_indices_depth <- readLines(params$diversity_indices_depth)
+
+cat(paste0("
+## Diversity analysis
+
+Diversity measures summarize important sample features (alpha diversity) or differences between samples (beta diversity).
+Diversity calculations are based on sub-sampled data rarefied to ",diversity_indices_depth, " counts.
+
+### Alpha diversity indices
+
+Alpha diversity measures the species diversity within samples.
+"))
+
+if ( params$dada_sample_inference == "independent") {
+ cat("Please note that ASVs were inferred for each sample independently, that can make alpha diversity indices a poor estimate of true diversity. ")
+}
+
+cat(paste0("
+This step calculates alpha diversity using various methods and performs pairwise comparisons of groups of samples. It is based on a phylogenetic tree of all ASV sequences.
+Folder [qiime2/diversity/alpha_diversity](../qiime2/diversity/alpha_diversity) contains the alpha-diversity data:
+
+- Shannon’s diversity index (quantitative): [qiime2/diversity/alpha_diversity/shannon_vector/index.html](../qiime2/diversity/alpha_diversity/shannon_vector/index.html)
+- Pielou’s Evenness: [qiime2/diversity/alpha_diversity/evenness_vector/index.html](../qiime2/diversity/alpha_diversity/evenness_vector/index.html)
+- Faith’s Phylogenetic Diversity (qualitiative, phylogenetic) [qiime2/diversity/alpha_diversity/faith_pd_vector/index.html](../qiime2/diversity/alpha_diversity/faith_pd_vector/index.html)
+- Observed OTUs (qualitative): [qiime2/diversity/alpha_diversity/observed_otus_vector/index.html](../qiime2/diversity/alpha_diversity/observed_otus_vector/index.html)
+
+### Beta diversity indices
+
+Beta diversity measures the species community differences between samples. This step calculates beta diversity distances using
+various methods and performs pairwise comparisons of groups of samples. Additionally, principle coordinates analysis (PCoA)
+plots are produced that can be visualized with Emperor in your default browser without the need for installation.
+These calculations are based on a phylogenetic tree of all ASV sequences.
+Folder [qiime2/diversity/beta_diversity](../qiime2/diversity/beta_diversity) contains the beta-diverity data:
+
+#### PCoA for four different beta diversity distances are accessible via:
+
+- Bray-Curtis distance (quantitative): [qiime2/diversity/beta_diversity/bray_curtis_pcoa_results-PCoA/index.html](../qiime2/diversity/beta_diversity/bray_curtis_pcoa_results-PCoA/index.html)
+- Jaccard distance (qualitative): [qiime2/diversity/beta_diversity/jaccard_pcoa_results-PCoA/index.html](../qiime2/diversity/beta_diversity/jaccard_pcoa_results-PCoA/index.html)
+- unweighted UniFrac distance (qualitative, phylogenetic) [qiime2/diversity/beta_diversity/unweighted_unifrac_pcoa_results-PCoA/index.html](../qiime2/diversity/beta_diversity/unweighted_unifrac_pcoa_results-PCoA/index.html)
+- weighted UniFrac distance (quantitative, phylogenetic): [qiime2/diversity/beta_diversity/weighted_unifrac_pcoa_results-PCoA/index.html](../qiime2/diversity/beta_diversity/weighted_unifrac_pcoa_results-PCoA/index.html)
+
+#### Pairwise comparisons between groups of samples
+
+Statistics on differences between specific metadata groups that can be found in folder
+[qiime2/diversity/beta_diversity/](../qiime2/diversity/beta_diversity/). Each significance test
+result is in its separate folder following the scheme `{method}_distance_matrix-{treatment}`:
+"))
+
+diversity_indices_beta <- sort( unlist( strsplit( params$diversity_indices_beta,"," ) ) )
+for (folder in diversity_indices_beta) {
+ beta_folder_path <- paste0("qiime2/diversity/",folder) #"beta_diversity/" is defined in input section with "stageAs: 'beta_diversity/*'"
+ cat("\n- [",beta_folder_path,"/index.html](../",beta_folder_path,"/index.html)\n", sep="")
+}
+```
+
+```{r, eval = !isFALSE(params$qiime_adonis_formula), results='asis'}
+cat(paste0("
+#### ADONIS test
+
+Permutational multivariate analysis of variance using distance matrices
+[adonis](https://doi.org/10.1111/j.1442-9993.2001.01070.pp.x) (in [VEGAN](https://CRAN.R-project.org/package=vegan))
+determines whether groups of samples are significantly different from one another.
+The formula was `",params$qiime_adonis_formula,"` (multiple formulas are comma separated).
+adonis computes an R2 value (effect size) which shows the percentage of variation explained
+by a condition, as well as a p-value to determine the statistical significance.
+The sequence of conditions in the formula matters, the variance of factors is removed
+(statistically controlled for) from beginning to end of the formula.
+
+Test results are in separate folders following the scheme `{method}_distance_matrix-{adonis formula}`:
+"))
+
+diversity_indices_adonis <- sort( unlist( strsplit( params$diversity_indices_adonis,"," ) ) )
+for (folder in diversity_indices_adonis) {
+ adonis_index_path <- paste0("qiime2/diversity/",folder) #"beta_diversity/" is defined in input section with "stageAs: 'beta_diversity/adonis/*'"
+ cat("\n- [",adonis_index_path,"/index.html](../",adonis_index_path,"/index.html)\n", sep="")
+}
+```
+
+
+
+```{r, eval = !isFALSE(params$ancom), results='asis'}
+cat(paste0("
+## ANCOM
+
+[Analysis of Composition of Microbiomes (ANCOM)](https://www.ncbi.nlm.nih.gov/pubmed/26028277)
+is applied to identify features that are differentially
+abundant across sample groups. A key assumption made by ANCOM is that few taxa (less than about 25%)
+will be differentially abundant between groups otherwise the method will be inaccurate.
+Comparisons between groups of samples is performed for specific metadata that can be found in folder
+[qiime2/ancom/](../qiime2/ancom/).
+
+Test results are in separate folders following the scheme `Category-{treatment}-{taxonomic level}`:
+"))
+
+ancom <- sort( unlist( strsplit( params$ancom,"," ) ) )
+for (folder in ancom) {
+ ancom_path <- paste0("qiime2/ancom/",folder)
+ cat("\n- [",ancom_path,"/index.html](../",ancom_path,"/index.html)\n", sep="")
+}
+```
+
+
+
+```{r, eval = !isFALSE(params$picrust_pathways), results='asis'}
+cat(paste0("
+## PICRUSt2
+
+[PICRUSt2](https://pubmed.ncbi.nlm.nih.gov/32483366/) (Phylogenetic Investigation of Communities by Reconstruction of Unobserved States)
+is a software for predicting functional abundances based only on marker gene sequences.
+Enzyme Classification numbers (EC), KEGG orthologs (KO) and MetaCyc ontology predictions were made for each sample.
+In folder [PICRUSt2/](../PICRUSt2/) are predicted quantifications for Enzyme Classification numbers (EC), see
+`EC_pred_metagenome_unstrat_descrip.tsv`, KEGG orthologs (KO), see `KO_pred_metagenome_unstrat_descrip.tsv`, MetaCyc ontology,
+see `METACYC_path_abun_unstrat_descrip.tsv`. Quantifications are not normalized yet, they can be normalized e.g. by the total sum per sample.
+"))
+```
+
+
+
+# Methods
+
+```{r, results='asis'}
+if ( !isFALSE(params$dada2_ref_tax_title) ) {
+ cat("Taxonomic classification by DADA2:\n\n",
+ "- database: `", params$dada2_ref_tax_title, "`\n\n",
+ "- files: `", params$dada2_ref_tax_file, "`\n\n",
+ "- citation: `", params$dada2_ref_tax_citation, "`\n\n", sep = "")
+}
+
+if ( !isFALSE(params$qiime2_ref_tax_title) ) {
+ cat("Taxonomic classification by QIIME2:\n\n",
+ "- database: `", params$qiime2_ref_tax_title, "`\n\n",
+ "- files: `", params$qiime2_ref_tax_file, "`\n\n",
+ "- citation: `", params$qiime2_ref_tax_citation, "`\n\n", sep = "")
+}
+
+if ( !isFALSE(params$sintax_ref_tax_title) ) {
+ cat("Taxonomic classification by SINTAX:\n\n",
+ "- database: `", params$sintax_ref_tax_title, "`\n\n",
+ "- files: `", params$sintax_ref_tax_file, "`\n\n",
+ "- citation: `", params$sintax_ref_tax_citation, "`\n\n", sep = "")
+}
+
+if ( !isFALSE(params$mqc_plot) ) {
+ # with MultiQC
+ cat("[MultiQC](https://multiqc.info/) summarized computational methods in [multiqc/multiqc_report.html](../multiqc/multiqc_report.html).
+ The proposed short methods description can be found in [MultiQC's Methods Description](../multiqc/multiqc_report.html#nf-core-ampliseq-methods-description),
+ versions of software collected at runtime in [MultiQC's Software Versions](../multiqc/multiqc_report.html#software_versions),
+ and a summary of non-default parameter in [MultiQC's Workflow Summary](../multiqc/multiqc_report.html#nf-core-ampliseq-summary).\n\n")
+}
+# with & without MultiQC
+cat(paste0("
+Technical information to the pipeline run are collected in folder [pipeline_info](../pipeline_info),
+including software versions collected at runtime in file `software_versions.yml` (can be viewed with a text editor),
+execution report in file `execution_report_{date}_{time}.html`,
+execution trace in file `execution_trace_{date}_{time}.txt`,
+execution timeline in file `execution_timelime_{date}_{time}.html`, and
+pipeline direct acyclic graph (DAG) in file `pipeline_dag_{date}_{time}.html`.
+"))
+```
+
+
+
+# Final notes
+
+This report (file `summary_report.html`) is located in folder [summary_report](.) of the original pipeline results folder.
+In this file, all links to files and folders are relative, therefore hyperlinks will only work when the report is at its original place in the pipeline results folder.
+Plots specifically produced for this report (if any) can be also found in folder [summary_report](.).
+
+A comprehensive read count report throughout the pipeline can be found in the [base results folder](../) in file `overall_summary.tsv`.
+
+Please cite the [pipeline publication](https://doi.org/10.3389/fmicb.2020.550420) and any software tools used by the pipeline (see [citations](https://nf-co.re/ampliseq#citations)) when you use any of the pipeline results in your study.
diff --git a/conf/modules.config b/conf/modules.config
index c431e4e0..bc91b125 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -809,4 +809,11 @@ process {
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}
+
+ withName: SUMMARY_REPORT {
+ publishDir = [
+ path: { "${params.outdir}/summary_report" },
+ mode: params.publish_dir_mode
+ ]
+ }
}
diff --git a/conf/test_doubleprimers.config b/conf/test_doubleprimers.config
index 6b275dc8..75c4afab 100644
--- a/conf/test_doubleprimers.config
+++ b/conf/test_doubleprimers.config
@@ -23,7 +23,7 @@ params {
FW_primer = "NNNNCCTAHGGGRBGCAGCAG"
RV_primer = "GACTACHVGGGTATCTAATCC"
double_primer = true
- dada_ref_taxonomy = false
+ skip_dada_taxonomy = true
input = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/Samplesheet_double_primer.tsv"
trunc_qmin = 30
skip_fastqc = true
diff --git a/conf/test_pplace.config b/conf/test_pplace.config
index b6eaff1d..ecd5424d 100644
--- a/conf/test_pplace.config
+++ b/conf/test_pplace.config
@@ -24,7 +24,7 @@ params {
RV_primer = "GGACTACNVGGGTWTCTAAT"
input = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/Samplesheet.tsv"
metadata = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/Metadata.tsv"
- dada_ref_taxonomy = false
+ skip_dada_taxonomy = true
qiime_ref_taxonomy = "greengenes85"
filter_ssu = "bac"
diff --git a/docs/output.md b/docs/output.md
index 305e578a..9e9eb75a 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -17,6 +17,7 @@ The directories listed below will be created in the results directory after the
The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
- [Input](#input) - Input files
+- [Pipeline summary report](#pipeline-summary-report) - Overview of pipeline output
- [Preprocessing](#preprocessing)
- [FastQC](#fastqc) - Read quality control
- [Cutadapt](#cutadapt) - Primer trimming
@@ -41,6 +42,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [Diversity analysis](#diversity-analysis) - High level overview with different diversity indices
- [ANCOM](#ancom) - Differential abundance analysis
- [PICRUSt2](#picrust2) - Predict the functional potential of a bacterial community
+- [SBDI export](#sbdi-export) - Swedish Biodiversity Infrastructure (SBDI) submission file
- [Phyloseq](#phyloseq) - Phyloseq R objects
- [Read count report](#read-count-report) - Report of read counts during various steps of the pipeline
- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
@@ -59,6 +61,20 @@ Samplesheet, ASV fasta, and metadata file are copied into the results folder.
+### Pipeline summary report
+
+A summary report for most pipeline results in html format produced by [R Markdown](https://rmarkdown.rstudio.com/). The report gives a general overview of the analysis, includes many tables and visualizations, and links to interactive downstream analysis results, if available.
+
+
+Output files
+
+- `summary_report/`
+ - `summary_report.html`: pipeline summary report as standalone HTML file that can be viewed in your web browser.
+ - `*.svg*`: plots that were produced for (and are included in) the report.
+ - `versions.yml`: software versions used to produce this report.
+
+
+
### Preprocessing
#### FastQC
diff --git a/modules/local/summary_report.nf b/modules/local/summary_report.nf
new file mode 100644
index 00000000..8af605c6
--- /dev/null
+++ b/modules/local/summary_report.nf
@@ -0,0 +1,142 @@
+process SUMMARY_REPORT {
+ label 'process_low'
+
+ conda "conda-forge::r-base=4.2.3 conda-forge::r-rmarkdown=2.22 conda-forge::r-tidyverse=2.0.0 conda-forge::r-knitr=1.43 conda-forge::r-dt=0.28 conda-forge::r-dtplyr=1.3.1 conda-forge::r-formattable=0.2.1 conda-forge::r-purrr=1.0.1 conda-forge::r-vegan=2.6_4 conda-forge::r-optparse=1.7.3 conda-forge::r-ggplot2=3.4.2 conda-forge::r-dplyr=1.1.2 conda-forge::r-data.table=1.14.8 conda-forge::r-patchwork=1.1.2"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' :
+ 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }"
+
+ input:
+ path(report_template)
+ path(report_styles)
+ path(report_logo)
+ path(report_abstract)
+ path(metadata)
+ path(samplesheet)
+ path(fasta)
+ path(mqc_plots)
+ path(cutadapt_summary)
+ val(find_truncation_values)
+ path(dada_filtntrim_args)
+ path(dada_qual_stats)
+ path(dada_pp_qual_stats)
+ tuple val(meta), path(dada_err_svgs)
+ path(dada_asv_table)
+ path(dada_asv_fa)
+ path(dada_tab)
+ path(dada_stats)
+ path(barrnap_summary)
+ path(filter_ssu_stats)
+ path(filter_ssu_asv)
+ path(filter_len_asv_stats)
+ path(filter_len_asv_len_orig)
+ path(filter_codons_stats)
+ path(itsx_cutasv_summary)
+ path(dada2_tax)
+ tuple val(meta_ref), path(cut_dada_ref_taxonomy) // cutadapt log when params.cut_dada_ref_taxonomy
+ path(sintax_tax)
+ path(pplace_tax)
+ tuple val(meta_pplace), path(pplace_heattree)
+ path(qiime2_tax)
+ val(run_qiime2)
+ val(val_used_taxonomy)
+ val(qiime2_filtertaxa) // ,
+ path(filter_stats_tsv)
+ path(barplot)
+ val(abundance_tables)
+ val(alpha_rarefaction)
+ path(diversity_indices)
+ path(diversity_indices_beta, stageAs: 'beta_diversity/*') // prevent folder name collisons
+ path(diversity_indices_adonis, stageAs: 'beta_diversity/adonis/*') // prevent folder name collisons
+ path(ancom)
+ path(picrust_pathways)
+
+
+ output:
+ path "*.svg" , emit: svg, optional: true
+ path "summary_report.html" , emit: report
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ // make named R list (comma separated)
+ // all non-boolean or non-numeric values must be encumbered by single quotes (')!
+ // all elements must have a value, i.e. booleans also need to be set to TRUE
+ def params_list_named = [
+ "css='$report_styles'",
+ "report_logo='$report_logo'",
+ "workflow_manifest_version='${workflow.manifest.version}'",
+ "workflow_scriptid='${workflow.scriptId.substring(0,10)}'",
+ params.report_title ? "report_title='$params.report_title'" : "",
+ report_abstract ? "report_abstract='$params.report_abstract'" : "",
+ meta.single_end ? "flag_single_end=TRUE" : "",
+ metadata ? "metadata='$metadata'" : "",
+ samplesheet ? "samplesheet='$samplesheet'" : "",
+ fasta ? "fasta='$fasta'" : "",
+ !fasta && !samplesheet ? "input='$params.input'" : "",
+ mqc_plots ? "mqc_plot='${mqc_plots}/svg/mqc_fastqc_per_sequence_quality_scores_plot_1.svg'" : "",
+ cutadapt_summary ?
+ params.retain_untrimmed ? "flag_retain_untrimmed=TRUE,cutadapt_summary='$cutadapt_summary'" :
+ "cutadapt_summary='$cutadapt_summary'" : "",
+ find_truncation_values ? "trunc_qmin=$params.trunc_qmin,trunc_rmin=$params.trunc_rmin" : "",
+ "trunclenf='$params.trunclenf'",
+ "trunclenr='$params.trunclenr'",
+ "max_ee=$params.max_ee",
+ dada_qual_stats && meta.single_end ? "dada_qc_f_path='$dada_qual_stats',dada_pp_qc_f_path='$dada_pp_qual_stats'" :
+ dada_qual_stats ? "dada_qc_f_path='FW_qual_stats.svg',dada_qc_r_path='RV_qual_stats.svg',dada_pp_qc_f_path='FW_preprocessed_qual_stats.svg',dada_pp_qc_r_path='RV_preprocessed_qual_stats.svg'" : "",
+ dada_filtntrim_args ? "dada_filtntrim_args='$dada_filtntrim_args'" : "",
+ "dada_sample_inference='$params.sample_inference'",
+ dada_err_svgs && meta.run.size() == 1 && meta.single_end ?
+ "dada_err_path='$dada_err_svgs',dada_err_run='"+meta.run+"'" :
+ dada_err_svgs ? "dada_err_path='"+dada_err_svgs.join(',')+"',dada_err_run='"+meta.run.join(',')+"'" : "",
+ dada_asv_table ? "asv_table_path='$dada_asv_table'" : "",
+ dada_asv_fa ? "path_asv_fa='$dada_asv_fa'": "",
+ dada_tab ? "path_dada2_tab='$dada_tab'" : "",
+ dada_stats ? "dada_stats_path='$dada_stats'" : "",
+ params.skip_barrnap ? "" : "path_barrnap_sum='$barrnap_summary'",
+ filter_ssu_stats ? "filter_ssu_stats='$filter_ssu_stats',filter_ssu_asv='$filter_ssu_asv',filter_ssu='$params.filter_ssu'" : "",
+ filter_len_asv_stats ? "filter_len_asv='$filter_len_asv_stats'" : "",
+ filter_len_asv_len_orig ? "filter_len_asv_len_orig='$filter_len_asv_len_orig'" : "",
+ params.min_len_asv ? "min_len_asv=$params.min_len_asv" : "min_len_asv=0",
+ params.max_len_asv ? "max_len_asv=$params.max_len_asv" : "max_len_asv=0",
+ filter_codons_stats ? "filter_codons='$filter_codons_stats',stop_codons='$params.stop_codons'" : "",
+ itsx_cutasv_summary ? "itsx_cutasv_summary='$itsx_cutasv_summary',cut_its='$params.cut_its'" : "",
+ !dada2_tax ? "" :
+ params.dada_ref_tax_custom ? "dada2_taxonomy='$dada2_tax',flag_ref_tax_user=TRUE" :
+ "dada2_taxonomy='$dada2_tax',dada2_ref_tax_title='${params.dada_ref_databases[params.dada_ref_taxonomy]["title"]}',dada2_ref_tax_file='${params.dada_ref_databases[params.dada_ref_taxonomy]["file"]}',dada2_ref_tax_citation='${params.dada_ref_databases[params.dada_ref_taxonomy]["citation"]}'",
+ cut_dada_ref_taxonomy ? "cut_dada_ref_taxonomy='$cut_dada_ref_taxonomy'" : "",
+ sintax_tax ? "sintax_taxonomy='$sintax_tax',sintax_ref_tax_title='${params.sintax_ref_databases[params.sintax_ref_taxonomy]["title"]}',sintax_ref_tax_file='${params.sintax_ref_databases[params.sintax_ref_taxonomy]["file"]}',sintax_ref_tax_citation='${params.sintax_ref_databases[params.sintax_ref_taxonomy]["citation"]}'" : "",
+ pplace_tax ? "pplace_taxonomy='$pplace_tax',pplace_heattree='$pplace_heattree'" : "",
+ qiime2_tax ? "qiime2_taxonomy='$qiime2_tax',qiime2_ref_tax_title='${params.qiime_ref_databases[params.qiime_ref_taxonomy]["title"]}',qiime2_ref_tax_file='${params.qiime_ref_databases[params.qiime_ref_taxonomy]["file"]}',qiime2_ref_tax_citation='${params.qiime_ref_databases[params.qiime_ref_taxonomy]["citation"]}'" : "",
+ run_qiime2 ? "val_used_taxonomy='$val_used_taxonomy'" : "",
+ filter_stats_tsv ? "filter_stats_tsv='$filter_stats_tsv',qiime2_filtertaxa='$qiime2_filtertaxa',exclude_taxa='$params.exclude_taxa',min_frequency='$params.min_frequency',min_samples='$params.min_samples'" : "",
+ barplot ? "barplot=TRUE" : "",
+ barplot && params.metadata_category_barplot ? "metadata_category_barplot='$params.metadata_category_barplot'" : "",
+ abundance_tables ? "abundance_tables=TRUE" : "",
+ alpha_rarefaction ? "alpha_rarefaction=TRUE" : "",
+ diversity_indices ? "diversity_indices_depth='$diversity_indices',diversity_indices_beta='"+ diversity_indices_beta.join(",") +"'" : "",
+ diversity_indices_adonis ? "diversity_indices_adonis='"+ diversity_indices_adonis.join(",") +"',qiime_adonis_formula='$params.qiime_adonis_formula'" : "",
+ ancom ? "ancom='"+ ancom.join(",") +"'" : "",
+ ]
+ // groovy list to R named list string; findAll removes empty entries
+ params_list_named_string = params_list_named.findAll().join(',').trim()
+ """
+ #!/usr/bin/env Rscript
+ library(rmarkdown)
+
+ # Work around https://github.com/rstudio/rmarkdown/issues/1508
+ # If the symbolic link is not replaced by a physical file
+ # output- and temporary files will be written to the original directory.
+ file.copy("./${report_template}", "./template.Rmd", overwrite = TRUE)
+
+ rmarkdown::render("template.Rmd", output_file = "summary_report.html", params = list($params_list_named_string), envir = new.env())
+
+ writeLines(c("\\"${task.process}\\":",
+ paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),
+ paste0(" rmarkdown: ", packageVersion("rmarkdown")),
+ paste0(" knitr: ", packageVersion("knitr")) ),
+ "versions.yml")
+ """
+}
diff --git a/nextflow.config b/nextflow.config
index 5fe65106..ed052347 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -70,6 +70,13 @@ params {
diversity_rarefaction_depth = 500
ancom_sample_min_count = 1
+ // Report options
+ report_template = "${projectDir}/assets/report_template.Rmd"
+ report_css = "${projectDir}/assets/nf-core_style.css"
+ report_logo = "${projectDir}/assets/nf-core-ampliseq_logo_light_long.png"
+ report_title = "Summary of analysis results"
+ report_abstract = null
+
// Skipping options
skip_cutadapt = false
skip_dada_quality = false
@@ -86,6 +93,7 @@ params {
skip_diversity_indices = false
skip_ancom = false
skip_multiqc = false
+ skip_report = false
// Database options
dada_ref_taxonomy = "silva=138"
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 5fc118fc..e0055c05 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -503,6 +503,39 @@
}
}
},
+ "pipeline_report": {
+ "title": "Pipeline summary report",
+ "type": "object",
+ "description": "",
+ "default": "",
+ "properties": {
+ "report_template": {
+ "type": "string",
+ "default": "${projectDir}/assets/report_template.Rmd",
+ "description": "Path to Markdown file (Rmd)"
+ },
+ "report_css": {
+ "type": "string",
+ "default": "${projectDir}/assets/nf-core_style.css",
+ "description": "Path to style file (css)"
+ },
+ "report_logo": {
+ "type": "string",
+ "default": "${projectDir}/assets/nf-core-ampliseq_logo_light_long.png",
+ "description": "Path to logo file (png)"
+ },
+ "report_title": {
+ "type": "string",
+ "default": "Summary of analysis results",
+ "description": "String used as report title"
+ },
+ "report_abstract": {
+ "type": "string",
+ "default": null,
+ "description": "Path to Markdown file (md) that replaces the 'Abstract' section"
+ }
+ }
+ },
"skipping_specific_steps": {
"title": "Skipping specific steps",
"type": "object",
@@ -564,6 +597,10 @@
"skip_multiqc": {
"type": "boolean",
"description": "Skip MultiQC reporting"
+ },
+ "skip_report": {
+ "type": "boolean",
+ "description": "Skip Markdown summary report"
}
}
},
@@ -802,6 +839,9 @@
{
"$ref": "#/definitions/downstream_analysis"
},
+ {
+ "$ref": "#/definitions/pipeline_report"
+ },
{
"$ref": "#/definitions/skipping_specific_steps"
},
diff --git a/subworkflows/local/dada2_preprocessing.nf b/subworkflows/local/dada2_preprocessing.nf
index fb2b44f3..12412c4a 100644
--- a/subworkflows/local/dada2_preprocessing.nf
+++ b/subworkflows/local/dada2_preprocessing.nf
@@ -41,10 +41,12 @@ workflow DADA2_PREPROCESSING {
.set { ch_all_trimmed_reads }
}
+ ch_DADA2_QUALITY1_SVG = Channel.empty()
if ( !params.skip_dada_quality ) {
DADA2_QUALITY1 ( ch_all_trimmed_reads.dump(tag: 'into_dada2_quality') )
ch_versions_dada2_preprocessing = ch_versions_dada2_preprocessing.mix(DADA2_QUALITY1.out.versions)
DADA2_QUALITY1.out.warning.subscribe { if ( it.baseName.toString().startsWith("WARNING") ) log.warn it.baseName.toString().replace("WARNING ","DADA2_QUALITY1: ") }
+ ch_DADA2_QUALITY1_SVG = DADA2_QUALITY1.out.svg
}
//find truncation values in case they are not supplied
@@ -94,9 +96,12 @@ workflow DADA2_PREPROCESSING {
.mix ( ch_all_preprocessed_rv )
.set { ch_all_preprocessed_reads }
}
+
+ ch_DADA2_QUALITY2_SVG = Channel.empty()
if ( !params.skip_dada_quality ) {
DADA2_QUALITY2 ( ch_all_preprocessed_reads.dump(tag: 'into_dada2_quality2') )
DADA2_QUALITY2.out.warning.subscribe { if ( it.baseName.toString().startsWith("WARNING") ) log.warn it.baseName.toString().replace("WARNING ","DADA2_QUALITY2: ") }
+ ch_DADA2_QUALITY2_SVG = DADA2_QUALITY2.out.svg
}
//group by sequencing run
@@ -118,7 +123,10 @@ workflow DADA2_PREPROCESSING {
.set { ch_filt_reads }
emit:
- reads = ch_filt_reads
- logs = DADA2_FILTNTRIM.out.log
- versions = ch_versions_dada2_preprocessing
+ reads = ch_filt_reads
+ logs = DADA2_FILTNTRIM.out.log
+ args = DADA2_FILTNTRIM.out.args
+ qc_svg = ch_DADA2_QUALITY1_SVG.collect()
+ qc_svg_preprocessed = ch_DADA2_QUALITY2_SVG.collect()
+ versions = ch_versions_dada2_preprocessing
}
diff --git a/subworkflows/local/dada2_taxonomy_wf.nf b/subworkflows/local/dada2_taxonomy_wf.nf
index c5259e6c..9673b45e 100644
--- a/subworkflows/local/dada2_taxonomy_wf.nf
+++ b/subworkflows/local/dada2_taxonomy_wf.nf
@@ -104,6 +104,7 @@ workflow DADA2_TAXONOMY_WF {
}
emit:
+ cut_tax = params.cut_dada_ref_taxonomy ? CUTADAPT_TAXONOMY.out.log : [[],[]]
tax = ch_dada2_tax
versions = ch_versions_dada_taxonomy
}
diff --git a/subworkflows/local/qiime2_ancom.nf b/subworkflows/local/qiime2_ancom.nf
index af83733d..ce308d78 100644
--- a/subworkflows/local/qiime2_ancom.nf
+++ b/subworkflows/local/qiime2_ancom.nf
@@ -34,4 +34,7 @@ workflow QIIME2_ANCOM {
QIIME2_ANCOM_TAX.out.ancom.subscribe { if ( it.baseName[0].toString().startsWith("WARNING") ) log.warn it.baseName[0].toString().replace("WARNING ","QIIME2_ANCOM_TAX: ") }
QIIME2_ANCOM_ASV ( ch_metadata.combine( QIIME2_FILTERSAMPLES_ANCOM.out.qza.flatten() ) )
+
+ emit:
+ ancom = QIIME2_ANCOM_ASV.out.ancom.mix(QIIME2_ANCOM_TAX.out.ancom)
}
diff --git a/subworkflows/local/qiime2_diversity.nf b/subworkflows/local/qiime2_diversity.nf
index b3d7f64b..02f0d91e 100644
--- a/subworkflows/local/qiime2_diversity.nf
+++ b/subworkflows/local/qiime2_diversity.nf
@@ -71,4 +71,11 @@ workflow QIIME2_DIVERSITY {
.set{ ch_to_diversity_betaord }
QIIME2_DIVERSITY_BETAORD ( ch_to_diversity_betaord )
}
+
+ emit:
+ depth = !skip_diversity_indices ? QIIME2_DIVERSITY_CORE.out.depth : []
+ alpha = !skip_diversity_indices ? QIIME2_DIVERSITY_ALPHA.out.alpha : []
+ beta = !skip_diversity_indices ? QIIME2_DIVERSITY_BETA.out.beta : []
+ betaord = !skip_diversity_indices ? QIIME2_DIVERSITY_BETAORD.out.beta : []
+ adonis = !skip_diversity_indices && params.qiime_adonis_formula ? QIIME2_DIVERSITY_ADONIS.out.html : []
}
diff --git a/tests/pipeline/doubleprimers.nf.test b/tests/pipeline/doubleprimers.nf.test
index cd810025..5d641077 100644
--- a/tests/pipeline/doubleprimers.nf.test
+++ b/tests/pipeline/doubleprimers.nf.test
@@ -29,11 +29,10 @@ nextflow_pipeline {
path("$outputDir/dada2/DADA2_stats.tsv"),
path("$outputDir/dada2/DADA2_table.rds"),
path("$outputDir/dada2/DADA2_table.tsv")).match("dada2") },
- { assert new File("$outputDir/qiime2/input/rep-seqs.qza").exists() },
- { assert new File("$outputDir/qiime2/input/table.qza").exists() },
{ assert snapshot(path("$outputDir/input/Samplesheet_double_primer.tsv")).match("input") },
{ assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"),
- path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }
+ path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") },
+ { assert new File("$outputDir/summary_report/summary_report.html").exists() }
)
}
}
diff --git a/tests/pipeline/doubleprimers.nf.test.snap b/tests/pipeline/doubleprimers.nf.test.snap
index 64ddaa21..cefcf1b9 100644
--- a/tests/pipeline/doubleprimers.nf.test.snap
+++ b/tests/pipeline/doubleprimers.nf.test.snap
@@ -13,9 +13,9 @@
},
"software_versions": {
"content": [
- "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.7.0dev}}"
+ "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.7.0dev}}"
],
- "timestamp": "2023-05-28T21:08:54+0000"
+ "timestamp": "2023-07-27T13:49:03+0000"
},
"overall_summary_tsv": {
"content": [
diff --git a/tests/pipeline/fasta.nf.test b/tests/pipeline/fasta.nf.test
index 9daca857..8db0826b 100644
--- a/tests/pipeline/fasta.nf.test
+++ b/tests/pipeline/fasta.nf.test
@@ -25,7 +25,8 @@ nextflow_pipeline {
{ assert snapshot(path("$outputDir/dada2/ref_taxonomy.rdp_18.txt")).match("dada2") },
{ assert new File("$outputDir/dada2/ASV_tax_species.rdp_18.tsv").exists() },
{ assert new File("$outputDir/dada2/ASV_tax.rdp_18.tsv").exists() },
- { assert snapshot(path("$outputDir/input/ASV_seqs.fasta")).match("input") }
+ { assert snapshot(path("$outputDir/input/ASV_seqs.fasta")).match("input") },
+ { assert new File("$outputDir/summary_report/summary_report.html").exists() }
)
}
}
diff --git a/tests/pipeline/iontorrent.nf.test b/tests/pipeline/iontorrent.nf.test
index a4a16631..200a9825 100644
--- a/tests/pipeline/iontorrent.nf.test
+++ b/tests/pipeline/iontorrent.nf.test
@@ -39,6 +39,7 @@ nextflow_pipeline {
{ assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"),
path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"),
path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") },
+ { assert new File("$outputDir/summary_report/summary_report.html").exists() },
{ assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() }
)
}
diff --git a/tests/pipeline/multi.nf.test b/tests/pipeline/multi.nf.test
index 75e2e374..3e01ff20 100644
--- a/tests/pipeline/multi.nf.test
+++ b/tests/pipeline/multi.nf.test
@@ -64,6 +64,7 @@ nextflow_pipeline {
{ assert new File("$outputDir/qiime2/representative_sequences/rep-seq.fasta").exists() },
{ assert snapshot(path("$outputDir/qiime2/representative_sequences/descriptive_stats.tsv"),
path("$outputDir/qiime2/representative_sequences/seven_number_summary.tsv")).match("qiime2") },
+ { assert new File("$outputDir/summary_report/summary_report.html").exists() },
{ assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() }
)
}
diff --git a/tests/pipeline/novaseq.nf.test b/tests/pipeline/novaseq.nf.test
index a2101d3d..a346898d 100644
--- a/tests/pipeline/novaseq.nf.test
+++ b/tests/pipeline/novaseq.nf.test
@@ -28,7 +28,8 @@ nextflow_pipeline {
{ assert new File("$outputDir/fastqc/S2_2_fastqc.html").exists() },
{ assert snapshot(path("$outputDir/input/Samplesheet_novaseq.tsv")).match("input") },
{ assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"),
- path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt")).match("multiqc") }
+ path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt")).match("multiqc") },
+ { assert new File("$outputDir/summary_report/summary_report.html").exists() }
)
}
}
diff --git a/tests/pipeline/pacbio_its.nf.test b/tests/pipeline/pacbio_its.nf.test
index 144db928..ffe4b31c 100644
--- a/tests/pipeline/pacbio_its.nf.test
+++ b/tests/pipeline/pacbio_its.nf.test
@@ -53,6 +53,7 @@ nextflow_pipeline {
path("$outputDir/SBDI/event.tsv")).match("SBDI") },
{ assert new File("$outputDir/SBDI/annotation.tsv").exists() },
{ assert new File("$outputDir/SBDI/asv-table.tsv").exists() },
+ { assert new File("$outputDir/summary_report/summary_report.html").exists() },
{ assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() }
)
}
diff --git a/tests/pipeline/pplace.nf.test b/tests/pipeline/pplace.nf.test
index d348bee8..564cf2b9 100644
--- a/tests/pipeline/pplace.nf.test
+++ b/tests/pipeline/pplace.nf.test
@@ -56,6 +56,7 @@ nextflow_pipeline {
{ assert new File("$outputDir/pplace/test_pplace.graft.test_pplace.epa_result.newick").exists() },
{ assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"),
path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") },
+ { assert new File("$outputDir/summary_report/summary_report.html").exists() },
{ assert new File("$outputDir/phyloseq/qiime2_phyloseq.rds").exists() }
)
}
diff --git a/tests/pipeline/reftaxcustom.nf.test b/tests/pipeline/reftaxcustom.nf.test
index 19035ccb..9183b126 100644
--- a/tests/pipeline/reftaxcustom.nf.test
+++ b/tests/pipeline/reftaxcustom.nf.test
@@ -44,6 +44,7 @@ nextflow_pipeline {
{ assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"),
path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"),
path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") },
+ { assert new File("$outputDir/summary_report/summary_report.html").exists() },
{ assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() }
)
}
diff --git a/tests/pipeline/single.nf.test b/tests/pipeline/single.nf.test
index 1aa634a0..02d54e9e 100644
--- a/tests/pipeline/single.nf.test
+++ b/tests/pipeline/single.nf.test
@@ -45,6 +45,7 @@ nextflow_pipeline {
{ assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"),
path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"),
path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") },
+ { assert new File("$outputDir/summary_report/summary_report.html").exists() },
{ assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() }
)
}
diff --git a/tests/pipeline/sintax.nf.test b/tests/pipeline/sintax.nf.test
index fb0c8c15..f4ff3a4f 100644
--- a/tests/pipeline/sintax.nf.test
+++ b/tests/pipeline/sintax.nf.test
@@ -66,6 +66,7 @@ nextflow_pipeline {
{ assert new File("$outputDir/sintax/ref_taxonomy_sintax.txt").exists() },
{ assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"),
path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") },
+ { assert new File("$outputDir/summary_report/summary_report.html").exists() },
{ assert new File("$outputDir/phyloseq/sintax_phyloseq.rds").exists() }
)
}
diff --git a/tests/pipeline/test.nf.test b/tests/pipeline/test.nf.test
index e8ba0ce0..0e0e571a 100644
--- a/tests/pipeline/test.nf.test
+++ b/tests/pipeline/test.nf.test
@@ -94,6 +94,7 @@ nextflow_pipeline {
path("$outputDir/SBDI/event.tsv")).match("SBDI") },
{ assert new File("$outputDir/SBDI/annotation.tsv").exists() },
{ assert new File("$outputDir/SBDI/asv-table.tsv").exists() },
+ { assert new File("$outputDir/summary_report/summary_report.html").exists() },
{ assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() },
{ assert new File("$outputDir/phyloseq/qiime2_phyloseq.rds").exists() }
)
diff --git a/workflows/ampliseq.nf b/workflows/ampliseq.nf
index 04255de8..5d7cdb3d 100644
--- a/workflows/ampliseq.nf
+++ b/workflows/ampliseq.nf
@@ -71,6 +71,11 @@ if (params.sintax_ref_taxonomy && !params.skip_taxonomy) {
val_sintax_ref_taxonomy = "none"
}
+// report sources
+ch_report_template = Channel.fromPath("${params.report_template}", checkIfExists: true)
+ch_report_css = Channel.fromPath("${params.report_css}", checkIfExists: true)
+ch_report_logo = Channel.fromPath("${params.report_logo}", checkIfExists: true)
+ch_report_abstract = params.report_abstract ? Channel.fromPath(params.report_abstract, checkIfExists: true) : []
// Set non-params Variables
@@ -166,6 +171,7 @@ include { QIIME2_INTAX } from '../modules/local/qiime2_intax'
include { PICRUST } from '../modules/local/picrust'
include { SBDIEXPORT } from '../modules/local/sbdiexport'
include { SBDIEXPORTREANNOTATE } from '../modules/local/sbdiexportreannotate'
+include { SUMMARY_REPORT } from '../modules/local/summary_report'
include { PHYLOSEQ_INTAX as PHYLOSEQ_INTAX_PPLACE } from '../modules/local/phyloseq_intax'
include { PHYLOSEQ_INTAX as PHYLOSEQ_INTAX_QIIME2 } from '../modules/local/phyloseq_intax'
@@ -508,23 +514,29 @@ workflow AMPLISEQ {
// Import taxonomic classification into QIIME2, if available
if ( params.skip_taxonomy ) {
log.info "Skip taxonomy classification"
+ val_used_taxonomy = "skipped"
ch_tax = Channel.empty()
tax_agglom_min = 1
tax_agglom_max = 2
} else if ( params.sintax_ref_taxonomy ) {
log.info "Use SINTAX taxonomy classification"
+ val_used_taxonomy = "SINTAX"
ch_tax = QIIME2_INTAX ( ch_sintax_tax ).qza
} else if ( params.pplace_tree && params.pplace_taxonomy) {
log.info "Use EPA-NG / GAPPA taxonomy classification"
+ val_used_taxonomy = "phylogenetic placement"
ch_tax = QIIME2_INTAX ( ch_pplace_tax ).qza
} else if ( params.dada_ref_taxonomy && !params.skip_dada_taxonomy ) {
log.info "Use DADA2 taxonomy classification"
+ val_used_taxonomy = "DADA2"
ch_tax = QIIME2_INTAX ( ch_dada2_tax ).qza
} else if ( params.qiime_ref_taxonomy || params.classifier ) {
log.info "Use QIIME2 taxonomy classification"
+ val_used_taxonomy = "QIIME2"
ch_tax = QIIME2_TAXONOMY.out.qza
} else {
log.info "Use no taxonomy classification"
+ val_used_taxonomy = "none"
ch_tax = Channel.empty()
tax_agglom_min = 1
tax_agglom_max = 2
@@ -696,6 +708,71 @@ workflow AMPLISEQ {
multiqc_report = MULTIQC.out.report.toList()
}
+ //
+ // MODULE: Summary Report
+ //
+ if (!params.skip_report) {
+ SUMMARY_REPORT (
+ ch_report_template,
+ ch_report_css,
+ ch_report_logo,
+ ch_report_abstract,
+ ch_metadata.ifEmpty( [] ),
+ params.input.toString().toLowerCase().endsWith("tsv") ? file(params.input) : [], // samplesheet input
+ is_fasta_input ? PARSE_INPUT.out.fasta.ifEmpty( [] ) : [], // fasta input
+ !is_fasta_input && !params.skip_fastqc && !params.skip_multiqc ? MULTIQC.out.plots : [], //.collect().flatten().collectFile(name: "mqc_fastqc_per_sequence_quality_scores_plot_1.svg")
+ !params.skip_cutadapt ? CUTADAPT_WORKFLOW.out.summary.collect().ifEmpty( [] ) : [],
+ find_truncation_values,
+ DADA2_PREPROCESSING.out.args.first().ifEmpty( [] ),
+ !params.skip_dada_quality ? DADA2_PREPROCESSING.out.qc_svg.ifEmpty( [] ) : [],
+ !params.skip_dada_quality ? DADA2_PREPROCESSING.out.qc_svg_preprocessed.ifEmpty( [] ) : [],
+ DADA2_ERR.out.svg
+ .map {
+ meta_old, svgs ->
+ def meta = [:]
+ meta.single_end = meta_old.single_end
+ [ meta, svgs, meta_old.run ] }
+ .groupTuple(by: 0 )
+ .map {
+ meta_old, svgs, runs ->
+ def meta = [:]
+ meta.single_end = meta_old.single_end
+ meta.run = runs.flatten()
+ [ meta, svgs.flatten() ]
+ }.ifEmpty( [[],[]] ),
+ DADA2_MERGE.out.asv.ifEmpty( [] ),
+ ch_unfiltered_fasta.ifEmpty( [] ), // this is identical to DADA2_MERGE.out.fasta if !is_fasta_input
+ DADA2_MERGE.out.dada2asv.ifEmpty( [] ),
+ DADA2_MERGE.out.dada2stats.ifEmpty( [] ),
+ !params.skip_barrnap ? BARRNAPSUMMARY.out.summary.ifEmpty( [] ) : [],
+ params.filter_ssu ? FILTER_SSU.out.stats.ifEmpty( [] ) : [],
+ params.filter_ssu ? FILTER_SSU.out.asv.ifEmpty( [] ) : [],
+ params.min_len_asv || params.max_len_asv ? FILTER_LEN_ASV.out.stats.ifEmpty( [] ) : [],
+ params.min_len_asv || params.max_len_asv ? FILTER_LEN_ASV.out.len_orig.ifEmpty( [] ) : [],
+ params.filter_codons ? FILTER_CODONS.out.stats.ifEmpty( [] ) : [],
+ params.cut_its != "none" ? ITSX_CUTASV.out.summary.ifEmpty( [] ) : [],
+ !params.skip_taxonomy && params.dada_ref_taxonomy && !params.skip_dada_taxonomy ? ch_dada2_tax.ifEmpty( [] ) : [],
+ !params.skip_taxonomy && params.dada_ref_taxonomy && !params.skip_dada_taxonomy ? DADA2_TAXONOMY_WF.out.cut_tax.ifEmpty( [[],[]] ) : [[],[]],
+ !params.skip_taxonomy && params.sintax_ref_taxonomy ? ch_sintax_tax.ifEmpty( [] ) : [],
+ !params.skip_taxonomy && params.pplace_tree ? ch_pplace_tax.ifEmpty( [] ) : [],
+ !params.skip_taxonomy && params.pplace_tree ? FASTA_NEWICK_EPANG_GAPPA.out.heattree.ifEmpty( [[],[]] ) : [[],[]],
+ !params.skip_taxonomy && ( params.qiime_ref_taxonomy || params.classifier ) && run_qiime2 ? QIIME2_TAXONOMY.out.tsv.ifEmpty( [] ) : [],
+ run_qiime2,
+ run_qiime2 ? val_used_taxonomy : "",
+ run_qiime2 && ( params.exclude_taxa != "none" || params.min_frequency != 1 || params.min_samples != 1 ) ? ch_dada2_asv.countLines()+","+QIIME2_FILTERTAXA.out.tsv.countLines() : "",
+ run_qiime2 && ( params.exclude_taxa != "none" || params.min_frequency != 1 || params.min_samples != 1 ) ? FILTER_STATS.out.tsv.ifEmpty( [] ) : [],
+ run_qiime2 && !params.skip_barplot ? QIIME2_BARPLOT.out.folder.ifEmpty( [] ) : [],
+ run_qiime2 && !params.skip_abundance_tables ? "done" : "",
+ run_qiime2 && !params.skip_alpha_rarefaction && params.metadata ? "done" : "",
+ run_qiime2 && !params.skip_diversity_indices && params.metadata ? QIIME2_DIVERSITY.out.depth.ifEmpty( [] ) : [],
+ run_qiime2 && !params.skip_diversity_indices && params.metadata ? QIIME2_DIVERSITY.out.beta.collect().ifEmpty( [] ) : [],
+ run_qiime2 && !params.skip_diversity_indices && params.metadata ? QIIME2_DIVERSITY.out.adonis.collect().ifEmpty( [] ) : [],
+ run_qiime2 && !params.skip_ancom && params.metadata ? QIIME2_ANCOM.out.ancom.collect().ifEmpty( [] ) : [],
+ params.picrust ? PICRUST.out.pathways.ifEmpty( [] ) : []
+ )
+ ch_versions = ch_versions.mix(SUMMARY_REPORT.out.versions)
+ }
+
//Save input in results folder
input = file(params.input)
if ( is_fasta_input || input.toString().toLowerCase().endsWith("tsv") ) {