diff --git a/CITATIONS.md b/CITATIONS.md index dac2da54..873bc5d8 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -111,6 +111,10 @@ > Jari Oksanen, F. Guillaume Blanchet, Michael Friendly, Roeland Kindt, Pierre Legendre, Dan McGlinn, Peter R. Minchin, R. B. O’Hara, Gavin L. Simpson, Peter Solymos, M. Henry H. Stevens, Eduard Szoecs, and Helene Wagner. vegan: Community Ecology Package. 2018. R package version 2.5-3. +- [Phyloseq](https://doi.org/10.1371/journal.pone.0061217) + + > McMurdie PJ, Holmes S (2013). “phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data.” PLoS ONE, 8(4), e61217. + ### Non-default tools - [ITSx](https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12073) diff --git a/README.md b/README.md index 34524010..e6b84050 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ By default, the pipeline currently performs the following: - Taxonomical classification using DADA2, [SINTAX](https://doi.org/10.1101/074161) or [QIIME2](https://www.nature.com/articles/s41587-019-0209-9) - Excludes unwanted taxa, produces absolute and relative feature/taxa count tables and plots, plots alpha rarefaction curves, computes alpha and beta diversity indices and plots thereof ([QIIME2](https://www.nature.com/articles/s41587-019-0209-9)) - Calls differentially abundant taxa ([ANCOM](https://www.ncbi.nlm.nih.gov/pubmed/26028277)) +- Creates phyloseq R objects ([Phyloseq](https://www.bioconductor.org/packages/release/bioc/html/phyloseq.html)) - Pipeline QC summaries ([MultiQC](https://multiqc.info/)) - Pipeline summary report ([R Markdown](https://github.com/rstudio/rmarkdown)) diff --git a/bin/reformat_tax_for_phyloseq.py b/bin/reformat_tax_for_phyloseq.py new file mode 100755 index 00000000..f35aaf03 --- /dev/null +++ b/bin/reformat_tax_for_phyloseq.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 + +import pandas as pd +import sys + +tax_file = sys.argv[1] +out_file = sys.argv[2] + +# Import tsv file +tax_df = pd.read_csv(tax_file, sep="\t") + +# The second column should hold the taxonomy information +tax_col = tax_df.columns[1] + +# Split the values in the tax column +split_tax = tax_df[tax_col].str.split(";", expand=True) + +# Assign names to the new columns with an auto incrementing integer +new_col_names = [f"{tax_col}_{i+1}" for i in range(split_tax.shape[1])] +split_tax.columns = new_col_names + +# Strip whitespace from the tax names +split_tax = split_tax.applymap(lambda x: x.strip() if isinstance(x, str) else x) + +# Drop the original tax column +tax_df = tax_df.drop(columns=[tax_col]) + +# Add the new tax columns to the df +result = pd.concat([tax_df, split_tax], axis=1) + +# Create new tsv file +result.to_csv(out_file, sep="\t", index=False) diff --git a/conf/modules.config b/conf/modules.config index 74e7afe7..bc91b125 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -785,6 +785,14 @@ process { ] } + withName: 'PHYLOSEQ' { + publishDir = [ + path: { "${params.outdir}/phyloseq" }, + mode: params.publish_dir_mode, + pattern: "*.rds" + ] + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/docs/output.md b/docs/output.md index 0a407aa3..9e9eb75a 100644 --- a/docs/output.md +++ b/docs/output.md @@ -42,6 +42,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Diversity analysis](#diversity-analysis) - High level overview with different diversity indices - [ANCOM](#ancom) - Differential abundance analysis - [PICRUSt2](#picrust2) - Predict the functional potential of a bacterial community +- [SBDI export](#sbdi-export) - Swedish Biodiversity Infrastructure (SBDI) submission file +- [Phyloseq](#phyloseq) - Phyloseq R objects - [Read count report](#read-count-report) - Report of read counts during various steps of the pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -533,6 +535,18 @@ Most of the fields in the template will not be populated by the export process, +### Phyloseq + +This directory will hold phyloseq objects for each taxonomy table produced by this pipeline. The objects will contain an ASV abundance table and a taxonomy table. If the pipeline is provided with metadata, that metadata will also be included in the phyloseq object. A phylogenetic tree will also be included if the pipeline produces a tree. + +
+Output files + +- `phyloseq/` + - `_phyloseq.rds`: Phyloseq R object. + +
+ ## Read count report This report includes information on how many reads per sample passed each pipeline step in which a loss can occur. Specifically, how many read pairs entered cutadapt, were reverse complemented, passed trimming; how many read pairs entered DADA2, were denoised, merged and non-chimeric; and how many counts were lost during excluding unwanted taxa and removing low abundance/prevalence sequences in QIIME2. diff --git a/modules/local/phyloseq.nf b/modules/local/phyloseq.nf new file mode 100644 index 00000000..54537213 --- /dev/null +++ b/modules/local/phyloseq.nf @@ -0,0 +1,63 @@ +process PHYLOSEQ { + tag "$prefix" + label 'process_low' + + conda "bioconda::bioconductor-phyloseq=1.44.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-phyloseq:1.44.0--r43hdfd78af_0' : + 'quay.io/biocontainers/bioconductor-phyloseq:1.44.0--r43hdfd78af_0' }" + + input: + tuple val(prefix), path(tax_tsv) + path otu_tsv + path sam_tsv + path tree + + output: + tuple val(prefix), path("*phyloseq.rds"), emit: rds + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def sam_tsv = "\"${sam_tsv}\"" + def otu_tsv = "\"${otu_tsv}\"" + def tax_tsv = "\"${tax_tsv}\"" + def tree = "\"${tree}\"" + def prefix = "\"${prefix}\"" + """ + #!/usr/bin/env Rscript + + suppressPackageStartupMessages(library(phyloseq)) + + otu_df <- read.table($otu_tsv, sep="\\t", header=TRUE, row.names=1) + tax_df <- read.table($tax_tsv, sep="\\t", header=TRUE, row.names=1) + otu_mat <- as.matrix(otu_df) + tax_mat <- as.matrix(tax_df) + + OTU <- otu_table(otu_mat, taxa_are_rows=TRUE) + TAX <- tax_table(tax_mat) + phy_obj <- phyloseq(OTU, TAX) + + if (file.exists($sam_tsv)) { + sam_df <- read.table($sam_tsv, sep="\\t", header=TRUE, row.names=1) + SAM <- sample_data(sam_df) + phy_obj <- merge_phyloseq(phy_obj, SAM) + } + + if (file.exists($tree)) { + TREE <- read_tree($tree) + phy_obj <- merge_phyloseq(phy_obj, TREE) + } + + saveRDS(phy_obj, file = paste0($prefix, "_phyloseq.rds")) + + # Version information + writeLines(c("\\"${task.process}\\":", + paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" phyloseq: ", packageVersion("phyloseq"))), + "versions.yml" + ) + """ +} diff --git a/modules/local/phyloseq_inasv.nf b/modules/local/phyloseq_inasv.nf new file mode 100644 index 00000000..f66d1669 --- /dev/null +++ b/modules/local/phyloseq_inasv.nf @@ -0,0 +1,28 @@ +process PHYLOSEQ_INASV { + label 'process_low' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + path(biom_file) + + output: + path( "*.tsv" ) , emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + tail $biom_file -n +2 | sed '1s/#OTU ID/ASV_ID/' > reformat_$biom_file + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(bash --version | sed -n 1p | sed 's/GNU bash, version //g') + END_VERSIONS + """ +} diff --git a/modules/local/phyloseq_intax.nf b/modules/local/phyloseq_intax.nf new file mode 100644 index 00000000..6dbd8487 --- /dev/null +++ b/modules/local/phyloseq_intax.nf @@ -0,0 +1,29 @@ +process PHYLOSEQ_INTAX { + label 'process_low' + + conda "conda-forge::pandas=1.1.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.1.5': + 'biocontainers/pandas:1.1.5' }" + + input: + path(tax_tsv) + + output: + path( "*.tsv" ) , emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + reformat_tax_for_phyloseq.py $tax_tsv reformat_$tax_tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") + END_VERSIONS + """ +} diff --git a/subworkflows/local/phyloseq_workflow.nf b/subworkflows/local/phyloseq_workflow.nf new file mode 100644 index 00000000..adf208b7 --- /dev/null +++ b/subworkflows/local/phyloseq_workflow.nf @@ -0,0 +1,44 @@ +/* + * Create phyloseq objects + */ + +include { PHYLOSEQ } from '../../modules/local/phyloseq' +include { PHYLOSEQ_INASV } from '../../modules/local/phyloseq_inasv' + +workflow PHYLOSEQ_WORKFLOW { + take: + ch_tax + ch_tsv + ch_meta + ch_tree + run_qiime2 + + main: + if ( params.metadata ) { + ch_phyloseq_inmeta = ch_meta.first() // The .first() is to make sure it's a value channel + } else { + ch_phyloseq_inmeta = [] + } + + if ( params.pplace_tree ) { + ch_phyloseq_intree = ch_tree.map { it = it[1] }.first() + } else { + ch_phyloseq_intree = [] + } + + if ( run_qiime2 ) { + if ( params.exclude_taxa != "none" || params.min_frequency != 1 || params.min_samples != 1 ) { + ch_phyloseq_inasv = PHYLOSEQ_INASV ( ch_tsv ).tsv + } else { + ch_phyloseq_inasv = ch_tsv + } + } else { + ch_phyloseq_inasv = ch_tsv + } + + PHYLOSEQ ( ch_tax, ch_phyloseq_inasv, ch_phyloseq_inmeta, ch_phyloseq_intree ) + + emit: + rds = PHYLOSEQ.out.rds + versions= PHYLOSEQ.out.versions +} diff --git a/tests/pipeline/iontorrent.nf.test b/tests/pipeline/iontorrent.nf.test index 6a7c3a9f..200a9825 100644 --- a/tests/pipeline/iontorrent.nf.test +++ b/tests/pipeline/iontorrent.nf.test @@ -39,7 +39,8 @@ nextflow_pipeline { { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"), path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"), path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }, - { assert new File("$outputDir/summary_report/summary_report.html").exists() } + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/iontorrent.nf.test.snap b/tests/pipeline/iontorrent.nf.test.snap index c9c8f4bb..c7fbfb89 100644 --- a/tests/pipeline/iontorrent.nf.test.snap +++ b/tests/pipeline/iontorrent.nf.test.snap @@ -13,7 +13,7 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, RENAME_RAW_DATA_FILES={sed=4.7}, Workflow={nf-core/ampliseq=2.7.0dev}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, RENAME_RAW_DATA_FILES={sed=4.7}, Workflow={nf-core/ampliseq=2.7.0dev}}" ], "timestamp": "2023-06-20T01:42:35+0000" }, diff --git a/tests/pipeline/multi.nf.test b/tests/pipeline/multi.nf.test index c0b099bd..3e01ff20 100644 --- a/tests/pipeline/multi.nf.test +++ b/tests/pipeline/multi.nf.test @@ -64,7 +64,8 @@ nextflow_pipeline { { assert new File("$outputDir/qiime2/representative_sequences/rep-seq.fasta").exists() }, { assert snapshot(path("$outputDir/qiime2/representative_sequences/descriptive_stats.tsv"), path("$outputDir/qiime2/representative_sequences/seven_number_summary.tsv")).match("qiime2") }, - { assert new File("$outputDir/summary_report/summary_report.html").exists() } + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/multi.nf.test.snap b/tests/pipeline/multi.nf.test.snap index 2f0095ac..25b1437c 100644 --- a/tests/pipeline/multi.nf.test.snap +++ b/tests/pipeline/multi.nf.test.snap @@ -14,7 +14,7 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, Workflow={nf-core/ampliseq=2.7.0dev}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, Workflow={nf-core/ampliseq=2.7.0dev}}" ], "timestamp": "2023-05-28T21:15:03+0000" }, diff --git a/tests/pipeline/pacbio_its.nf.test b/tests/pipeline/pacbio_its.nf.test index c5314798..ffe4b31c 100644 --- a/tests/pipeline/pacbio_its.nf.test +++ b/tests/pipeline/pacbio_its.nf.test @@ -53,7 +53,8 @@ nextflow_pipeline { path("$outputDir/SBDI/event.tsv")).match("SBDI") }, { assert new File("$outputDir/SBDI/annotation.tsv").exists() }, { assert new File("$outputDir/SBDI/asv-table.tsv").exists() }, - { assert new File("$outputDir/summary_report/summary_report.html").exists() } + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/pacbio_its.nf.test.snap b/tests/pipeline/pacbio_its.nf.test.snap index 3c860a89..775e5195 100644 --- a/tests/pipeline/pacbio_its.nf.test.snap +++ b/tests/pipeline/pacbio_its.nf.test.snap @@ -35,7 +35,7 @@ }, "software_versions": { "content": [ - "{ASSIGNSH={pandas=1.1.5, python=3.9.1}, BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, FORMAT_TAXRESULTS_STD={pandas=1.1.5, python=3.9.1}, ITSX_CUTASV={ITSx=1.1.3}, RENAME_RAW_DATA_FILES={sed=4.7}, SBDIEXPORT={R=3.6.3}, VSEARCH_USEARCHGLOBAL={vsearch=2.21.1}, Workflow={nf-core/ampliseq=2.7.0dev}}" + "{ASSIGNSH={pandas=1.1.5, python=3.9.1}, BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, FORMAT_TAXRESULTS_STD={pandas=1.1.5, python=3.9.1}, ITSX_CUTASV={ITSx=1.1.3}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, RENAME_RAW_DATA_FILES={sed=4.7}, SBDIEXPORT={R=3.6.3}, VSEARCH_USEARCHGLOBAL={vsearch=2.21.1}, Workflow={nf-core/ampliseq=2.7.0dev}}" ], "timestamp": "2023-06-20T02:07:02+0000" }, diff --git a/tests/pipeline/pplace.nf.test b/tests/pipeline/pplace.nf.test index b0507df7..564cf2b9 100644 --- a/tests/pipeline/pplace.nf.test +++ b/tests/pipeline/pplace.nf.test @@ -56,7 +56,8 @@ nextflow_pipeline { { assert new File("$outputDir/pplace/test_pplace.graft.test_pplace.epa_result.newick").exists() }, { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"), path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }, - { assert new File("$outputDir/summary_report/summary_report.html").exists() } + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/qiime2_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/pplace.nf.test.snap b/tests/pipeline/pplace.nf.test.snap index d0aa5f26..9ee79d29 100644 --- a/tests/pipeline/pplace.nf.test.snap +++ b/tests/pipeline/pplace.nf.test.snap @@ -8,7 +8,7 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, EPANG_PLACE={epang=0.3.8}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, GAPPA_ASSIGN={gappa=0.8.0}, GAPPA_GRAFT={gappa=0.8.0}, GAPPA_HEATTREE={gappa=0.8.0}, HMMER_AFAFORMATQUERY={hmmer/easel=0.48}, HMMER_AFAFORMATREF={hmmer/easel=0.48}, HMMER_HMMALIGNQUERY={hmmer=3.3.2}, HMMER_HMMALIGNREF={hmmer=3.3.2}, HMMER_HMMBUILD={hmmer=3.3.2}, HMMER_MASKQUERY={hmmer/easel=0.48}, HMMER_MASKREF={hmmer/easel=0.48}, HMMER_UNALIGNREF={hmmer/easel=0.48}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.7.0dev}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, EPANG_PLACE={epang=0.3.8}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, GAPPA_ASSIGN={gappa=0.8.0}, GAPPA_GRAFT={gappa=0.8.0}, GAPPA_HEATTREE={gappa=0.8.0}, HMMER_AFAFORMATQUERY={hmmer/easel=0.48}, HMMER_AFAFORMATREF={hmmer/easel=0.48}, HMMER_HMMALIGNQUERY={hmmer=3.3.2}, HMMER_HMMALIGNREF={hmmer=3.3.2}, HMMER_HMMBUILD={hmmer=3.3.2}, HMMER_MASKQUERY={hmmer/easel=0.48}, HMMER_MASKREF={hmmer/easel=0.48}, HMMER_UNALIGNREF={hmmer/easel=0.48}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.7.0dev}}" ], "timestamp": "2023-06-20T17:24:03+0000" }, diff --git a/tests/pipeline/reftaxcustom.nf.test b/tests/pipeline/reftaxcustom.nf.test index 48e98fdf..9183b126 100644 --- a/tests/pipeline/reftaxcustom.nf.test +++ b/tests/pipeline/reftaxcustom.nf.test @@ -44,7 +44,8 @@ nextflow_pipeline { { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"), path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"), path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }, - { assert new File("$outputDir/summary_report/summary_report.html").exists() } + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/reftaxcustom.nf.test.snap b/tests/pipeline/reftaxcustom.nf.test.snap index 6407a3bf..7b33f261 100644 --- a/tests/pipeline/reftaxcustom.nf.test.snap +++ b/tests/pipeline/reftaxcustom.nf.test.snap @@ -13,7 +13,7 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.7.0dev}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, RENAME_RAW_DATA_FILES={sed=4.7}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.7.0dev}}" ], "timestamp": "2023-05-28T21:18:54+0000" }, diff --git a/tests/pipeline/single.nf.test b/tests/pipeline/single.nf.test index 44d71baf..02d54e9e 100644 --- a/tests/pipeline/single.nf.test +++ b/tests/pipeline/single.nf.test @@ -45,7 +45,8 @@ nextflow_pipeline { { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"), path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"), path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }, - { assert new File("$outputDir/summary_report/summary_report.html").exists() } + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/single.nf.test.snap b/tests/pipeline/single.nf.test.snap index 49d65106..bd9096d0 100644 --- a/tests/pipeline/single.nf.test.snap +++ b/tests/pipeline/single.nf.test.snap @@ -13,7 +13,7 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, RENAME_RAW_DATA_FILES={sed=4.7}, Workflow={nf-core/ampliseq=2.7.0dev}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, RENAME_RAW_DATA_FILES={sed=4.7}, Workflow={nf-core/ampliseq=2.7.0dev}}" ], "timestamp": "2023-05-28T20:35:33+0000" }, diff --git a/tests/pipeline/sintax.nf.test b/tests/pipeline/sintax.nf.test index dd3d3892..f4ff3a4f 100644 --- a/tests/pipeline/sintax.nf.test +++ b/tests/pipeline/sintax.nf.test @@ -66,7 +66,8 @@ nextflow_pipeline { { assert new File("$outputDir/sintax/ref_taxonomy_sintax.txt").exists() }, { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"), path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }, - { assert new File("$outputDir/summary_report/summary_report.html").exists() } + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/sintax_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/sintax.nf.test.snap b/tests/pipeline/sintax.nf.test.snap index c9745541..5f360a4b 100644 --- a/tests/pipeline/sintax.nf.test.snap +++ b/tests/pipeline/sintax.nf.test.snap @@ -16,7 +16,7 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, ITSX_CUTASV={ITSx=1.1.3}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, SBDIEXPORT={R=3.6.3}, VSEARCH_SINTAX={vsearch=2.21.1}, Workflow={nf-core/ampliseq=2.7.0dev}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, ITSX_CUTASV={ITSx=1.1.3}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, SBDIEXPORT={R=3.6.3}, VSEARCH_SINTAX={vsearch=2.21.1}, Workflow={nf-core/ampliseq=2.7.0dev}}" ], "timestamp": "2023-06-20T16:40:18+0000" }, diff --git a/tests/pipeline/test.nf.test b/tests/pipeline/test.nf.test index b9224114..0e0e571a 100644 --- a/tests/pipeline/test.nf.test +++ b/tests/pipeline/test.nf.test @@ -94,7 +94,9 @@ nextflow_pipeline { path("$outputDir/SBDI/event.tsv")).match("SBDI") }, { assert new File("$outputDir/SBDI/annotation.tsv").exists() }, { assert new File("$outputDir/SBDI/asv-table.tsv").exists() }, - { assert new File("$outputDir/summary_report/summary_report.html").exists() } + { assert new File("$outputDir/summary_report/summary_report.html").exists() }, + { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() }, + { assert new File("$outputDir/phyloseq/qiime2_phyloseq.rds").exists() } ) } } diff --git a/tests/pipeline/test.nf.test.snap b/tests/pipeline/test.nf.test.snap index fdf84093..b345de55 100644 --- a/tests/pipeline/test.nf.test.snap +++ b/tests/pipeline/test.nf.test.snap @@ -22,7 +22,7 @@ }, "software_versions": { "content": [ - "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, FILTER_LEN_ASV={Biostrings=2.58.0, R=4.0.3}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, SBDIEXPORT={R=3.6.3}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.7.0dev}}" + "{BARRNAP={barrnap=0.9}, CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.0, yaml=6.0}, CUTADAPT_BASIC={cutadapt=3.4}, DADA2_DENOISING={R=4.1.1, dada2=1.22.0}, DADA2_FILTNTRIM={R=4.1.1, dada2=1.22.0}, DADA2_QUALITY1={R=4.1.1, ShortRead=1.52.0, dada2=1.22.0}, DADA2_TAXONOMY={R=4.1.1, dada2=1.22.0}, FASTQC={fastqc=0.11.9}, FILTER_LEN_ASV={Biostrings=2.58.0, R=4.0.3}, FILTER_STATS={pandas=1.1.5, python=3.9.1}, PHYLOSEQ={R=4.3.0, phyloseq=1.44.0}, QIIME2_INSEQ={qiime2=2022.11.1}, RENAME_RAW_DATA_FILES={sed=4.7}, SBDIEXPORT={R=3.6.3}, TRUNCLEN={pandas=1.1.5, python=3.9.1}, Workflow={nf-core/ampliseq=2.7.0dev}}" ], "timestamp": "2023-05-28T20:55:32+0000" }, diff --git a/workflows/ampliseq.nf b/workflows/ampliseq.nf index 4a971395..5d7cdb3d 100644 --- a/workflows/ampliseq.nf +++ b/workflows/ampliseq.nf @@ -128,6 +128,9 @@ if ( !(workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) if ( workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1 ) { log.warn "Conda or mamba is enabled, any steps involving QIIME2 are not available. Use a container engine instead of conda to enable all software." } } +// This tracks tax tables produced during pipeline and each table will be used during phyloseq +ch_tax_for_phyloseq = Channel.empty() + /* ======================================================================================== @@ -169,6 +172,8 @@ include { PICRUST } from '../modules/local/picrust' include { SBDIEXPORT } from '../modules/local/sbdiexport' include { SBDIEXPORTREANNOTATE } from '../modules/local/sbdiexportreannotate' include { SUMMARY_REPORT } from '../modules/local/summary_report' +include { PHYLOSEQ_INTAX as PHYLOSEQ_INTAX_PPLACE } from '../modules/local/phyloseq_intax' +include { PHYLOSEQ_INTAX as PHYLOSEQ_INTAX_QIIME2 } from '../modules/local/phyloseq_intax' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -185,6 +190,7 @@ include { QIIME2_EXPORT } from '../subworkflows/local/qiime2_exp include { QIIME2_BARPLOTAVG } from '../subworkflows/local/qiime2_barplotavg' include { QIIME2_DIVERSITY } from '../subworkflows/local/qiime2_diversity' include { QIIME2_ANCOM } from '../subworkflows/local/qiime2_ancom' +include { PHYLOSEQ_WORKFLOW } from '../subworkflows/local/phyloseq_workflow' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -429,6 +435,7 @@ workflow AMPLISEQ { taxlevels ).tax.set { ch_dada2_tax } ch_versions = ch_versions.mix(DADA2_TAXONOMY_WF.out.versions) + ch_tax_for_phyloseq = ch_tax_for_phyloseq.mix ( ch_dada2_tax.map { it = [ "dada2", file(it) ] } ) } else { ch_dada2_tax = Channel.empty() } @@ -443,6 +450,7 @@ workflow AMPLISEQ { sintax_taxlevels ).tax.set { ch_sintax_tax } ch_versions = ch_versions.mix(SINTAX_TAXONOMY_WF.out.versions) + ch_tax_for_phyloseq = ch_tax_for_phyloseq.mix ( ch_sintax_tax.map { it = [ "sintax", file(it) ] } ) } else { ch_sintax_tax = Channel.empty() } @@ -463,8 +471,8 @@ workflow AMPLISEQ { } FASTA_NEWICK_EPANG_GAPPA ( ch_pp_data ) ch_versions = ch_versions.mix( FASTA_NEWICK_EPANG_GAPPA.out.versions ) - ch_pplace_tax = FORMAT_PPLACETAX ( FASTA_NEWICK_EPANG_GAPPA.out.taxonomy_per_query ).tsv + ch_tax_for_phyloseq = ch_tax_for_phyloseq.mix ( PHYLOSEQ_INTAX_PPLACE ( ch_pplace_tax ).tsv.map { it = [ "pplace", file(it) ] } ) } else { ch_pplace_tax = Channel.empty() } @@ -484,6 +492,10 @@ workflow AMPLISEQ { ch_qiime_classifier ) ch_versions = ch_versions.mix( QIIME2_TAXONOMY.out.versions.ifEmpty(null) ) //usually a .first() is here, dont know why this leads here to a warning + ch_qiime2_tax = QIIME2_TAXONOMY.out.tsv + ch_tax_for_phyloseq = ch_tax_for_phyloseq.mix ( PHYLOSEQ_INTAX_QIIME2 ( ch_qiime2_tax ).tsv.map { it = [ "qiime2", file(it) ] } ) + } else { + ch_qiime2_tax = Channel.empty() } // @@ -553,7 +565,7 @@ workflow AMPLISEQ { } //Export various ASV tables if (!params.skip_abundance_tables) { - QIIME2_EXPORT ( ch_asv, ch_seq, ch_tax, QIIME2_TAXONOMY.out.tsv, ch_dada2_tax, ch_pplace_tax, ch_sintax_tax, tax_agglom_min, tax_agglom_max ) + QIIME2_EXPORT ( ch_asv, ch_seq, ch_tax, ch_qiime2_tax, ch_dada2_tax, ch_pplace_tax, ch_sintax_tax, tax_agglom_min, tax_agglom_max ) } if (!params.skip_barplot) { @@ -610,6 +622,8 @@ workflow AMPLISEQ { tax_agglom_max ) } + } else { + ch_tsv = ch_dada2_asv } // @@ -640,6 +654,26 @@ workflow AMPLISEQ { ch_versions = ch_versions.mix(SBDIEXPORT.out.versions.first()) } + // + // SUBWORKFLOW: Create phyloseq objects + // + if ( !params.skip_taxonomy ) { + if ( params.pplace_tree ) { + ch_tree_for_phyloseq = FASTA_NEWICK_EPANG_GAPPA.out.grafted_phylogeny + } else { + ch_tree_for_phyloseq = [] + } + + PHYLOSEQ_WORKFLOW ( + ch_tax_for_phyloseq, + ch_tsv, + ch_metadata.ifEmpty([]), + ch_tree_for_phyloseq, + run_qiime2 + ) + ch_versions = ch_versions.mix(PHYLOSEQ_WORKFLOW.out.versions.first()) + } + CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') )