diff --git a/CHANGELOG.md b/CHANGELOG.md index a1b5a81d..f7cf6c9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Added` -- Template update to nf-core tools v2.6 +- [#209](https://github.com/nf-core/airrflow/pull/209) Template update to nf-core tools v2.6. +- [#210](https://github.com/nf-core/airrflow/pull/210) Add fastp for read QC, adapter trimming and read clipping. ## [2.3.0] - 2022-09-22 "Expelliarmus" diff --git a/CITATIONS.md b/CITATIONS.md index 611485ca..10d23253 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -12,6 +12,10 @@ - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- [Fastp](https://doi.org/10.1093/bioinformatics/bty560) + + > Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu, fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics. 2018 Sept 1; 34(17):i884–i890. doi: 10.1093/bioinformatics/bty560. + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. diff --git a/README.md b/README.md index b68a98e8..d5976da2 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ On release, automated continuous integration tests run the pipeline on a full-si By default, the pipeline currently performs the following steps: -- Raw read quality control (`FastQC`) +- Raw read quality control, adapter trimming and read clipping (`fastp`) - Pre-processing (`pRESTO`) - Filtering sequences by sequencing quality. - Masking amplicon primers. @@ -35,6 +35,7 @@ By default, the pipeline currently performs the following steps: - Assembling R1 and R2 read mates. - Removing and annotating read duplicates with different UMI barcodes. - Filtering out sequences that do not have at least 2 duplicates. +- Post-assembly read quality control (`FastQC`s) - Assigning gene segment alleles with `IgBlast` using the IMGT database (`Change-O`). - Finding the Hamming distance threshold for clone definition (`SHazaM`). - Clonal assignment: defining clonal lineages of the B-cell / T-cell populations (`Change-O`). diff --git a/conf/modules.config b/conf/modules.config index 9404957c..731e203a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -36,8 +36,36 @@ process { ] } - withName: FASTQC { - ext.args = '--quiet' + withName: 'FASTP' { + publishDir = [ + [ + path: { "${params.outdir}/fastp/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{html,json,log}" + ], + [ + enabled: params.save_trimmed, + path: { "${params.outdir}/fastp/${meta.id}/" }, + mode: params.publish_dir_mode, + pattern: "*.fastp.fastq.gz" + ] + ] + ext.args = [ "--disable_quality_filtering --disable_length_filtering", + params.trim_fastq ?: "--disable_adapter_trimming", + params.clip_r1 > 0 ? "--trim_front1 ${params.clip_r1}" : "", // Remove bp from the 5' end of read 1 + params.clip_r2 > 0 ? "--trim_front2 ${params.clip_r2}" : "", // Remove bp from the 5' end of read 2 + params.three_prime_clip_r1 > 0 ? "--trim_tail1 ${params.three_prime_clip_r1}" : "", // Remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed + params.three_prime_clip_r2 > 0 ? "--trim_tail2 ${params.three_prime_clip_r2}" : "", // Remove bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed + params.trim_nextseq ? "--trim_poly_g" : "", // Apply the --nextseq=X option, to trim based on quality after removing poly-G tails + ].join(" ").trim() + } + + withName: 'GUNZIP_*' { + publishDir = [ + [ + enabled: false + ] + ] } withName: FASTQC_POSTASSEMBLY { diff --git a/docs/output.md b/docs/output.md index 838ac62e..bd8c88f7 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,7 +10,7 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - read quality control +- [FastP](#fastp) - read quality control, adapter trimming and read clipping - [pRESTO](#presto) - read pre-processing - [Filter by sequence quality](#filter-by-sequence-quality) - filter sequences by quality - [Mask primers](#mask-primers) - Masking primers @@ -21,6 +21,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Assemble mates](#assemble-mates) - Assemble sequence mates. - [Remove duplicates](#remove-duplicates) - Remove and annotate read duplicates. - [Filter sequences for at least 2 representative](#filter-sequences-for-at-least-2-representative) Filter sequences that do not have at least 2 duplicates. +- [FastQC](#fastqc) - read quality control post-assembly - [Change-O](#change-o) - Assign genes and clonotyping - [Assign genes with Igblast](#assign-genes-with-igblast) - [Make database from assigned genes](#make-database-from-assigned-genes) @@ -39,29 +40,20 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [MultiQC](#MultiQC) - MultiQC - [Pipeline information](#pipeline-information) - Pipeline information -## FastQC +## Fastp
Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics for the raw unmated reads. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images for the raw unmated reads. - - `postassembly/` - - `*_ASSEMBLED_fastqc.html`: FastQC report containing quality metrics for the mated and quality filtered reads. - - `*_ASSEMBLED_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images for the mated and quality filtered reads. +- `fastp/` + - `/` + - `*.fastp.html`: Fast report containing quality metrics for the mated and quality filtered reads. + - `*.fastp.json`: Zip archive containing the FastQC report, tab-delimited data file and plot images for the mated and quality filtered reads. + - `*.fastp.log`: Fastp
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). - -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) - -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) - -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) - -> **NB:** Two sets of FastQC plots are displayed in the MultiQC report: first for the raw _untrimmed_ and unmated reads and secondly for the assembled and QC filtered reads (but before collapsing duplicates). They may contain adapter sequence and potentially regions with low quality. +[fastp](https://doi.org/10.1093/bioinformatics/bty560) gives general quality metrics about your sequenced reads, as well as allows filtering reads by quality, trimming adapters and clipping reads at 5' or 3' ends. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [fastp documentation](https://github.com/OpenGene/fastp). ## presto @@ -193,6 +185,28 @@ Remove duplicates using [CollapseSeq](https://presto.readthedocs.io/en/version-0 Remove sequences which do not have 2 representative using [SplitSeq](https://presto.readthedocs.io/en/version-0.5.11/tools/SplitSeq.html) from the pRESTO Immcantation toolset. +## FastQC + +
+Output files + +- `fastqc/` + - `postassembly/` + - `*_ASSEMBLED_fastqc.html`: FastQC report containing quality metrics for the mated and quality filtered reads. + - `*_ASSEMBLED_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images for the mated and quality filtered reads. + +
+ +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + +![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) + +![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) + +![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) + +> **NB:** Two sets of FastQC plots are displayed in the MultiQC report: first for the raw _untrimmed_ and unmated reads and secondly for the assembled and QC filtered reads (but before collapsing duplicates). They may contain adapter sequence and potentially regions with low quality. + ## Change-O ### Assign genes with Igblast diff --git a/modules.json b/modules.json index 28e1d0cf..54043170 100644 --- a/modules.json +++ b/modules.json @@ -9,6 +9,10 @@ "branch": "master", "git_sha": "8022c68e7403eecbd8ba9c49496f69f8c49d50f0" }, + "fastp": { + "branch": "master", + "git_sha": "1e49f31e93c56a3832833eef90a02d3cde5a3f7e" + }, "fastqc": { "branch": "master", "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf new file mode 100644 index 00000000..207258ad --- /dev/null +++ b/modules/nf-core/fastp/main.nf @@ -0,0 +1,103 @@ +process FASTP { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? 'bioconda::fastp=0.23.2' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.2--h79da9fb_0' : + 'quay.io/biocontainers/fastp:0.23.2--h79da9fb_0' }" + + input: + tuple val(meta), path(reads) + path adapter_fasta + val save_trimmed_fail + val save_merged + + output: + tuple val(meta), path('*.fastp.fastq.gz') , optional:true, emit: reads + tuple val(meta), path('*.json') , emit: json + tuple val(meta), path('*.html') , emit: html + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail + tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def adapter_list = adapter_fasta ? "--adapter_fasta ${adapter_fasta}" : "" + def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + // Added soft-links to original fastqs for consistent naming in MultiQC + // Use single ended for interleaved. Add --interleaved_in in config. + if ( task.ext.args?.contains('--interleaved_in') ) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --stdout \\ + --in1 ${prefix}.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log \\ + | gzip -c > ${prefix}.fastp.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else if (meta.single_end) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --stdout \\ + --in1 ${prefix}.fastq.gz \\ + --out1 ${prefix}.fastp.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else { + def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : '' + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -sf ${reads[1]} ${prefix}_2.fastq.gz + fastp \\ + --in1 ${prefix}_1.fastq.gz \\ + --in2 ${prefix}_2.fastq.gz \\ + --out1 ${prefix}_1.fastp.fastq.gz \\ + --out2 ${prefix}_2.fastp.fastq.gz \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $merge_fastq \\ + --thread $task.cpus \\ + --detect_adapter_for_pe \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml new file mode 100644 index 00000000..6f6fad74 --- /dev/null +++ b/modules/nf-core/fastp/meta.yml @@ -0,0 +1,73 @@ +name: fastp +description: Perform adapter/quality trimming on sequencing reads +keywords: + - trimming + - quality control + - fastq +tools: + - fastp: + description: | + A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. + documentation: https://github.com/OpenGene/fastp + doi: https://doi.org/10.1093/bioinformatics/bty560 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads. + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. If you wish to run interleaved paired-end data, supply as single-end data + but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module. + - adapter_fasta: + type: file + description: File in FASTA format containing possible adapters to remove. + pattern: "*.{fasta,fna,fas,fa}" + - save_trimmed_fail: + type: boolean + description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz` + - save_merged: + type: boolean + description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified/unmerged fastq reads + pattern: "*fastp.fastq.gz" + - json: + type: file + description: Results in JSON format + pattern: "*.json" + - html: + type: file + description: Results in HTML format + pattern: "*.html" + - log: + type: file + description: fastq log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads_fail: + type: file + description: Reads the failed the preprocessing + pattern: "*fail.fastq.gz" + - reads_merged: + type: file + description: Reads that were successfully merged + pattern: "*.{merged.fastq.gz}" +authors: + - "@drpatelh" + - "@kevinmenden" diff --git a/nextflow.config b/nextflow.config index 8af8c2a2..ba10ed7c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -40,6 +40,16 @@ params { umi_length = -1 umi_start = 0 + // trimming options + trim_fastq = true + adapter_fasta = null + clip_r1 = 0 + clip_r2 = 0 + three_prime_clip_r1 = 0 + three_prime_clip_r2 = 0 + trim_nextseq = false + save_trimmed = false + // pRESTO options filterseq_q = 20 primer_maxerror = 0.2 diff --git a/nextflow_schema.json b/nextflow_schema.json index 8043aad3..dbfaa6df 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -60,7 +60,8 @@ "description": "Path to fasta file containing the linker sequence, if no V-region primers were used but a linker sequence is present (e.g. 5' RACE SMARTer TAKARA protocol).", "fa_icon": "fas fa-dna" } - } + }, + "fa_icon": "fas fa-flask" }, "databases_cache": { "title": "Databases cache", @@ -167,6 +168,61 @@ "help_text": "The pipeline requires indication of UMI barcode treatment (for identifying unique transcripts). These barcodes are typically read from an index file but sometimes can be provided merged with the start of the R1 or R2 reads. If provided in an additional index file, set the `--index_file` parameter, if provided merged with the R1 or R2 reads, set the `--umi_position` parameter. Specify the UMI barcode length with the `--umi_length` parameter. If no UMI barcodes are present, specify `--umi_length = 0` to use the sans-UMI subworkflow.", "fa_icon": "fas fa-barcode" }, + "adapter_trimming": { + "title": "Adapter trimming", + "type": "object", + "description": "Options for adapter trimming and read clipping", + "default": "", + "fa_icon": "fas fa-cut", + "properties": { + "trim_fastq": { + "type": "boolean", + "default": true, + "description": "Whether to trim adapters in fastq reads with fastp.", + "help_text": "By default adapters will be auto-detected, but adapter sequences can also be provided in a `fasta` file with the `--adapter_fasta` option.", + "fa_icon": "fas fa-cut" + }, + "adapter_fasta": { + "type": "string", + "default": null, + "fa_icon": "fas fa-file" + }, + "clip_r1": { + "type": "integer", + "default": 0, + "description": "Number of bases to clip 5' in R1 reads.", + "fa_icon": "fas fa-cut" + }, + "clip_r2": { + "type": "integer", + "default": 0, + "description": "Number of bases to clip 5' in R2 reads.", + "fa_icon": "fas fa-cut" + }, + "three_prime_clip_r1": { + "type": "integer", + "default": 0, + "description": "Number of bases to clip 3' in R1 reads.", + "fa_icon": "fas fa-cut" + }, + "three_prime_clip_r2": { + "type": "integer", + "default": 0, + "description": "Number of bases to clip 3' in R2 reads.", + "fa_icon": "fas fa-cut" + }, + "trim_nextseq": { + "type": "boolean", + "description": "Trim adapters specific for Nextseq sequencing", + "fa_icon": "fas fa-cut" + }, + "save_trimmed": { + "type": "boolean", + "description": "Option to save trimmed reads.", + "fa_icon": "far fa-save" + } + } + }, "presto_options": { "title": "pRESTO options", "type": "object", @@ -258,15 +314,18 @@ "properties": { "skip_report": { "type": "boolean", - "description": "Skip repertoire analysis and report generation" + "description": "Skip repertoire analysis and report generation", + "fa_icon": "fas fa-angle-double-right" }, "skip_lineage": { "type": "boolean", - "description": "Skip clonal lineage analysis and lineage tree plotting." + "description": "Skip clonal lineage analysis and lineage tree plotting.", + "fa_icon": "fas fa-angle-double-right" }, "skip_multiqc": { "type": "boolean", - "description": "Skip multiqc report" + "description": "Skip multiqc report", + "fa_icon": "fas fa-angle-double-right" } }, "help_text": "Downstream analyses include a series of R scripts based on the Immcantation Alakazam, ChangeO and Shazam packages to calculate:\n- Clonal abundance and diversity\n- Clonal lineage tree export in graphML\n- Clonal overlap and statistics\n- Mutational load\n- Isotype and V-family distribution", @@ -585,6 +644,9 @@ { "$ref": "#/definitions/umi_barcode_handling" }, + { + "$ref": "#/definitions/adapter_trimming" + }, { "$ref": "#/definitions/presto_options" }, diff --git a/subworkflows/local/presto_sans_umi.nf b/subworkflows/local/presto_sans_umi.nf index 3d310ab3..59a2db57 100644 --- a/subworkflows/local/presto_sans_umi.nf +++ b/subworkflows/local/presto_sans_umi.nf @@ -1,16 +1,17 @@ // Include statements -include { GUNZIP as GUNZIP_SANS_UMI } from '../../modules/local/gunzip' +include { GUNZIP as GUNZIP_SANS_UMI } from '../../modules/local/gunzip' include { FASTQC_POSTASSEMBLY as FASTQC_POSTASSEMBLY_SANS_UMI } from '../../modules/local/fastqc_postassembly' +include { FASTP } from '../../modules/nf-core/fastp/main' //PRESTO -include { PRESTO_ASSEMBLEPAIRS as PRESTO_ASSEMBLEPAIRS_SANS_UMI } from '../../modules/local/presto/presto_assemblepairs' +include { PRESTO_ASSEMBLEPAIRS as PRESTO_ASSEMBLEPAIRS_SANS_UMI } from '../../modules/local/presto/presto_assemblepairs' include { PRESTO_FILTERSEQ_POSTASSEMBLY as PRESTO_FILTERSEQ_POSTASSEMBLY_SANS_UMI } from '../../modules/local/presto/presto_filterseq_postassembly' include { PRESTO_MASKPRIMERS_POSTASSEMBLY as PRESTO_MASKPRIMERS_POSTASSEMBLY_SANS_UMI } from '../../modules/local/presto/presto_maskprimers_postassembly' -include { PRESTO_PARSEHEADERS_PRIMERS as PRESTO_PARSEHEADERS_PRIMERS_SANS_UMI } from '../../modules/local/presto/presto_parseheaders_primers' -include { PRESTO_PARSEHEADERS_METADATA as PRESTO_PARSEHEADERS_METADATA_SANS_UMI } from '../../modules/local/presto/presto_parseheaders_metadata' -include { PRESTO_COLLAPSESEQ as PRESTO_COLLAPSESEQ_SANS_UMI } from '../../modules/local/presto/presto_collapseseq' -include { PRESTO_SPLITSEQ as PRESTO_SPLITSEQ_SANS_UMI} from '../../modules/local/presto/presto_splitseq' +include { PRESTO_PARSEHEADERS_PRIMERS as PRESTO_PARSEHEADERS_PRIMERS_SANS_UMI } from '../../modules/local/presto/presto_parseheaders_primers' +include { PRESTO_PARSEHEADERS_METADATA as PRESTO_PARSEHEADERS_METADATA_SANS_UMI } from '../../modules/local/presto/presto_parseheaders_metadata' +include { PRESTO_COLLAPSESEQ as PRESTO_COLLAPSESEQ_SANS_UMI } from '../../modules/local/presto/presto_collapseseq' +include { PRESTO_SPLITSEQ as PRESTO_SPLITSEQ_SANS_UMI} from '../../modules/local/presto/presto_splitseq' workflow PRESTO_SANS_UMI { @@ -18,11 +19,23 @@ workflow PRESTO_SANS_UMI { ch_reads // channel: [ val(meta), [ reads ] ] ch_cprimers // channel: [ cprimers.fasta ] ch_vprimers // channel: [ vprimers.fasta ] + ch_adapter_fasta // channel: [ adapters.fasta ] main: ch_versions = Channel.empty() - ch_gunzip = ch_reads + + // Fastp + save_merged = false + FASTP ( + ch_reads, + ch_adapter_fasta, + params.save_trimmed, + save_merged + ) + ch_versions = ch_versions.mix(FASTP.out.versions.ifEmpty([])) + + ch_gunzip = FASTP.out.reads.map{ meta,reads -> [meta, reads[0], reads[1]] } // gunzip fastq.gz to fastq GUNZIP_SANS_UMI ( ch_gunzip ) @@ -81,6 +94,8 @@ workflow PRESTO_SANS_UMI { emit: fasta = PRESTO_SPLITSEQ_SANS_UMI.out.fasta software = ch_versions + fastp_reads_json = FASTP.out.json.collect{ meta,json -> json } + fastp_reads_html = FASTP.out.html.collect{ meta,html -> html } fastqc_postassembly_gz = FASTQC_POSTASSEMBLY_SANS_UMI.out.zip presto_assemblepairs_logs = PRESTO_ASSEMBLEPAIRS_SANS_UMI.out.logs.collect() presto_filterseq_logs = PRESTO_FILTERSEQ_POSTASSEMBLY_SANS_UMI.out.logs diff --git a/subworkflows/local/presto_umi.nf b/subworkflows/local/presto_umi.nf index 55a1cc39..5bcd972d 100644 --- a/subworkflows/local/presto_umi.nf +++ b/subworkflows/local/presto_umi.nf @@ -1,8 +1,10 @@ // Include statements -include { MERGE_UMI } from '../../modules/local/merge_UMI' -include { RENAME_FASTQ as RENAME_FASTQ_UMI } from '../../modules/local/rename_fastq' -include { GUNZIP as GUNZIP_UMI } from '../../modules/local/gunzip' +include { MERGE_UMI } from '../../modules/local/merge_UMI' +include { RENAME_FASTQ as RENAME_FASTQ_UMI } from '../../modules/local/rename_fastq' +include { GUNZIP as GUNZIP_UMI } from '../../modules/local/gunzip' include { FASTQC_POSTASSEMBLY as FASTQC_POSTASSEMBLY_UMI } from '../../modules/local/fastqc_postassembly' +include { FASTP } from '../../modules/nf-core/fastp/main' + //PRESTO include { PRESTO_FILTERSEQ as PRESTO_FILTERSEQ_UMI } from '../../modules/local/presto/presto_filterseq' @@ -25,18 +27,64 @@ workflow PRESTO_UMI { ch_reads // channel: [ val(meta), [ reads ] ] ch_cprimers // channel: [ cprimers.fasta ] ch_vprimers // channel: [ vprimers.fasta ] + ch_adapter_fasta // channel: [ adapters.fasta ] main: ch_versions = Channel.empty() + + // prepare reads for fastp + ch_reads.dump(tag:'presto umi reads') + // Merge UMI from index file to R1 if provided if (params.index_file) { - MERGE_UMI ( ch_reads ) + + // ch for fastp reads R1 R2 + ch_reads.map{ meta, reads -> [meta, [reads[0], reads[1]]] } + .dump(tag: 'presto_umi_R1_R2_reads') + .set{ ch_reads_R1_R2 } + + // Fastp reads R1 R2 + save_merged = false + FASTP ( + ch_reads_R1_R2, + ch_adapter_fasta, + params.save_trimmed, + save_merged + ) + ch_versions = ch_versions.mix(FASTP.out.versions.ifEmpty([])) + + //ch for merge umi + ch_meta_R1_R2 = FASTP.out.reads + .map{ meta, reads -> [meta.id, meta, reads[0], reads[1]] } + ch_meta_index = ch_reads + .map{ meta, reads -> [meta.id, meta, reads[2]] } + ch_meta_R1_R2_index = ch_meta_R1_R2.join( ch_meta_index ) + .map{ id, meta1, R1, R2, meta2, index -> [ meta1, R1, R2, index ] } + .dump(tag: 'ch_merge_umi') + + MERGE_UMI ( ch_meta_R1_R2_index ) ch_gunzip = MERGE_UMI.out.reads ch_versions = ch_versions.mix(MERGE_UMI.out.versions.ifEmpty(null)) + + } else { - RENAME_FASTQ_UMI ( ch_reads ) + + // Fastp reads + save_merged = false + FASTP ( + ch_reads, + ch_adapter_fasta, + params.save_trimmed, + save_merged + ) + ch_versions = ch_versions.mix(FASTP.out.versions.ifEmpty([])) + + ch_rename_fastq_umi = FASTP.out.reads.map{ meta,reads -> [meta, reads[0], reads[1]] } + + RENAME_FASTQ_UMI ( ch_rename_fastq_umi ) ch_gunzip = RENAME_FASTQ_UMI.out.reads + } // gunzip fastq.gz to fastq @@ -139,6 +187,8 @@ workflow PRESTO_UMI { emit: fasta = PRESTO_SPLITSEQ_UMI.out.fasta software = ch_versions + fastp_reads_json = FASTP.out.json.collect{ meta,json -> json } + fastp_reads_html = FASTP.out.html.collect{ meta,html -> html } fastqc_postassembly_gz = FASTQC_POSTASSEMBLY_UMI.out.zip presto_filterseq_logs = PRESTO_FILTERSEQ_UMI.out.logs presto_maskprimers_logs = PRESTO_MASKPRIMERS_UMI.out.logs.collect() diff --git a/workflows/bcellmagic.nf b/workflows/bcellmagic.nf index 1427f81f..caefe289 100644 --- a/workflows/bcellmagic.nf +++ b/workflows/bcellmagic.nf @@ -15,12 +15,15 @@ def checkPathParamList = [ params.input, params.multiqc_config ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, "Please provide input file containing the sample metadata with the '--input' option." } +if (params.input) { ch_input = Channel.fromPath(params.input, checkIfExists: true) } else { exit 1, "Please provide input file containing the sample metadata with the '--input' option." } if (!params.library_generation_method) { exit 1, "Please specify a library generation method with the `--library_generation_method` option." } +// Check other params +if (params.adapter_fasta) { ch_adapter_fasta = Channel.fromPath(params.adapter_fasta, checkIfExists: true) } else { ch_adapter_fasta = [] } + // Validate library generation method parameter if (params.library_generation_method == 'specific_pcr_umi'){ if (params.vprimers) { @@ -161,7 +164,6 @@ include { PRESTO_SANS_UMI } from '../subworkflows/local/presto_sans_u // // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' @@ -183,35 +185,29 @@ workflow BCELLMAGIC { // INPUT_CHECK ( ch_input ) - ch_fastqc = INPUT_CHECK + INPUT_CHECK.out.reads.dump(tag: 'input reads') + + ch_reads = INPUT_CHECK .out .reads - .groupTuple(by: [0]) - .map{ it -> [ it[0], it[1].flatten() ] } - - ch_presto = ch_fastqc.map{ it -> it.flatten() } + .dump(tag: 'input reads') ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) - // - // MODULE: FastQC - // - FASTQC ( ch_fastqc ) - - // Channel for software versions - ch_versions = ch_versions.mix(FASTQC.out.versions.ifEmpty(null)) - if (params.umi_length == 0) { // // SUBWORKFLOW: pRESTO without UMIs // PRESTO_SANS_UMI ( - ch_presto, + ch_reads, ch_cprimers_fasta, - ch_vprimers_fasta + ch_vprimers_fasta, + ch_adapter_fasta ) ch_presto_fasta = PRESTO_SANS_UMI.out.fasta ch_presto_software = PRESTO_SANS_UMI.out.software + ch_fastp_reads_html = PRESTO_SANS_UMI.out.fastp_reads_html + ch_fastp_reads_json = PRESTO_SANS_UMI.out.fastp_reads_json ch_fastqc_postassembly_gz = PRESTO_SANS_UMI.out.fastqc_postassembly_gz ch_presto_assemblepairs_logs = PRESTO_SANS_UMI.out.presto_assemblepairs_logs ch_presto_filterseq_logs = PRESTO_SANS_UMI.out.presto_filterseq_logs @@ -228,12 +224,15 @@ workflow BCELLMAGIC { // SUBWORKFLOW: pRESTO with UMIs // PRESTO_UMI ( - ch_presto, + ch_reads, ch_cprimers_fasta, - ch_vprimers_fasta + ch_vprimers_fasta, + ch_adapter_fasta ) ch_presto_fasta = PRESTO_UMI.out.fasta ch_presto_software = PRESTO_UMI.out.software + ch_fastp_reads_html = PRESTO_UMI.out.fastp_reads_html + ch_fastp_reads_json = PRESTO_UMI.out.fastp_reads_json ch_fastqc_postassembly_gz = PRESTO_UMI.out.fastqc_postassembly_gz ch_presto_filterseq_logs = PRESTO_UMI.out.presto_filterseq_logs ch_presto_maskprimers_logs = PRESTO_UMI.out.presto_maskprimers_logs @@ -412,7 +411,8 @@ workflow BCELLMAGIC { ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml') ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_fastp_reads_json.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_fastp_reads_html.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_fastqc_postassembly_gz.collect{it[1]}.ifEmpty([])) MULTIQC (