diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3f201885..4d37a278 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,6 +29,7 @@ jobs: ANALYSIS: - "test_screening" - "test_targeted" + - "test_umis" steps: - name: Check out pipeline code uses: actions/checkout@v3 @@ -40,4 +41,4 @@ jobs: - name: Run pipeline with test data (${{ matrix.ANALYSIS }}) run: | - nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.ANALYSIS }},docker --outdir ./results_targeted + nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.ANALYSIS }},docker --outdir ./results_${{ matrix.ANALYSIS }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 09385ad7..ccb1efd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Add new parameters --reference and --protospacer ([#45](https://github.com/nf-core/crisprseq/pull/45)) +- Add UMI clustering to crisprseq-targeted ([#24](https://github.com/nf-core/crisprseq/pull/24)) ### Fixed diff --git a/README.md b/README.md index 9cd521aa..86ecafe6 100644 --- a/README.md +++ b/README.md @@ -38,11 +38,19 @@ For crispr targeted: 2. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) 3. Adapter trimming ([`Cutadapt`](http://dx.doi.org/10.14806/ej.17.1.200)) 4. Quality filtering ([`Seqtk`](https://github.com/lh3/seqtk)) -5. Read mapping: +5. UMI clustering (optional): + a. Extract UMI sequences (Python script) + b. Cluster UMI sequences ([`Vsearch`](https://github.com/torognes/vsearch)) + c. Obtain the most abundant UMI sequence for each cluster ([`Vsearch`](https://github.com/torognes/vsearch)) + d. Obtain a consensus for each cluster ([`minimap2`](https://github.com/lh3/minimap2)) + e. Polish consensus sequence ([`racon`](https://github.com/lbcb-sci/racon)) + f. Repeat a second rand of consensus + polishing (`minimap2` + `racon`) + g. Obtain the final consensus of each cluster ([Medaka](https://nanoporetech.github.io/medaka/index.html)) +6. Read mapping: - ([`minimap2`](https://github.com/lh3/minimap2), _default_) - ([`bwa`](http://bio-bwa.sourceforge.net/)) - ([`bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)) -6. CIGAR parsing for edit calling ([`R`](https://www.r-project.org/)) +7. CIGAR parsing for edit calling ([`R`](https://www.r-project.org/)) For crispr screening: @@ -62,7 +70,9 @@ For crispr screening: 3. Download the pipeline and test it on a minimal dataset with a single command: ```bash - nextflow run nf-core/crisprseq -profile test,YOURPROFILE --outdir + nextflow run nf-core/crisprseq -profile test_screening,YOURPROFILE --outdir + # or + nextflow run nf-core/crisprseq -profile test_targeted,YOURPROFILE --outdir ``` Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. diff --git a/bin/extract_umis.py b/bin/extract_umis.py index 78802c9f..e4792d0f 100755 --- a/bin/extract_umis.py +++ b/bin/extract_umis.py @@ -1,6 +1,10 @@ #!/usr/bin/env python3 -# FROM: pipeline-umi-amplicon distributed by ONT https://github.com/nanoporetech/pipeline-umi-amplicon -# https://github.com/nanoporetech/pipeline-umi-amplicon/blob/69ec3907879aea406b5fb02e3db83b579bfb8b45/lib/umi_amplicon_tools/extract_umis.py +# +# This code is obtained from Oxford Nanopore Technologies pipeline-umi-amplicon +# Distributed under the Mozilla Public License, v. 2.0 +# +# Original source code: https://github.com/nanoporetech/pipeline-umi-amplicon/blob/69ec3907879aea406b5fb02e3db83b579bfb8b45/lib/umi_amplicon_tools/extract_umis.py +# import argparse import logging diff --git a/conf/modules.config b/conf/modules.config index ad961c41..47155bcb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -127,7 +127,7 @@ process { ] } - withName: SEQTK_SEQ { + withName: SEQTK_SEQ_MASK { ext.args = '-q 20 -L 80 -n N' publishDir = [ path: { "${params.outdir}/preprocessing/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, @@ -140,6 +140,18 @@ process { ext.args = '--max-error 3 --adapter-length 250 --fwd-context ""' } + withName: VSEARCH_CLUSTER { + ext.args = "--minseqlength ${params.vsearch_minseqlength} --maxseqlength ${params.vsearch_maxseqlength} --qmask none --clusterout_sort --gapopen 0E/5I --gapext 0E/2I --mismatch -8 --match 6 --iddef 0 --minwordmatches 0 --qmask none -id ${params.vsearch_id}" + ext.args2 = '--cluster_fast' + ext.args3 = '--clusters' + } + + + withName: VSEARCH_SORT { + ext.args = '--topn 1' + ext.prefix = { "${fasta.baseName}_top" } + } + withName: MERGING_SUMMARY { publishDir = [ path: { "${params.outdir}/summary/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, @@ -148,7 +160,43 @@ process { ] } - withName: DUMMY_FINAL_UMI { + withName: MINIMAP2_ALIGN_UMI_1 { + ext.args = '-x map-ont' + ext.prefix = { "${reads.baseName}_cycle1" } + publishDir = [ + path: { "${params.outdir}/minimap2_umi" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: MINIMAP2_ALIGN_UMI_2 { + ext.args = '-x map-ont' + ext.prefix = { "${reads.baseName}_cycle2" } + publishDir = [ + path: { "${params.outdir}/minimap2_umi" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: RACON_1 { + ext.args = '-t 4 -m 8 -x -6 -g -8 -w 500 --no-trimming' + ext.prefix = { "${reads.baseName}_cycle1" } + } + + withName: RACON_2 { + ext.args = '-t 4 -m 8 -x -6 -g -8 -w 500 --no-trimming' + ext.prefix = { "${reads.baseName}_cycle2" } + } + + withName: MEDAKA { + ext.args = '-m r941_min_high_g303' + ext.prefix = { "${reads.baseName}_medakaConsensus" } + } + + withName: SEQTK_SEQ_FATOFQ { + ext.args = '-F "#"' publishDir = [ path: { "${params.outdir}/preprocessing/UMI" }, mode: params.publish_dir_mode, diff --git a/conf/test_umis.config b/conf/test_umis.config new file mode 100644 index 00000000..6efc80d8 --- /dev/null +++ b/conf/test_umis.config @@ -0,0 +1,29 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests (with UMIs option) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/crisprseq -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile UMIs' + config_profile_description = 'Minimal test dataset to check pipeline function with UMIs option' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata-edition/samplesheet_test_umis.csv' + analysis = 'targeted' + umi_clustering = true + + // Aligner + aligner = 'minimap2' +} diff --git a/docs/output/targeted.md b/docs/output/targeted.md index 16ffaf8a..b04ca3d8 100644 --- a/docs/output/targeted.md +++ b/docs/output/targeted.md @@ -15,14 +15,18 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: - [Preprocessing](#preprocessing) - - [Sequences](#sequences) - Input sequence preparation (reference, protospacer, template) + - [sequences](#sequences) - Input sequence preparation (reference, protospacer, template) - [cat](#cat) - Concatenate sample fastq files if required - - [Pear](#pear) - Join double-end reads if required - - [FastQC](#fastqc) - Read Quality Control - - [Adapters](#adapters) - Find adapters (Overrepresented sequences) in reads - - [Cutadapt](#cutadapt) - Trim adapters - - [Seqtk](#seqtk) - Mask low-quality bases - + - [pear](#pear) - Join double-end reads if required + - [fastqc](#fastqc) - Read Quality Control + - [adapters](#adapters) - Find adapters (Overrepresented sequences) in reads + - [cutadapt](#cutadapt) - Trim adapters + - [seqtk](#seqtk) - Mask low-quality bases +- [UMI clustering](#umi-clustering) + - [vsearch](#vsearch) + - [minimap2 umi](#minimap2-umi) + - [racon](#racon) + - [medaka](#medaka) - [Mapping](#mapping) - [minimap2](#minimap2) - Mapping reads to reference - [BWA](#bwa) - Mapping reads to reference @@ -143,7 +147,56 @@ The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They m [Seqtk](https://github.com/lh3/seqtk) masks (converts to Ns) bases with quality lower than 20 and removes sequences shorter than 80 bases. - +## UMI clustering + +### vsearch + +
+Output files + +- `vsearch/` + - `*_clusters*`: Contains all UMI sequences which clustered together. + - `*_clusters*_top.fasta`: Contains the most abundant UMI sequence from the cluster. + +
+ +[VSEARCH](https://github.com/torognes/vsearch) is a versatile open-source tool which includes chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pair-wise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. `vsearch/clsuter` can cluster sequences using a single-pass, greedy centroid-based clustering algorithm. `vsearch/sort` can sort fasta entries by decreasing abundance (`--sortbysize`) or sequence length (`--sortbylength`). + +### minimap2_umi + +
+Output files + +- `minimap2_umi/` + - `*_sequences_clycle[1,2].paf`: Alignment of the cluster sequences against the top UMi sequence in paf format. + +
+ +[Minimap2](https://github.com/lh3/minimap2) is a sequence alignment program that aligns DNA sequences against a reference database. + +### racon + +
+Output files + +- `racon/` + - `*_sequences_clycle[1,2]_assembly_consensus.fasta.gz`: Consensus sequence obtained from the cluster multiple sequence alignment. + +
+ +[Racon](https://github.com/lbcb-sci/racon) is an ultrafast consensus module for raw de novo genome assembly of long uncorrected reads. + +### medaka + +
+Output files + +- `medaka/` + - `*_medakaConsensus.fasta`: Final consensus sequence of each UMI cluster. Obtained after two rounds of minimap2 + racon. + +
+ +[Medaka](https://nanoporetech.github.io/medaka/index.html) is a tool to create consensus sequences and variant calls from nanopore sequencing data. ## Mapping diff --git a/docs/usage/targeted.md b/docs/usage/targeted.md index 8f415d0b..51e7a60f 100644 --- a/docs/usage/targeted.md +++ b/docs/usage/targeted.md @@ -55,6 +55,26 @@ chr6,chr6-61942198-61942498_R1.fastq.gz,,CAA...GGA,TTTTATGATATTTATCTTTT,TTC...CA An [example samplesheet](https://nf-co.re/crisprseq/1.0/assets/samplesheet.csv) has been provided with the pipeline. +## Optional pipeline steps + +### Trimming of overrepresented sequences + +To trim the overrepresented sequences found with FastQC from the reads, use the parameter `--overrepresented`. +Such sequences are not trimmed by default. +When using the `--overrepresented` parameter, Cutadapt is used to trim overrepresented sequences from the input FASTQ files. + +### UMI clustering + +If the provided samples were sequenced using umi-molecular identifiers (UMIs), use the parameter `--umi_clustering` in order to run the clustering steps. + +1. Extract UMI sequences (Python script) +2. Cluster UMI sequences ([`Vsearch`](https://github.com/torognes/vsearch)) +3. Obtain the most abundant UMI sequence for each cluster ([`Vsearch`](https://github.com/torognes/vsearch)) +4. Obtain a consensus for each cluster ([`minimap2`](https://github.com/lh3/minimap2)) +5. Polish consensus sequence ([`racon`](https://github.com/lbcb-sci/racon)) +6. Repeat a second rand of consensus + polishing (`minimap2` + `racon`) +7. Obtain the final consensus of each cluster ([Medaka](https://nanoporetech.github.io/medaka/index.html)) + ## Other input parameters ### Reference diff --git a/modules.json b/modules.json index 8af5a345..94b62c4a 100644 --- a/modules.json +++ b/modules.json @@ -52,6 +52,12 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, + "medaka": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"], + "patch": "modules/nf-core/medaka/medaka.diff" + }, "mageck/count": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", @@ -76,6 +82,11 @@ "installed_by": ["modules"], "patch": "modules/nf-core/minimap2/align/minimap2-align.diff" }, + "minimap2/index": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "ee80d14721e76e2e079103b8dcd5d57129e584ba", @@ -86,6 +97,16 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, + "racon": { + "branch": "master", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "installed_by": ["modules"] + }, + "samtools/faidx": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "samtools/index": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", @@ -95,6 +116,17 @@ "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] + }, + "vsearch/cluster": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"], + "patch": "modules/nf-core/vsearch/cluster/vsearch-cluster.diff" + }, + "vsearch/sort": { + "branch": "master", + "git_sha": "e7801603532df26b4bb4ef324ca2c39f7a4d0eee", + "installed_by": ["modules"] } } }, diff --git a/modules/local/dummy_final_umi.nf b/modules/local/dummy_final_umi.nf deleted file mode 100644 index 64069410..00000000 --- a/modules/local/dummy_final_umi.nf +++ /dev/null @@ -1,55 +0,0 @@ -process DUMMY_FINAL_UMI { - tag "$meta.id" - label 'process_single' - - conda "conda-forge::p7zip==16.02" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/p7zip:16.02' : - 'quay.io/biocontainers/p7zip:16.02' }" - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*_clustered.fastq.gz"), emit: dummy - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}_clustered.fastq - - 7za \\ - a \\ - $args \\ - ${prefix}_clustered.fastq.gz \\ - ${prefix}_clustered.fastq - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - 7za: \$(echo \$(7za --help) | sed 's/.*p7zip Version //; s/(.*//') - END_VERSIONS - """ -} - - - -process summary_reads{ - - input: - set sampleID, file(summary), file(finalReads) from summaryReport.join(readsSummary) - - output: - set val(sampleID), file(summary) into summaryReportReads - - script: - """ - final_count=`expr \$(cat $finalReads | wc -l) / 4` - echo "clustered-reads," \$final_count >> ${sampleID}_summary.csv - """ - - } diff --git a/modules/local/extract_umis.nf b/modules/local/extract_umis.nf index 4f9be2a3..d6c8fb44 100644 --- a/modules/local/extract_umis.nf +++ b/modules/local/extract_umis.nf @@ -11,9 +11,9 @@ process EXTRACT_UMIS { tuple val(meta), path(reads) output: - tuple val(meta), path(fasta) , emit: fasta - tuple val(meta), path(tsv) , emit: tsv - path "versions.yml" , emit: versions + tuple val(meta), path("*.fasta") , emit: fasta + tuple val(meta), path("*.tsv") , emit: tsv + path "versions.yml" , emit: versions script: def args = task.ext.args ?: '' diff --git a/modules/nf-core/medaka/main.nf b/modules/nf-core/medaka/main.nf new file mode 100644 index 00000000..80e4c065 --- /dev/null +++ b/modules/nf-core/medaka/main.nf @@ -0,0 +1,51 @@ +process MEDAKA { + tag "$meta.id" + label 'process_high' + + conda "bioconda::medaka=1.4.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/medaka:1.4.4--py38h130def0_0' : + 'biocontainers/medaka:1.4.4--py38h130def0_0' }" + + input: + tuple val(meta), path(reads), path(assembly) + + output: + tuple val(meta), path("*_medakaConsensus.fasta"), emit: assembly + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + if [[ "${reads.extension}" == "gz" ]]; then + gzip -df $reads + reads=$reads.baseName + else + reads=$reads + fi + if [[ "${assembly.extension}" == "gz" ]]; then + gzip -df $assembly + assembly=$assembly.baseName + else + assembly=$assembly + fi + + medaka_consensus \\ + -t $task.cpus \\ + $args \\ + -i \$reads \\ + -d \$assembly \\ + -o ./ + + mv consensus.fasta ${prefix}.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + medaka: \$( medaka --version 2>&1 | sed 's/medaka //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/medaka/medaka.diff b/modules/nf-core/medaka/medaka.diff new file mode 100644 index 00000000..7f8752ac --- /dev/null +++ b/modules/nf-core/medaka/medaka.diff @@ -0,0 +1,49 @@ +Changes in module 'nf-core/medaka' +--- modules/nf-core/medaka/main.nf ++++ modules/nf-core/medaka/main.nf +@@ -11,8 +11,8 @@ + tuple val(meta), path(reads), path(assembly) + + output: +- tuple val(meta), path("*.fa.gz"), emit: assembly +- path "versions.yml" , emit: versions ++ tuple val(meta), path("*_medakaConsensus.fasta"), emit: assembly ++ path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when +@@ -21,16 +21,27 @@ + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ ++ if [[ "${reads.extension}" == "gz" ]]; then ++ gzip -df $reads ++ reads=$reads.baseName ++ else ++ reads=$reads ++ fi ++ if [[ "${assembly.extension}" == "gz" ]]; then ++ gzip -df $assembly ++ assembly=$assembly.baseName ++ else ++ assembly=$assembly ++ fi ++ + medaka_consensus \\ + -t $task.cpus \\ + $args \\ +- -i $reads \\ +- -d $assembly \\ ++ -i \$reads \\ ++ -d \$assembly \\ + -o ./ + +- mv consensus.fasta ${prefix}.fa +- +- gzip -n ${prefix}.fa ++ mv consensus.fasta ${prefix}.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/modules/nf-core/medaka/meta.yml b/modules/nf-core/medaka/meta.yml new file mode 100644 index 00000000..ed124d61 --- /dev/null +++ b/modules/nf-core/medaka/meta.yml @@ -0,0 +1,47 @@ +name: medaka +description: A tool to create consensus sequences and variant calls from nanopore sequencing data +keywords: + - assembly + - polishing + - nanopore +tools: + - medaka: + description: Neural network sequence error correction. + homepage: https://nanoporetech.github.io/medaka/index.html + documentation: https://nanoporetech.github.io/medaka/index.html + tool_dev_url: https://github.com/nanoporetech/medaka + + licence: ["Mozilla Public License 2.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: List of input nanopore fasta/FastQ files + pattern: "*.{fasta,fa,fastq,fastq.gz,fq,fq.gz}" + - assembly: + type: file + description: Genome assembly + pattern: "*.{fasta,fa}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - assembly: + type: file + description: Polished genome assembly + pattern: "*.fa.gz" + +authors: + - "@avantonder" diff --git a/modules/nf-core/minimap2/index/main.nf b/modules/nf-core/minimap2/index/main.nf new file mode 100644 index 00000000..7a1bb227 --- /dev/null +++ b/modules/nf-core/minimap2/index/main.nf @@ -0,0 +1,34 @@ +process MINIMAP2_INDEX { + label 'process_medium' + + // Note: the versions here need to match the versions used in minimap2/align + conda "bioconda::minimap2=2.24" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/minimap2:2.24--h7132678_1' : + 'biocontainers/minimap2:2.24--h7132678_1' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("*.mmi"), emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + minimap2 \\ + -t $task.cpus \\ + -d ${fasta.baseName}.mmi \\ + $args \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/index/meta.yml b/modules/nf-core/minimap2/index/meta.yml new file mode 100644 index 00000000..b58f35c6 --- /dev/null +++ b/modules/nf-core/minimap2/index/meta.yml @@ -0,0 +1,40 @@ +name: minimap2_index +description: Provides fasta index required by minimap2 alignment. +keywords: + - index + - fasta + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: | + Reference database in FASTA format. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: Minimap2 fasta index. + pattern: "*.mmi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@yuukiiwa" + - "@drpatelh" diff --git a/modules/nf-core/racon/main.nf b/modules/nf-core/racon/main.nf new file mode 100644 index 00000000..f8ed3691 --- /dev/null +++ b/modules/nf-core/racon/main.nf @@ -0,0 +1,38 @@ +process RACON { + tag "$meta.id" + label 'process_high' + + conda "bioconda::racon=1.4.20" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/racon:1.4.20--h9a82719_1' : + 'quay.io/biocontainers/racon:1.4.20--h9a82719_1' }" + + input: + tuple val(meta), path(reads), path(assembly), path(paf) + + output: + tuple val(meta), path('*_assembly_consensus.fasta.gz') , emit: improved_assembly + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + racon -t "$task.cpus" \\ + "${reads}" \\ + "${paf}" \\ + $args \\ + "${assembly}" > \\ + ${prefix}_assembly_consensus.fasta + + gzip -n ${prefix}_assembly_consensus.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + racon: \$( racon --version 2>&1 | sed 's/^.*v//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/racon/meta.yml b/modules/nf-core/racon/meta.yml new file mode 100644 index 00000000..2e7737d9 --- /dev/null +++ b/modules/nf-core/racon/meta.yml @@ -0,0 +1,52 @@ +name: racon +description: Consensus module for raw de novo DNA assembly of long uncorrected reads +keywords: + - assembly + - pacbio + - nanopore + - polish +tools: + - racon: + description: Ultrafast consensus module for raw de novo genome assembly of long uncorrected reads. + homepage: https://github.com/lbcb-sci/racon + documentation: https://github.com/lbcb-sci/racon + tool_dev_url: https://github.com/lbcb-sci/racon + doi: 10.1101/gr.214270.116 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: List of input FastQ files. Racon expects single end reads + pattern: "*.{fastq,fastq.gz,fq,fq.gz}" + - assembly: + type: file + description: Genome assembly to be improved + pattern: "*.{fasta,fa}" + - paf: + type: file + description: Alignment in PAF format + pattern: "*.paf" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - improved_assembly: + type: file + description: Improved genome assembly + pattern: "*_assembly_consensus.fasta.gz" + +authors: + - "@avantonder" diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf new file mode 100644 index 00000000..4dd0e5b0 --- /dev/null +++ b/modules/nf-core/samtools/faidx/main.nf @@ -0,0 +1,44 @@ +process SAMTOOLS_FAIDX { + tag "$fasta" + label 'process_single' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path ("*.fai"), emit: fai + tuple val(meta), path ("*.gzi"), emit: gzi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + faidx \\ + $args \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${fasta}.fai + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml new file mode 100644 index 00000000..fe2fe9a1 --- /dev/null +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -0,0 +1,47 @@ +name: samtools_faidx +description: Index FASTA file +keywords: + - index + - fasta +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: FASTA file + pattern: "*.{fa,fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" + - gzi: + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@phue" diff --git a/modules/nf-core/vsearch/cluster/main.nf b/modules/nf-core/vsearch/cluster/main.nf new file mode 100644 index 00000000..413f70c9 --- /dev/null +++ b/modules/nf-core/vsearch/cluster/main.nf @@ -0,0 +1,66 @@ +process VSEARCH_CLUSTER { + tag "$meta.id" + label 'process_low' + + conda "bioconda::vsearch=2.21.1 bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-53dae514294fca7b44842b784ed85a5303ac2d80:7b3365d778c690ca79bc85aaaeb86bb39a2dec69-0': + 'quay.io/biocontainers/mulled-v2-53dae514294fca7b44842b784ed85a5303ac2d80:7b3365d778c690ca79bc85aaaeb86bb39a2dec69-0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path('*.aln.gz') , optional: true, emit: aln + tuple val(meta), path('*.biom.gz') , optional: true, emit: biom + tuple val(meta), path('*.mothur.tsv.gz') , optional: true, emit: mothur + tuple val(meta), path('*.otu.tsv.gz') , optional: true, emit: otu + tuple val(meta), path('*.bam') , optional: true, emit: bam + tuple val(meta), path('*.out.tsv.gz') , optional: true, emit: out + tuple val(meta), path('*.blast.tsv.gz') , optional: true, emit: blast + tuple val(meta), path('*.uc.tsv.gz') , optional: true, emit: uc + tuple val(meta), path('*.centroids.fasta.gz') , optional: true, emit: centroids + tuple val(meta), path('*_clusters*') , optional: true, emit: clusters + tuple val(meta), path('*.profile.txt.gz') , optional: true, emit: profile + tuple val(meta), path('*.msa.fasta.gz') , optional: true, emit: msa + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (!args2.contains("--cluster_fast") && !args2.contains("--cluster_size") && !args2.contains("--cluster_smallmem") && !args2.contains("--cluster_unoise") ) { + error "Unknown clustering option provided (${args2})" + } + def out_ext = args3.contains("--alnout") ? ".aln" : + args3.contains("--biomout") ? ".biom" : + args3.contains("--blast6out") ? ".blast.tsv" : + args3.contains("--centroids") ? ".centroids.fasta" : + args3.contains("--clusters") ? "_clusters" : + args3.contains("--mothur_shared_out") ? ".mothur.tsv" : + args3.contains("--msaout") ? ".msa.fasta" : + args3.contains("--otutabout") ? ".otu.tsv" : + args3.contains("--profile") ? ".profile.txt" : + args3.contains("--samout") ? ".sam" : + args3.contains("--uc") ? "uc.tsv" : + args3.contains("--userout") ? "out.tsv" : + "" + if (out_ext == "") { error "Unknown output file format provided (${args3})" } + """ + vsearch \\ + $args2 $fasta \\ + $args3 ${prefix}${out_ext} \\ + --threads $task.cpus \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g' | sed 's/,.*//g' | sed 's/^v//' | sed 's/_.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/vsearch/cluster/meta.yml b/modules/nf-core/vsearch/cluster/meta.yml new file mode 100644 index 00000000..5cc620cb --- /dev/null +++ b/modules/nf-core/vsearch/cluster/meta.yml @@ -0,0 +1,68 @@ +name: "vsearch_cluster" +description: Cluster sequences using a single-pass, greedy centroid-based clustering algorithm. +keywords: + - vsearch + - clustering +tools: + - vsearch: + description: VSEARCH is a versatile open-source tool for microbiome analysis, including chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pair-wise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. (USEARCH alternative) + homepage: https://github.com/torognes/vsearch + documentation: https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch_manual.pdf + tool_dev_url: https://github.com/torognes/vsearch + doi: 10.7717/peerj.2584 + licence: ["GPL v3-or-later OR BSD-2-clause"] + +input: + - meta: + type: map + description: Groovy Map containing sample information e.g. [ id:'test' ] + - fasta: + type: file + description: Sequences to cluster in FASTA format + pattern: "*.{fasta,fa,fasta.gz,fa.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - aln: + type: file + description: Results in pairwise alignment format + pattern: "*.aln.gz" + - biom: + type: file + description: Results in an OTU table in the biom version 1.0 file format + pattern: "*.biom.gz" + - mothur: + type: file + description: Results in an OTU table in the mothur ’shared’ tab-separated plain text file format + pattern: "*.mothur.tsv.gz" + - otu: + type: file + description: Results in an OTU table in the classic tab-separated plain text format + pattern: "*.otu.tsv.gz" + - bam: + type: file + description: Results written in bam format + pattern: "*.bam" + - out: + type: file + description: Results in tab-separated output, columns defined by user + pattern: "*.out.tsv.gz" + - blast: + type: file + description: Tab delimited results in blast-like tabular format + pattern: "*.blast.tsv.gz" + - uc: + type: file + description: Tab delimited results in a uclust-like format with 10 columns + pattern: "*.uc.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@mirpedrol" diff --git a/modules/nf-core/vsearch/cluster/vsearch-cluster.diff b/modules/nf-core/vsearch/cluster/vsearch-cluster.diff new file mode 100644 index 00000000..ff6808ac --- /dev/null +++ b/modules/nf-core/vsearch/cluster/vsearch-cluster.diff @@ -0,0 +1,59 @@ +Changes in module 'nf-core/vsearch/cluster' +--- modules/nf-core/vsearch/cluster/main.nf ++++ modules/nf-core/vsearch/cluster/main.nf +@@ -20,7 +20,7 @@ + tuple val(meta), path('*.blast.tsv.gz') , optional: true, emit: blast + tuple val(meta), path('*.uc.tsv.gz') , optional: true, emit: uc + tuple val(meta), path('*.centroids.fasta.gz') , optional: true, emit: centroids +- tuple val(meta), path('*.clusters.fasta.gz') , optional: true, emit: clusters ++ tuple val(meta), path('*_clusters*') , optional: true, emit: clusters + tuple val(meta), path('*.profile.txt.gz') , optional: true, emit: profile + tuple val(meta), path('*.msa.fasta.gz') , optional: true, emit: msa + path "versions.yml" , emit: versions +@@ -37,16 +37,16 @@ + if (!args2.contains("--cluster_fast") && !args2.contains("--cluster_size") && !args2.contains("--cluster_smallmem") && !args2.contains("--cluster_unoise") ) { + error "Unknown clustering option provided (${args2})" + } +- def out_ext = args3.contains("--alnout") ? "aln" : +- args3.contains("--biomout") ? "biom" : +- args3.contains("--blast6out") ? "blast.tsv" : +- args3.contains("--centroids") ? "centroids.fasta" : +- args3.contains("--clusters") ? "clusters.fasta" : +- args3.contains("--mothur_shared_out") ? "mothur.tsv" : +- args3.contains("--msaout") ? "msa.fasta" : +- args3.contains("--otutabout") ? "otu.tsv" : +- args3.contains("--profile") ? "profile.txt" : +- args3.contains("--samout") ? "sam" : ++ def out_ext = args3.contains("--alnout") ? ".aln" : ++ args3.contains("--biomout") ? ".biom" : ++ args3.contains("--blast6out") ? ".blast.tsv" : ++ args3.contains("--centroids") ? ".centroids.fasta" : ++ args3.contains("--clusters") ? "_clusters" : ++ args3.contains("--mothur_shared_out") ? ".mothur.tsv" : ++ args3.contains("--msaout") ? ".msa.fasta" : ++ args3.contains("--otutabout") ? ".otu.tsv" : ++ args3.contains("--profile") ? ".profile.txt" : ++ args3.contains("--samout") ? ".sam" : + args3.contains("--uc") ? "uc.tsv" : + args3.contains("--userout") ? "out.tsv" : + "" +@@ -54,16 +54,9 @@ + """ + vsearch \\ + $args2 $fasta \\ +- $args3 ${prefix}.${out_ext} \\ ++ $args3 ${prefix}${out_ext} \\ + --threads $task.cpus \\ + $args +- +- if [[ $args3 != "--samout" ]] +- then +- gzip -n ${prefix}.${out_ext} +- else +- samtools view -T $fasta -S -b ${prefix}.${out_ext} > ${prefix}.bam +- fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/modules/nf-core/vsearch/sort/main.nf b/modules/nf-core/vsearch/sort/main.nf new file mode 100644 index 00000000..a6f06535 --- /dev/null +++ b/modules/nf-core/vsearch/sort/main.nf @@ -0,0 +1,47 @@ +process VSEARCH_SORT { + tag "$meta.id" + label 'process_low' + + conda "bioconda::vsearch=2.21.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/vsearch:2.21.1--h95f258a_0': + 'quay.io/biocontainers/vsearch:2.21.1--h95f258a_0' }" + + input: + tuple val(meta), path(fasta) + val sort_arg + + output: + tuple val(meta), path("*.fasta"), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$fasta" == "${prefix}.fasta") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + """ + vsearch \\ + $sort_arg $fasta \\ + --threads $task.cpus \\ + --output ${prefix}.fasta \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g;s/,.*//g;s/^v//;s/_.*//') + END_VERSIONS + """ + + stub: + """ + touch ${prefix}.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g;s/,.*//g;s/^v//;s/_.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/vsearch/sort/meta.yml b/modules/nf-core/vsearch/sort/meta.yml new file mode 100644 index 00000000..a527b7f1 --- /dev/null +++ b/modules/nf-core/vsearch/sort/meta.yml @@ -0,0 +1,49 @@ +name: "vsearch_sort" +description: Sort fasta entries by decreasing abundance (--sortbysize) or sequence length (--sortbylength). +keywords: + - vsearch/sort + - vsearch + - sort + - amplicon sequences + - metagenomics + - genomics + - population genetics +tools: + - vsearch: + description: VSEARCH is a versatile open-source tool for microbiome analysis, including chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pair-wise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. (USEARCH alternative) + homepage: https://github.com/torognes/vsearch + documentation: https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch_manual.pdf + tool_dev_url: https://github.com/torognes/vsearch + doi: 10.7717/peerj.2584 + licence: ["GPL v3-or-later OR BSD-2-clause"] + +input: + - meta: + type: map + description: Groovy Map containing sample information e.g. [ id:'test' ] + - fasta: + type: file + description: Sequences to be sorted in FASTA format + pattern: "*.{fasta,fa,fasta.gz,fa.gz}" + - sort_arg: + type: string + description: Argument to provide to sort algorithm. Sort by abundance with --sortbysize or by sequence length with --sortbylength. + enums: ["--sortbysize", "--sortbylength"] + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Sorted FASTA file + pattern: "*.{fasta}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@mirpedrol" diff --git a/nextflow.config b/nextflow.config index a1fa8021..1db28f3e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -22,6 +22,18 @@ params { min_reads = 30 min_targeted_genes = 3 + // Pipeline steps + overrepresented = false + umi_clustering = false + + // UMI parameters + umi_bin_size = 1 + medaka_model = 'r941_min_high_g303' + + // Vsearch options + vsearch_minseqlength = 55 + vsearch_maxseqlength = 57 + vsearch_id = 0.99 // References genome = null @@ -110,6 +122,7 @@ profiles { docker { docker.enabled = true docker.userEmulation = true + docker.registry = 'quay.io' singularity.enabled = false podman.enabled = false shifter.enabled = false @@ -154,6 +167,7 @@ profiles { } test_targeted { includeConfig 'conf/test_targeted.config' } test_full { includeConfig 'conf/test_full.config' } + test_umis { includeConfig 'conf/test_umis.config' } test_screening { includeConfig 'conf/test_screening.config' } } diff --git a/nextflow_schema.json b/nextflow_schema.json index b38ebd95..f599d796 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -48,15 +48,55 @@ } } }, + "targeted_pipeline_steps": { + "title": "targeted pipeline steps", + "type": "object", + "description": "Alternative pipeline steps to include in the targeted analysis.", + "default": "", + "properties": { + "overrepresented": { + "type": "boolean", + "fa_icon": "fas fa-sort-numeric-up-alt", + "description": "Trim overrepresented sequences from reads (cutadapt)" + }, + "umi_clustering": { + "type": "boolean", + "fa_icon": "fas fa-layer-group", + "description": "If the sample contains umi-molecular identifyers (UMIs), run the UMI extraction, clustering and consensus steps." + } + }, + "fa_icon": "fas fa-shoe-prints" + }, + "umi_parameters": { + "title": "UMI parameters", + "type": "object", + "description": "Parameters regarding umi molecular identifiers (UMIs)", + "default": "", + "properties": { + "umi_bin_size": { + "type": "integer", + "default": 1, + "description": "Minimum size of a UMI cluster.", + "fa_icon": "fas fa-sort-amount-down" + }, + "medaka_model": { + "type": "string", + "default": "r941_min_high_g360", + "fa_icon": "fas fa-font", + "description": "Medaka model (-m) to use according to the basecaller used." + } + }, + "fa_icon": "fas fa-layer-group" + }, "targeted_parameters": { "title": "Targeted parameters", "type": "object", - "description": "", + "description": "Parameters used for alignment processes", "default": "", "properties": { "aligner": { "type": "string", - "description": "Aligner program to use", + "description": "Aligner program to use.", "default": "minimap2", "fa_icon": "fas fa-align-justify", "enum": ["minimap2", "bwa", "bowtie2"] @@ -66,7 +106,42 @@ "fa_icon": "fas fa-grip-lines", "description": "Provide the same protospacer sequence for all samples. Will override protospacer sequences provided by an input samplesheet." } - } + }, + "fa_icon": "fas fa-align-justify" + }, + "vsearch_parameters": { + "title": "Vsearch parameters", + "type": "object", + "description": "Parameters to use in Vsearch processes", + "default": "", + "properties": { + "vsearch_minseqlength": { + "type": "integer", + "default": 55, + "fa_icon": "fas fa-minus", + "description": "Vsearch minimum sequence length.", + "help_text": "Discard sequences shorter than vsearch_minseqlength.", + "minimum": 1 + }, + "vsearch_maxseqlength": { + "type": "integer", + "default": 57, + "fa_icon": "fas fa-plus", + "description": "Vsearch maximum sequence length.", + "help_text": "Discard sequences longer than vsearch_minseqlength.", + "minimum": 1 + }, + "vsearch_id": { + "type": "number", + "default": 0.99, + "fa_icon": "fas fa-equals", + "description": "Vsearch pairwise identity threshold.", + "help_text": "Do not add the target to the cluster if the pairwise identity with the centroid is lower than id. The pairwise identity is defined as the number of (matching columns) / (alignment length - terminal gaps).", + "minimum": 0, + "maximum": 1 + } + }, + "fa_icon": "fas fa-layer-group" }, "screening_parameters": { "title": "Screening parameters", @@ -98,12 +173,12 @@ "min_reads": { "type": "number", "description": "a filter threshold value for sgRNAs, based on their average counts in the control sample", - "default": 30.0 + "default": 30 }, "min_targeted_genes": { "type": "number", "description": "Minimal number of different genes targeted by sgRNAs in a biased segment in order for the corresponding counts to be corrected", - "default": 3.0 + "default": 3 } } }, @@ -326,6 +401,13 @@ "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "schema_ignore_params": { + "type": "string", + "default": "genomes", + "description": "Ignore JSON schema validation of the following params", + "fa_icon": "fas fa-ban", + "hidden": true } } } @@ -334,9 +416,27 @@ { "$ref": "#/definitions/input_output_options" }, + { + "$ref": "#/definitions/targeted_pipeline_steps" + }, + { + "$ref": "#/definitions/umi_parameters" + }, + { + "$ref": "#/definitions/targeted_pipeline_steps" + }, + { + "$ref": "#/definitions/umi_parameters" + }, { "$ref": "#/definitions/targeted_parameters" }, + { + "$ref": "#/definitions/vsearch_parameters" + }, + { + "$ref": "#/definitions/vsearch_parameters" + }, { "$ref": "#/definitions/screening_parameters" }, diff --git a/templates/merging_summary.py b/templates/merging_summary.py index 0eb1239a..804a5be3 100755 --- a/templates/merging_summary.py +++ b/templates/merging_summary.py @@ -6,22 +6,28 @@ with gzip.open("${raw_reads[0]}", "rt") as handle: raw_reads_count = len(list(SeqIO.parse(handle, "fastq"))) -if "$assembled_reads" == "null": + +if "$assembled_reads" == "null_a": assembled_reads_count = 0 else: with gzip.open("$assembled_reads", "rt") as handle: assembled_reads_count = len(list(SeqIO.parse(handle, "fastq"))) # Merged reads R1+R2 + with gzip.open("$trimmed_reads", "rt") as handle: trimmed_reads_count = len(list(SeqIO.parse(handle, "fastq"))) # Filtered reads -with open("$trimmed_adapters", "r") as handle: - for line in handle: - if line.startswith("Reads with adapters"): - for field in "".join(line.split(",")).split(): - if field.isdigit(): - adapters_count = field # reads with adapters - if "%" in field: - adapters_percentage = field # percentage of reads with adapters: ex. "(100.0%)" +if "$trimmed_adapters" == "null_t": + adapters_count = 0 + adapters_percentage = "(0.0%)" +else: + with open("$trimmed_adapters", "r") as handle: + for line in handle: + if line.startswith("Reads with adapters"): + for field in "".join(line.split(",")).split(): + if field.isdigit(): + adapters_count = field # reads with adapters + if "%" in field: + adapters_percentage = field # percentage of reads with adapters: ex. "(100.0%)" if "$task.ext.prefix" != "null": prefix = "$task.ext.prefix" @@ -35,7 +41,7 @@ f"merged-reads, {assembled_reads_count} ({round(assembled_reads_count * 100 / raw_reads_count,1)}%)\\n" ) output_file.write(f"reads-with-adapters, {adapters_count} {adapters_percentage}\\n") - if "$assembled_reads" == "null": + if "$assembled_reads" == "null_a": output_file.write( f"quality-filtered-reads, {trimmed_reads_count} ({round(trimmed_reads_count * 100 / raw_reads_count,1)}%)\\n" ) diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf index eeb155e6..d78bda2e 100644 --- a/workflows/crisprseq_targeted.nf +++ b/workflows/crisprseq_targeted.nf @@ -41,17 +41,16 @@ include { INPUT_CHECK } from '../subworkflows/local/input_c // // MODULE // -include { FIND_ADAPTERS } from '../modules/local/find_adapters' -include { EXTRACT_UMIS } from '../modules/local/extract_umis' -include { SEQ_TO_FILE as SEQ_TO_FILE_REF } from '../modules/local/seq_to_file' -include { SEQ_TO_FILE as SEQ_TO_FILE_TEMPL } from '../modules/local/seq_to_file' -include { ORIENT_REFERENCE } from '../modules/local/orient_reference' -include { CIGAR_PARSER } from '../modules/local/cigar_parser' -include { MERGING_SUMMARY } from '../modules/local/merging_summary' -include { CLUSTERING_SUMMARY } from '../modules/local/clustering_summary' -include { ALIGNMENT_SUMMARY } from '../modules/local/alignment_summary' -include { TEMPLATE_REFERENCE } from '../modules/local/template_reference' -include { DUMMY_FINAL_UMI } from '../modules/local/dummy_final_umi' +include { FIND_ADAPTERS } from '../modules/local/find_adapters' +include { EXTRACT_UMIS } from '../modules/local/extract_umis' +include { SEQ_TO_FILE as SEQ_TO_FILE_REF } from '../modules/local/seq_to_file' +include { SEQ_TO_FILE as SEQ_TO_FILE_TEMPL } from '../modules/local/seq_to_file' +include { ORIENT_REFERENCE } from '../modules/local/orient_reference' +include { CIGAR_PARSER } from '../modules/local/cigar_parser' +include { MERGING_SUMMARY } from '../modules/local/merging_summary' +include { CLUSTERING_SUMMARY } from '../modules/local/clustering_summary' +include { ALIGNMENT_SUMMARY } from '../modules/local/alignment_summary' +include { TEMPLATE_REFERENCE } from '../modules/local/template_reference' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -67,17 +66,66 @@ include { MULTIQC } from '../modules/nf-co include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { PEAR } from '../modules/nf-core/pear/main' include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' -include { SEQTK_SEQ } from '../modules/nf-core/seqtk/seq/main' +include { SEQTK_SEQ as SEQTK_SEQ_MASK } from '../modules/nf-core/seqtk/seq/main' +include { SEQTK_SEQ as SEQTK_SEQ_FATOFQ } from '../modules/nf-core/seqtk/seq/main' +include { VSEARCH_CLUSTER } from '../modules/nf-core/vsearch/cluster/main' +include { VSEARCH_SORT } from '../modules/nf-core/vsearch/sort/main' +include { RACON as RACON_1 } from '../modules/nf-core/racon/main' +include { RACON as RACON_2 } from '../modules/nf-core/racon/main' include { BOWTIE2_ALIGN } from '../modules/nf-core/bowtie2/align/main' include { BOWTIE2_BUILD } from '../modules/nf-core/bowtie2/build/main' include { BWA_MEM } from '../modules/nf-core/bwa/mem/main' include { BWA_INDEX } from '../modules/nf-core/bwa/index/main' include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_ORIGINAL } from '../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_1 } from '../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_2 } from '../modules/nf-core/minimap2/align/main' include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE } from '../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' +include { MINIMAP2_INDEX } from '../modules/nf-core/minimap2/index/main' +include { MEDAKA } from '../modules/nf-core/medaka/main' include { CUTADAPT } from '../modules/nf-core/cutadapt/main' include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + DEFINE GROOVY FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +def umi_to_sequence(cluster1) { + String line1 + String sequences = "" + String sequence1 + String id1 + cluster1.withReader { + while ( line1=it.readLine() ) { + if (line1.startsWith(">")) { + sequence1 = (line1 =~ /;seq=(.*$)/)[0][1] + id1 = (line1 =~ /(>.*?);/)[0][1] + sequences = sequences + id1 + "\n" + sequence1 + "\n" + } + } + } + return sequences +} + +def umi_to_sequence_centroid(cluster) { + String line + String sequence + String id + cluster.withReader { + while ( line=it.readLine() ) { + if (line.startsWith(">")) { + sequence = (line =~ /;seq=(.*$)/)[0][1] + id = (line =~ /(>.*?);/)[0][1] + return id.replace(">", ">centroid_") + "\n" + sequence + } + } + } +} + + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -226,103 +274,311 @@ workflow CRISPRSEQ_TARGETED { ) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - // - // MODULE: Find overrepresented sequences - FIND_ADAPTERS ( - FASTQC.out.zip - ) - .adapters - .join(ch_pear_fastq) - .groupTuple(by: [0]) - // Separate samples by containing overrepresented sequences or not - .branch { - meta, adapter_seqs, reads -> - no_adapters: adapter_seqs[0].size() == 0 - return [ meta, reads[0] ] - adapters : adapter_seqs[0].size() > 0 - return [ meta, reads[0], adapter_seqs[0] ] - } - .set { ch_adapter_seqs } + ch_trimmed = Channel.empty() + if (params.overrepresented) { + // + // MODULE: Find overrepresented sequences + FIND_ADAPTERS ( + FASTQC.out.zip + ) + .adapters + .join(ch_pear_fastq) + .groupTuple(by: [0]) + // Separate samples by containing overrepresented sequences or not + .branch { + meta, adapter_seqs, reads -> + no_adapters: adapter_seqs[0].size() == 0 + return [ meta, reads[0] ] + adapters : adapter_seqs[0].size() > 0 + return [ meta, reads[0], adapter_seqs[0] ] + } + .set { ch_adapter_seqs } - // - // MODULE: Trim adapter sequences - // - CUTADAPT ( - ch_adapter_seqs.adapters - ) - ch_versions = ch_versions.mix(CUTADAPT.out.versions) - ch_adapter_seqs.no_adapters - .mix(CUTADAPT.out.reads) - .groupTuple(by: [0]) - .map { - meta, fastq -> - return [ meta, fastq.flatten() ] + // + // MODULE: Trim adapter sequences + // + CUTADAPT ( + ch_adapter_seqs.adapters + ) + ch_versions = ch_versions.mix(CUTADAPT.out.versions) + + ch_adapter_seqs.no_adapters + .mix(CUTADAPT.out.reads) + .groupTuple(by: [0]) + .map { + meta, fastq -> + return [ meta, fastq.flatten() ] + } + .set{ ch_trimmed } + } else { + ch_trimmed = ch_pear_fastq } - .set{ ch_trimmed } + // // MODULE: Mask (convert to Ns) bases with quality lower than 20 and remove sequences shorter than 80 // - SEQTK_SEQ ( + SEQTK_SEQ_MASK ( ch_trimmed ) - ch_versions = ch_versions.mix(SEQTK_SEQ.out.versions) - + ch_versions = ch_versions.mix(SEQTK_SEQ_MASK.out.versions) - // - // MODULE: Summary of merged reads - // - MERGING_SUMMARY { + if (params.overrepresented) { ch_cat_fastq.paired .mix(ch_cat_fastq.single) .join(PEAR.out.assembled, remainder: true) - .join(SEQTK_SEQ.out.fastx) + .join(SEQTK_SEQ_MASK.out.fastx) .join(CUTADAPT.out.log) + .set { ch_merging_summary_data } + } else { + ch_cat_fastq.paired + .mix(ch_cat_fastq.single) + .join(PEAR.out.assembled, remainder: true) + .join(SEQTK_SEQ_MASK.out.fastx) + .combine(Channel.value("null")) .map { meta, reads, assembled, masked, trimmed -> if (assembled == null) { - dummy = file('null') - return [ meta, reads, dummy, masked, trimmed ] - } else { - return [ meta, reads, assembled, masked, trimmed ] + assembled = file('null_a') } + if (trimmed == "null") { + trimmed = file('null_t') + } + return [ meta, reads, assembled, masked, trimmed ] } + .set { ch_merging_summary_data } } - - /* - The UMI clustering step is posponed until the next release, the steps to be implemented are listed below: - // - // MODULE: Extract UMI sequences + // MODULE: Summary of merged reads // - EXTRACT_UMIS ( - SEQTK_SEQ.out.fastx - ) + MERGING_SUMMARY { + ch_merging_summary_data + } + + + if (params.umi_clustering) { + // + // MODULE: Extract UMI sequences + // + EXTRACT_UMIS ( + SEQTK_SEQ_MASK.out.fastx + ) + + + // + // MODULE: Cluster UMIs + // + VSEARCH_CLUSTER ( + EXTRACT_UMIS.out.fasta + ) + + // Obtain a file with UBS (UBI bin size) and UMI ID + VSEARCH_CLUSTER.out.clusters + .transpose() + .collectFile( storeDir:params.outdir ) { + it -> + [ "${it[0].id}_ubs.txt", "${it[1].countFasta()}\t${it[1].baseName}\n" ] + } + + // Branch the clusters into the ones containing only one sequence and the ones containing more than one sequences + VSEARCH_CLUSTER.out.clusters + .transpose() + .branch{ + meta, cluster -> + single: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() == 1 + return [meta, cluster] + cluster: cluster.countFasta() >= params.umi_bin_size && cluster.countFasta() > 1 + return [meta, cluster] + } + .set{ ch_umi_bysize } + + // Get the correspondent fasta sequencences from single clusters + ch_umi_bysize.single + .tap{ meta_channel } + .map{ meta, cluster -> + fasta_line = umi_to_sequence(cluster) + [meta, cluster.baseName, fasta_line] + } + .collectFile() { meta, name, fasta -> + [ "${name}_consensus.fasta", fasta ] + } + .map{ new_file -> + [new_file.baseName, new_file] + } + .join(meta_channel + .map { meta, original_file -> + ["${original_file.baseName}_consensus", meta] + }) + .map{ file_name, new_file, meta -> + [meta, new_file] + } + .set{ ch_single_clusters_consensus } + + // + // MODULE: Obtain most abundant UMI in cluster + // + VSEARCH_SORT( + ch_umi_bysize.cluster, + Channel.value("--sortbysize") + ) - Modules to implement: + // Get the correspondent fasta sequencences from top cluster sequences + // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps + VSEARCH_SORT.out.fasta // [[id:sample_id, ...], sample_top.fasta] + .tap{ meta_channel_2 } // [[id:sample_id, ...], sample_top.fasta] + .map{ meta, fasta -> + fasta_line = umi_to_sequence_centroid(fasta) + [meta, fasta.baseName, fasta_line] // [[id:sample_id, ...], sample_top, >centroid_...] + } + .collectFile(cache:true,sort:true) { meta, name, fasta -> + [ "${name}.fasta", fasta ] // >centroid_... -> sample_top.fasta + } + .dump(tag:"collectfile output") + .map{ new_file -> + [new_file.baseName, new_file] // Substring is removing "_top" added by VSEARCH_SORT // [sample, sample_top.fasta] + } + .join(meta_channel_2 + .map { meta, original_file -> + ["${original_file.baseName}", meta] // Substring is removing "_top" added by VSEARCH_SORT // [sample, [id:sample_id, ...]] + }) // [sample, sample_top.fasta, [id:sample_id, ...]] + .map{ file_name, new_file, meta -> + [meta + [cluster_id: file_name[0..-5]], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta] + } + .set{ ch_top_clusters_sequence } // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta] + + + // Get the correspondent fasta sequencences from UMI clusters + ch_umi_bysize.cluster // [[id:sample_id, ...], sample] + .tap{ meta_channel_3 } // [[id:sample_id, ...], sample] + .map{ meta, cluster -> + fasta_line = umi_to_sequence(cluster) + [meta, cluster.baseName, fasta_line] // [[id:sample_id, ...], sample, >...] + } + .dump(tag:'map allreads output') + .collectFile(cache:true,sort:true) { meta, name, fasta -> + [ "${name}_sequences.fasta", fasta ] // >... -> sample_sequences.fasta + } + .dump(tag:"collectfile allreads output") + .map{ new_file -> + [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile // [sample, sample_sequences.fasta] + } + .join(meta_channel_3 + .map { meta, original_file -> + ["${original_file.baseName}", meta] // [sample, [id:sample_id, ...]] + }) // [sample, sample_sequences.fasta, [id:sample_id, ...]] + .map{ file_name, new_file, meta -> + [meta + [cluster_id: file_name], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_sequences.fasta] + } + .set{ ch_clusters_sequence } - vsearch - get_ubs - top_read - polishing: minimap2, racon - consensus - join_reads - fa2fq - */ + // Cluster consensus & polishing + // Two cycles of minimap2 + racon - // Dummy process simulating the last UMi clustering step to obtain clusters as fastq - DUMMY_FINAL_UMI { - SEQTK_SEQ.out.fastx + // + // MODULE: Mapping with minimap2 - cycle 1 + // + // Map each cluster against the top read (most abundant UMI) in the cluster + MINIMAP2_ALIGN_UMI_1 ( + ch_clusters_sequence + .join(ch_top_clusters_sequence), + false, //output in paf format + false, + false + ) + + MINIMAP2_ALIGN_UMI_1.out.paf.dump(tag:'minimap_unfiltered') + + // Only continue with clusters that have aligned sequences + MINIMAP2_ALIGN_UMI_1.out.paf + .filter{ it[1].countLines() > 0 } + .set{ ch_minimap_1 } + + // + // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 1 + // + RACON_1 ( + ch_clusters_sequence + .join(ch_top_clusters_sequence) + .join(ch_minimap_1) + ) + + // + // MODULE: Mapping with minimap2 - cycle 2 + // + // Map each cluster against the top read (most abundant UMI) in the cluster + MINIMAP2_ALIGN_UMI_2 ( + ch_clusters_sequence + .join(RACON_1.out.improved_assembly), + false, //output in paf format + false, + false + ) + + // Only continue with clusters that have aligned sequences + MINIMAP2_ALIGN_UMI_2.out.paf + .filter{ it[1].countLines() > 0 } + .set{ ch_minimap_2 } + + // + // MODULE: Improve top read from UMI cluster using cluster consensus - cycle 2 + // + RACON_2 ( + ch_clusters_sequence + .join(RACON_1.out.improved_assembly) + .join(ch_minimap_2) + ) + + + // + // MODULE: Obtain a consensus sequence + // + MEDAKA ( + ch_clusters_sequence + .join(RACON_2.out.improved_assembly) + ) + + // Collect all consensus UMI sequences into one single file per sample + MEDAKA.out.assembly + .tap{ meta_channel_4 } + .map{ meta, file -> + file_content = file.getText() + [meta, file_content] // [[id:sample_id, ...], consensus_content] + } + .collectFile() { meta, file -> + [ "${meta.id}_consensus.fasta", file ] + } + .map{ new_file -> + [new_file.baseName, new_file] + } + .join(meta_channel_4 + .map{ meta, consensus -> + ["${meta.id}_consensus", meta] + } + ) + .map{ name, file, meta -> + [meta - meta.subMap('cluster_id'), file] + } + .set{ ch_umi_consensus } + + + // + // MODULE: Convert fasta to fastq + // + SEQTK_SEQ_FATOFQ ( + ch_umi_consensus + ) } + ch_preprocess_reads = params.umi_clustering ? SEQTK_SEQ_FATOFQ.out.fastx : SEQTK_SEQ_MASK.out.fastx + // // MODULE: Summary of clustered reads // CLUSTERING_SUMMARY ( - SEQTK_SEQ.out.fastx + ch_preprocess_reads .join(MERGING_SUMMARY.out.summary) ) @@ -332,7 +588,7 @@ workflow CRISPRSEQ_TARGETED { // if (params.aligner == "minimap2") { MINIMAP2_ALIGN_ORIGINAL ( - SEQTK_SEQ.out.fastx + ch_preprocess_reads .join(ORIENT_REFERENCE.out.reference), true, false, @@ -351,7 +607,7 @@ workflow CRISPRSEQ_TARGETED { ) ch_versions = ch_versions.mix(BWA_INDEX.out.versions) BWA_MEM ( - SEQTK_SEQ.out.fastx, + ch_preprocess_reads, BWA_INDEX.out.index, true ) @@ -368,7 +624,7 @@ workflow CRISPRSEQ_TARGETED { ) ch_versions = ch_versions.mix(BOWTIE2_BUILD.out.versions) BOWTIE2_ALIGN ( - SEQTK_SEQ.out.fastx, + ch_preprocess_reads, BOWTIE2_BUILD.out.index, false, true