diff --git a/CHANGELOG.md b/CHANGELOG.md index bfdb9a8..8a41ccd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## unreleased + +### Added + +- `bwa mem` as short-read mapper alternative, parameter: `--bwa` + ## [v1.0.2] - 2024-05-17 ### Added diff --git a/CITATIONS.md b/CITATIONS.md index 8d0e428..641d26b 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -12,6 +12,10 @@ > Quinlan AR, Hall IM. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics. 2010 Mar 15;26(6):841-2. doi: 10.1093/bioinformatics/btq033. Epub 2010 Jan 28. PubMed PMID: 20110278; PubMed Central PMCID: PMC2832824. +- [BWA](https://arxiv.org/abs/1303.3997) + + > Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXiv:1303.3997v2 [q-bio.GN] + - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) - [Minimap2](https://pubmed.ncbi.nlm.nih.gov/29750242/) diff --git a/clean.nf b/clean.nf index 00f245e..a818254 100755 --- a/clean.nf +++ b/clean.nf @@ -10,7 +10,7 @@ Author: hoelzer.martin@gmail.com // Parameters sanity checking -Set valid_params = ['max_cores', 'cores', 'max_memory', 'memory', 'profile', 'help', 'input', 'input_type', 'list', 'host', 'own', 'control', 'keep', 'rm_rrna', 'bbduk', 'bbduk_kmer', 'bbduk_qin', 'reads_rna', 'min_clip', 'dcs_strict', 'output', 'multiqc_dir', 'nf_runinfo_dir', 'databases', 'cleanup_work_dir','condaCacheDir', 'singularityCacheDir', 'singularityCacheDir', 'cloudProcess', 'conda-cache-dir', 'singularity-cache-dir', 'cloud-process', 'publish_dir_mode', 'no_intermediate', 'skip_qc'] // don't ask me why there is also 'conda-cache-dir', 'singularity-cache-dir', 'cloud-process' +Set valid_params = ['max_cores', 'cores', 'max_memory', 'memory', 'profile', 'help', 'input', 'input_type', 'list', 'host', 'own', 'control', 'keep', 'rm_rrna', 'bwa', 'bbduk', 'bbduk_kmer', 'bbduk_qin', 'reads_rna', 'min_clip', 'dcs_strict', 'output', 'multiqc_dir', 'nf_runinfo_dir', 'databases', 'cleanup_work_dir','condaCacheDir', 'singularityCacheDir', 'singularityCacheDir', 'cloudProcess', 'conda-cache-dir', 'singularity-cache-dir', 'cloud-process', 'publish_dir_mode', 'no_intermediate', 'skip_qc'] // don't ask me why there is also 'conda-cache-dir', 'singularity-cache-dir', 'cloud-process' def parameter_diff = params.keySet() - valid_params if (parameter_diff.size() != 0){ exit 1, "ERROR: Parameter(s) $parameter_diff is/are not valid in the pipeline!\n" @@ -275,6 +275,7 @@ def helpMSG() { Reads are assigned to a combined index for decontamination and keeping. The use of this parameter can prevent false positive hits and the accidental removal of reads due to (poor quality) mappings. ${c_green}--rm_rrna ${c_reset} Clean your data from rRNA [default: $params.rm_rrna] + ${c_green}--bwa${c_reset} Add this flag to use BAW MEM instead of minimap2 for decontamination of short reads [default: $params.bwa] ${c_green}--bbduk${c_reset} Add this flag to use bbduk instead of minimap2 for decontamination of short reads [default: $params.bbduk] ${c_green}--bbduk_kmer${c_reset} Set kmer for bbduk [default: $params.bbduk_kmer] ${c_green}--bbduk_qin${c_reset} Set quality ASCII encoding for bbduk [default: $params.bbduk_qin; options are: 64, 33, auto] diff --git a/configs/container.config b/configs/container.config index f713429..0f746c8 100644 --- a/configs/container.config +++ b/configs/container.config @@ -1,11 +1,12 @@ process { - withLabel: smallTask { container = 'nanozoo/samtools:1.14--d8fb865' } - withLabel: minimap2 { container = 'nanozoo/minimap2:2.26--d9ef6b6' } - withLabel: bbmap { container = 'nanozoo/bbmap:38.79--8e915d7' } - withLabel: multiqc { container = 'nanozoo/multiqc:1.9--aba729b' } - withLabel: fastqc { container = 'nanozoo/fastqc:0.11.9--f61b8b4' } - withLabel: nanoplot { container = 'nanozoo/nanoplot:1.32.0--1ae6f5d' } - withLabel: quast { container = 'nanozoo/quast:5.0.2--e7f0cfe' } - withLabel: bed_samtools { container = 'nanozoo/bed_samtools:2.30.0--cc7d1b9' } - withLabel: seqkit { container = 'nanozoo/seqkit:2.6.1--022e008' } + withLabel: smallTask { container = 'nanozoo/samtools:1.14--d8fb865' } + withLabel: minimap2 { container = 'nanozoo/minimap2:2.26--d9ef6b6' } + withLabel: bwa { container = 'nanozoo/bwa:0.7.18--0eff897' } + withLabel: bbmap { container = 'nanozoo/bbmap:38.79--8e915d7' } + withLabel: multiqc { container = 'nanozoo/multiqc:1.9--aba729b' } + withLabel: fastqc { container = 'nanozoo/fastqc:0.11.9--f61b8b4' } + withLabel: nanoplot { container = 'nanozoo/nanoplot:1.32.0--1ae6f5d' } + withLabel: quast { container = 'nanozoo/quast:5.0.2--e7f0cfe' } + withLabel: bed_samtools { container = 'nanozoo/bed_samtools:2.30.0--cc7d1b9' } + withLabel: seqkit { container = 'nanozoo/seqkit:2.6.1--022e008' } } diff --git a/configs/node.config b/configs/node.config index f1bdb73..c9705b5 100644 --- a/configs/node.config +++ b/configs/node.config @@ -1,11 +1,12 @@ process { - withLabel: minimap2 { cpus = 24; memory = 24.GB } - withLabel: bbmap { cpus = 24; memory = 24.GB } - withLabel: smallTask { cpus = 1; memory = 2.GB } - withLabel: pysam { cpus = 2; memory = 4.GB } - withLabel: fastqc { cpus = {2 * task.attempt}; memory = {4.GB * task.attempt } ; maxRetries = 3 ; errorStrategy = { task.exitStatus in 130..140 ? 'retry' : 'terminate' } } - withLabel: multiqc { cpus = 4; memory = 4.GB } - withLabel: nanoplot{ cpus = 8; memory = 8.GB } - withLabel: quast{ cpus = 8; memory = 8.GB } + withLabel: minimap2 { cpus = 24; memory = {24.GB * task.attempt}; maxRetries = 4 ; errorStrategy = { task.exitStatus in 1 || 130..140 ? 'retry' : 'terminate' }; } + withLabel: bwa { cpus = 24; memory = {24.GB * task.attempt}; maxRetries = 3 ; errorStrategy = { task.exitStatus in 130..140 ? 'retry' : 'terminate' } } + withLabel: bbmap { cpus = 24; memory = {24.GB * task.attempt}; maxRetries = 6 ; errorStrategy = { task.exitStatus in 1 || 130..140 ? 'retry' : 'terminate' }; } + withLabel: smallTask { cpus = 1; memory = 2.GB } + withLabel: pysam { cpus = 2; memory = 4.GB } + withLabel: fastqc { cpus = 2; memory = {4.GB * task.attempt } ; maxRetries = 3 ; errorStrategy = { task.exitStatus in 130..140 ? 'retry' : 'terminate' }; } + withLabel: multiqc { cpus = 4; memory = 4.GB } + withLabel: nanoplot { cpus = 8; memory = 8.GB } + withLabel: quast { cpus = 8; memory = 8.GB } } diff --git a/envs/bwa.yaml b/envs/bwa.yaml new file mode 100644 index 0000000..b443d6f --- /dev/null +++ b/envs/bwa.yaml @@ -0,0 +1,8 @@ +name: bwa +channels: + - bioconda + - conda-forge +dependencies: + - bwa=0.7.18 + - samtools=1.20 + - htslib=1.20 \ No newline at end of file diff --git a/modules/bwa.nf b/modules/bwa.nf new file mode 100644 index 0000000..d6b5179 --- /dev/null +++ b/modules/bwa.nf @@ -0,0 +1,52 @@ +process bwa_index { + label 'bwa' + + input: + path(fasta) + + output: + path(bwa) , emit: index + + script: + """ + mkdir bwa + bwa \\ + index \\ + -p bwa/db \\ + $fasta + """ + + stub: + """ + mkdir bwa + + touch bwa/db.{amb,ann,bwt,pac,sa} + """ +} + +process bwa { + label 'bwa' + + input: + tuple val(name), path(input) + path(db_index) + path(db) + + + output: + tuple val(name), val('raw'), path("${name}.bam"), emit: bam // input just for naming + + script: + """ + INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'` + bwa mem \\ + -t $task.cpus \\ + \$INDEX \\ + $input \\ + | samtools view -bhS -@ $task.cpus > ${name}.bam + """ + stub: + """ + touch ${name}.bam + """ +} diff --git a/nextflow.config b/nextflow.config index 694092f..58fa1b5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -24,6 +24,7 @@ params { control = false keep = false rm_rrna = false + bwa = false bbduk = false bbduk_kmer = 27 bbduk_qin = 'auto' diff --git a/tests/fasta/main.nf.test b/tests/fasta/main.nf.test index dde2ae0..0abbfd9 100644 --- a/tests/fasta/main.nf.test +++ b/tests/fasta/main.nf.test @@ -4,6 +4,7 @@ nextflow_pipeline { script "../../clean.nf" test("Stub") { + tag "minimap2" options "-stub-run" when { diff --git a/tests/illumina/main.nf.test b/tests/illumina/main.nf.test index d53af29..c62b7c0 100644 --- a/tests/illumina/main.nf.test +++ b/tests/illumina/main.nf.test @@ -4,12 +4,13 @@ nextflow_pipeline { script "../../clean.nf" test("Stub paired-end") { + tag "minimap2" options "-stub-run" when { params { - input_type = "illumina" - input = "$projectDir/assets/EMPTY_FILE_{R1,R2}.fastq" + input_type = "illumina" + input = "$projectDir/assets/EMPTY_FILE_{R1,R2}.fastq" own = "$projectDir/assets/EMPTY_FILE" } } @@ -22,12 +23,13 @@ nextflow_pipeline { } test("Stub single-end") { + tag "minimap2" options "-stub-run" when { params { - input_type = "illumina_single_end" - input = "$projectDir/assets/EMPTY_FILE" + input_type = "illumina_single_end" + input = "$projectDir/assets/EMPTY_FILE" own = "$projectDir/assets/EMPTY_FILE" } } @@ -39,13 +41,54 @@ nextflow_pipeline { } } + test("Stub paired-end bwa") { + tag "bwa" + options "-stub-run" + + when { + params { + input_type = "illumina" + input = "$projectDir/assets/EMPTY_FILE_{R1,R2}.fastq" + own = "$projectDir/assets/EMPTY_FILE" + bwa = true + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } + + test("Stub single-end bwa") { + tag "bwa" + options "-stub-run" + + when { + params { + input_type = "illumina_single_end" + input = "$projectDir/assets/EMPTY_FILE" + own = "$projectDir/assets/EMPTY_FILE" + bwa = true + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } + test("Stub paired-end bbduk") { + tag "bbduk" options "-stub-run" when { params { - input_type = "illumina" - input = "$projectDir/assets/EMPTY_FILE_{R1,R2}.fastq" + input_type = "illumina" + input = "$projectDir/assets/EMPTY_FILE_{R1,R2}.fastq" own = "$projectDir/assets/EMPTY_FILE" bbduk = true } @@ -59,12 +102,13 @@ nextflow_pipeline { } test("Stub single-end bbduk") { + tag "bbduk" options "-stub-run" when { params { - input_type = "illumina_single_end" - input = "$projectDir/assets/EMPTY_FILE" + input_type = "illumina_single_end" + input = "$projectDir/assets/EMPTY_FILE" own = "$projectDir/assets/EMPTY_FILE" bbduk = true } diff --git a/tests/nanopore/main.nf.test b/tests/nanopore/main.nf.test index eb5ac62..0894d67 100644 --- a/tests/nanopore/main.nf.test +++ b/tests/nanopore/main.nf.test @@ -4,6 +4,7 @@ nextflow_pipeline { script "../../clean.nf" test("Stub") { + tag "minimap2" options "-stub-run" when { diff --git a/workflows/clean_wf.nf b/workflows/clean_wf.nf index 5c76a1c..904ce06 100644 --- a/workflows/clean_wf.nf +++ b/workflows/clean_wf.nf @@ -1,4 +1,5 @@ include { minimap2 } from '../modules/minimap2' +include { bwa_index; bwa } from '../modules/bwa' include { bbduk } from '../modules/bbmap' include { bbduk_stats } from '../modules/utils' include { split_bam; fastq_from_bam ; idxstats_from_bam ; flagstats_from_bam ; index_bam as index_bam; index_bam as index_bam2; sort_bam ; filter_true_dcs_alignments ; merge_bam as merge_bam1 ; merge_bam as merge_bam2 ; merge_bam as merge_bam3 ; merge_bam as merge_bam4 ; filter_soft_clipped_alignments } from '../modules/alignment_processing' @@ -22,11 +23,16 @@ workflow clean { out_reads = bbduk.out.cleaned_reads.concat(bbduk.out.contaminated_reads) bams_bai = Channel.empty() sort_bam_ch = Channel.empty() - } + } else { - minimap2(input, contamination) | sort_bam | index_bam | ( idxstats_from_bam & flagstats_from_bam ) - - split_bam(minimap2.out.bam) + if ( params.bwa ) { + bwa_index(contamination) + bwa(input, bwa_index.out, contamination) | sort_bam | index_bam | ( idxstats_from_bam & flagstats_from_bam ) + split_bam(bwa.out.bam) + } else { + minimap2(input, contamination) | sort_bam | index_bam | ( idxstats_from_bam & flagstats_from_bam ) + split_bam(minimap2.out.bam) + } contamination_bam = split_bam.out.mapped cleaned_bam = split_bam.out.unmapped if ( params.control && 'dcs' in params.control.split(',') && params.dcs_strict ) { @@ -51,7 +57,7 @@ workflow clean { idxstats = idxstats_from_bam.out flagstats = flagstats_from_bam.out out_reads = fastq_from_bam.out - sort_bam_ch = sort_bam.out + sort_bam_ch = sort_bam.out } emit: @@ -60,5 +66,5 @@ workflow clean { flagstats out_reads bams_bai - sort_bam_ch + sort_bam_ch }