From fb759f5fb375a7d09d4b7aa3c9bd036c484fcd97 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Wed, 15 Nov 2023 16:30:21 +0100 Subject: [PATCH 01/26] Disable all publishDir directives. Add back the publishing of 'keep' output for now, to new dir with new name. --- README.md | 43 ++++++++++++++++++++++++++ modules/alignment_processing.nf | 14 ++++----- modules/bbmap.nf | 2 +- modules/prepare_contamination.nf | 5 +-- modules/qc.nf | 2 +- modules/unused/alignment_processing.nf | 2 +- modules/utils.nf | 13 ++++++-- nextflow.config | 14 +++------ 8 files changed, 70 insertions(+), 25 deletions(-) mode change 100755 => 100644 README.md mode change 100755 => 100644 nextflow.config diff --git a/README.md b/README.md old mode 100755 new mode 100644 index 23f50c7..a43e021 --- a/README.md +++ b/README.md @@ -108,6 +108,49 @@ Included in this repository are: The icons and diagram components that make up the schematic view were originally designed by James A. Fellow Yates & nf-core under a CCO license (public domain). +## Results + +Running the pipeline will create a directory called `results/` in the current directory with some or all of the following directories and files: + +```text +results/ + clean/ + .fastq.gz + removed/ + .fastq.gz + intermedate/ + to-remove/ + mapped.fastq.gz + unmapped.fastq.gz + mapped.bam + unmapped.bam + dcs-strict/ + no-dcs.bam + true-dcs.bam + false-dcs.bam + soft-clipped/ + soft-clipped.bam + passed-clipped.bam + to-keep/ + mapped.fastq.gz + unmapped.fastq.gz + mapped.bam + unmapped.bam + dcs-strict/ + no-dcs.bam + true-dcs.bam + false-dcs.bam + soft-clipped/ + soft-clipped.bam + passed-clipped.bam + logs/\*.html + qc/multiqc_report.html +``` + +The most important files you are likely interested in are `results/clean/.fastq.gz`, which are the "cleaned" reads. These are the input reads that *do not* map to the host, control, own fasta or rRNA files (or the subset of these that you provided), plus those reads that map to the "keep" sequence if you used the `--keep` option. Any files that were removed from your input fasta file are placed in `results/removed/.fastq.gz`. + +For debugging purposes we also provide various intermediate results in the `intermediate/` folder. + ## Citations If you use `CLEAN` in your work, please consider citing our preprint: diff --git a/modules/alignment_processing.nf b/modules/alignment_processing.nf index 05fba6a..5106e17 100644 --- a/modules/alignment_processing.nf +++ b/modules/alignment_processing.nf @@ -50,7 +50,7 @@ process filter_soft_clipped_alignments { label 'samclipy' label 'smallTask' - publishDir "${params.output}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.bam*" + publishDir "${params.output}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.bam*", enabled: false input: tuple val(name), path (bam) @@ -78,7 +78,7 @@ process filter_soft_clipped_alignments { process filter_true_dcs_alignments { label 'bed_samtools' - publishDir "${params.output}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.bam*" + publishDir "${params.output}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.bam*", enabled: false input: tuple val(name), path (bam) @@ -111,7 +111,7 @@ process filter_true_dcs_alignments { process fastq_from_bam { label 'minimap2' - publishDir "${params.output}/${params.tool}/${name}", mode: 'copy', pattern: "*.gz" + publishDir "${params.output}/${params.tool}/${name}", mode: 'copy', pattern: "*.gz", enabled: false input: tuple val(name), val(type), path(bam) @@ -144,7 +144,7 @@ process fastq_from_bam { process idxstats_from_bam { label 'minimap2' - publishDir "${params.output}/minimap2/${name}", mode: 'copy', pattern: "${bam.baseName}.idxstats.tsv" + publishDir "${params.output}/minimap2/${name}", mode: 'copy', pattern: "${bam.baseName}.idxstats.tsv" , enabled: false input: tuple val(name), val(type), path(bam), path(bai) @@ -165,7 +165,7 @@ process idxstats_from_bam { process flagstats_from_bam { label 'minimap2' - publishDir "${params.output}/minimap2", mode: params.publish_dir_mode, pattern: "${bam.baseName}.flagstats.txt" + publishDir "${params.output}/minimap2", mode: params.publish_dir_mode, pattern: "${bam.baseName}.flagstats.txt" , enabled: false input: tuple val(name), val(type), path(bam), path(bai) @@ -206,8 +206,8 @@ process sort_bam { process index_bam { label 'minimap2' - publishDir "${params.output}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.bam" - publishDir "${params.output}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.bam.bai" + publishDir "${params.output}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.bam", enabled: false + publishDir "${params.output}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.bam.bai", enabled: false input: tuple val(name), val(type), path(bam) diff --git a/modules/bbmap.nf b/modules/bbmap.nf index c750232..c1f81ef 100644 --- a/modules/bbmap.nf +++ b/modules/bbmap.nf @@ -1,7 +1,7 @@ process bbduk { label 'bbmap' - publishDir "${params.output}/${params.tool}/${name}", mode: 'copy', pattern: "*.gz" + publishDir "${params.output}/${params.tool}/${name}", mode: 'copy', pattern: "*.gz", enabled: false input: tuple val(name), path(reads) diff --git a/modules/prepare_contamination.nf b/modules/prepare_contamination.nf index 60b14b5..45cb44e 100644 --- a/modules/prepare_contamination.nf +++ b/modules/prepare_contamination.nf @@ -2,7 +2,7 @@ process download_host { label 'minimap2' if (params.cloudProcess) { - publishDir "${params.databases}/hosts", mode: params.publish_dir_mode, pattern: "*.fa.gz" + publishDir "${params.databases}/hosts", mode: params.publish_dir_mode, pattern: "*.fa.gz" } else { storeDir "${params.databases}/hosts" @@ -78,9 +78,6 @@ process check_own { process concat_contamination { label 'minimap2' - publishDir "${params.output}/", mode: params.publish_dir_mode, pattern: "db.fa.gz" - publishDir "${params.output}/", mode: params.publish_dir_mode, pattern: "db.fa.fai" - input: path fastas diff --git a/modules/qc.nf b/modules/qc.nf index 62a45e7..239d80a 100644 --- a/modules/qc.nf +++ b/modules/qc.nf @@ -94,7 +94,7 @@ process multiqc { label 'multiqc' // label 'smallTask' // this is tricky for the cloud, because only one container can be mounted. Thus comment this as hot fix - publishDir "${params.output}/${params.multiqc_dir}", mode: params.publish_dir_mode, pattern: 'multiqc_report.html' + publishDir "${params.output}/${params.multiqc_dir}", mode: params.publish_dir_mode, pattern: 'multiqc_report.html', enabled: false input: path(config) diff --git a/modules/unused/alignment_processing.nf b/modules/unused/alignment_processing.nf index 28f0461..4fc9bdf 100644 --- a/modules/unused/alignment_processing.nf +++ b/modules/unused/alignment_processing.nf @@ -33,7 +33,7 @@ process filter_un_mapped_alignments { process make_mapped_bam { label 'minimap2' - publishDir "${params.output}/${name}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.mapped.bam*" + publishDir "${params.output}/${name}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.mapped.bam*", enabled: false input: tuple val(name), path(sam), path(reads) diff --git a/modules/utils.nf b/modules/utils.nf index 5a69b8d..5b62ec2 100644 --- a/modules/utils.nf +++ b/modules/utils.nf @@ -67,7 +67,16 @@ process filter_fastq_by_name { label 'minimap2' // We don't need minimap2 but the container has pigz if ( params.keep ) { - publishDir "${params.output}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.gz" + publishDir ( + path: params.output, + mode: params.publish_dir_mode, + pattern: "*.gz", + saveAs: { fn -> + fn.endsWith('.unmapped.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped.fastq.gz$', '.fastq.gz') : + fn.endsWith('.mapped.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped.fastq.gz$', '.fastq.gz') : + fn + } + ) } input: @@ -140,7 +149,7 @@ process filter_fastq_by_name { process bbdukStats { label 'smallTask' - publishDir "${params.output}/bbduk", mode: params.publish_dir_mode, pattern: "${name}_stats.txt" + publishDir "${params.output}/bbduk", mode: params.publish_dir_mode, pattern: "${name}_stats.txt", enabled: false input: tuple val(name), path (bbdukStats) diff --git a/nextflow.config b/nextflow.config old mode 100755 new mode 100644 index 40e18fe..344106f --- a/nextflow.config +++ b/nextflow.config @@ -32,8 +32,8 @@ params { // folder structure output = 'results' - multiqc_dir = 'Summary' - nf_runinfo_dir = 'Logs' + multiqc_dir = 'qc' + nf_runinfo_dir = 'logs' // location for storing the conda or singularity environments condaCacheDir = 'conda' @@ -65,7 +65,7 @@ report { profiles { - //executors + // executors local { executor { name = "local" @@ -97,7 +97,7 @@ profiles { } - //engines + // engines docker { docker { enabled = true } includeConfig 'configs/container.config' @@ -130,7 +130,7 @@ profiles { } - //pre-merged + // pre-merged standard { params.cloudProcess = false includeConfig 'configs/local.config' @@ -230,9 +230,5 @@ profiles { process { withLabel: noDocker { cpus = 1; memory = '4.0 GB'; container = 'nanozoo/template:3.8--ccd0653' } } - } - - - } From df2497e042117766a0f53b7785a12622f8429a5f Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Thu, 16 Nov 2023 13:25:49 +0100 Subject: [PATCH 02/26] Place fastq files that map to 'keep' genome and to the host/control/own/rrna 'remove' genomes in appropriate intermediate folders --- modules/alignment_processing.nf | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/modules/alignment_processing.nf b/modules/alignment_processing.nf index 5106e17..0ee46bc 100644 --- a/modules/alignment_processing.nf +++ b/modules/alignment_processing.nf @@ -111,7 +111,32 @@ process filter_true_dcs_alignments { process fastq_from_bam { label 'minimap2' - publishDir "${params.output}/${params.tool}/${name}", mode: 'copy', pattern: "*.gz", enabled: false + + publishDir ( + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "*.gz", + overwrite: false, + saveAs: { fn -> + fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" + } + ) + + if ( !params.keep ) { + publishDir ( + path: params.output, + overwrite: false, + mode: params.publish_dir_mode, + pattern: "*.gz", + saveAs: { fn -> + fn.endsWith('.unmapped.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped.fastq.gz$', '.fastq.gz') : + fn.endsWith('.mapped.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped.fastq.gz$', '.fastq.gz') : + fn.endsWith('.unmapped.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped.fastq.gz$', '.fastq.gz') : + fn.endsWith('.mapped.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped.fastq.gz$', '.fastq.gz') : + fn + } + ) + } input: tuple val(name), val(type), path(bam) From f739f8b18eb4ba91568a9bc89981018e0d312f0b Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Thu, 16 Nov 2023 13:54:11 +0100 Subject: [PATCH 03/26] Re-publish multiqc report to the qc/ directory (by default) --- modules/qc.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/qc.nf b/modules/qc.nf index 239d80a..62a45e7 100644 --- a/modules/qc.nf +++ b/modules/qc.nf @@ -94,7 +94,7 @@ process multiqc { label 'multiqc' // label 'smallTask' // this is tricky for the cloud, because only one container can be mounted. Thus comment this as hot fix - publishDir "${params.output}/${params.multiqc_dir}", mode: params.publish_dir_mode, pattern: 'multiqc_report.html', enabled: false + publishDir "${params.output}/${params.multiqc_dir}", mode: params.publish_dir_mode, pattern: 'multiqc_report.html' input: path(config) From b2ec3556e556af7eb477e2b6cbaec4655a798ca2 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Thu, 16 Nov 2023 15:58:44 +0100 Subject: [PATCH 04/26] Only publish sorted.bam{,.bai} intermediate bam files --- modules/alignment_processing.nf | 19 +++++++++++++------ modules/utils.nf | 2 +- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/modules/alignment_processing.nf b/modules/alignment_processing.nf index 0ee46bc..c7baca1 100644 --- a/modules/alignment_processing.nf +++ b/modules/alignment_processing.nf @@ -50,7 +50,7 @@ process filter_soft_clipped_alignments { label 'samclipy' label 'smallTask' - publishDir "${params.output}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.bam*", enabled: false + publishDir "${params.output}/intermediate/soft-clipped", mode: params.publish_dir_mode, pattern: "*.bam*", overwrite: false input: tuple val(name), path (bam) @@ -78,7 +78,7 @@ process filter_soft_clipped_alignments { process filter_true_dcs_alignments { label 'bed_samtools' - publishDir "${params.output}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.bam*", enabled: false + publishDir "${params.output}/intermediate/true-dcs", mode: params.publish_dir_mode, pattern: "*.bam*", overwrite: false input: tuple val(name), path (bam) @@ -169,7 +169,7 @@ process fastq_from_bam { process idxstats_from_bam { label 'minimap2' - publishDir "${params.output}/minimap2/${name}", mode: 'copy', pattern: "${bam.baseName}.idxstats.tsv" , enabled: false + publishDir "${params.output}/qc", mode: 'copy', pattern: "${bam.baseName}.idxstats.tsv", overwrite: false, enabled: false input: tuple val(name), val(type), path(bam), path(bai) @@ -190,7 +190,7 @@ process idxstats_from_bam { process flagstats_from_bam { label 'minimap2' - publishDir "${params.output}/minimap2", mode: params.publish_dir_mode, pattern: "${bam.baseName}.flagstats.txt" , enabled: false + publishDir "${params.output}/qc", mode: params.publish_dir_mode, pattern: "${bam.baseName}.flagstats.txt", overwrite: false, enabled: false input: tuple val(name), val(type), path(bam), path(bai) @@ -231,8 +231,15 @@ process sort_bam { process index_bam { label 'minimap2' - publishDir "${params.output}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.bam", enabled: false - publishDir "${params.output}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.bam.bai", enabled: false + publishDir ( + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "*.sorted.bam{,.bai}", + overwrite: false, + saveAs: { fn -> + fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" + } + ) input: tuple val(name), val(type), path(bam) diff --git a/modules/utils.nf b/modules/utils.nf index 5b62ec2..acbc21f 100644 --- a/modules/utils.nf +++ b/modules/utils.nf @@ -149,7 +149,7 @@ process filter_fastq_by_name { process bbdukStats { label 'smallTask' - publishDir "${params.output}/bbduk", mode: params.publish_dir_mode, pattern: "${name}_stats.txt", enabled: false + publishDir "${params.output}/qc", mode: params.publish_dir_mode, pattern: "${name}_stats.txt" input: tuple val(name), path (bbdukStats) From d91f94f0d1e02205bf6fea2bca03f78d7833eb8d Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Thu, 16 Nov 2023 16:56:32 +0100 Subject: [PATCH 05/26] Publish idxstats and flagstats so they are next to their bam files --- modules/alignment_processing.nf | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/modules/alignment_processing.nf b/modules/alignment_processing.nf index c7baca1..1671186 100644 --- a/modules/alignment_processing.nf +++ b/modules/alignment_processing.nf @@ -169,7 +169,15 @@ process fastq_from_bam { process idxstats_from_bam { label 'minimap2' - publishDir "${params.output}/qc", mode: 'copy', pattern: "${bam.baseName}.idxstats.tsv", overwrite: false, enabled: false + publishDir ( + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "*.sorted.idxstats.tsv", + overwrite: false, + saveAs: { fn -> + fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" + } + ) input: tuple val(name), val(type), path(bam), path(bai) @@ -190,7 +198,15 @@ process idxstats_from_bam { process flagstats_from_bam { label 'minimap2' - publishDir "${params.output}/qc", mode: params.publish_dir_mode, pattern: "${bam.baseName}.flagstats.txt", overwrite: false, enabled: false + publishDir ( + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "*.sorted.flagstats.txt", + overwrite: false, + saveAs: { fn -> + fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" + } + ) input: tuple val(name), val(type), path(bam), path(bai) From 82b60c074cd199f29f0bb3a97d69cd9bb24c43da Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Fri, 17 Nov 2023 14:47:04 +0100 Subject: [PATCH 06/26] Handle --strict_dcs and --min_clip output in results dir --- modules/alignment_processing.nf | 64 +++++++++++++++++++++------------ modules/utils.nf | 7 ++++ 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/modules/alignment_processing.nf b/modules/alignment_processing.nf index 1671186..cc708d0 100644 --- a/modules/alignment_processing.nf +++ b/modules/alignment_processing.nf @@ -50,44 +50,60 @@ process filter_soft_clipped_alignments { label 'samclipy' label 'smallTask' - publishDir "${params.output}/intermediate/soft-clipped", mode: params.publish_dir_mode, pattern: "*.bam*", overwrite: false + publishDir ( + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "${name}*.bam{,.bai}", + overwrite: false, + saveAs: { fn -> + fn.startsWith("keep_") ? "map-to-keep/soft-clipped/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/soft-clipped/${fn}" + } + ) input: tuple val(name), path (bam) val (minClip) output: - tuple val(name), val('unmapped'), path ('*.softclipped.bam'), emit: bam_clipped - tuple val(name), val('mapped'), path ('*.passedclipped.bam'), emit: bam_ok_clipped + tuple val(name), val('unmapped'), path ('*.soft-clipped.bam'), emit: bam_clipped + tuple val(name), val('mapped'), path ('*.passed-clipped.bam'), emit: bam_ok_clipped tuple val(name), path ('*.bam.bai') script: """ git clone https://github.com/MarieLataretu/samclipy.git --branch v0.0.2 || git clone git@github.com:MarieLataretu/samclipy.git --branch v0.0.2 - samtools view -h ${bam} | python samclipy/samclipy.py --invert --minClip ${minClip} | samtools sort > ${name}.softclipped.bam - samtools view -h ${bam} | python samclipy/samclipy.py --minClip ${minClip} | samtools sort > ${name}.passedclipped.bam - samtools index ${name}.softclipped.bam - samtools index ${name}.passedclipped.bam + samtools view -h ${bam} | python samclipy/samclipy.py --invert --minClip ${minClip} | samtools sort > ${name}.soft-clipped.bam + samtools view -h ${bam} | python samclipy/samclipy.py --minClip ${minClip} | samtools sort > ${name}.passed-clipped.bam + samtools index ${name}.soft-clipped.bam + samtools index ${name}.passed-clipped.bam """ stub: """ - touch ${name}.softclipped.bam ${name}.passedclipped.bam ${name}.softclipped.bam.bai ${name}.passedclipped.bam.bai + touch ${name}.soft-clipped.bam ${name}.passed-clipped.bam ${name}.soft-clipped.bam.bai ${name}.passed-clipped.bam.bai """ } process filter_true_dcs_alignments { label 'bed_samtools' - publishDir "${params.output}/intermediate/true-dcs", mode: params.publish_dir_mode, pattern: "*.bam*", overwrite: false + publishDir ( + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "${name}*.bam{,.bai}", + overwrite: false, + saveAs: { fn -> + fn.startsWith("keep_") ? "map-to-keep/strict-dcs/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/strict-dcs/${fn}" + } + ) input: tuple val(name), path (bam) path (dcs_ends_bed) output: - tuple val(name), val('mapped'), path ("${name}.no_dcs.bam"), emit: no_dcs - tuple val(name), val('mapped'), path ("${name}.true_dcs.bam"), emit: true_dcs - tuple val(name), val('unmapped'), path ("${name}.false_dcs.bam"), emit: false_dcs + tuple val(name), val('mapped'), path ("${name}.no-dcs.bam"), emit: no_dcs + tuple val(name), val('mapped'), path ("${name}.true-dcs.bam"), emit: true_dcs + tuple val(name), val('unmapped'), path ("${name}.false-dcs.bam"), emit: false_dcs tuple val(name), path ('*.bam.bai') path('dcs.bam') @@ -95,17 +111,17 @@ process filter_true_dcs_alignments { """ # true spike in: 1-65 || 1-92; 3513-3560 (len 48) samtools view -b -h -e 'rname=="Lambda_3.6kb"' ${bam} > dcs.bam - samtools view -b -h -e 'rname!="Lambda_3.6kb"' ${bam} > ${name}.no_dcs.bam - bedtools intersect -wa -ubam -a dcs.bam -b ${dcs_ends_bed} > ${name}.true_dcs.bam - bedtools intersect -v -ubam -a dcs.bam -b ${dcs_ends_bed} > ${name}.false_dcs.bam + samtools view -b -h -e 'rname!="Lambda_3.6kb"' ${bam} > ${name}.no-dcs.bam + bedtools intersect -wa -ubam -a dcs.bam -b ${dcs_ends_bed} > ${name}.true-dcs.bam + bedtools intersect -v -ubam -a dcs.bam -b ${dcs_ends_bed} > ${name}.false-dcs.bam samtools index dcs.bam - samtools index ${name}.no_dcs.bam - samtools index ${name}.true_dcs.bam - samtools index ${name}.false_dcs.bam + samtools index ${name}.no-dcs.bam + samtools index ${name}.true-dcs.bam + samtools index ${name}.false-dcs.bam """ stub: """ - touch ${name}.no_dcs.bam ${name}.true_dcs.bam ${name}.false_dcs.bam ${name}.no_dcs.bam.bai ${name}.true_dcs.bam.bai ${name}.false_dcs.bam.bai + touch ${name}.no-dcs.bam ${name}.true-dcs.bam ${name}.false-dcs.bam ${name}.no-dcs.bam.bai ${name}.true-dcs.bam.bai ${name}.false-dcs.bam.bai """ } @@ -118,10 +134,12 @@ process fastq_from_bam { pattern: "*.gz", overwrite: false, saveAs: { fn -> - fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" + fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '').replaceAll(~'_merged', '')}" : "map-to-remove/${fn.replaceAll(~'_merged', '')}" } ) + // When using --keep, cleaned fastq files are generated by the + // filter_fastq_by_name process and not here if ( !params.keep ) { publishDir ( path: params.output, @@ -131,8 +149,10 @@ process fastq_from_bam { saveAs: { fn -> fn.endsWith('.unmapped.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped.fastq.gz$', '.fastq.gz') : fn.endsWith('.mapped.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped.fastq.gz$', '.fastq.gz') : - fn.endsWith('.unmapped.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped.fastq.gz$', '.fastq.gz') : - fn.endsWith('.mapped.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped.fastq.gz$', '.fastq.gz') : + fn.endsWith('.unmapped_merged.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped_merged.fastq.gz$', '.fastq.gz') : + fn.endsWith('.mapped_merged.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped_merged.fastq.gz$', '.fastq.gz') : + fn.endsWith('.unmapped_merged_merged.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped_merged_merged.fastq.gz$', '.fastq.gz') : + fn.endsWith('.soft-clipped_merged.fastq.gz') ? "clean/${fn}".replaceAll(~'.soft-clipped_merged.fastq.gz$', '.fastq.gz') : fn } ) diff --git a/modules/utils.nf b/modules/utils.nf index acbc21f..5feea0f 100644 --- a/modules/utils.nf +++ b/modules/utils.nf @@ -66,6 +66,8 @@ process get_read_names { process filter_fastq_by_name { label 'minimap2' // We don't need minimap2 but the container has pigz + // When using --keep, this is where the final cleaned fastq file is + // generated. If not, it's generated by the fastq_from_bam process if ( params.keep ) { publishDir ( path: params.output, @@ -74,6 +76,11 @@ process filter_fastq_by_name { saveAs: { fn -> fn.endsWith('.unmapped.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped.fastq.gz$', '.fastq.gz') : fn.endsWith('.mapped.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped.fastq.gz$', '.fastq.gz') : + fn.endsWith('.unmapped_merged.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped_merged.fastq.gz$', '.fastq.gz') : + fn.endsWith('.mapped_merged.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped_merged.fastq.gz$', '.fastq.gz') : + fn.endsWith('.unmapped_merged_merged.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped_merged_merged.fastq.gz$', '.fastq.gz') : + fn.endsWith('.mapped_merged_merged.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped_merged_merged.fastq.gz$', '.fastq.gz') : + fn.endsWith('.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped_merged_merged.fastq.gz$', '.fastq.gz') : fn } ) From fc1318bba66f12280a4d1ed2f1be0b54c2527417 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Fri, 17 Nov 2023 17:41:06 +0100 Subject: [PATCH 07/26] First steps towards making illumina/bbduk results dir look right --- modules/bbmap.nf | 35 +++++++++++++++++++++++++++++------ modules/utils.nf | 26 +++++++++++++++++--------- workflows/clean_wf.nf | 8 ++++---- 3 files changed, 50 insertions(+), 19 deletions(-) diff --git a/modules/bbmap.nf b/modules/bbmap.nf index c1f81ef..c020a13 100644 --- a/modules/bbmap.nf +++ b/modules/bbmap.nf @@ -1,7 +1,30 @@ process bbduk { label 'bbmap' + + publishDir ( + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "*.gz", + overwrite: false, + saveAs: { fn -> + fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '').replaceAll(~'_merged', '')}" : "map-to-remove/${fn.replaceAll(~'_merged', '')}" + } + ) - publishDir "${params.output}/${params.tool}/${name}", mode: 'copy', pattern: "*.gz", enabled: false + //if ( !params.keep ) { + if ( true ) { + publishDir ( + path: params.output, + overwrite: false, + mode: params.publish_dir_mode, + pattern: "*.gz", + saveAs: { fn -> + fn.endsWith('.clean.fastq.gz') ? "clean/${fn}".replaceAll(~'.fastq.clean.fastq.gz$', '.fastq.gz') : + fn.endsWith('.contamination.fastq.gz') ? "removed/${fn}".replaceAll(~'.fastq.contamination.fastq.gz$', '.fastq.gz') : + fn + } + ) + } input: tuple val(name), path(reads) @@ -11,13 +34,13 @@ process bbduk { val name, emit: name tuple val(name), val('clean'), path('*clean.fastq.gz'), emit: cleaned_reads tuple val(name), val('contamination'), path('*contamination.fastq.gz'), emit: contaminated_reads - tuple val(name), path("${name}_bbduk_stats.txt"), emit: stats + tuple val(name), path("${name}.bbduk_stats.txt"), emit: stats script: if ( params.lib_pairedness == 'paired' ) { """ MEM=\$(echo ${task.memory} | sed 's/ GB//g') - bbduk.sh -Xmx\${MEM}g ref=${db} threads=${task.cpus} stats=${name}_bbduk_stats.txt ordered=t k=${params.bbduk_kmer} in=${reads[0]} in2=${reads[1]} out=${reads[0].baseName}.clean.fastq out2=${reads[1].baseName}.clean.fastq outm=${reads[0].baseName}.contamination.fastq outm2=${reads[1].baseName}.contamination.fastq + bbduk.sh -Xmx\${MEM}g ref=${db} threads=${task.cpus} stats=${name}.bbduk_stats.txt ordered=t k=${params.bbduk_kmer} in=${reads[0]} in2=${reads[1]} out=${reads[0].baseName}.clean.fastq out2=${reads[1].baseName}.clean.fastq outm=${reads[0].baseName}.contamination.fastq outm2=${reads[1].baseName}.contamination.fastq gzip --no-name *.clean.fastq gzip --no-name *.contamination.fastq @@ -25,7 +48,7 @@ process bbduk { } else if ( params.lib_pairedness == 'single' ) { """ MEM=\$(echo ${task.memory} | sed 's/ GB//g') - bbduk.sh -Xmx\${MEM}g ref=${db} threads=${task.cpus} stats=${name}_bbduk_stats.txt ordered=t k=${params.bbduk_kmer} in=${reads} out=${name}.clean.fastq outm=${name}.contamination.fastq + bbduk.sh -Xmx\${MEM}g ref=${db} threads=${task.cpus} stats=${name}.bbduk_stats.txt ordered=t k=${params.bbduk_kmer} in=${reads} out=${name}.clean.fastq outm=${name}.contamination.fastq gzip --no-name *.clean.fastq gzip --no-name *.contamination.fastq @@ -36,12 +59,12 @@ process bbduk { stub: if ( params.lib_pairedness == 'paired' ) { """ - touch ${name}_bbduk_stats.txt + touch ${name}.bbduk_stats.txt touch ${reads[0].baseName}.clean.fastq.gz ${reads[0].baseName}.contamination.fastq.gz ${reads[1].baseName}.clean.fastq.gz ${reads[1].baseName}.contamination.fastq.gz """ } else if ( params.lib_pairedness == 'single' ) { """ - touch ${name}_bbduk_stats.txt + touch ${name}.bbduk_stats.txt touch ${name}.contamination.fastq.gz ${name}.clean.fastq.gz """ } diff --git a/modules/utils.nf b/modules/utils.nf index 5feea0f..6500503 100644 --- a/modules/utils.nf +++ b/modules/utils.nf @@ -153,17 +153,25 @@ process filter_fastq_by_name { // """ // } -process bbdukStats { +process bbduk_stats { label 'smallTask' - publishDir "${params.output}/qc", mode: params.publish_dir_mode, pattern: "${name}_stats.txt" + publishDir ( + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "*.stats.tsv", + overwrite: false, + saveAs: { fn -> + fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" + } + ) input: tuple val(name), path (bbdukStats) output: - tuple val(name), path ("${name}_stats.txt") - path("${name}_bbduk_stats.tsv"), emit: tsv + tuple val(name), path ("${name}.stats.txt") + path("${name}.bbduk_stats.tsv"), emit: tsv script: """ @@ -173,21 +181,21 @@ process bbdukStats { FA=\$(awk -F '\\t' '/^[^#]/ {print "\\t\\t"\$2" ("\$3") aligned to "\$1}' ${bbdukStats}) - touch ${name}_stats.txt - cat <> ${name}_stats.txt + touch ${name}.stats.txt + cat <> ${name}.stats.txt \$TOTAL reads in total; of these: \t\$MNUM (\$MPER) reads were properly mapped; of these: \$FA EOF - touch ${name}_bbduk_stats.tsv - cat <> ${name}_bbduk_stats.tsv + touch ${name}.bbduk_stats.tsv + cat <> ${name}.bbduk_stats.tsv Sample Name\tClean reads\tMapped reads ${name}\t\$((\$TOTAL-\$MNUM))\t\$MNUM EOF """ stub: """ - touch ${name}_stats.txt ${name}_bbduk_stats.tsv + touch ${name}.stats.txt ${name}.bbduk_stats.tsv """ } diff --git a/workflows/clean_wf.nf b/workflows/clean_wf.nf index 1a431ca..c066a4e 100644 --- a/workflows/clean_wf.nf +++ b/workflows/clean_wf.nf @@ -1,6 +1,6 @@ include { minimap2 } from '../modules/minimap2' include { bbduk } from '../modules/bbmap' -include { bbdukStats } from '../modules/utils' +include { bbduk_stats } from '../modules/utils' include { split_bam; fastq_from_bam ; idxstats_from_bam ; flagstats_from_bam ; index_bam as index_bam; index_bam as index_bam2; sort_bam ; filter_true_dcs_alignments ; merge_bam as merge_bam1 ; merge_bam as merge_bam2 ; merge_bam as merge_bam3 ; merge_bam as merge_bam4 ; filter_soft_clipped_alignments } from '../modules/alignment_processing' workflow clean { @@ -13,9 +13,9 @@ workflow clean { if ( params.bbduk ) { // map bbduk(input, contamination) - bbdukStats(bbduk.out.stats) + bbduk_stats(bbduk.out.stats) // define output - bbduk_summary = bbdukStats.out.tsv + bbduk_summary = bbduk_stats.out.tsv idxstats = Channel.empty() flagstats = Channel.empty() out_reads = bbduk.out.cleaned_reads.concat(bbduk.out.contaminated_reads) @@ -57,4 +57,4 @@ workflow clean { flagstats out_reads bams_bai -} \ No newline at end of file +} From f03c8a5711e834402251fed2224ff34ee356e25c Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Sun, 19 Nov 2023 21:22:51 +0100 Subject: [PATCH 08/26] Add bed_samtools container bump minimap2 container (from main) --- configs/container.config | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/configs/container.config b/configs/container.config index 733c0e6..598f12a 100644 --- a/configs/container.config +++ b/configs/container.config @@ -1,9 +1,10 @@ process { withLabel: smallTask { container = 'nanozoo/samtools:1.14--d8fb865' } - withLabel: minimap2 { container = 'nanozoo/minimap2:2.26--ef54a1d' } + withLabel: minimap2 { container = 'nanozoo/minimap2:2.26--d9ef6b6' } withLabel: bbmap { container = 'nanozoo/bbmap:38.79--8e915d7' } withLabel: multiqc { container = 'nanozoo/multiqc:1.9--aba729b' } withLabel: fastqc { container = 'nanozoo/fastqc:0.11.9--f61b8b4' } withLabel: nanoplot { container = 'nanozoo/nanoplot:1.32.0--1ae6f5d' } withLabel: quast { container = 'nanozoo/quast:5.0.2--e7f0cfe' } -} \ No newline at end of file + withLabel: bed_samtools { container = 'nanozoo/bed_samtools:2.30.0--cc7d1b9' } +} From 619c205e276359996896f405b98e6018c438bb21 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Tue, 21 Nov 2023 14:03:40 +0100 Subject: [PATCH 09/26] --keep is now working with bbduk. results/ is properly populated. --- clean.nf | 2 +- configs/conda.config | 1 + configs/container.config | 5 +++-- modules/bbmap.nf | 19 ++++++++--------- modules/utils.nf | 27 +++++++++++++++++++++--- workflows/clean_wf.nf | 3 ++- workflows/keep_wf.nf | 44 ++++++++++++++++++++++++++-------------- 7 files changed, 69 insertions(+), 32 deletions(-) diff --git a/clean.nf b/clean.nf index 894b017..69c27fb 100755 --- a/clean.nf +++ b/clean.nf @@ -194,7 +194,7 @@ workflow { prepare_contamination(nanoControlFastaChannel, illuminaControlFastaChannel, rRNAChannel, hostNameChannel, ownFastaChannel) contamination = prepare_contamination.out - clean(input_ch, contamination, nanoControlBedChannel) + clean(input_ch, contamination, nanoControlBedChannel, 'map-to-remove') if (params.keep){ prepare_keep(keepFastaChannel) diff --git a/configs/conda.config b/configs/conda.config index 700a584..d13d30d 100644 --- a/configs/conda.config +++ b/configs/conda.config @@ -8,4 +8,5 @@ process { withLabel: nanoplot { conda = "$baseDir/envs/nanoplot.yaml" } withLabel: quast { conda = "$baseDir/envs/quast.yaml" } withLabel: bed_samtools { conda = "$baseDir/envs/bed_samtools.yaml" } + withLabel: seqkit { conda = "$baseDir/envs/seqkit.yaml" } } diff --git a/configs/container.config b/configs/container.config index 598f12a..6d310ac 100644 --- a/configs/container.config +++ b/configs/container.config @@ -1,10 +1,11 @@ process { - withLabel: smallTask { container = 'nanozoo/samtools:1.14--d8fb865' } + withLabel: smallTask { container = 'nanozoo/samtools:1.14--d8fb865' } withLabel: minimap2 { container = 'nanozoo/minimap2:2.26--d9ef6b6' } - withLabel: bbmap { container = 'nanozoo/bbmap:38.79--8e915d7' } + withLabel: bbmap { container = 'nanozoo/bbmap:38.79--8e915d7' } withLabel: multiqc { container = 'nanozoo/multiqc:1.9--aba729b' } withLabel: fastqc { container = 'nanozoo/fastqc:0.11.9--f61b8b4' } withLabel: nanoplot { container = 'nanozoo/nanoplot:1.32.0--1ae6f5d' } withLabel: quast { container = 'nanozoo/quast:5.0.2--e7f0cfe' } withLabel: bed_samtools { container = 'nanozoo/bed_samtools:2.30.0--cc7d1b9' } + withLabel: seqkit { container = 'quay.io/biocontainers/seqkit:2.6.1--h9ee0642_0' } } diff --git a/modules/bbmap.nf b/modules/bbmap.nf index c020a13..4326ac4 100644 --- a/modules/bbmap.nf +++ b/modules/bbmap.nf @@ -2,17 +2,15 @@ process bbduk { label 'bbmap' publishDir ( - path: "${params.output}/intermediate", + path: "${params.output}/intermediate/${map_target}", mode: params.publish_dir_mode, - pattern: "*.gz", - overwrite: false, - saveAs: { fn -> - fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '').replaceAll(~'_merged', '')}" : "map-to-remove/${fn.replaceAll(~'_merged', '')}" - } + pattern: "*.{clean,contamination}.fastq.gz", + overwrite: false ) - //if ( !params.keep ) { - if ( true ) { + // When using `--keep`, we need to do further processing before we have + // the final clean and removed data sets. + if ( !params.keep ) { publishDir ( path: params.output, overwrite: false, @@ -29,11 +27,12 @@ process bbduk { input: tuple val(name), path(reads) path db + val(map_target) output: val name, emit: name - tuple val(name), val('clean'), path('*clean.fastq.gz'), emit: cleaned_reads - tuple val(name), val('contamination'), path('*contamination.fastq.gz'), emit: contaminated_reads + tuple val(name), val('unmapped'), path('*clean.fastq.gz'), emit: cleaned_reads + tuple val(name), val('mapped'), path('*contamination.fastq.gz'), emit: contaminated_reads tuple val(name), path("${name}.bbduk_stats.txt"), emit: stats script: diff --git a/modules/utils.nf b/modules/utils.nf index 6500503..40c0c2b 100644 --- a/modules/utils.nf +++ b/modules/utils.nf @@ -63,11 +63,31 @@ process get_read_names { """ } +process get_read_names_fastx { + label 'seqkit' + + input: + tuple val(name), path(fastx) + + output: + tuple val(name), path("${name}.ids") + + script: + """ + seqkit seq -ni ${fastx} | sort | uniq > ${name}.ids + """ + stub: + """ + touch ${name}.ids + """ +} + process filter_fastq_by_name { label 'minimap2' // We don't need minimap2 but the container has pigz // When using --keep, this is where the final cleaned fastq file is - // generated. If not, it's generated by the fastq_from_bam process + // generated. If not, it's generated by the fastq_from_bam process or + // bbduk mapping if ( params.keep ) { publishDir ( path: params.output, @@ -80,7 +100,8 @@ process filter_fastq_by_name { fn.endsWith('.mapped_merged.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped_merged.fastq.gz$', '.fastq.gz') : fn.endsWith('.unmapped_merged_merged.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped_merged_merged.fastq.gz$', '.fastq.gz') : fn.endsWith('.mapped_merged_merged.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped_merged_merged.fastq.gz$', '.fastq.gz') : - fn.endsWith('.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped_merged_merged.fastq.gz$', '.fastq.gz') : + fn.endsWith('.fastq.clean.fastq.gz') ? "clean/${fn}".replaceAll(~'.fastq.clean.fastq.gz$', '.fastq.gz') : + fn.endsWith('.fastq.contamination.fastq.gz') ? "removed/${fn}".replaceAll(~'.fastq.contamination.fastq.gz$', '.fastq.gz') : fn } ) @@ -159,7 +180,7 @@ process bbduk_stats { publishDir ( path: "${params.output}/intermediate", mode: params.publish_dir_mode, - pattern: "*.stats.tsv", + pattern: "*.bbduk_stats.tsv", overwrite: false, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" diff --git a/workflows/clean_wf.nf b/workflows/clean_wf.nf index c066a4e..b61ed5b 100644 --- a/workflows/clean_wf.nf +++ b/workflows/clean_wf.nf @@ -8,11 +8,12 @@ workflow clean { input contamination dcs_ends_bed + map_target main: if ( params.bbduk ) { // map - bbduk(input, contamination) + bbduk(input, contamination, map_target) bbduk_stats(bbduk.out.stats) // define output bbduk_summary = bbduk_stats.out.tsv diff --git a/workflows/keep_wf.nf b/workflows/keep_wf.nf index 11ca00d..12605a4 100644 --- a/workflows/keep_wf.nf +++ b/workflows/keep_wf.nf @@ -1,6 +1,6 @@ include { clean as keep_map } from './clean_wf' -include { get_read_names; filter_fastq_by_name } from '../modules/utils' +include { get_read_names; get_read_names_fastx; filter_fastq_by_name } from '../modules/utils' include { idxstats_from_bam ; flagstats_from_bam } from '../modules/alignment_processing' workflow keep { @@ -11,25 +11,39 @@ workflow keep { un_mapped_clean_fastq main: - keep_map(input, keep_reference, dcs_ends_bed) + keep_map(input, keep_reference, dcs_ends_bed, 'map-to-keep') - keep_reads_bam = keep_map.out.bams_bai.filter{ it[1] == 'mapped' } - get_read_names(keep_reads_bam.map{it -> [it[0], it[2]]}) - // works also for multiple samples? - - // mv keep mapped reads from cleaned mapped to clean unmapped - filter_fastq_by_name(get_read_names.out.map{ it -> [it[0].replaceAll("^keep_",""), it[1]] }.join(un_mapped_clean_fastq)) + if ( params.bbduk ) { + keep_reads_fastx = keep_map.out.out_reads.filter{ it[1] == 'mapped' } + get_read_names_fastx(keep_reads_fastx.map{it -> [it[0], it[2]]}) + + filter_fastq_by_name(get_read_names_fastx.out.map{ it -> [it[0].replaceAll("^keep_",""), it[1]] }.join(un_mapped_clean_fastq)) + + // These channels are not populated when we're working using bbduk + idxstats = Channel.empty() + flagstats = Channel.empty() + keep_reads_bam = Channel.empty() + out_reads = filter_fastq_by_name.out.mapped_no_keep.mix(filter_fastq_by_name.out.unmapped_keep) + } else { + + keep_reads_bam = keep_map.out.bams_bai.filter{ it[1] == 'mapped' } + get_read_names(keep_reads_bam.map{it -> [it[0], it[2]]}) + // works also for multiple samples? + + // mv keep mapped reads from cleaned mapped to clean unmapped + filter_fastq_by_name(get_read_names.out.map{ it -> [it[0].replaceAll("^keep_",""), it[1]] }.join(un_mapped_clean_fastq)) - // QC - idxstats = idxstats_from_bam(keep_reads_bam) - flagstats = flagstats_from_bam(keep_reads_bam) + // QC + idxstats = idxstats_from_bam(keep_reads_bam) + flagstats = flagstats_from_bam(keep_reads_bam) - idxstats = idxstats_from_bam.out - flagstats = flagstats_from_bam.out - out_reads = filter_fastq_by_name.out.mapped_no_keep.mix(filter_fastq_by_name.out.unmapped_keep) + idxstats = idxstats_from_bam.out + flagstats = flagstats_from_bam.out + out_reads = filter_fastq_by_name.out.mapped_no_keep.mix(filter_fastq_by_name.out.unmapped_keep) + } emit: idxstats flagstats out_reads keep_reads_bam -} \ No newline at end of file +} From 4611647dbc1400fb80232779fb85fc9091cb630f Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Tue, 21 Nov 2023 14:11:45 +0100 Subject: [PATCH 10/26] Add seqkit conda environment file --- envs/seqkit.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 envs/seqkit.yaml diff --git a/envs/seqkit.yaml b/envs/seqkit.yaml new file mode 100644 index 0000000..3dc07d1 --- /dev/null +++ b/envs/seqkit.yaml @@ -0,0 +1,6 @@ +name: seqkit +channels: + - bioconda + - conda-forge +dependencies: + - seqkit==2.6.1 From e5e5659d654987c8fefee8cf73209f4d2694f400 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Tue, 21 Nov 2023 16:44:19 +0100 Subject: [PATCH 11/26] Output fasta when input is fasta Also remove dead code, skip creation of uncompressed fast[aq] file by letting samtools directly write a gzipped file. Remove whitespace at end of some lines. --- README.md | 16 +++---- clean.nf | 56 +++++++++++------------ modules/alignment_processing.nf | 28 ++++++------ modules/unused/alignment_processing.nf | 34 -------------- modules/utils.nf | 62 +++++--------------------- 5 files changed, 61 insertions(+), 135 deletions(-) diff --git a/README.md b/README.md index a43e021..d0d8674 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Technologies ([DNA CS (DCS)](https://assets.ctfassets.net/hkzaxo8a05x5/2IX56YmF5 ## What this workflow does for you -With this workflow you can screen and clean your Illumina, Nanopore or any FASTA-formated sequence date. The results are the clean sequences and the sequences identified as contaminated. +With this workflow you can screen and clean your Illumina, Nanopore or any FASTA-formated sequence data. The results are the clean sequences and the sequences identified as contaminated. Per default [minimap2](https://github.com/lh3/minimap2) is used for aligning your sequences to reference sequences but I recommend using `bbduk`, part of [BBTools](https://github.com/BioInfoTools/BBMap), to clean short-read data (_--bbduk_). You can simply specify provided hosts and controls for the cleanup or use your own FASTA files. The reads are then mapped against the specified host, control and user defined FASTA files. All reads that map are considered as contamination. In case of Illumina paired-end reads, both mates need to be aligned. @@ -35,7 +35,7 @@ We saw many soft-clipped reads after the mapping, that probably aren't contamina ### Dependencies management -- [Conda](https://docs.conda.io/en/latest/miniconda.html) +- [Conda](https://docs.conda.io/en/latest/miniconda.html) and/or @@ -57,7 +57,7 @@ Get help: nextflow run hoelzer/clean --help ``` -Clean Nanopore data by filtering against a combined reference of the _E. coli_ genome and the Nanopore DNA CS spike-in. +Clean Nanopore data by filtering against a combined reference of the _E. coli_ genome and the Nanopore DNA CS spike-in. ```bash # uses Docker per default @@ -77,7 +77,7 @@ nextflow run hoelzer/clean --input_type illumina --input '/home/martin/.nextflow --own ~/.nextflow/assets/hoelzer/clean/test/ref.fasta.gz --bbduk ``` -Clean some Illumina, Nanopore, and assembly files against the mouse and phiX genomes. +Clean some Illumina, Nanopore, and assembly files against the mouse and phiX genomes. ## Supported species and control sequences @@ -119,7 +119,7 @@ results/ removed/ .fastq.gz intermedate/ - to-remove/ + map-to-remove/ mapped.fastq.gz unmapped.fastq.gz mapped.bam @@ -131,7 +131,7 @@ results/ soft-clipped/ soft-clipped.bam passed-clipped.bam - to-keep/ + map-to-keep/ mapped.fastq.gz unmapped.fastq.gz mapped.bam @@ -154,11 +154,11 @@ For debugging purposes we also provide various intermediate results in the `inte ## Citations If you use `CLEAN` in your work, please consider citing our preprint: - + > Targeted decontamination of sequencing data with CLEAN > > Marie Lataretu, Sebastian Krautwurst, Adrian Viehweger, Christian Brandt, Martin Hölzer > -> bioRxiv 2023.08.05.552089; doi: https://doi.org/10.1101/2023.08.05.552089 +> bioRxiv 2023.08.05.552089; doi: https://doi.org/10.1101/2023.08.05.552089 Additionally, an extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/clean.nf b/clean.nf index 69c27fb..3e43de6 100755 --- a/clean.nf +++ b/clean.nf @@ -15,19 +15,19 @@ def parameter_diff = params.keySet() - valid_params if (parameter_diff.size() != 0){ exit 1, "ERROR: Parameter(s) $parameter_diff is/are not valid in the pipeline!\n" } -if (params.input.contains('.clean.') ) { - exit 1, "ERROR: Input files cannot contain `.clean.`\n" +if (params.input.contains('.clean.') ) { + exit 1, "ERROR: Input files cannot contain `.clean.`\n" } -/************************** -* META & HELP MESSAGES +/************************** +* META & HELP MESSAGES **************************/ -/* +/* Comment section: First part is a terminal print for additional user information, followed by some help statements (e.g. missing input) Second part is file channel input. This allows via --list to alter the input of --nano & --illumina -to add csv instead. name,path or name,pathR1,pathR2 in case of illumina +to add csv instead. name,path or name,pathR1,pathR2 in case of illumina */ // terminal prints @@ -51,7 +51,7 @@ if ( workflow.profile.contains('singularity') ) { println "Singularity cache directory:" println " $params.singularityCacheDir" } -if ( workflow.profile.contains('conda') ) { +if ( workflow.profile.contains('conda') ) { println "Conda cache directory:" println " $params.condaCacheDir" } @@ -66,11 +66,11 @@ if (workflow.profile == 'standard' || workflow.profile.contains('local')) { println "\033[2mMemory to use: $params.memory, maximal memory to use: $params.max_memory\u001B[0m" println " " } -if ( !workflow.revision ) { +if ( !workflow.revision ) { println "\033[0;33mWARNING: Not a stable execution. Please use -r for full reproducibility.\033[0m\n" } def folder = new File(params.output) -if ( folder.exists() ) { +if ( folder.exists() ) { println "\033[0;33mWARNING: Output folder already exists. Results might be overwritten! You can adjust the output folder via [--output]\033[0m\n" } if ( workflow.profile.contains('singularity') ) { @@ -85,15 +85,15 @@ Set input_types = ['nano', 'illumina', 'illumina_single_end', 'fasta'] if ( params.profile ) { exit 1, "--profile is wrong, use -profile" } if ( params.input == '' || !params.input_type == '' ) { exit 1, "Missing required input parameters [--input] and [--input_type]" } -if ( params.input_type ) { if ( ! (params.input_type in input_types ) ) { exit 1, "Choose one of the the input types with --input_type: " + input_types } } +if ( params.input_type ) { if ( ! (params.input_type in input_types ) ) { exit 1, "Choose one of the the input types with --input_type: " + input_types } } if ( params.control ) { for( String ctr : params.control.split(',') ) if ( ! (ctr in controls ) ) { exit 1, "Wrong control defined (" + ctr + "), use one of these: " + controls } } if ( params.input_type == 'nano' && params.control && 'dcs' in params.control.split(',') && 'eno' in params.control.split(',') ) { exit 1, "Please choose either eno (for ONT dRNA-Seq) or dcs (for ONT DNA-Seq)." } if ( params.host ) { for( String hst : params.host.split(',') ) if ( ! (hst in hosts ) ) { exit 1, "Wrong host defined (" + hst + "), use one of these: " + hosts } } if ( !params.host && !params.own && !params.control && !params.rm_rrna ) { exit 1, "Please provide a control (--control), a host tag (--host), a FASTA file (--own) or set --rm_rrna for rRNA removal for the clean up."} -/************************** -* INPUT CHANNELS +/************************** +* INPUT CHANNELS **************************/ if ( params.input_type == 'illumina' ) { @@ -174,7 +174,7 @@ multiqc_config = Channel.fromPath( workflow.projectDir + '/assets/multiqc_config tool = params.bbduk ? 'bbduk' : 'minimap2' lib_pairedness = params.input_type == 'illumina' ? 'paired' : 'single' -/************************** +/************************** * MODULES **************************/ @@ -186,7 +186,7 @@ include { keep } from './workflows/keep_wf' addParams( tool: tool, lib_pairednes include { qc } from './workflows/qc_wf' -/************************** +/************************** * WORKFLOW ENTRY POINT **************************/ @@ -202,9 +202,9 @@ workflow { mapped = clean.out.out_reads.filter{ it[1] == 'mapped' } unmapped = clean.out.out_reads.filter{ it[1] == 'unmapped' } - + un_mapped_clean_fastq = mapped.join(unmapped) - + keep(input_ch.map{ it -> ['keep_'+it[0], it[1]]}, keep_fasta.collect(), nanoControlBedChannel, un_mapped_clean_fastq) } @@ -212,7 +212,7 @@ workflow { qc(input_ch.map{ it -> tuple(it[0], 'input', it[1]) }.mix(clean.out.out_reads), params.input_type, clean.out.bbduk_summary, clean.out.idxstats, clean.out.flagstats, multiqc_config) } -/************************** +/************************** * --help **************************/ def helpMSG() { @@ -223,19 +223,19 @@ def helpMSG() { c_dim = "\033[2m"; log.info """ ____________________________________________________________________________________________ - + Workflow: Decontamination - Clean your Illumina, Nanopore or any FASTA-formated sequence date. The output are the clean + Clean your Illumina, Nanopore or any FASTA-formated sequence date. The output are the clean and as contaminated identified sequences. Per default minimap2 is used for aligning your sequences - to a host but we recommend using the ${c_dim}--bbduk${c_reset} flag to switch to bbduk to clean short-read data. + to a host but we recommend using the ${c_dim}--bbduk${c_reset} flag to switch to bbduk to clean short-read data. + + Use the ${c_dim}--host${c_reset} and ${c_dim}--control${c_reset} flag to download a host database or specify your ${c_dim}--own${c_reset} FASTA. - Use the ${c_dim}--host${c_reset} and ${c_dim}--control${c_reset} flag to download a host database or specify your ${c_dim}--own${c_reset} FASTA. - ${c_yellow}Usage example:${c_reset} - nextflow run clean.nf --input_type nano --input '*/*.fastq' --host eco --control dcs + nextflow run clean.nf --input_type nano --input '*/*.fastq' --host eco --control dcs or - nextflow run clean.nf --input_type illumina --input '*/*.R{1,2}.fastq' --own some_host.fasta --bbduk + nextflow run clean.nf --input_type illumina --input '*/*.R{1,2}.fastq' --own some_host.fasta --bbduk or nextflow run clean.nf --input_type illumina --input 'test/illumina*.R{1,2}.fastq.gz' --nano data/nanopore.fastq.gz --fasta data/assembly.fasta --host eco --control phix @@ -244,8 +244,8 @@ def helpMSG() { ${c_green}--input_type illumina --input${c_reset} '*.R{1,2}.fastq.gz' -> file pairs ${c_green}--input_type illumina_single_end --input${c_reset} '*.fastq.gz' -> one sample per file ${c_green}--input_type fasta --input${c_reset} '*.fasta.gz' -> one sample per file - ${c_dim} ...read above input from csv files:${c_reset} ${c_green}--list ${c_reset} - ${c_dim}required format: name,path for --input_type nano and --input_type fasta; name,pathR1,pathR2 for --illumina input_type; name,path for --input_type illumina_single_end${c_reset} + ${c_dim} ...read above input from csv files:${c_reset} ${c_green}--list ${c_reset} + ${c_dim}required format: name,path for --input_type nano and --input_type fasta; name,pathR1,pathR2 for --illumina input_type; name,path for --input_type illumina_single_end${c_reset} ${c_yellow}Decontamination options:${c_reset} ${c_green}--host${c_reset} comma separated list of reference genomes for decontamination, downloaded based on this parameter [default: $params.host] @@ -287,11 +287,11 @@ def helpMSG() { In particular for execution of the workflow on a HPC (LSF, SLURM) adjust the following parameters: --databases defines the path where databases are stored [default: $params.databases] --condaCacheDir defines the path where environments (conda) are cached [default: $params.condaCacheDir] - --singularityCacheDir defines the path where images (singularity) are cached [default: $params.singularityCacheDir] + --singularityCacheDir defines the path where images (singularity) are cached [default: $params.singularityCacheDir] ${c_yellow}Miscellaneous:${c_reset} --cleanup_work_dir deletes all files in the work directory after a successful completion of a run [default: $params.cleanup_work_dir] - ${c_dim}warning: if ture, the option will prevent the use of the resume feature!${c_reset} + ${c_dim}warning: if ture, the option will prevent the use of the resume feature!${c_reset} ${c_yellow}Profile:${c_reset} You can merge different profiles for different setups, e.g. diff --git a/modules/alignment_processing.nf b/modules/alignment_processing.nf index cc708d0..c13f0dd 100644 --- a/modules/alignment_processing.nf +++ b/modules/alignment_processing.nf @@ -35,7 +35,7 @@ process merge_bam { output: tuple val(name), val(type), path("${bam[0].baseName}_merged.bam") - + script: """ samtools merge -@ ${task.cpus} ${bam[0].baseName}_merged.bam ${bam} # first bam is output @@ -63,15 +63,15 @@ process filter_soft_clipped_alignments { input: tuple val(name), path (bam) val (minClip) - + output: tuple val(name), val('unmapped'), path ('*.soft-clipped.bam'), emit: bam_clipped tuple val(name), val('mapped'), path ('*.passed-clipped.bam'), emit: bam_ok_clipped tuple val(name), path ('*.bam.bai') - + script: """ - git clone https://github.com/MarieLataretu/samclipy.git --branch v0.0.2 || git clone git@github.com:MarieLataretu/samclipy.git --branch v0.0.2 + git clone https://github.com/MarieLataretu/samclipy.git --branch v0.0.2 || git clone git@github.com:MarieLataretu/samclipy.git --branch v0.0.2 samtools view -h ${bam} | python samclipy/samclipy.py --invert --minClip ${minClip} | samtools sort > ${name}.soft-clipped.bam samtools view -h ${bam} | python samclipy/samclipy.py --minClip ${minClip} | samtools sort > ${name}.passed-clipped.bam samtools index ${name}.soft-clipped.bam @@ -118,7 +118,7 @@ process filter_true_dcs_alignments { samtools index ${name}.no-dcs.bam samtools index ${name}.true-dcs.bam samtools index ${name}.false-dcs.bam - """ + """ stub: """ touch ${name}.no-dcs.bam ${name}.true-dcs.bam ${name}.false-dcs.bam ${name}.no-dcs.bam.bai ${name}.true-dcs.bam.bai ${name}.false-dcs.bam.bai @@ -147,12 +147,12 @@ process fastq_from_bam { mode: params.publish_dir_mode, pattern: "*.gz", saveAs: { fn -> - fn.endsWith('.unmapped.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped.fastq.gz$', '.fastq.gz') : - fn.endsWith('.mapped.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped.fastq.gz$', '.fastq.gz') : - fn.endsWith('.unmapped_merged.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped_merged.fastq.gz$', '.fastq.gz') : - fn.endsWith('.mapped_merged.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped_merged.fastq.gz$', '.fastq.gz') : - fn.endsWith('.unmapped_merged_merged.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped_merged_merged.fastq.gz$', '.fastq.gz') : - fn.endsWith('.soft-clipped_merged.fastq.gz') ? "clean/${fn}".replaceAll(~'.soft-clipped_merged.fastq.gz$', '.fastq.gz') : + fn.matches('.*.unmapped.fast[aq].gz$') ? "clean/${fn}".replaceAll(~'.unmapped(.fast[aq].gz)$', '$1') : + fn.matches('.*.mapped.fast[aq].gz$') ? "removed/${fn}".replaceAll(~'.mapped(.fast[aq].gz)$', '$1') : + fn.matches('.*.unmapped_merged.fast[aq].gz$') ? "clean/${fn}".replaceAll(~'.unmapped_merged(.fast[aq].gz)$', '$1') : + fn.matches('.*.mapped_merged.fast[aq].gz$') ? "removed/${fn}".replaceAll(~'.mapped_merged(.fast[aq].gz)$', '$1') : + fn.matches('.*.unmapped_merged_merged.fast[aq].gz$') ? "clean/${fn}".replaceAll(~'.unmapped_merged_merged(.fast[aq].gz)$', '$1') : + fn.matches('.*.soft-clipped_merged.fast[aq].gz$') ? "clean/${fn}".replaceAll(~'.soft-clipped_merged(.fast[aq].gz)$', '$1') : fn } ) @@ -167,14 +167,12 @@ process fastq_from_bam { script: if ( params.lib_pairedness == 'paired' ) { """ - samtools fastq -@ ${task.cpus} -1 ${bam.baseName}_1.fastq -2 ${bam.baseName}_2.fastq -s ${bam.baseName}_singleton.fastq ${bam} - gzip --no-name *.fastq + samtools fastq -@ ${task.cpus} -c 6 -1 ${bam.baseName}_1.fastq.gz -2 ${bam.baseName}_2.fastq.gz -s ${bam.baseName}_singleton.fastq.gz ${bam} """ } else if ( params.lib_pairedness == 'single' ) { dtype = (params.input_type == 'fasta') ? 'a' : 'q' """ - samtools fastq -@ ${task.cpus} -0 ${bam.baseName}.fast${dtype} ${bam} - gzip --no-name *.fast${dtype} + samtools fast${dtype} -@ ${task.cpus} -c 6 -0 ${bam.baseName}.fast${dtype}.gz ${bam} """ } else { error "Invalid pairedness: ${params.lib_pairedness}" diff --git a/modules/unused/alignment_processing.nf b/modules/unused/alignment_processing.nf index 4fc9bdf..061085c 100644 --- a/modules/unused/alignment_processing.nf +++ b/modules/unused/alignment_processing.nf @@ -30,37 +30,3 @@ process filter_un_mapped_alignments { """ } -process make_mapped_bam { - label 'minimap2' - - publishDir "${params.output}/${name}/${params.tool}", mode: params.publish_dir_mode, pattern: "*.mapped.bam*", enabled: false - - input: - tuple val(name), path(sam), path(reads) - - output: - tuple val(name), path ('*.mapped.bam'), emit: contamination_bam - tuple val(name), path ('*.mapped.bam.bai'), emit: contamination_bai - tuple val(name), path ('idxstats.tsv'), emit: idxstats - - script: - if ( params.mode == 'paired' ) { - """ - samtools view -@ ${task.cpus} -b -f 2 -F 2048 ${name}.sam | samtools sort -o ${name}.mapped.bam --threads ${task.cpus} - samtools index ${name}.mapped.bam - samtools idxstats ${name}.mapped.bam > idxstats.tsv - """ - } else if ( params.mode == 'single' ) { - """ - samtools view -@ ${task.cpus} -b -F 2052 ${name}.sam | samtools sort -o ${name}.mapped.bam --threads ${task.cpus} - samtools index ${name}.mapped.bam - samtools idxstats ${name}.mapped.bam > idxstats.tsv - """ - } else { - error "Invalid mode: ${params.mode}" - } - stub: - """ - touch ${name}.mapped.bam ${name}.mapped.bam.bai idxstats.tsv - """ -} diff --git a/modules/utils.nf b/modules/utils.nf index 40c0c2b..00ebaf1 100644 --- a/modules/utils.nf +++ b/modules/utils.nf @@ -49,10 +49,10 @@ process get_read_names { input: tuple val(name), path(bam) - + output: tuple val(name), path("${name}_read_names.csv") - + script: """ samtools view ${bam} | cut -f1 | sort | uniq > ${name}_read_names.csv @@ -89,19 +89,19 @@ process filter_fastq_by_name { // generated. If not, it's generated by the fastq_from_bam process or // bbduk mapping if ( params.keep ) { - publishDir ( + publishDir ( path: params.output, mode: params.publish_dir_mode, pattern: "*.gz", saveAs: { fn -> - fn.endsWith('.unmapped.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped.fastq.gz$', '.fastq.gz') : - fn.endsWith('.mapped.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped.fastq.gz$', '.fastq.gz') : - fn.endsWith('.unmapped_merged.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped_merged.fastq.gz$', '.fastq.gz') : - fn.endsWith('.mapped_merged.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped_merged.fastq.gz$', '.fastq.gz') : - fn.endsWith('.unmapped_merged_merged.fastq.gz') ? "clean/${fn}".replaceAll(~'.unmapped_merged_merged.fastq.gz$', '.fastq.gz') : - fn.endsWith('.mapped_merged_merged.fastq.gz') ? "removed/${fn}".replaceAll(~'.mapped_merged_merged.fastq.gz$', '.fastq.gz') : - fn.endsWith('.fastq.clean.fastq.gz') ? "clean/${fn}".replaceAll(~'.fastq.clean.fastq.gz$', '.fastq.gz') : - fn.endsWith('.fastq.contamination.fastq.gz') ? "removed/${fn}".replaceAll(~'.fastq.contamination.fastq.gz$', '.fastq.gz') : + fn.matches('.*.unmapped.fast[aq].gz$') ? "clean/${fn}".replaceAll(~'.unmapped(.fast[aq].gz)$', '$1') : + fn.matches('.*.mapped.fast[aq].gz$') ? "removed/${fn}".replaceAll(~'.mapped(.fast[aq].gz)$', '$1') : + fn.matches('.*.unmapped_merged.fast[aq].gz$') ? "clean/${fn}".replaceAll(~'.unmapped_merged(.fast[aq].gz)$', '$1') : + fn.matches('.*.mapped_merged.fast[aq].gz$') ? "removed/${fn}".replaceAll(~'.mapped_merged(.fast[aq].gz)$', '$1') : + fn.matches('.*.unmapped_merged_merged.fast[aq].gz$') ? "clean/${fn}".replaceAll(~'.unmapped_merged_merged(.fast[aq].gz)$', '$1') : + fn.matches('.*.mapped_merged_merged.fast[aq].gz$') ? "removed/${fn}".replaceAll(~'.mapped_merged_merged(.fast[aq].gz)$', '$1') : + fn.matches('.*.fast[aq].clean.fast[aq].gz$') ? "clean/${fn}".replaceAll(~'.fast[aq].clean(.fast[aq].gz)$', '$1') : + fn.matches('.*.fast[aq].contamination.fast[aq].gz$') ? "removed/${fn}".replaceAll(~'.fast[aq].contamination(.fast[aq].gz)$', '$1') : fn } ) @@ -113,7 +113,7 @@ process filter_fastq_by_name { output: tuple val(name), val(mapped), path(reads_mapped, includeInputs: true), emit: mapped_no_keep tuple val(name), val(unmapped), path(reads_unmapped, includeInputs: true), emit: unmapped_keep - + script: if ( params.lib_pairedness == 'paired' ) { """ @@ -136,44 +136,6 @@ process filter_fastq_by_name { """ } -// process split_bam { -// label 'pysam' -// echo true - -// input: -// path(read_name_list) -// tuple val(name), path(mapped_bam), path(mapped_bai) - -// output: -// tuple val(name), val('mapped'), path('mapped.fq'), emit: mapped -// tuple val(name), val('unmapped'), path('unmapped.fq'), emit: unmapped - -// script: -// """ -// #!/usr/bin/env python3 - -// import pysam - -// reads = set() -// with open('${read_name_list}', 'r') as infile: -// for line in infile: -// reads.add(line.strip()) -// print(reads) - -// # split bam into mapped (not in list) -// # and "unmapped" (in list) - -// bamfile = pysam.AlignmentFile('${mapped_bam}', 'rb') -// for read in bamfile.fetch(until_eof=True): -// if (read.query_name in reads): -// #write to unmapped -// pass -// else: -// # write to mapped -// pass -// """ -// } - process bbduk_stats { label 'smallTask' From 753f97ce991abe1342c620a163d57709cb368299 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Wed, 22 Nov 2023 10:32:51 +0100 Subject: [PATCH 12/26] Add --no_intermediate argument to skip publishing intermediate files to results/ --- clean.nf | 6 ++++-- modules/alignment_processing.nf | 6 ++++++ modules/bbmap.nf | 7 ++++--- modules/utils.nf | 1 + nextflow.config | 1 + 5 files changed, 16 insertions(+), 5 deletions(-) diff --git a/clean.nf b/clean.nf index 3e43de6..aab7973 100755 --- a/clean.nf +++ b/clean.nf @@ -10,7 +10,7 @@ Author: hoelzer.martin@gmail.com // Parameters sanity checking -Set valid_params = ['max_cores', 'cores', 'max_memory', 'memory', 'profile', 'help', 'input', 'input_type', 'list', 'host', 'own', 'control', 'keep', 'rm_rrna', 'bbduk', 'bbduk_kmer', 'bbduk_qin', 'reads_rna', 'min_clip', 'dcs_strict', 'output', 'multiqc_dir', 'nf_runinfo_dir', 'databases', 'cleanup_work_dir','condaCacheDir', 'singularityCacheDir', 'singularityCacheDir', 'cloudProcess', 'conda-cache-dir', 'singularity-cache-dir', 'cloud-process', 'publish_dir_mode'] // don't ask me why there is also 'conda-cache-dir', 'singularity-cache-dir', 'cloud-process' +Set valid_params = ['max_cores', 'cores', 'max_memory', 'memory', 'profile', 'help', 'input', 'input_type', 'list', 'host', 'own', 'control', 'keep', 'rm_rrna', 'bbduk', 'bbduk_kmer', 'bbduk_qin', 'reads_rna', 'min_clip', 'dcs_strict', 'output', 'multiqc_dir', 'nf_runinfo_dir', 'databases', 'cleanup_work_dir','condaCacheDir', 'singularityCacheDir', 'singularityCacheDir', 'cloudProcess', 'conda-cache-dir', 'singularity-cache-dir', 'cloud-process', 'publish_dir_mode', 'no_intermediate'] // don't ask me why there is also 'conda-cache-dir', 'singularity-cache-dir', 'cloud-process' def parameter_diff = params.keySet() - valid_params if (parameter_diff.size() != 0){ exit 1, "ERROR: Parameter(s) $parameter_diff is/are not valid in the pipeline!\n" @@ -291,7 +291,9 @@ def helpMSG() { ${c_yellow}Miscellaneous:${c_reset} --cleanup_work_dir deletes all files in the work directory after a successful completion of a run [default: $params.cleanup_work_dir] - ${c_dim}warning: if ture, the option will prevent the use of the resume feature!${c_reset} + ${c_dim}warning: if true, the option will prevent the use of the resume feature!${c_reset} + --no_intermediate do not save intermediate .bam/fastq/etc files into the `results/intermediate/` directory [default: $params.cleanup_work_dir] + saves a lot of disk space, especially if used with the `--cleanup_work_dir` argument. ${c_yellow}Profile:${c_reset} You can merge different profiles for different setups, e.g. diff --git a/modules/alignment_processing.nf b/modules/alignment_processing.nf index c13f0dd..85cc74f 100644 --- a/modules/alignment_processing.nf +++ b/modules/alignment_processing.nf @@ -55,6 +55,7 @@ process filter_soft_clipped_alignments { mode: params.publish_dir_mode, pattern: "${name}*.bam{,.bai}", overwrite: false, + enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/soft-clipped/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/soft-clipped/${fn}" } @@ -91,6 +92,7 @@ process filter_true_dcs_alignments { mode: params.publish_dir_mode, pattern: "${name}*.bam{,.bai}", overwrite: false, + enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/strict-dcs/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/strict-dcs/${fn}" } @@ -133,6 +135,7 @@ process fastq_from_bam { mode: params.publish_dir_mode, pattern: "*.gz", overwrite: false, + enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '').replaceAll(~'_merged', '')}" : "map-to-remove/${fn.replaceAll(~'_merged', '')}" } @@ -192,6 +195,7 @@ process idxstats_from_bam { mode: params.publish_dir_mode, pattern: "*.sorted.idxstats.tsv", overwrite: false, + enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" } @@ -221,6 +225,7 @@ process flagstats_from_bam { mode: params.publish_dir_mode, pattern: "*.sorted.flagstats.txt", overwrite: false, + enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" } @@ -270,6 +275,7 @@ process index_bam { mode: params.publish_dir_mode, pattern: "*.sorted.bam{,.bai}", overwrite: false, + enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" } diff --git a/modules/bbmap.nf b/modules/bbmap.nf index 4326ac4..4aabe4d 100644 --- a/modules/bbmap.nf +++ b/modules/bbmap.nf @@ -5,10 +5,11 @@ process bbduk { path: "${params.output}/intermediate/${map_target}", mode: params.publish_dir_mode, pattern: "*.{clean,contamination}.fastq.gz", + enabled: !params.no_intermediate, overwrite: false ) - - // When using `--keep`, we need to do further processing before we have + + // When using `--keep`, we need to do further processing before we have // the final clean and removed data sets. if ( !params.keep ) { publishDir ( @@ -23,7 +24,7 @@ process bbduk { } ) } - + input: tuple val(name), path(reads) path db diff --git a/modules/utils.nf b/modules/utils.nf index 00ebaf1..dd6a917 100644 --- a/modules/utils.nf +++ b/modules/utils.nf @@ -144,6 +144,7 @@ process bbduk_stats { mode: params.publish_dir_mode, pattern: "*.bbduk_stats.tsv", overwrite: false, + enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" } diff --git a/nextflow.config b/nextflow.config index 344106f..870397b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -47,6 +47,7 @@ params { // cleanup cleanup_work_dir = false + no_intermediate = false } // see https://www.nextflow.io/docs/latest/config.html?highlight=cleanup#miscellaneous From bb499ac48c8019ed8098ef117a7073f5952e87b0 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Wed, 22 Nov 2023 12:01:08 +0100 Subject: [PATCH 13/26] Add singularity/ dir to .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index eec86d5..dbcfd8c 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ work/ results/ centrifuge-cloud/ conda/ -.vscode/ \ No newline at end of file +singularity/ +.vscode/ From 14840fb50ce59b11f49c4733e0a3ddb1f286addf Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Wed, 22 Nov 2023 20:33:19 +0100 Subject: [PATCH 14/26] Tweak filter_fastq_by_name to work properly, avoid reading and writing to the same file in a pipe --- modules/utils.nf | 11 ++++++++--- workflows/keep_wf.nf | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/modules/utils.nf b/modules/utils.nf index dd6a917..4f6088b 100644 --- a/modules/utils.nf +++ b/modules/utils.nf @@ -84,6 +84,7 @@ process get_read_names_fastx { process filter_fastq_by_name { label 'minimap2' // We don't need minimap2 but the container has pigz + stageInMode 'copy' // When using --keep, this is where the final cleaned fastq file is // generated. If not, it's generated by the fastq_from_bam process or @@ -114,18 +115,22 @@ process filter_fastq_by_name { tuple val(name), val(mapped), path(reads_mapped, includeInputs: true), emit: mapped_no_keep tuple val(name), val(unmapped), path(reads_unmapped, includeInputs: true), emit: unmapped_keep + // The commands below are in a very specific ordering, because we are + // modifying the input files: + // 1. First move reads that mapped to keep from host mapped to unmapped + // 2. Then remove those same reads from the host mapped set script: if ( params.lib_pairedness == 'paired' ) { """ - zcat ${reads_mapped[0]} | paste - - - - | grep -v -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} > ${reads_mapped[0]} zcat ${reads_mapped[0]} | paste - - - - | grep -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} >> ${reads_unmapped[0]} - zcat ${reads_mapped[1]} | paste - - - - | grep -v -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} > ${reads_mapped[1]} + zcat ${reads_mapped[0]} | paste - - - - | grep -v -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} > ${reads_mapped[0]} && mv ${reads_mapped[0]}{.tmp,} zcat ${reads_mapped[1]} | paste - - - - | grep -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} >> ${reads_unmapped[1]} + zcat ${reads_mapped[1]} | paste - - - - | grep -v -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} > ${reads_mapped[1]} && mv ${reads_mapped[1]}{.tmp,} """ } else if ( params.lib_pairedness == 'single' ) { """ - zcat ${reads_mapped} | paste - - - - | grep -v -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} > ${reads_mapped} zcat ${reads_mapped} | paste - - - - | grep -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} >> ${reads_unmapped} + zcat ${reads_mapped} | paste - - - - | grep -v -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} > ${reads_mapped}.tmp && mv ${reads_mapped}{.tmp,} """ } else { error "Invalid mode: ${params.lib_pairedness}" diff --git a/workflows/keep_wf.nf b/workflows/keep_wf.nf index 12605a4..3054a40 100644 --- a/workflows/keep_wf.nf +++ b/workflows/keep_wf.nf @@ -12,7 +12,7 @@ workflow keep { main: keep_map(input, keep_reference, dcs_ends_bed, 'map-to-keep') - + if ( params.bbduk ) { keep_reads_fastx = keep_map.out.out_reads.filter{ it[1] == 'mapped' } get_read_names_fastx(keep_reads_fastx.map{it -> [it[0], it[2]]}) From bb074a7c6042c59e3907dfaabc29c62295fc04ad Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Wed, 22 Nov 2023 21:00:47 +0100 Subject: [PATCH 15/26] Replace scary zcat/paste/grep/tr with seqkit grep --- modules/utils.nf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/utils.nf b/modules/utils.nf index 4f6088b..53d5694 100644 --- a/modules/utils.nf +++ b/modules/utils.nf @@ -83,7 +83,7 @@ process get_read_names_fastx { } process filter_fastq_by_name { - label 'minimap2' // We don't need minimap2 but the container has pigz + label 'seqkit' // We don't need minimap2 but the container has pigz stageInMode 'copy' // When using --keep, this is where the final cleaned fastq file is @@ -122,15 +122,15 @@ process filter_fastq_by_name { script: if ( params.lib_pairedness == 'paired' ) { """ - zcat ${reads_mapped[0]} | paste - - - - | grep -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} >> ${reads_unmapped[0]} - zcat ${reads_mapped[0]} | paste - - - - | grep -v -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} > ${reads_mapped[0]} && mv ${reads_mapped[0]}{.tmp,} - zcat ${reads_mapped[1]} | paste - - - - | grep -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} >> ${reads_unmapped[1]} - zcat ${reads_mapped[1]} | paste - - - - | grep -v -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} > ${reads_mapped[1]} && mv ${reads_mapped[1]}{.tmp,} + seqkit grep --pattern-file ${keep_read_name_list} ${reads_mapped[0]} | gzip >> ${reads_unmapped[0]} + seqkit grep --invert-match --pattern-file ${keep_read_name_list} ${reads_mapped[0]} | gzip > ${reads_mapped[0]}.tmp && mv ${reads_mapped[0]}{.tmp,} + seqkit grep --pattern-file ${keep_read_name_list} ${reads_mapped[1]} | gzip >> ${reads_unmapped[1]} + seqkit grep --invert-match --pattern-file ${keep_read_name_list} ${reads_mapped[1]} | gzip > ${reads_mapped[1]}.tmp && mv ${reads_mapped[1]}{.tmp,} """ } else if ( params.lib_pairedness == 'single' ) { """ - zcat ${reads_mapped} | paste - - - - | grep -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} >> ${reads_unmapped} - zcat ${reads_mapped} | paste - - - - | grep -v -F -f ${keep_read_name_list} | tr "\t" "\n" | pigz -fc -p ${task.cpus} > ${reads_mapped}.tmp && mv ${reads_mapped}{.tmp,} + seqkit grep --pattern-file ${keep_read_name_list} ${reads_mapped} | gzip >> ${reads_unmapped} + seqkit grep --invert-match --pattern-file ${keep_read_name_list} ${reads_mapped} | gzip > ${reads_mapped}.tmp && mv ${reads_mapped}{.tmp,} """ } else { error "Invalid mode: ${params.lib_pairedness}" From 40276ea52097b3704349c5d6d5d8b17ce48a3af3 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Wed, 22 Nov 2023 21:16:02 +0100 Subject: [PATCH 16/26] Add clarifying comments to the filter_fastq_by_name process --- modules/utils.nf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/utils.nf b/modules/utils.nf index 53d5694..6352558 100644 --- a/modules/utils.nf +++ b/modules/utils.nf @@ -84,6 +84,9 @@ process get_read_names_fastx { process filter_fastq_by_name { label 'seqkit' // We don't need minimap2 but the container has pigz + + // Needed because we want to modify the input files and pass them on as + // output stageInMode 'copy' // When using --keep, this is where the final cleaned fastq file is @@ -119,6 +122,8 @@ process filter_fastq_by_name { // modifying the input files: // 1. First move reads that mapped to keep from host mapped to unmapped // 2. Then remove those same reads from the host mapped set + // If these two steps are done in the opposite order the results will be + // wrong. script: if ( params.lib_pairedness == 'paired' ) { """ From dc59d2821b262db6af77f15a3548eba7e7289690 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Wed, 22 Nov 2023 21:45:14 +0100 Subject: [PATCH 17/26] Remove overwrite: false from publishDir directives --- modules/alignment_processing.nf | 7 ------- modules/bbmap.nf | 4 +--- modules/utils.nf | 1 - 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/modules/alignment_processing.nf b/modules/alignment_processing.nf index 85cc74f..21b557f 100644 --- a/modules/alignment_processing.nf +++ b/modules/alignment_processing.nf @@ -54,7 +54,6 @@ process filter_soft_clipped_alignments { path: "${params.output}/intermediate", mode: params.publish_dir_mode, pattern: "${name}*.bam{,.bai}", - overwrite: false, enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/soft-clipped/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/soft-clipped/${fn}" @@ -91,7 +90,6 @@ process filter_true_dcs_alignments { path: "${params.output}/intermediate", mode: params.publish_dir_mode, pattern: "${name}*.bam{,.bai}", - overwrite: false, enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/strict-dcs/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/strict-dcs/${fn}" @@ -134,7 +132,6 @@ process fastq_from_bam { path: "${params.output}/intermediate", mode: params.publish_dir_mode, pattern: "*.gz", - overwrite: false, enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '').replaceAll(~'_merged', '')}" : "map-to-remove/${fn.replaceAll(~'_merged', '')}" @@ -146,7 +143,6 @@ process fastq_from_bam { if ( !params.keep ) { publishDir ( path: params.output, - overwrite: false, mode: params.publish_dir_mode, pattern: "*.gz", saveAs: { fn -> @@ -194,7 +190,6 @@ process idxstats_from_bam { path: "${params.output}/intermediate", mode: params.publish_dir_mode, pattern: "*.sorted.idxstats.tsv", - overwrite: false, enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" @@ -224,7 +219,6 @@ process flagstats_from_bam { path: "${params.output}/intermediate", mode: params.publish_dir_mode, pattern: "*.sorted.flagstats.txt", - overwrite: false, enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" @@ -274,7 +268,6 @@ process index_bam { path: "${params.output}/intermediate", mode: params.publish_dir_mode, pattern: "*.sorted.bam{,.bai}", - overwrite: false, enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" diff --git a/modules/bbmap.nf b/modules/bbmap.nf index 4aabe4d..46c56f9 100644 --- a/modules/bbmap.nf +++ b/modules/bbmap.nf @@ -5,8 +5,7 @@ process bbduk { path: "${params.output}/intermediate/${map_target}", mode: params.publish_dir_mode, pattern: "*.{clean,contamination}.fastq.gz", - enabled: !params.no_intermediate, - overwrite: false + enabled: !params.no_intermediate ) // When using `--keep`, we need to do further processing before we have @@ -14,7 +13,6 @@ process bbduk { if ( !params.keep ) { publishDir ( path: params.output, - overwrite: false, mode: params.publish_dir_mode, pattern: "*.gz", saveAs: { fn -> diff --git a/modules/utils.nf b/modules/utils.nf index 6352558..5d94aad 100644 --- a/modules/utils.nf +++ b/modules/utils.nf @@ -153,7 +153,6 @@ process bbduk_stats { path: "${params.output}/intermediate", mode: params.publish_dir_mode, pattern: "*.bbduk_stats.tsv", - overwrite: false, enabled: !params.no_intermediate, saveAs: { fn -> fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" From 39bd52e9cc71e311b6637ad1c296f067702e94d5 Mon Sep 17 00:00:00 2001 From: MarieLataretu Date: Sun, 10 Dec 2023 11:03:40 +0100 Subject: [PATCH 18/26] fix stub touch - output is expecting the suffix .sorted.bam --- modules/alignment_processing.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/alignment_processing.nf b/modules/alignment_processing.nf index 21b557f..0f908f7 100644 --- a/modules/alignment_processing.nf +++ b/modules/alignment_processing.nf @@ -257,7 +257,7 @@ process sort_bam { """ stub: """ - touch ${bam.baseName}.bam + touch ${bam.baseName}.sorted.bam """ } From d29a5ef63ceba64df11bd20ed6fc2a24d0ddea22 Mon Sep 17 00:00:00 2001 From: MarieLataretu Date: Sun, 10 Dec 2023 11:10:34 +0100 Subject: [PATCH 19/26] added seqkit to citations --- CITATIONS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CITATIONS.md b/CITATIONS.md index 0771f59..8d0e428 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -29,6 +29,10 @@ > Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. doi: 10.1093/bioinformatics/btp352. Epub 2009 Jun 8. PubMed PMID: 19505943; PubMed Central PMCID: PMC2723002. +- [SeqKit](https://pubmed.ncbi.nlm.nih.gov/27706213/) + + > Shen W, Le S, Li Y, Hu F. SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. PLoS One. 2016 Oct 5;11(10):e0163962. doi: 10.1371/journal.pone.0163962. PMID: 27706213; PMCID: PMC5051824. + - [QUAST](https://pubmed.ncbi.nlm.nih.gov/23422339/) > Gurevich A, Saveliev V, Vyahhi N, Tesler G. QUAST: quality assessment tool for genome assemblies. Bioinformatics. 2013 Apr 15;29(8):1072-5. doi: 10.1093/bioinformatics/btt086. Epub 2013 Feb 19. PMID: 23422339; PMCID: PMC3624806. From d2ae118956e91633b7f70140e674157e82bb1489 Mon Sep 17 00:00:00 2001 From: martin Date: Sun, 10 Dec 2023 16:58:35 +0100 Subject: [PATCH 20/26] Add -keep to help mssg. Reformated --help a bit. Removed old profiles for ARA and EBI. --- clean.nf | 55 ++++++++++++++++++++++---------------------- nextflow.config | 61 ------------------------------------------------- 2 files changed, 27 insertions(+), 89 deletions(-) diff --git a/clean.nf b/clean.nf index aab7973..e68d5f5 100755 --- a/clean.nf +++ b/clean.nf @@ -248,7 +248,7 @@ def helpMSG() { ${c_dim}required format: name,path for --input_type nano and --input_type fasta; name,pathR1,pathR2 for --illumina input_type; name,path for --input_type illumina_single_end${c_reset} ${c_yellow}Decontamination options:${c_reset} - ${c_green}--host${c_reset} comma separated list of reference genomes for decontamination, downloaded based on this parameter [default: $params.host] + ${c_green}--host${c_reset} Comma separated list of reference genomes for decontamination, downloaded based on this parameter [default: $params.host] ${c_dim}Currently supported are: - hsa [Ensembl: Homo_sapiens.GRCh38.dna.primary_assembly] - mmu [Ensembl: Mus_musculus.GRCm38.dna.primary_assembly] @@ -256,44 +256,46 @@ def helpMSG() { - gga [NCBI: Gallus_gallus.GRCg6a.dna.toplevel] - cli [NCBI: GCF_000337935.1_Cliv_1.0_genomic] - eco [Ensembl: Escherichia_coli_k_12.ASM80076v1.dna.toplevel]${c_reset} - ${c_green}--control${c_reset} comma separated list of common controls used in Illumina or Nanopore sequencing [default: $params.control] + ${c_green}--control${c_reset} Comma separated list of common controls used in Illumina or Nanopore sequencing [default: $params.control] ${c_dim}Currently supported are: - phix [Illumina: enterobacteria_phage_phix174_sensu_lato_uid14015, NC_001422] - dcs [ONT DNA-Seq: a positive control (3.6 kb standard amplicon mapping the 3' end of the Lambda genome)] - eno [ONT RNA-Seq: a positive control (yeast ENO2 Enolase II of strain S288C, YHR174W)]${c_reset} - ${c_green}--own ${c_reset} use your own FASTA sequences (comma separated list of files) for decontamination, e.g. host.fasta.gz,spike.fasta [default: $params.own] - ${c_green}--rm_rrna ${c_reset} clean your data from rRNA [default: $params.rm_rrna] - ${c_green}--bbduk${c_reset} add this flag to use bbduk instead of minimap2 for decontamination of short reads [default: $params.bbduk] - ${c_green}--bbduk_kmer${c_reset} set kmer for bbduk [default: $params.bbduk_kmer] - ${c_green}--bbduk_qin${c_reset} set quality ASCII encoding for bbduk [default: $params.bbduk_qin; options are: 64, 33, auto] - ${c_green}--reads_rna${c_reset} add this flag for noisy direct RNA-Seq Nanopore data [default: $params.reads_rna] - - ${c_green}--min_clip${c_reset} filter mapped reads by soft-clipped length (left + right). If >= 1 total - number; if < 1 relative to read length - ${c_green}--dcs_strict${c_reset} filter out alignments that cover artificial ends of the ONT DCS to discriminate between Lambda Phage and DCS + ${c_green}--own ${c_reset} Use your own FASTA sequences (comma separated list of files) for decontamination, e.g. host.fasta.gz,spike.fasta [default: $params.own] + ${c_green}--keep ${c_reset} Use your own FASTA sequences (comma separated list of files) to explicitly keep mapped reads, e.g. target.fasta.gz,important.fasta [default: $params.keep] + Reads are assigned to a combined index for decontamination and keeping. The use of this parameter can prevent + false positive hits and the accidental removal of reads due to (poor quality) mappings. + ${c_green}--rm_rrna ${c_reset} Clean your data from rRNA [default: $params.rm_rrna] + ${c_green}--bbduk${c_reset} Add this flag to use bbduk instead of minimap2 for decontamination of short reads [default: $params.bbduk] + ${c_green}--bbduk_kmer${c_reset} Set kmer for bbduk [default: $params.bbduk_kmer] + ${c_green}--bbduk_qin${c_reset} Set quality ASCII encoding for bbduk [default: $params.bbduk_qin; options are: 64, 33, auto] + ${c_green}--reads_rna${c_reset} Add this flag for noisy direct RNA-Seq Nanopore data [default: $params.reads_rna] + + ${c_green}--min_clip${c_reset} Filter mapped reads by soft-clipped length (left + right). If >= 1 total number; if < 1 relative to read length + ${c_green}--dcs_strict${c_reset} Filter out alignments that cover artificial ends of the ONT DCS to discriminate between Lambda Phage and DCS ${c_yellow}Compute options:${c_reset} - --cores max cores per process for local use [default $params.cores] - --max_cores max cores used on the machine for local use [default $params.max_cores] - --memory max memory for local use, enter in this format '8.GB' [default: $params.memory] - --output name of the result folder [default: $params.output] + --cores Max cores per process for local use [default $params.cores] + --max_cores Max cores used on the machine for local use [default $params.max_cores] + --memory Max memory for local use, enter in this format '8.GB' [default: $params.memory] + --output Name of the result folder [default: $params.output] ${c_dim}Nextflow options: - -with-report rep.html cpu / ram usage (may cause errors) - -with-dag chart.html generates a flowchart for the process tree - -with-timeline time.html timeline (may cause errors) + -with-report rep.html CPU / RAM usage (may cause errors) + -with-dag chart.html Generates a flowchart for the process tree + -with-timeline time.html Timeline (may cause errors) ${c_yellow}Computing:${c_reset} In particular for execution of the workflow on a HPC (LSF, SLURM) adjust the following parameters: - --databases defines the path where databases are stored [default: $params.databases] - --condaCacheDir defines the path where environments (conda) are cached [default: $params.condaCacheDir] - --singularityCacheDir defines the path where images (singularity) are cached [default: $params.singularityCacheDir] + --databases Defines the path where databases are stored [default: $params.databases] + --condaCacheDir Defines the path where environments (conda) are cached [default: $params.condaCacheDir] + --singularityCacheDir Defines the path where images (singularity) are cached [default: $params.singularityCacheDir] ${c_yellow}Miscellaneous:${c_reset} - --cleanup_work_dir deletes all files in the work directory after a successful completion of a run [default: $params.cleanup_work_dir] + --cleanup_work_dir Deletes all files in the work directory after a successful completion of a run [default: $params.cleanup_work_dir] ${c_dim}warning: if true, the option will prevent the use of the resume feature!${c_reset} - --no_intermediate do not save intermediate .bam/fastq/etc files into the `results/intermediate/` directory [default: $params.cleanup_work_dir] - saves a lot of disk space, especially if used with the `--cleanup_work_dir` argument. + --no_intermediate Do not save intermediate .bam/fastq/etc files into the `results/intermediate/` directory [default: $params.cleanup_work_dir] + Saves a lot of disk space, especially if used with the `--cleanup_work_dir` argument. ${c_yellow}Profile:${c_reset} You can merge different profiles for different setups, e.g. @@ -313,9 +315,6 @@ def helpMSG() { conda mamba - ebi (lsf,singularity; preconfigured for the EBI cluster) - yoda (lsf,singularity; preconfigured for the EBI YODA cluster) - ara (slurm,conda; preconfigured for the ARA cluster) gcloud (use this as template for your own GCP setup) ${c_reset} """.stripIndent() diff --git a/nextflow.config b/nextflow.config index 870397b..a5f7316 100644 --- a/nextflow.config +++ b/nextflow.config @@ -130,7 +130,6 @@ profiles { includeConfig 'configs/conda.config' } - // pre-merged standard { params.cloudProcess = false @@ -139,66 +138,6 @@ profiles { includeConfig 'configs/container.config' } - ebi { - params.databases = "/hps/nobackup2/production/metagenomics/$USER/nextflow-databases/" - params.cachedir = "/hps/nobackup2/singularity/$USER" - - workDir = "/hps/nobackup2/production/metagenomics/$USER/nextflow-work-$USER" - executor { - name = "lsf" - queueSize = 200 - } - params.cloudProcess = true - process.cache = "lenient" - includeConfig 'configs/node.config' - - singularity { - enabled = true - autoMounts = true - cacheDir = params.cachedir - } - includeConfig 'configs/container.config' - } - - yoda { - params.databases = "/hps/nobackup2/metagenomics/$USER/nextflow-databases/" - params.cachedir = "/hps/nobackup2/metagenomics/$USER/singularity" - - workDir = "/hps/nobackup2/metagenomics/$USER/nextflow-work-$USER" - executor { - name = "lsf" - queueSize = 200 - } - params.cloudProcess = true - process.cache = "lenient" - includeConfig 'configs/node.config' - - singularity { - enabled = true - autoMounts = true - cacheDir = params.cachedir - } - includeConfig 'configs/container.config' - } - - ara { - params.cloudProcess = true - workDir = "/beegfs/rna-hta/$USER/work" - params.databases = "/beegfs/rna-hta/nextflow-clean-autodownload/" - conda { cacheDir = "/beegfs/rna-hta/$USER/nextflow-conda-cache" } - process { - clusterOptions = '--partition=s_standard,s_fat,b_standard,b_fat' - withLabel: smallTask { executor = 'local' } - } - executor { - name = "slurm" - queueSize = 200 - } - process.cache = "lenient" - includeConfig 'configs/node.config' - includeConfig 'configs/conda.config' - } - // cloud configs node { docker { enabled = true } From 95ce2eeb7c6b29ae8f7c1e826e4b5e0cc7323e03 Mon Sep 17 00:00:00 2001 From: martin Date: Sun, 10 Dec 2023 17:15:35 +0100 Subject: [PATCH 21/26] adjusted the README a bit --- README.md | 64 +++++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index d0d8674..2ed3d59 100644 --- a/README.md +++ b/README.md @@ -110,41 +110,41 @@ Included in this repository are: ## Results -Running the pipeline will create a directory called `results/` in the current directory with some or all of the following directories and files: +Running the pipeline will create a directory called `results/` in the current directory with some or all of the following directories and files (plus additional failes for indices, ...): ```text results/ - clean/ - .fastq.gz - removed/ - .fastq.gz - intermedate/ - map-to-remove/ - mapped.fastq.gz - unmapped.fastq.gz - mapped.bam - unmapped.bam - dcs-strict/ - no-dcs.bam - true-dcs.bam - false-dcs.bam - soft-clipped/ - soft-clipped.bam - passed-clipped.bam - map-to-keep/ - mapped.fastq.gz - unmapped.fastq.gz - mapped.bam - unmapped.bam - dcs-strict/ - no-dcs.bam - true-dcs.bam - false-dcs.bam - soft-clipped/ - soft-clipped.bam - passed-clipped.bam - logs/\*.html - qc/multiqc_report.html +├── clean/ +│ └── .fastq.gz +├── removed/ +│ └── .fastq.gz +├── intermediate/ +│ ├── map-to-remove/ +│ │ ├── .mapped.fastq.gz +│ │ ├── .unmapped.fastq.gz +│ │ ├── .mapped.bam +│ │ ├── .unmapped.bam +│ │ ├── strict-dcs/ +│ │ │ ├── .no-dcs.bam +│ │ │ ├── .true-dcs.bam +│ │ │ └── .false-dcs.bam +│ │ └── soft-clipped/ +│ │ ├── .soft-clipped.bam +│ │ └── .passed-clipped.bam +│ └── map-to-keep/ +│ ├── .mapped.fastq.gz +│ ├── .unmapped.fastq.gz +│ ├── .mapped.bam +│ ├── .unmapped.bam +│ ├── strict-dcs/ +│ │ ├── .no-dcs.bam +│ │ ├── .true-dcs.bam +│ │ └── .false-dcs.bam +│ └── soft-clipped/ +│ ├── .soft-clipped.bam +│ └── .passed-clipped.bam +├── logs/*.html +└── qc/multiqc_report.html ``` The most important files you are likely interested in are `results/clean/.fastq.gz`, which are the "cleaned" reads. These are the input reads that *do not* map to the host, control, own fasta or rRNA files (or the subset of these that you provided), plus those reads that map to the "keep" sequence if you used the `--keep` option. Any files that were removed from your input fasta file are placed in `results/removed/.fastq.gz`. From 975e05ab49dc7d4ed6471e1f0ddaa8639ef5d02c Mon Sep 17 00:00:00 2001 From: martin Date: Sun, 10 Dec 2023 22:33:43 +0100 Subject: [PATCH 22/26] change sed command for --keep and add empty BED channel for phix control --- clean.nf | 1 + modules/prepare_contamination.nf | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/clean.nf b/clean.nf index e68d5f5..2ba3dc3 100755 --- a/clean.nf +++ b/clean.nf @@ -120,6 +120,7 @@ if ( params.input_type == 'illumina' ) { if ( params.control ) { if ( 'phix' in params.control.split(',') ) { illuminaControlFastaChannel = Channel.fromPath( workflow.projectDir + '/data/controls/phix.fa.gz' , checkIfExists: true ) + nanoControlBedChannel = [] } else { illuminaControlFastaChannel = Channel.empty() } if ( 'dcs' in params.control.split(',') ) { nanoControlFastaChannel = Channel.fromPath( workflow.projectDir + '/data/controls/dcs.fa.gz' , checkIfExists: true ) diff --git a/modules/prepare_contamination.nf b/modules/prepare_contamination.nf index 45cb44e..0991342 100644 --- a/modules/prepare_contamination.nf +++ b/modules/prepare_contamination.nf @@ -61,8 +61,8 @@ process check_own { """ # -L for following a symbolic link if ! ( file -L $fasta | grep -q 'BGZF; gzip compatible\\|gzip compressed' ); then - sed -i -e '\$a\\' ${fasta} - bgzip -@ ${task.cpus} < ${fasta} > ${fasta}.gz + sed -e '\$a\\' ${fasta} > ${fasta}.tmp + bgzip -@ ${task.cpus} < ${fasta}.tmp > ${fasta}.gz # now $fasta'.gz' else mv ${fasta} ${fasta}.tmp From 814e37b9726272aa2d05d2e222e2bff374d1ec65 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Mon, 11 Dec 2023 14:56:54 +0100 Subject: [PATCH 23/26] Publish host genome and index for loading into IGV. Skipped if intermediate files are skipped. --- modules/prepare_contamination.nf | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/modules/prepare_contamination.nf b/modules/prepare_contamination.nf index 0991342..85657b2 100644 --- a/modules/prepare_contamination.nf +++ b/modules/prepare_contamination.nf @@ -77,18 +77,33 @@ process check_own { process concat_contamination { label 'minimap2' - + + publishDir ( + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "db.fa.gz", + enabled: !params.no_intermediate, + saveAs: { "host.fa.gz" } + ) + publishDir ( + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "db.fa.fai", + enabled: !params.no_intermediate, + saveAs: { "host.fa.fai" } + ) + input: path fastas output: path 'db.fa.gz', emit: fa path 'db.fa.fai', emit: fai - + script: len = fastas.collect().size() """ - if [[ ${len} -gt 1 ]] + if [[ ${len} -gt 1 ]] then for FASTA in ${fastas} do From 5d87ac41e68ea44a9848e554bbdb433dfe486fb4 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Mon, 11 Dec 2023 17:53:30 +0100 Subject: [PATCH 24/26] Switch to seqkit for check_own and concat_contamination --- envs/seqkit.yaml | 2 ++ modules/prepare_contamination.nf | 31 ++++++------------------------- 2 files changed, 8 insertions(+), 25 deletions(-) diff --git a/envs/seqkit.yaml b/envs/seqkit.yaml index 3dc07d1..d69fd7c 100644 --- a/envs/seqkit.yaml +++ b/envs/seqkit.yaml @@ -4,3 +4,5 @@ channels: - conda-forge dependencies: - seqkit==2.6.1 + - tabix==1.11 + - samtools==1.18 diff --git a/modules/prepare_contamination.nf b/modules/prepare_contamination.nf index 85657b2..5eca4ec 100644 --- a/modules/prepare_contamination.nf +++ b/modules/prepare_contamination.nf @@ -49,7 +49,7 @@ process download_host { } process check_own { - label 'minimap2' + label 'seqkit' input: path fasta @@ -59,15 +59,7 @@ process check_own { script: """ - # -L for following a symbolic link - if ! ( file -L $fasta | grep -q 'BGZF; gzip compatible\\|gzip compressed' ); then - sed -e '\$a\\' ${fasta} > ${fasta}.tmp - bgzip -@ ${task.cpus} < ${fasta}.tmp > ${fasta}.gz - # now $fasta'.gz' - else - mv ${fasta} ${fasta}.tmp - zcat ${fasta}.tmp | sed -e '\$a\\' | bgzip -@ ${task.cpus} -c > ${fasta}.gz - fi + seqkit seq ${fasta} -o ${fasta}.gz """ stub: """ @@ -76,7 +68,7 @@ process check_own { } process concat_contamination { - label 'minimap2' + label 'seqkit' publishDir ( path: "${params.output}/intermediate", @@ -101,21 +93,10 @@ process concat_contamination { path 'db.fa.fai', emit: fai script: - len = fastas.collect().size() """ - if [[ ${len} -gt 1 ]] - then - for FASTA in ${fastas} - do - NAME="\${FASTA%%.*}" - zcat \$FASTA | awk -v n=\$NAME '/>/{sub(">","&"n"_")}1' | bgzip -@ ${task.cpus} -c >> db.fa.gz - done - else - mv ${fastas} db.fa.gz - fi - - samtools faidx db.fa.gz - mv db.fa.gz.fai db.fa.fai + # Combine input files, rename duplicate sequences (by id) if found, and compress + seqkit seq ${fastas} | seqkit rename | bgzip -@ ${task.cpus} -c > db.fa.gz + samtools faidx db.fa.gz --gzi-idx db.fa.fai """ stub: """ From 4228c4b9f4f824a061ce631557aef3ce62a61e03 Mon Sep 17 00:00:00 2001 From: martin Date: Mon, 11 Dec 2023 20:21:57 +0100 Subject: [PATCH 25/26] bump seqkit container --- configs/container.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/container.config b/configs/container.config index 6d310ac..f713429 100644 --- a/configs/container.config +++ b/configs/container.config @@ -7,5 +7,5 @@ process { withLabel: nanoplot { container = 'nanozoo/nanoplot:1.32.0--1ae6f5d' } withLabel: quast { container = 'nanozoo/quast:5.0.2--e7f0cfe' } withLabel: bed_samtools { container = 'nanozoo/bed_samtools:2.30.0--cc7d1b9' } - withLabel: seqkit { container = 'quay.io/biocontainers/seqkit:2.6.1--h9ee0642_0' } + withLabel: seqkit { container = 'nanozoo/seqkit:2.6.1--022e008' } } From 26bd1d09438f44f4177585888530982de234d698 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Wed, 13 Dec 2023 12:20:52 +0100 Subject: [PATCH 26/26] Update fastq_from_bam to publish illumina data properly. Regexes are geting crazy, need to rethink this. --- modules/alignment_processing.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/alignment_processing.nf b/modules/alignment_processing.nf index 0f908f7..ca3b693 100644 --- a/modules/alignment_processing.nf +++ b/modules/alignment_processing.nf @@ -152,6 +152,8 @@ process fastq_from_bam { fn.matches('.*.mapped_merged.fast[aq].gz$') ? "removed/${fn}".replaceAll(~'.mapped_merged(.fast[aq].gz)$', '$1') : fn.matches('.*.unmapped_merged_merged.fast[aq].gz$') ? "clean/${fn}".replaceAll(~'.unmapped_merged_merged(.fast[aq].gz)$', '$1') : fn.matches('.*.soft-clipped_merged.fast[aq].gz$') ? "clean/${fn}".replaceAll(~'.soft-clipped_merged(.fast[aq].gz)$', '$1') : + fn.matches('.*.unmapped_(1|2|singleton).fast[aq].gz$') ? "clean/${fn}".replaceAll(~'.unmapped_(1|2|singleton)(.fast[aq].gz)$', '_$1$2') : + fn.matches('.*.mapped_(1|2|singleton).fast[aq].gz$') ? "removed/${fn}".replaceAll(~'.mapped_(1|2|singleton)(.fast[aq].gz)$', '_$1$2') : fn } )