From 49202ab4de718eafdd6fc9e07a46919ece7c8872 Mon Sep 17 00:00:00 2001 From: d4straub Date: Tue, 19 Nov 2024 14:35:12 +0100 Subject: [PATCH] Add param --quality_type --- CHANGELOG.md | 2 ++ conf/modules.config | 6 ++++-- modules/local/dada2_denoising.nf | 21 ++++++++++++++++++--- nextflow.config | 1 + nextflow_schema.json | 8 ++++++++ 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ecf1f6b..7c690054 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#801](https://github.com/nf-core/ampliseq/pull/801) - Parameter `--quality_type` allows specifying the type of quality scores in raw read data, by default `Auto` (i.e. default behavior did not change) + ### `Changed` ### `Fixed` diff --git a/conf/modules.config b/conf/modules.config index 5f940e57..0eafbd97 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -139,7 +139,8 @@ process { max_len = params.max_len ?: "Inf" withName: DADA2_FILTNTRIM { ext.args = [ - 'maxN = 0, truncQ = 2, trimRight = 0, minQ = 0, rm.lowcomplex = 0, orient.fwd = NULL, matchIDs = FALSE, id.sep = "\\\\s", id.field = NULL, n = 1e+05, OMP = TRUE, qualityType = "Auto"', + 'maxN = 0, truncQ = 2, trimRight = 0, minQ = 0, rm.lowcomplex = 0, orient.fwd = NULL, matchIDs = FALSE, id.sep = "\\\\s", id.field = NULL, n = 1e+05, OMP = TRUE', + "qualityType = \"${params.quality_type}\"", params.pacbio || params.iontorrent || params.single_end ? "maxEE = ${params.max_ee}" : "maxEE = c(${params.max_ee}, ${params.max_ee})", params.pacbio ? "trimLeft = 0, minLen = ${params.min_len}, maxLen = $max_len, rm.phix = FALSE" : params.iontorrent ? "trimLeft = 15, minLen = ${params.min_len}, maxLen = $max_len, rm.phix = TRUE" : @@ -179,7 +180,8 @@ process { ext.seed = "${params.seed}" ext.prefix = { meta.region ? "region-${meta.region}_run-${meta.run}" : "${meta.run}" } ext.args = [ - 'nbases = 1e8, nreads = NULL, randomize = TRUE, MAX_CONSIST = 10, OMEGA_C = 0, qualityType = "Auto"', + 'nbases = 1e8, nreads = NULL, randomize = TRUE, MAX_CONSIST = 10, OMEGA_C = 0', + "qualityType = \"${params.quality_type}\"", params.pacbio ? "errorEstimationFunction = PacBioErrfun" : "errorEstimationFunction = loessErrfun" ].join(',').replaceAll('(,)*$', "") publishDir = [ diff --git a/modules/local/dada2_denoising.nf b/modules/local/dada2_denoising.nf index 637bd898..8e551250 100644 --- a/modules/local/dada2_denoising.nf +++ b/modules/local/dada2_denoising.nf @@ -24,6 +24,7 @@ process DADA2_DENOISING { script: def prefix = task.ext.prefix ?: "prefix" + def quality_type = task.ext.quality_type ?: "Auto" def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' if (!meta.single_end) { @@ -39,9 +40,17 @@ process DADA2_DENOISING { #denoising sink(file = "${prefix}.dada.log") - dadaFs <- dada(filtFs, err = errF, $args, multithread = $task.cpus) + if ("${quality_type}" == "Auto") { + # Avoid using memory-inefficient derepFastq() if not necessary + dadaFs <- dada(filtFs, err = errF, $args, multithread = $task.cpus) + dadaRs <- dada(filtRs, err = errR, $args, multithread = $task.cpus) + } else { + derepFs <- derepFastq(filtFs, qualityType="${quality_type}") + dadaFs <- dada(derepFs, err = errF, $args, multithread = $task.cpus) + derepRs <- derepFastq(filtRs, qualityType="${quality_type}") + dadaRs <- dada(derepRs, err = errR, $args, multithread = $task.cpus) + } saveRDS(dadaFs, "${prefix}_1.dada.rds") - dadaRs <- dada(filtRs, err = errR, $args, multithread = $task.cpus) saveRDS(dadaRs, "${prefix}_2.dada.rds") sink(file = NULL) @@ -66,7 +75,13 @@ process DADA2_DENOISING { #denoising sink(file = "${prefix}.dada.log") - dadaFs <- dada(filtFs, err = errF, $args, multithread = $task.cpus) + if ("${quality_type}" == "Auto") { + # Avoid using memory-inefficient derepFastq() if not necessary + dadaFs <- dada(filtFs, err = errF, $args, multithread = $task.cpus) + } else { + derepFs <- derepFastq(filtFs, qualityType="${quality_type}") + dadaFs <- dada(derepFs, err = errF, $args, multithread = $task.cpus) + } saveRDS(dadaFs, "${prefix}.dada.rds") sink(file = NULL) diff --git a/nextflow.config b/nextflow.config index 1982ab26..fa91dcab 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,6 +17,7 @@ params { extension = "/*_R{1,2}_001.fastq.gz" pacbio = false iontorrent = false + quality_type = "Auto" FW_primer = null RV_primer = null classifier = null diff --git a/nextflow_schema.json b/nextflow_schema.json index e0b263b2..3c09f481 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -118,6 +118,14 @@ "help_text": "This will cause the pipeline to\n- not truncate input reads if not `--trunclenf` and `--trunclenr` are overwriting defaults\n- remove reverse complement primers from the end of reads in case the read length exceeds the amplicon length", "fa_icon": "fas fa-align-justify" }, + "quality_type": { + "type": "string", + "default": "Auto", + "description": "Type of quality scores in raw read data", + "help_text": "From R package 'ShortRead' function 'readFastq': Representation to be used for quality scores, must be one of `Auto` (infer automatically), `FastqQuality` (Phred-like base 33 encoding), `SFastqQuality` (Illumina base 64 encoding).", + "enum": ["Auto", "FastqQuality", "SFastqQuality"], + "fa_icon": "fab fa-amilia" + }, "multiple_sequencing_runs": { "type": "boolean", "description": "If using `--input_folder`: samples were sequenced in multiple sequencing runs",