From 49202ab4de718eafdd6fc9e07a46919ece7c8872 Mon Sep 17 00:00:00 2001
From: d4straub <daniel.straub@uni-tuebingen.de>
Date: Tue, 19 Nov 2024 14:35:12 +0100
Subject: [PATCH] Add param --quality_type

---
 CHANGELOG.md                     |  2 ++
 conf/modules.config              |  6 ++++--
 modules/local/dada2_denoising.nf | 21 ++++++++++++++++++---
 nextflow.config                  |  1 +
 nextflow_schema.json             |  8 ++++++++
 5 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1ecf1f6b..7c690054 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
+- [#801](https://github.com/nf-core/ampliseq/pull/801) - Parameter `--quality_type` allows specifying the type of quality scores in raw read data, by default `Auto` (i.e. default behavior did not change)
+
 ### `Changed`
 
 ### `Fixed`
diff --git a/conf/modules.config b/conf/modules.config
index 5f940e57..0eafbd97 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -139,7 +139,8 @@ process {
     max_len = params.max_len ?: "Inf"
     withName: DADA2_FILTNTRIM {
         ext.args = [
-            'maxN = 0, truncQ = 2, trimRight = 0, minQ = 0, rm.lowcomplex = 0, orient.fwd = NULL, matchIDs = FALSE, id.sep = "\\\\s", id.field = NULL, n = 1e+05, OMP = TRUE, qualityType = "Auto"',
+            'maxN = 0, truncQ = 2, trimRight = 0, minQ = 0, rm.lowcomplex = 0, orient.fwd = NULL, matchIDs = FALSE, id.sep = "\\\\s", id.field = NULL, n = 1e+05, OMP = TRUE',
+            "qualityType = \"${params.quality_type}\"",
             params.pacbio || params.iontorrent || params.single_end ? "maxEE = ${params.max_ee}" : "maxEE = c(${params.max_ee}, ${params.max_ee})",
             params.pacbio ? "trimLeft = 0, minLen = ${params.min_len}, maxLen = $max_len, rm.phix = FALSE" :
                 params.iontorrent ? "trimLeft = 15, minLen = ${params.min_len}, maxLen = $max_len, rm.phix = TRUE" :
@@ -179,7 +180,8 @@ process {
         ext.seed = "${params.seed}"
         ext.prefix = { meta.region ? "region-${meta.region}_run-${meta.run}" : "${meta.run}" }
         ext.args = [
-            'nbases = 1e8, nreads = NULL, randomize = TRUE, MAX_CONSIST = 10, OMEGA_C = 0, qualityType = "Auto"',
+            'nbases = 1e8, nreads = NULL, randomize = TRUE, MAX_CONSIST = 10, OMEGA_C = 0',
+            "qualityType = \"${params.quality_type}\"",
             params.pacbio ? "errorEstimationFunction = PacBioErrfun" : "errorEstimationFunction = loessErrfun"
         ].join(',').replaceAll('(,)*$', "")
         publishDir = [
diff --git a/modules/local/dada2_denoising.nf b/modules/local/dada2_denoising.nf
index 637bd898..8e551250 100644
--- a/modules/local/dada2_denoising.nf
+++ b/modules/local/dada2_denoising.nf
@@ -24,6 +24,7 @@ process DADA2_DENOISING {
 
     script:
     def prefix = task.ext.prefix ?: "prefix"
+    def quality_type = task.ext.quality_type ?: "Auto"
     def args = task.ext.args ?: ''
     def args2 = task.ext.args2 ?: ''
     if (!meta.single_end) {
@@ -39,9 +40,17 @@ process DADA2_DENOISING {
 
         #denoising
         sink(file = "${prefix}.dada.log")
-        dadaFs <- dada(filtFs, err = errF, $args, multithread = $task.cpus)
+        if ("${quality_type}" == "Auto") {
+            # Avoid using memory-inefficient derepFastq() if not necessary
+            dadaFs <- dada(filtFs, err = errF, $args, multithread = $task.cpus)
+            dadaRs <- dada(filtRs, err = errR, $args, multithread = $task.cpus)
+        } else {
+            derepFs <- derepFastq(filtFs, qualityType="${quality_type}")
+            dadaFs <- dada(derepFs, err = errF, $args, multithread = $task.cpus)
+            derepRs <- derepFastq(filtRs, qualityType="${quality_type}")
+            dadaRs <- dada(derepRs, err = errR, $args, multithread = $task.cpus)
+        }
         saveRDS(dadaFs, "${prefix}_1.dada.rds")
-        dadaRs <- dada(filtRs, err = errR, $args, multithread = $task.cpus)
         saveRDS(dadaRs, "${prefix}_2.dada.rds")
         sink(file = NULL)
 
@@ -66,7 +75,13 @@ process DADA2_DENOISING {
 
         #denoising
         sink(file = "${prefix}.dada.log")
-        dadaFs <- dada(filtFs, err = errF, $args, multithread = $task.cpus)
+        if ("${quality_type}" == "Auto") {
+            # Avoid using memory-inefficient derepFastq() if not necessary
+            dadaFs <- dada(filtFs, err = errF, $args, multithread = $task.cpus)
+        } else {
+            derepFs <- derepFastq(filtFs, qualityType="${quality_type}")
+            dadaFs <- dada(derepFs, err = errF, $args, multithread = $task.cpus)
+        }
         saveRDS(dadaFs, "${prefix}.dada.rds")
         sink(file = NULL)
 
diff --git a/nextflow.config b/nextflow.config
index 1982ab26..fa91dcab 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -17,6 +17,7 @@ params {
     extension                  = "/*_R{1,2}_001.fastq.gz"
     pacbio                     = false
     iontorrent                 = false
+    quality_type               = "Auto"
     FW_primer                  = null
     RV_primer                  = null
     classifier                 = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index e0b263b2..3c09f481 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -118,6 +118,14 @@
                     "help_text": "This will cause the pipeline to\n- not truncate input reads if not `--trunclenf` and `--trunclenr` are overwriting defaults\n- remove reverse complement primers from the end of reads in case the read length exceeds the amplicon length",
                     "fa_icon": "fas fa-align-justify"
                 },
+                "quality_type": {
+                    "type": "string",
+                    "default": "Auto",
+                    "description": "Type of quality scores in raw read data",
+                    "help_text": "From R package 'ShortRead' function 'readFastq': Representation to be used for quality scores, must be one of `Auto` (infer automatically), `FastqQuality` (Phred-like base 33 encoding), `SFastqQuality` (Illumina base 64 encoding).",
+                    "enum": ["Auto", "FastqQuality", "SFastqQuality"],
+                    "fa_icon": "fab fa-amilia"
+                },
                 "multiple_sequencing_runs": {
                     "type": "boolean",
                     "description": "If using `--input_folder`: samples were sequenced in multiple sequencing runs",