nf-core · jfy133 · Apr 4, 2024 · Mar 21, 2024 · Mar 21, 2024 · Mar 21, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Added`
 
 - [#454](https://github.com/nf-core/taxprofiler/pull/454) Updated to nf-core pipeline template v2.13.1 (added by @LilyAnderssonLee & @sofstam)
+- [#461](https://github.com/nf-core/taxprofiler/pull/461) Turned on 'strict' Nextflow evaluation runs (added by @jfy133)
+- [#461](https://github.com/nf-core/taxprofiler/pull/461) Optimised database compression so each compressed input database is untarred once, and shared amongst each run with different parameters (added by @jfy133)
+- [#461](https://github.com/nf-core/taxprofiler/pull/461) Added new parameter to optionally save uncompressed databases (added by @jfy133)
 
 ### `Fixed`
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -18,6 +18,15 @@ process {
         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
     ]
 
+    withName: UNTAR {
+        ext.prefix = { "${archive.simpleName}" }
+        publishDir = [
+            path: { "${params.outdir}/untar/databases" },
+            mode: params.publish_dir_mode,
+            enabled: params.save_untarred_databases
+        ]
+    }
+
     withName: FASTQC {
         ext.args = '--quiet'
         ext.prefix = { "${meta.id}_${meta.run_accession}_raw" }
@@ -512,6 +521,14 @@ process {
         ]
     }
 
+    withName: KRAKENTOOLS_KREPORT2KRONA {
+        publishDir = [
+            enabled: false,
+            mode: params.publish_dir_mode,
+            pattern: '*.txt'
+        ]
+    }
+
     withName: KRONA_CLEANUP {
         ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}" }
         publishDir = [

diff --git a/docs/output.md b/docs/output.md
@@ -10,6 +10,7 @@ The directories listed below will be created in the results directory after the
 
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
 
+- [UNTAR](#untar) - Optionally saved decompressed input databases
 - [FastQC](#fastqc) - Raw read QC
 - [falco](#fastqc) - Alternative to FastQC for raw read QC
 - [fastp](#fastp) - Adapter trimming for Illumina data
@@ -40,6 +41,21 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 ![](images/taxprofiler_tube.png)
 
+### untar
+
+untar is used in nf-core/taxprofiler to decompress various input files ending in `.tar.gz`. This process is mainly used for decompressing input database archive files.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `untar/`
+  - `database/`
+    - `<database_file_name>`: directory containing contents of the decompressed archive
+
+</details>
+
+This directory will only be present if `--save_untarred_databases` is supplied. The contained directories can be useful for moving the decompressed directories to a central 'cache' location allowing users to re-use the same databases. This is useful to save unnecessary computational time of decompressing the archives on every run.
+
 ### FastQC or Falco
 
 <details markdown="1">

diff --git a/nextflow.config b/nextflow.config
@@ -6,6 +6,8 @@
 ----------------------------------------------------------------------------------------
 */
 
+nextflow.enable.strict = true
+
 // Global default params, used in configs
 params {
 
@@ -30,6 +32,7 @@ params {
     email_on_fail              = null
     plaintext_email            = false
     monochrome_logs            = false
+    monochromeLogs             = false // required so nf-validation nextflow.enabled.strict works nicely together
     hook_url                   = null
     help                       = false
     version                    = false
@@ -51,12 +54,13 @@ params {
     // Schema validation default options
     validationFailUnrecognisedParams = false
     validationLenientMode            = false
-    validationSchemaIgnoreParams     = 'genomes,igenomes_base,fasta'
+    validationSchemaIgnoreParams     = 'genomes,igenomes_base,fasta,monochromeLogs'
     validationShowHiddenParams       = false
     validate_params                  = true
 
     // Databases
-    databases = null
+    databases               = null
+    save_untarred_databases = false
 
     // FASTQ preprocessing
     skip_preprocessing_qc            = false

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -34,6 +34,12 @@
                     "description": "Path to comma-separated file containing information about databases and profiling parameters for each taxonomic profiler",
                     "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/dev/usage#full-database-sheet).\n\nProfilers will only be executed if a corresponding  database are supplied. \n\nWe recommend storing this database sheet somewhere centrally and accessible by others members of your lab/institutions, as this file will likely be regularly reused."
                 },
+                "save_untarred_databases": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-database",
+                    "description": "Specify to save decompressed user-supplied TAR archives of databases",
+                    "help_text": "If input databases are supplied as gzipped TAR archives, in some cases you may want to move and re-use these for future runs. Specifying this parameter will save these to `--outdir results/` under a directory called `untar`."
+                },
                 "outdir": {
                     "type": "string",
                     "format": "directory-path",

diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
@@ -155,21 +155,37 @@ workflow TAXPROFILER {
             skip: true
         }
     // Filter the channel to untar only those databases for tools that are selected to be run by the user.
-    ch_input_untar = ch_dbs_for_untar.untar
+    // Also, to ensure only untar once per file, group together all databases of one file
+    ch_inputdb_untar = ch_dbs_for_untar.untar
         .filter { db_meta, db_path ->
             params[ "run_${db_meta.tool}" ]
         }
-    UNTAR ( ch_input_untar )
-
-    ch_final_dbs = ch_dbs_for_untar.skip.mix( UNTAR.out.untar )
-    ch_final_dbs
-        .map { db_meta, db -> [ db_meta.db_params ]
-            def corrected_db_params = db_meta.db_params == null ? '' : db_meta.db_params
-            db_meta.db_params = corrected_db_params
-            [ db_meta, db ]
+        .groupTuple(by: 1)
+        .map {
+            meta, dbfile ->
+                def new_meta = [ 'id': dbfile.baseName ] + [ 'meta': meta ]
+            [new_meta , dbfile ]
         }
+
+    // Untar the databases
+    UNTAR ( ch_inputdb_untar )
     ch_versions = ch_versions.mix( UNTAR.out.versions.first() )
 
+    // Spread out the untarred and shared databases
+    ch_outputdb_from_untar = UNTAR.out.untar
+        .map {
+            meta, db ->
+            [meta.meta, db]
+        }
+        .transpose(by: 0)
+
+    ch_final_dbs = ch_dbs_for_untar.skip
+                    .mix( ch_outputdb_from_untar  )
+                    .map { db_meta, db ->
+                        def corrected_db_params = db_meta.db_params ? [ db_params: db_meta.db_params ] : [ db_params: '' ]
+                        [ db_meta + corrected_db_params, db ]
+                    }
+
     /*
         MODULE: Run FastQC
     */