From 6563d8f18d6e7ab0c7a34ac3df1e1055f2cdc58e Mon Sep 17 00:00:00 2001 From: adamrtalbot <12817534+adamrtalbot@users.noreply.github.com> Date: Sat, 2 Nov 2024 12:26:01 +0000 Subject: [PATCH] Adds fq/lint for early validation of FASTQs Validation of FASTQS early prevents running the pipeline on invalid FASTQ files which will make the pipeline more efficient at achieving it's ultimate objective of checking FASTQ validity. It adds 3 more parameters: - `--skip_linting` which enables the linting of FASTQs - `--fq_lint_args` which is a string of arguments to pass to the linting tool - `--continue_with_lint_fail` which is a boolean to determine whether to continue if the linting fails Between these three options the user has a high degree of control over how the pipeline lints which should handle most use cases. Closes #31 --- conf/modules.config | 9 ++ modules.json | 5 ++ modules/nf-core/fq/lint/environment.yml | 5 ++ modules/nf-core/fq/lint/main.nf | 33 +++++++ modules/nf-core/fq/lint/meta.yml | 43 +++++++++ modules/nf-core/fq/lint/tests/main.nf.test | 63 +++++++++++++ .../nf-core/fq/lint/tests/main.nf.test.snap | 25 ++++++ modules/nf-core/fq/lint/tests/tags.yml | 2 + nextflow.config | 7 ++ nextflow_schema.json | 44 +++++++++- tests/rnaseq.main.nf.test | 88 +++++++++++++++++++ tests/rnaseq.main.nf.test.config | 8 ++ workflows/seqinspector.nf | 20 +++++ 13 files changed, 348 insertions(+), 4 deletions(-) create mode 100644 modules/nf-core/fq/lint/environment.yml create mode 100644 modules/nf-core/fq/lint/main.nf create mode 100644 modules/nf-core/fq/lint/meta.yml create mode 100644 modules/nf-core/fq/lint/tests/main.nf.test create mode 100644 modules/nf-core/fq/lint/tests/main.nf.test.snap create mode 100644 modules/nf-core/fq/lint/tests/tags.yml create mode 100644 tests/rnaseq.main.nf.test create mode 100644 tests/rnaseq.main.nf.test.config diff --git a/conf/modules.config b/conf/modules.config index d3c597b..01f75ef 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,6 +18,15 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: 'FQ_LINT' { + ext.args = { params.fq_lint_args } + errorStrategy = { + task.exitStatus in ((130..145) + 104) ? 'retry' : + params.continue_with_lint_fail ? 'ignore' : + 'finish' + } + } + withName: SEQTK_SAMPLE { ext.args = '-s100' } diff --git a/modules.json b/modules.json index 7e57ea1..c78482c 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,11 @@ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, + "fq/lint": { + "branch": "master", + "git_sha": "a1abf90966a2a4016d3c3e41e228bfcbd4811ccc", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", diff --git a/modules/nf-core/fq/lint/environment.yml b/modules/nf-core/fq/lint/environment.yml new file mode 100644 index 0000000..74b1460 --- /dev/null +++ b/modules/nf-core/fq/lint/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fq=0.12.0 diff --git a/modules/nf-core/fq/lint/main.nf b/modules/nf-core/fq/lint/main.nf new file mode 100644 index 0000000..943314c --- /dev/null +++ b/modules/nf-core/fq/lint/main.nf @@ -0,0 +1,33 @@ +process FQ_LINT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fq:0.12.0--h9ee0642_0': + 'biocontainers/fq:0.12.0--h9ee0642_0' }" + + input: + tuple val(meta), path(fastq) + + output: + tuple val(meta), path("*.fq_lint.txt"), emit: lint + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + fq lint \\ + $args \\ + $fastq > ${prefix}.fq_lint.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fq: \$(echo \$(fq lint --version | sed 's/fq-lint //g')) + END_VERSIONS + """ +} diff --git a/modules/nf-core/fq/lint/meta.yml b/modules/nf-core/fq/lint/meta.yml new file mode 100644 index 0000000..7240fb5 --- /dev/null +++ b/modules/nf-core/fq/lint/meta.yml @@ -0,0 +1,43 @@ +name: "fq_lint" +description: fq lint is a FASTQ file pair validator. +keywords: + - lint + - fastq + - validate +tools: + - "fq": + description: "fq is a library to generate and validate FASTQ file pairs." + homepage: "https://github.com/stjude-rust-labs/fq" + documentation: "https://github.com/stjude-rust-labs/fq" + tool_dev_url: "https://github.com/stjude-rust-labs/fq" + licence: ["MIT"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: FASTQ file list + pattern: "*.fastq{,.gz}" +output: + - lint: + - meta: + type: file + description: Lint output + pattern: "*.fq_lint.txt" + - "*.fq_lint.txt": + type: file + description: Lint output + pattern: "*.fq_lint.txt" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@adamrtalbot" +maintainers: + - "@adamrtalbot" diff --git a/modules/nf-core/fq/lint/tests/main.nf.test b/modules/nf-core/fq/lint/tests/main.nf.test new file mode 100644 index 0000000..ec2eaf8 --- /dev/null +++ b/modules/nf-core/fq/lint/tests/main.nf.test @@ -0,0 +1,63 @@ +nextflow_process { + + name "Test Process FQ_LINT" + script "../main.nf" + process "FQ_LINT" + + tag "modules" + tag "modules_nfcore" + tag "fq" + tag "fq/lint" + + test("test_fq_lint_success") { + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert process.out.lint.get(0).get(1) ==~ ".*/test.fq_lint.txt" }, + { assert path(process.out.lint.get(0).get(1)).getText().contains("fq-lint start") }, + { assert path(process.out.lint.get(0).get(1)).getText().contains("read 100 records") }, + { assert path(process.out.lint.get(0).get(1)).getText().contains("fq-lint end") }, + ) + } + + } + + test("test_fq_lint_fail") { + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert !process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + +} diff --git a/modules/nf-core/fq/lint/tests/main.nf.test.snap b/modules/nf-core/fq/lint/tests/main.nf.test.snap new file mode 100644 index 0000000..fec8e52 --- /dev/null +++ b/modules/nf-core/fq/lint/tests/main.nf.test.snap @@ -0,0 +1,25 @@ +{ + "test_fq_lint_fail": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "lint": [ + + ], + "versions": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-19T16:37:02.133847389" + } +} \ No newline at end of file diff --git a/modules/nf-core/fq/lint/tests/tags.yml b/modules/nf-core/fq/lint/tests/tags.yml new file mode 100644 index 0000000..9c9c323 --- /dev/null +++ b/modules/nf-core/fq/lint/tests/tags.yml @@ -0,0 +1,2 @@ +fq/lint: + - modules/nf-core/fq/lint/** diff --git a/nextflow.config b/nextflow.config index 38eb312..8ddfe42 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,13 @@ params { // Input options input = null sample_size = 0 + + // Options + skip_linting = false + fq_lint_args = "" + continue_with_lint_fail = false + + // References genome = null fasta = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 49742b2..3d1c62d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -31,7 +34,6 @@ }, "outdir": { "type": "string", - "default": null, "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" @@ -50,6 +52,30 @@ } } }, + "validation_options": { + "title": "Validation options", + "type": "object", + "description": "Options for validating and screening FASTQ files.", + "default": "", + "properties": { + "skip_linting": { + "type": "boolean", + "default": false, + "description": "Whether to lint the FASTQs before performing QC on the sequences", + "help_text": "FASTQ files will be linted with FQ early in the pipeline. If they fail validation, the pipeline will terminate preventing expensive quality control steps being performed on the other samples. If ignoring FQ is enabled, quality control will be performed on the remaining samples." + }, + "fq_lint_args": { + "type": "string", + "description": "Arguments to pass to FQ lint", + "help_text": "Arguments to pass to FQ lint. This can be used to disable overly strict linting. See https://github.com/stjude-rust-labs/fq?tab=readme-ov-file#lint for more information." + }, + "continue_with_lint_fail": { + "type": "boolean", + "description": "Whether to continue with the pipeline if linting fails for a single sample.", + "help_text": "If set to true, the pipeline will continue with the remaining samples if linting fails for a single sample. If set to false, the pipeline will terminate if linting fails for a single sample." + } + } + }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -156,7 +182,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -233,6 +266,9 @@ { "$ref": "#/$defs/input_output_options" }, + { + "$ref": "#/$defs/validation_options" + }, { "$ref": "#/$defs/reference_genome_options" }, @@ -243,4 +279,4 @@ "$ref": "#/$defs/generic_options" } ] -} +} \ No newline at end of file diff --git a/tests/rnaseq.main.nf.test b/tests/rnaseq.main.nf.test new file mode 100644 index 0000000..3994533 --- /dev/null +++ b/tests/rnaseq.main.nf.test @@ -0,0 +1,88 @@ +nextflow_pipeline { + + name "Test Workflow main.nf on NovaSeq6000 data" + script "../main.nf" + tag "seqinspector" + tag "PIPELINE" + + test("rnaseq data test fail linting") { + + when { + config "./rnaseq.main.nf.test.config" + params { + outdir = "$outputDir" + } + } + + then { + assertAll( + // Linting should fail! + { assert workflow.failed }, + ) + } + } + + test("rnaseq data test skip linting") { + + when { + config "./rnaseq.main.nf.test.config" + params { + outdir = "$outputDir" + skip_linting = true + } + } + + then { + assertAll( + // Linting should fail! + { assert workflow.failed }, + ) + } + } + + test("rnaseq data test ignore linting") { + + when { + config "./rnaseq.main.nf.test.config" + params { + outdir = "$outputDir" + continue_with_lint_fail = true + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_general_stats.txt") + ) + }, + ) + } + } + + test("rnaseq data test add args to fq/lint") { + + when { + config "./rnaseq.main.nf.test.config" + params { + outdir = "$outputDir" + fq_lint_args = "--disable-validator P001" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_general_stats.txt") + ) + }, + ) + } + } +} diff --git a/tests/rnaseq.main.nf.test.config b/tests/rnaseq.main.nf.test.config new file mode 100644 index 0000000..f85acb5 --- /dev/null +++ b/tests/rnaseq.main.nf.test.config @@ -0,0 +1,8 @@ +// Load the basic test config +includeConfig 'nextflow.config' + +// Load the correct samplesheet for that test +params { + input = params.pipelines_testdata_base_path + '626c8fab639062eade4b10747e919341cbf9b41a/samplesheet/v3.10/samplesheet_test.csv' + +} diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index 7a2dfae..ceb35fc 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -5,7 +5,9 @@ */ include { SEQTK_SAMPLE } from '../modules/nf-core/seqtk/sample/main' +include { FQ_LINT } from '../modules/nf-core/fq/lint/main' include { FASTQC } from '../modules/nf-core/fastqc/main' + include { MULTIQC as MULTIQC_GLOBAL } from '../modules/nf-core/multiqc/main' include { MULTIQC as MULTIQC_PER_TAG } from '../modules/nf-core/multiqc/main' @@ -32,6 +34,24 @@ workflow SEQINSPECTOR { ch_multiqc_extra_files = Channel.empty() ch_multiqc_reports = Channel.empty() + // + // MODULE: Run FQ_LINT to catch early errors + // + if ( !params.skip_linting ) { + FQ_LINT ( + ch_samplesheet + ) + ch_versions = ch_versions.mix(FQ_LINT.out.versions.first()) + // This catches all FASTQs that pass linting + // If you use an error strategy that allows FQ_LINT to fail, + // only valid FASTQ files will be passed to the next module + ch_samplesheet = FQ_LINT.out.lint + .join(ch_samplesheet) + .map { meta, fq_lint, reads -> + [meta, reads] + } + } + // // MODULE: Run Seqtk sample to perform subsampling //