diff --git a/CHANGELOG.md b/CHANGELOG.md index 6db05e56..17d95eea 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#220](https://github.com/nf-core/demultiplex/pull/220) Added kraken2. - [#221](https://github.com/nf-core/demultiplex/pull/221) Added checkqc_config to pipeline schema. - [#225](https://github.com/nf-core/demultiplex/pull/225) Added test profile for multi-lane samples, updated handling of such samples and adapter trimming. +- [#234](https://github.com/nf-core/demultiplex/pull/234) Added module for samplesheet validation. - [#236](https://github.com/nf-core/demultiplex/pull/236) Add samplesheet generation. ### `Changed` diff --git a/CITATIONS.md b/CITATIONS.md index 425cedfb..b3497f60 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -22,6 +22,8 @@ - [CheckQC](https://github.com/Molmed/checkQC) +- [samshee](https://github.com/lit-regensburg/samshee) + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index 85601c83..d9932471 100755 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ On release, automated continuous integration tests run the pipeline on a full-si 4. [Falco](#falco) - Raw read QC 5. [md5sum](#md5sum) - Creates an MD5 (128-bit) checksum of every fastq. 6. [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline +7. [samshee](#samshee) - Validates illumina v2 samplesheets. ![subway map](docs/demultiplex.png) diff --git a/bin/validate_samplesheet.py b/bin/validate_samplesheet.py new file mode 100755 index 00000000..987e3441 --- /dev/null +++ b/bin/validate_samplesheet.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +from samshee.samplesheetv2 import read_samplesheetv2 +from samshee.validation import illuminasamplesheetv2schema, illuminasamplesheetv2logic, validate +import json +import sys + +def validate_samplesheet(filename, custom_schema_file=None): + # Load the custom schema if provided + if custom_schema_file: + with open(custom_schema_file, 'r') as f: + custom_schema = json.load(f) + custom_validator = lambda doc: validate(doc, custom_schema) + else: + custom_validator = None + + # Prepare the list of validators + validators = [illuminasamplesheetv2schema, illuminasamplesheetv2logic] + if custom_validator: + validators.append(custom_validator) + # Read and validate the sample sheet + try: + sheet = read_samplesheetv2(filename, validation=validators) + print(f"Validation successful for {filename}") + except Exception as e: + print(f"Validation failed: {e}") + +if __name__ == "__main__": + if len(sys.argv) < 2 or len(sys.argv) > 3: + print("Usage: validate_samplesheet.py [custom_schema.json]") + sys.exit(1) + samplesheet_file = sys.argv[1] + schema_file = sys.argv[2] if len(sys.argv) == 3 else None + + validate_samplesheet(samplesheet_file, schema_file) diff --git a/conf/test.config b/conf/test.config index bfea6a82..2a1366b5 100755 --- a/conf/test.config +++ b/conf/test.config @@ -22,6 +22,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/flowcell_input.csv' demultiplexer = 'bclconvert' + skip_tools = 'samshee' } diff --git a/conf/test_bases2fastq.config b/conf/test_bases2fastq.config index 3d9c79f3..f87261ea 100644 --- a/conf/test_bases2fastq.config +++ b/conf/test_bases2fastq.config @@ -20,6 +20,6 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/b2fq-samplesheet.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/b2fq-samplesheet.csv' demultiplexer = 'bases2fastq' } diff --git a/conf/test_bcl2fastq.config b/conf/test_bcl2fastq.config index ecb2adff..ce880444 100755 --- a/conf/test_bcl2fastq.config +++ b/conf/test_bcl2fastq.config @@ -20,9 +20,9 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/flowcell_input.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/flowcell_input.csv' demultiplexer = 'bcl2fastq' - skip_tools = "checkqc" + skip_tools = "checkqc,samshee" } diff --git a/conf/test_checkqc.config b/conf/test_checkqc.config index a9a88846..7dc7fbb5 100644 --- a/conf/test_checkqc.config +++ b/conf/test_checkqc.config @@ -16,9 +16,9 @@ params { // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/samplesheet_full.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/samplesheet_full.csv' demultiplexer = 'bcl2fastq' - skip_tools = "fastp,falco,md5sum,multiqc" + skip_tools = "fastp,falco,md5sum,multiqc,samshee" checkqc_config = "${projectDir}/assets/checkqc_config.yaml" } diff --git a/conf/test_fqtk.config b/conf/test_fqtk.config index 40ce7665..f097b8b2 100644 --- a/conf/test_fqtk.config +++ b/conf/test_fqtk.config @@ -20,6 +20,6 @@ params { max_time = '1.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/fqtk-samplesheet.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/fqtk-samplesheet.csv' demultiplexer = 'fqtk' } diff --git a/conf/test_full.config b/conf/test_full.config index c45a689c..40209e9a 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -13,6 +13,9 @@ params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/samplesheet_full.csv' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/samplesheet_full.csv' demultiplexer = 'bcl2fastq' + skip_tools = 'samshee' } diff --git a/conf/test_kraken.config b/conf/test_kraken.config index 4adfef71..f13f7fb8 100644 --- a/conf/test_kraken.config +++ b/conf/test_kraken.config @@ -13,8 +13,11 @@ params { config_profile_name = 'Test full kraken profile' config_profile_description = 'Full test dataset to check pipeline function with kraken' - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/samplesheet_full.csv' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/samplesheet_full.csv' demultiplexer = 'bcl2fastq' kraken_db = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/genome/db/kraken2.tar.gz' + skip_tools = 'samshee' } diff --git a/conf/test_mkfastq.config b/conf/test_mkfastq.config index 1cc9914c..7990e1c8 100644 --- a/conf/test_mkfastq.config +++ b/conf/test_mkfastq.config @@ -20,6 +20,7 @@ params { max_time = '1.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/mkfastq-samplesheet.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/mkfastq-samplesheet.csv' demultiplexer = 'mkfastq' + skip_tools = 'samshee' } diff --git a/conf/test_pe.config b/conf/test_pe.config index e3fef5c8..84ca95a0 100644 --- a/conf/test_pe.config +++ b/conf/test_pe.config @@ -13,7 +13,9 @@ params { config_profile_name = 'Paired end test profile' config_profile_description = 'Paired end test dataset to check pipeline function' - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/pe_samplesheet.csv' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/pe_samplesheet.csv' demultiplexer = 'bcl2fastq' - skip_tools = "checkqc" + skip_tools = "checkqc,samshee" } diff --git a/conf/test_sgdemux.config b/conf/test_sgdemux.config index 1ee949b5..00ed472e 100644 --- a/conf/test_sgdemux.config +++ b/conf/test_sgdemux.config @@ -20,6 +20,6 @@ params { max_time = '1.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/sgdemux-samplesheet.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/sgdemux-samplesheet.csv' demultiplexer = 'sgdemux' } diff --git a/conf/test_two_lanes.config b/conf/test_two_lanes.config index d3385173..6dff2efc 100644 --- a/conf/test_two_lanes.config +++ b/conf/test_two_lanes.config @@ -15,7 +15,7 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function with multiple lanes' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/two_lane_samplesheet.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/two_lane_samplesheet.csv' demultiplexer = 'bclconvert' skip_tools = "checkqc" } diff --git a/conf/test_uncompressed.config b/conf/test_uncompressed.config index 1072387b..3d0c2ab7 100644 --- a/conf/test_uncompressed.config +++ b/conf/test_uncompressed.config @@ -20,8 +20,9 @@ params { max_time = '6.h' // Input data - input = 'https://github.com/nf-core/test-datasets/raw/demultiplex/samplesheet/1.3.0/uncompressed-samplesheet.csv' + input = 'https://github.com/nf-core/test-datasets/raw/demultiplex/samplesheet/1.3.0/uncompressed-samplesheet.csv' demultiplexer = 'bclconvert' + skip_tools = 'samshee' } diff --git a/docs/usage.md b/docs/usage.md index ba370802..c3efd5b8 100755 --- a/docs/usage.md +++ b/docs/usage.md @@ -118,10 +118,20 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). -## Optional parameters +### Optional parameters + +## checkQC If you are running this pipeline with the bcl2fastq demultiplexer, the checkqc module is run. In this case, the default run will include the default config file for checkqc, but you can additionally provide your own checkqc config file using the parameter `--checkqc_config` and a path to a `yml`. See an example of a config file in the [checkqc repository](https://github.com/Molmed/checkQC/blob/dfba84ec63e1df60c0f84ccc96a154a330b28ce4/checkQC/default_config/config.yaml). +### Trimming + +The trimming process in our demultiplexing pipeline has been updated to ensure compatibility with 10x Genomics recommendations. By default, trimming in the pipeline is performed using fastp, which reliably auto-detects and removes adapter sequences without the need for storing adapter sequences. As users can also supply adapter sequences in a samplesheet and thereby triggering trimming in any `bcl2fastq` or `bclconvert` subworkflows, we have added a new parameter, `remove_adapter`, which is set to true by default. When `remove_adapter` is true, the pipeline automatically removes any adapter sequences listed in the `[Settings]` section of the Illumina sample sheet, replacing them with an empty string in order to not provoke this behaviour. This approach aligns with 10x Genomics' guidelines, as they advise against pre-processing FASTQ reads before inputting them into their software pipelines. If the `remove_adapter` setting is true but no adapter is removed, a warning will be displayed; however, this does not necessarily indicate an error, as some sample sheets may already lack these adapter sequences. Users can disable this behavior by setting `--remove_adapter false` in the command line, though this is not recommended. + +## samshee (Samplesheet validator) + +samshee ensures the integrity of Illumina v2 Sample Sheets by allowing users to apply custom validation rules. The module can be used together with the parameter `--validator_schema`, which accepts a JSON schema validator file. Users can specify this file to enforce additional validation rules beyond the default ones provided by the tool. To use this feature, simply provide the path to the JSON schema validator file via the `--validator_schema` parameter in the pipeline configuration. This enables tailored validation of Sample Sheets to meet specific requirements or standards relevant to your sequencing workflow. For more information about the tool or how to write the schema JSON file, please refer to [Samshee on GitHub](https://github.com/lit-regensburg/samshee). + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -217,10 +227,6 @@ A pipeline might not always support every possible argument or option of a parti To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website. -### Trimming - -The trimming process in our demultiplexing pipeline has been updated to ensure compatibility with 10x Genomics recommendations. By default, trimming in the pipeline is performed using fastp, which reliably auto-detects and removes adapter sequences without the need for storing adapter sequences. As users can also supply adapter sequences in a samplesheet and thereby triggering trimming in any `bcl2fastq` or `bclconvert` subworkflows, we have added a new parameter, `remove_adapter`, which is set to true by default. When `remove_adapter` is true, the pipeline automatically removes any adapter sequences listed in the `[Settings]` section of the Illumina sample sheet, replacing them with an empty string in order to not provoke this behaviour. This approach aligns with 10x Genomics' guidelines, as they advise against pre-processing FASTQ reads before inputting them into their software pipelines. If the `remove_adapter` setting is true but no adapter is removed, a warning will be displayed; however, this does not necessarily indicate an error, as some sample sheets may already lack these adapter sequences. Users can disable this behavior by setting `--remove_adapter false` in the command line, though this is not recommended. - ### nf-core/configs In most cases, you will only need to create a custom config as a one-off but if you and others within your organisation are likely to be running nf-core pipelines regularly and need to use the same settings regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter. You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. diff --git a/modules/local/csv2tsv.nf b/modules/local/csv2tsv.nf index 0dd5a720..a35681d3 100644 --- a/modules/local/csv2tsv.nf +++ b/modules/local/csv2tsv.nf @@ -9,6 +9,7 @@ process CSV2TSV { output: tuple val(meta), path('samplesheet.tsv'), val(fastq_readstructure_pairs), emit: ch_output + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when @@ -16,5 +17,10 @@ process CSV2TSV { script: """ sed 's/,/\t/g' ${sample_sheet} > samplesheet.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sed: \$( sed --version | grep "sed (GNU sed) " | sed -e "s/sed (GNU sed) //g" ) + END_VERSIONS """ } diff --git a/modules/local/samshee/README.md b/modules/local/samshee/README.md new file mode 100644 index 00000000..3e8745bc --- /dev/null +++ b/modules/local/samshee/README.md @@ -0,0 +1,84 @@ +# Guide to Writing a `validation.json` Schema File + +## Introduction + +A JSON schema defines the structure and constraints of JSON data. This guide will help you create a `validation.json` schema file for use with Samshee to perform additional checks on Illumina® Sample Sheet v2 files. + +## JSON Schema Basics + +JSON Schema is a powerful tool for validating the structure of JSON data. It allows you to specify required fields, data types, and constraints. Here are some common components: + +- **`$schema`**: Declares the JSON Schema version being used. +- **`type`**: Specifies the data type (e.g., `object`, `array`, `string`, `number`). +- **`properties`**: Defines the properties of an object and their constraints. +- **`required`**: Lists properties that must be present in the object. +- **`items`**: Specifies the schema for items in an array. + +## Example Schema + +Here’s an example of a `validation.json` schema file for an Illumina® Sample Sheet: + +```json +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "Header": { + "type": "object", + "properties": { + "InvestigatorName": { + "type": "string" + }, + "ExperimentName": { + "type": "string" + } + }, + "required": ["InvestigatorName", "ExperimentName"] + }, + "Reads": { + "type": "object", + "properties": { + "Read1": { + "type": "integer", + "minimum": 1 + }, + "Read2": { + "type": "integer", + "minimum": 1 + } + }, + "required": ["Read1", "Read2"] + }, + "BCLConvert": { + "type": "object", + "properties": { + "Index": { + "type": "string", + "pattern": "^[ACGT]{8}$" // Example pattern for 8-base indices + } + } + } + }, + "required": ["Header", "Reads"] +} +``` + +### Explanation of the Example + +- **`$schema`**: Specifies the JSON Schema version (draft-07). +- **`type`**: Defines the main type as `object`. +- **`properties`**: Lists the properties of the object: +- **`Header`**: An object with required `InvestigatorName` and `ExperimentName` fields. +- **`Reads`**: An object with required `Read1` and `Read2` fields that must be integers greater than or equal to 1. +- **`BCLConvert`**: An object with an optional `Index` field that must be a string matching a pattern for 8-base indices. +- **`required`**: Lists required properties at the top level. + +### Tips for Writing JSON Schemas + +1. **Start Simple**: Begin with basic constraints and gradually add complexity. +2. **Use Online Validators**: Validate your schema using online tools to ensure it adheres to the JSON Schema specification. +3. **Refer to Schema Documentation**: Consult the [JSON Schema documentation](https://json-schema.org/) for detailed guidance. + +### Conclusion + +By defining a JSON schema, you can enforce specific rules and ensure that your Illumina® Sample Sheet v2 files meet your required structure and constraints. Use this guide to create and validate your `validation.json` schema files effectively. diff --git a/modules/local/samshee/environment.yml b/modules/local/samshee/environment.yml new file mode 100644 index 00000000..f92e0eee --- /dev/null +++ b/modules/local/samshee/environment.yml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python>=3.9 + - pip + - pip: # FIXME https://github.com/nf-core/modules/issues/5814 + - samshee==0.1.12 diff --git a/modules/local/samshee/main.nf b/modules/local/samshee/main.nf new file mode 100644 index 00000000..58146d6e --- /dev/null +++ b/modules/local/samshee/main.nf @@ -0,0 +1,134 @@ +process SAMSHEE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/python_pip_samshee:84a770c9853c725d' : + 'community.wave.seqera.io/library/python_pip_samshee:e8a5c47ec32efa42' }" + + input: + tuple val(meta), path(samplesheet) + path(validator_schema) //optional + + output: + // Module is meant to stop the pipeline if validation fails + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def arg_validator_schema = validator_schema ? "${validator_schema}" : "" + """ + # Run validation command and capture output + output=\$(validate_samplesheet.py "${samplesheet}" "${arg_validator_schema}" 2>&1) + status=\$? + # Check if validation failed + if echo "\$output" | grep -q "Validation failed:"; then + echo "\$output" # Print output for debugging + exit 1 # Fail the process if validation failed + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samshee: \$( python -m pip show --version samshee | grep "Version" | sed -e "s/Version: //g" ) + python: \$( python --version | sed -e "s/Python //g" ) + END_VERSIONS + + # If no validation errors, process exits with status 0 + exit \$status + """ + + stub: + """ + #Generate minimal samplesheet + cat <<-END_SAMPLE_SHEET > minimal_samplesheet.csv + [Header] + FileFormatVersion,2 + RunName,Run_001 + Instrument Type,NextSeq 1000 + InstrumentPlatform,NextSeq 1000 + + [Reads] + Read1Cycles,150 + Read2Cycles,150 + Index1Cycles,8 + Index2Cycles,8 + + [Settings] + + [Data] + Sample_ID,Sample_Name,Description,Sample_Project + Sample1,Sample1,, + END_SAMPLE_SHEET + + + #Generate minimal schema validator file + cat <<-END_SCHEMA > minimal_schema.json + { + "type": "object", + "properties": { + "Header": { + "type": "object", + "properties": { + "FileFormatVersion": { "type": "integer" }, + "RunName": { "type": "string" }, + "Instrument Type": { "type": "string" }, + "InstrumentPlatform": { "type": "string" } + }, + "required": ["FileFormatVersion", "RunName", "Instrument Type", "InstrumentPlatform"] + }, + "Reads": { + "type": "object", + "properties": { + "Read1Cycles": { "type": "integer" }, + "Read2Cycles": { "type": "integer" }, + "Index1Cycles": { "type": "integer" }, + "Index2Cycles": { "type": "integer" } + }, + "required": ["Read1Cycles", "Read2Cycles", "Index1Cycles", "Index2Cycles"] + }, + "Settings": { + "type": "object" + }, + "Data": { + "type": "array", + "items": { + "type": "object", + "properties": { + "Sample_ID": { "type": "string" }, + "Sample_Name": { "type": "string" }, + "Description": { "type": "string" }, + "Sample_Project": { "type": "string" } + }, + "required": ["Sample_ID", "Sample_Name", "Description", "Sample_Project"] + } + } + }, + "required": ["Header", "Reads", "Settings", "Data"] + } + END_SCHEMA + + # Run validation command and capture output + output=\$(validate_samplesheet.py minimal_samplesheet.csv minimal_schema.json 2>&1) + status=\$? + # Check if validation failed + if echo "\$output" | grep -q "Validation failed:"; then + echo "\$output" # Print output for debugging + exit 1 # Fail the process if validation failed + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samshee: \$( python -m pip show --version samshee | grep "Version" | sed -e "s/Version: //g" ) + python: \$( python --version | sed -e "s/Python //g" ) + END_VERSIONS + + # If no validation errors, process exits with status 0 + exit \$status + """ +} diff --git a/modules/local/samshee/meta.yml b/modules/local/samshee/meta.yml new file mode 100644 index 00000000..145ddd24 --- /dev/null +++ b/modules/local/samshee/meta.yml @@ -0,0 +1,33 @@ +name: samshee +description: Module to validate illumina® Sample Sheet v2 files. +keywords: + - samplesheet + - illumina + - bclconvert + - bcl2fastq +tools: + - samshee: + description: A schema-agnostic parser and writer for illumina® sample sheets v2 and similar documents. + homepage: https://github.com/lit-regensburg/samshee + documentation: https://github.com/lit-regensburg/samshee/blob/main/README.md + tool_dev_url: https://github.com/lit-regensburg/samshee + licence: [MIT license] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', lane:1 ] + - samplesheet: + type: file + description: "illumina v2 samplesheet" + pattern: "*.{csv}" +output: + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@nschcolnicov" +maintainers: + - "@nschcolnicov" diff --git a/modules/local/samshee/tests/main.nf.test b/modules/local/samshee/tests/main.nf.test new file mode 100644 index 00000000..d76c98f4 --- /dev/null +++ b/modules/local/samshee/tests/main.nf.test @@ -0,0 +1,51 @@ +// nf-core modules test cellranger/mkfastq +nextflow_process { + + name "Test Process samshee" + script "../main.nf" + config "./nextflow.config" + process "SAMSHEE" + + tag "modules" + + test("test samplesheet") { + + when { + process { + """ + input[0] = [ [ id: 'test', lane:1 ], file("https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/testdata/NextSeq2000/SampleSheet.csv", checkIfExists: true) ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success } + ) + } + + } + + test("stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [ id: 'test', lane:1 ], file("https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/testdata/NextSeq2000/SampleSheet.csv", checkIfExists: true), [] ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + ) + } + + } + +} diff --git a/modules/local/samshee/tests/nextflow.config b/modules/local/samshee/tests/nextflow.config new file mode 100644 index 00000000..e69de29b diff --git a/nextflow.config b/nextflow.config index 81fa6519..ba2872ae 100755 --- a/nextflow.config +++ b/nextflow.config @@ -18,7 +18,7 @@ params { remove_adapter = true // [true, false] // Options: tooling - skip_tools = [] // list [fastp, fastqc, kraken, multiqc, checkqc, falco, md5sum] + skip_tools = [] // list [fastp, fastqc, kraken, multiqc, checkqc, falco, md5sum, samshee] // seqtk sample options sample_size = 100000 @@ -30,7 +30,10 @@ params { downstream_pipeline = "default" // enum string [rnaseq, atacseq, taxprofiler, default] // Options: CheckQC - checkqc_config = [] // file .yaml + checkqc_config = [] // file .yaml + + // Options: Illumina samplesheet validator + validator_schema = null // file .json // MultiQC options multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 3d4acd22..7ab8d009 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -19,7 +19,7 @@ "skip_tools": { "type": "string", "default": "[]", - "description": "Comma-separated list of tools to skip (fastp,falco,multiqc)" + "description": "Comma-separated list of tools to skip (fastp,fastqc,kraken,multiqc,checkqc,falco,md5sum,samshee)" }, "sample_size": { "type": "integer", @@ -31,6 +31,11 @@ "format": "path", "description": "Path to Kraken2 DB to use for screening" }, + "validator_schema": { + "type": "string", + "format": "file-path", + "description": "Path to Illumina v2 samplesheet validator .json file" + }, "downstream_pipeline": { "type": "string", "description": "Name of downstream nf-core pipeline (one of: rnaseq, atacseq, taxprofiler or default). Used to produce the input samplesheet for that pipeline.", diff --git a/subworkflows/local/fqtk_demultiplex/main.nf b/subworkflows/local/fqtk_demultiplex/main.nf index df4f5478..9066bcd5 100644 --- a/subworkflows/local/fqtk_demultiplex/main.nf +++ b/subworkflows/local/fqtk_demultiplex/main.nf @@ -21,11 +21,14 @@ workflow FQTK_DEMULTIPLEX { // Generate meta for each fastq ch_fastq_with_meta = generate_fastq_meta(FQTK.out.sample_fastq) + // Add versions to versions channel + ch_versions = FQTK.out.versions.mix(CSV2TSV.out.versions) + emit: fastq = ch_fastq_with_meta metrics = FQTK.out.metrics unassigned = FQTK.out.most_frequent_unmatched - versions = FQTK.out.versions + versions = ch_versions } /* diff --git a/tests/pipeline/bcl2fastq.nf.test b/tests/pipeline/bcl2fastq.nf.test index 027784f9..cedada98 100644 --- a/tests/pipeline/bcl2fastq.nf.test +++ b/tests/pipeline/bcl2fastq.nf.test @@ -12,7 +12,7 @@ nextflow_pipeline { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/flowcell_input.csv' demultiplexer = 'bcl2fastq' outdir = "$outputDir" - skip_tools = "checkqc" + skip_tools = "checkqc,samshee" } } diff --git a/tests/pipeline/bcl2fastq.nf.test.snap b/tests/pipeline/bcl2fastq.nf.test.snap index 58ce544d..2e77ad49 100644 --- a/tests/pipeline/bcl2fastq.nf.test.snap +++ b/tests/pipeline/bcl2fastq.nf.test.snap @@ -5,9 +5,9 @@ ], "meta": { "nf-test": "0.9.0", - "nextflow": "23.10.0" + "nextflow": "24.04.4" }, - "timestamp": "2024-08-07T17:06:30.73934962" + "timestamp": "2024-08-08T20:43:41.988053048" }, "bcl2fastq": { "content": [ @@ -66,9 +66,9 @@ ], "meta": { "nf-test": "0.9.0", - "nextflow": "23.10.0" + "nextflow": "24.04.4" }, - "timestamp": "2024-08-07T17:06:30.762919351" + "timestamp": "2024-08-08T20:43:42.035230933" }, "multiqc": { "content": [ @@ -78,8 +78,8 @@ ], "meta": { "nf-test": "0.9.0", - "nextflow": "23.10.0" + "nextflow": "24.04.4" }, - "timestamp": "2024-08-07T17:06:30.754480121" + "timestamp": "2024-08-08T20:43:42.016587995" } } \ No newline at end of file diff --git a/tests/pipeline/fqtk.nf.test.snap b/tests/pipeline/fqtk.nf.test.snap index 32800060..acb4a9b3 100644 --- a/tests/pipeline/fqtk.nf.test.snap +++ b/tests/pipeline/fqtk.nf.test.snap @@ -11,7 +11,7 @@ }, "software_versions": { "content": [ - "{FALCO={falco=1.2.1}, FASTP={fastp=0.23.4}, FQTK={fqtk=0.2.1}, MD5SUM={md5sum=8.3}, UNTAR_FLOWCELL={untar=1.34}, Workflow={nf-core/demultiplex=v1.5.0dev}}" + "{CSV2TSV={sed=4.8}, FALCO={falco=1.2.1}, FASTP={fastp=0.23.4}, FQTK={fqtk=0.2.1}, MD5SUM={md5sum=8.3}, UNTAR_FLOWCELL={untar=1.34}, Workflow={nf-core/demultiplex=v1.5.0dev}}" ], "meta": { "nf-test": "0.8.4", @@ -19,4 +19,4 @@ }, "timestamp": "2024-08-02T19:57:17.122084549" } -} \ No newline at end of file +} diff --git a/tests/pipeline/kraken.nf.test b/tests/pipeline/kraken.nf.test index 73ccc70c..a5d2c357 100644 --- a/tests/pipeline/kraken.nf.test +++ b/tests/pipeline/kraken.nf.test @@ -12,7 +12,7 @@ nextflow_pipeline { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/flowcell_input.csv' demultiplexer = 'bcl2fastq' outdir = "$outputDir" - skip_tools = "checkqc" + skip_tools = "checkqc,samshee" kraken_db = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/genome/db/kraken2.tar.gz' } } diff --git a/tests/pipeline/skip_tools.nf.test b/tests/pipeline/skip_tools.nf.test index 2f0d1cc7..6ffc1ecd 100644 --- a/tests/pipeline/skip_tools.nf.test +++ b/tests/pipeline/skip_tools.nf.test @@ -41,7 +41,7 @@ nextflow_pipeline { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/flowcell_input.csv' demultiplexer = 'bclconvert' outdir = "$outputDir" - skip_tools = "fastp" + skip_tools = "fastp,samshee" } } @@ -69,7 +69,7 @@ nextflow_pipeline { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/flowcell_input.csv' demultiplexer = 'bclconvert' outdir = "$outputDir" - skip_tools = "fastqc" + skip_tools = "fastqc,samshee" } } @@ -97,7 +97,7 @@ nextflow_pipeline { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/flowcell_input.csv' demultiplexer = 'bclconvert' outdir = "$outputDir" - skip_tools = "fastp,fastqc" + skip_tools = "fastp,fastqc,samshee" } } @@ -125,7 +125,7 @@ nextflow_pipeline { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/flowcell_input.csv' demultiplexer = 'bclconvert' outdir = "$outputDir" - skip_tools = "multiqc" + skip_tools = "multiqc,samshee" } } diff --git a/tests/pipeline/test_pe.nf.test b/tests/pipeline/test_pe.nf.test index 8b9ae237..9449a8a0 100644 --- a/tests/pipeline/test_pe.nf.test +++ b/tests/pipeline/test_pe.nf.test @@ -11,7 +11,7 @@ nextflow_pipeline { params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/demultiplex/samplesheet/1.3.0/pe_samplesheet.csv' demultiplexer = 'bcl2fastq' - skip_tools = "checkqc" + skip_tools = "checkqc,samshee" outdir = "$outputDir" } } diff --git a/workflows/demultiplex.nf b/workflows/demultiplex.nf index 1f8576a1..fad8d5e5 100644 --- a/workflows/demultiplex.nf +++ b/workflows/demultiplex.nf @@ -28,6 +28,11 @@ include { UNTAR as UNTAR_FLOWCELL } from '../modules/nf-core/untar/main' include { UNTAR as UNTAR_KRAKEN_DB } from '../modules/nf-core/untar/main' include { MD5SUM } from '../modules/nf-core/md5sum/main' +// +// MODULE: Local modules +// +include { SAMSHEE } from '../modules/local/samshee/main' + // // FUNCTION // @@ -58,10 +63,11 @@ workflow DEMULTIPLEX { // Channel inputs - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() - ch_multiqc_reports = Channel.empty() - checkqc_config = params.checkqc_config ? Channel.fromPath(params.checkqc_config, checkIfExists: true) : [] // file checkqc_config.yaml + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + ch_multiqc_reports = Channel.empty() + checkqc_config = params.checkqc_config ? Channel.fromPath(params.checkqc_config, checkIfExists: true) : [] // file checkqc_config.yaml + ch_validator_schema = params.validator_schema ? Channel.fromPath(params.validator_schema, checkIfExists: true) : [] // file validator_schema.json // Remove adapter from Illumina samplesheet to avoid adapter trimming in demultiplexer tools if (params.remove_adapter && (params.demultiplexer in ["bcl2fastq", "bclconvert", "mkfastq"])) { @@ -86,6 +92,15 @@ workflow DEMULTIPLEX { } } + // RUN samplesheet_validator samshee + if (!("samshee" in skip_tools) && (params.demultiplexer in ["bcl2fastq", "bclconvert", "mkfastq"])){ + SAMSHEE ( + ch_samplesheet.map{ meta, samplesheet, flowcell, lane -> [meta,samplesheet] }, + ch_validator_schema + ) + ch_versions = ch_versions.mix(SAMSHEE.out.versions) + } + // Convenience ch_samplesheet.dump(tag: 'DEMULTIPLEX::inputs', {FormattingService.prettyFormat(it)})