From 4afad38d7298e7b760098c656bf137d124004244 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Fri, 1 Nov 2024 15:52:02 +0100 Subject: [PATCH 1/8] add germline benchmark --- conf/benchmark.config | 18 + main.nf | 4 +- modules.json | 15 + .../nf-core/bcftools/index/environment.yml | 5 + modules/nf-core/bcftools/index/main.nf | 51 ++ modules/nf-core/bcftools/index/meta.yml | 63 ++ .../nf-core/bcftools/index/tests/main.nf.test | 113 +++ .../bcftools/index/tests/main.nf.test.snap | 120 ++++ .../bcftools/index/tests/nextflow.config | 3 + modules/nf-core/bcftools/index/tests/tags.yml | 2 + .../nf-core/rtgtools/format/environment.yml | 5 + modules/nf-core/rtgtools/format/main.nf | 66 ++ modules/nf-core/rtgtools/format/meta.yml | 58 ++ .../nf-core/rtgtools/vcfeval/environment.yml | 5 + modules/nf-core/rtgtools/vcfeval/main.nf | 90 +++ modules/nf-core/rtgtools/vcfeval/meta.yml | 199 +++++ .../rtgtools/vcfeval/tests/main.nf.test | 113 +++ .../rtgtools/vcfeval/tests/main.nf.test.snap | 677 ++++++++++++++++++ .../rtgtools/vcfeval/tests/nextflow.config | 4 + nextflow.config | 5 + nextflow_schema.json | 15 +- subworkflows/local/vcf_benchmark/main.nf | 62 ++ workflows/sarek/main.nf | 29 + 23 files changed, 1719 insertions(+), 3 deletions(-) create mode 100644 conf/benchmark.config create mode 100644 modules/nf-core/bcftools/index/environment.yml create mode 100644 modules/nf-core/bcftools/index/main.nf create mode 100644 modules/nf-core/bcftools/index/meta.yml create mode 100644 modules/nf-core/bcftools/index/tests/main.nf.test create mode 100644 modules/nf-core/bcftools/index/tests/main.nf.test.snap create mode 100644 modules/nf-core/bcftools/index/tests/nextflow.config create mode 100644 modules/nf-core/bcftools/index/tests/tags.yml create mode 100644 modules/nf-core/rtgtools/format/environment.yml create mode 100644 modules/nf-core/rtgtools/format/main.nf create mode 100644 modules/nf-core/rtgtools/format/meta.yml create mode 100644 modules/nf-core/rtgtools/vcfeval/environment.yml create mode 100644 modules/nf-core/rtgtools/vcfeval/main.nf create mode 100644 modules/nf-core/rtgtools/vcfeval/meta.yml create mode 100644 modules/nf-core/rtgtools/vcfeval/tests/main.nf.test create mode 100644 modules/nf-core/rtgtools/vcfeval/tests/main.nf.test.snap create mode 100644 modules/nf-core/rtgtools/vcfeval/tests/nextflow.config create mode 100644 subworkflows/local/vcf_benchmark/main.nf diff --git a/conf/benchmark.config b/conf/benchmark.config new file mode 100644 index 0000000000..d0b5d8807c --- /dev/null +++ b/conf/benchmark.config @@ -0,0 +1,18 @@ +params { + config_profile_name = 'Full benchmark profile for germline VC' + config_profile_description = 'Full benchmark dataset to check germline VC pipeline ' + + // Input data for full size test + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/csv/NA12878_Agilent_full_test.csv' + + // Other params + tools = 'strelka,freebayes,haplotypecaller,deepvariant,manta,tiddit,cnvkit,vep,snpeff' + intervals = 's3://ngi-igenomes/test-data/sarek/Agilent_v7.bed' + wes = true + trim_fastq = true + + // Benchmark + benchmark = true + truth_vcf = 'https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz' + truth_bed = 'https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.bed' +} diff --git a/main.nf b/main.nf index ac3a22a98a..58796d4d5b 100755 --- a/main.nf +++ b/main.nf @@ -305,7 +305,9 @@ workflow NFCORE_SAREK { vep_extra_files, vep_fasta, vep_genome, - vep_species + vep_species, + truth_vcf, + truth_bed ) emit: multiqc_report = SAREK.out.multiqc_report // channel: /path/to/multiqc_report.html diff --git a/modules.json b/modules.json index 75fec0dbc7..a2ac97d136 100644 --- a/modules.json +++ b/modules.json @@ -21,6 +21,11 @@ "git_sha": "d1e0ec7670fa77905a378627232566ce54c3c26d", "installed_by": ["modules"] }, + "bcftools/index": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "bcftools/mpileup": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", @@ -354,6 +359,16 @@ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["bam_ngscheckmate"] }, + "rtgtools/format": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, + "rtgtools/vcfeval": { + "branch": "master", + "git_sha": "83e2df1e4ec594beb8a575b4db0b4197900f4ebd", + "installed_by": ["modules"] + }, "samblaster": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", diff --git a/modules/nf-core/bcftools/index/environment.yml b/modules/nf-core/bcftools/index/environment.yml new file mode 100644 index 0000000000..5c00b116ad --- /dev/null +++ b/modules/nf-core/bcftools/index/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::bcftools=1.20 diff --git a/modules/nf-core/bcftools/index/main.nf b/modules/nf-core/bcftools/index/main.nf new file mode 100644 index 0000000000..408e584c8c --- /dev/null +++ b/modules/nf-core/bcftools/index/main.nf @@ -0,0 +1,51 @@ +process BCFTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0': + 'biocontainers/bcftools:1.20--h8b25389_0' }" + + input: + tuple val(meta), path(vcf) + + output: + tuple val(meta), path("*.csi"), optional:true, emit: csi + tuple val(meta), path("*.tbi"), optional:true, emit: tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + bcftools \\ + index \\ + $args \\ + --threads $task.cpus \\ + $vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--tbi") || args.contains("-t") ? "tbi" : + "csi" + """ + touch ${vcf}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/index/meta.yml b/modules/nf-core/bcftools/index/meta.yml new file mode 100644 index 0000000000..6897d1bd12 --- /dev/null +++ b/modules/nf-core/bcftools/index/meta.yml @@ -0,0 +1,63 @@ +name: bcftools_index +description: Index VCF tools +keywords: + - vcf + - index + - bcftools + - csi + - tbi +tools: + - bcftools: + description: BCFtools is a set of utilities that manipulate variant calls in the + Variant Call Format (VCF) and its binary counterpart BCF. All commands work + transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed. Most + commands accept VCF, bgzipped VCF and BCF with filetype detected automatically + even when streaming from a pipe. Indexed VCF and BCF will work in all situations. + Un-indexed VCF and BCF and streams will work in most, but not all situations. + homepage: https://samtools.github.io/bcftools/ + documentation: https://samtools.github.io/bcftools/howtos/index.html + tool_dev_url: https://github.com/samtools/bcftools + doi: "10.1093/gigascience/giab008" + licence: ["MIT", "GPL-3.0-or-later"] + identifier: biotools:bcftools +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF file (optionally GZIPPED) + pattern: "*.{vcf,vcf.gz}" +output: + - csi: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.csi": + type: file + description: Default VCF file index file + pattern: "*.csi" + - tbi: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tbi": + type: file + description: Alternative VCF file index file for larger files (activated with + -t parameter) + pattern: "*.tbi" + - versions: + - versions.yml: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/bcftools/index/tests/main.nf.test b/modules/nf-core/bcftools/index/tests/main.nf.test new file mode 100644 index 0000000000..9b3748533c --- /dev/null +++ b/modules/nf-core/bcftools/index/tests/main.nf.test @@ -0,0 +1,113 @@ +nextflow_process { + + name "Test Process BCFTOOLS_INDEX" + script "../main.nf" + process "BCFTOOLS_INDEX" + + tag "modules" + tag "modules_nfcore" + tag "bcftools" + tag "bcftools/index" + + test("sarscov2 - vcf - csi") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.csi.collect { it.collect { it instanceof Map ? it : file(it).name } }, + process.out.versions).match() + } + ) + } + + } + + test("sarscov2 - vcf - tbi") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.tbi.collect { it.collect { it instanceof Map ? it : file(it).name } }, + process.out.versions).match() + } + ) + } + + } + + test("sarscov2 - vcf - csi - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - vcf - tbi - stub") { + + config "./nextflow.config" + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/index/tests/main.nf.test.snap b/modules/nf-core/bcftools/index/tests/main.nf.test.snap new file mode 100644 index 0000000000..b6f7b7003c --- /dev/null +++ b/modules/nf-core/bcftools/index/tests/main.nf.test.snap @@ -0,0 +1,120 @@ +{ + "sarscov2 - vcf - csi - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,b4ea0f633dba7f5992fbf41b518f98e9" + ], + "csi": [ + [ + { + "id": "test" + }, + "test.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tbi": [ + + ], + "versions": [ + "versions.yml:md5,b4ea0f633dba7f5992fbf41b518f98e9" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-03T11:43:09.347303629" + }, + "sarscov2 - vcf - tbi": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.vcf.gz.tbi" + ] + ], + [ + "versions.yml:md5,b4ea0f633dba7f5992fbf41b518f98e9" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-03T11:42:46.38669168" + }, + "sarscov2 - vcf - tbi - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,b4ea0f633dba7f5992fbf41b518f98e9" + ], + "csi": [ + + ], + "tbi": [ + [ + { + "id": "test" + }, + "test.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,b4ea0f633dba7f5992fbf41b518f98e9" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-03T11:43:32.494612317" + }, + "sarscov2 - vcf - csi": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.vcf.gz.csi" + ] + ], + [ + "versions.yml:md5,b4ea0f633dba7f5992fbf41b518f98e9" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-03T11:42:33.652109509" + } +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/index/tests/nextflow.config b/modules/nf-core/bcftools/index/tests/nextflow.config new file mode 100644 index 0000000000..db83f7e5a4 --- /dev/null +++ b/modules/nf-core/bcftools/index/tests/nextflow.config @@ -0,0 +1,3 @@ +process { + ext.args = '--tbi' +} diff --git a/modules/nf-core/bcftools/index/tests/tags.yml b/modules/nf-core/bcftools/index/tests/tags.yml new file mode 100644 index 0000000000..b4c349be93 --- /dev/null +++ b/modules/nf-core/bcftools/index/tests/tags.yml @@ -0,0 +1,2 @@ +bcftools/index: + - "modules/nf-core/bcftools/index/**" diff --git a/modules/nf-core/rtgtools/format/environment.yml b/modules/nf-core/rtgtools/format/environment.yml new file mode 100644 index 0000000000..8aa83cdc41 --- /dev/null +++ b/modules/nf-core/rtgtools/format/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::rtg-tools=3.12.1 diff --git a/modules/nf-core/rtgtools/format/main.nf b/modules/nf-core/rtgtools/format/main.nf new file mode 100644 index 0000000000..802d3b20a3 --- /dev/null +++ b/modules/nf-core/rtgtools/format/main.nf @@ -0,0 +1,66 @@ +process RTGTOOLS_FORMAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/rtg-tools:3.12.1--hdfd78af_0': + 'biocontainers/rtg-tools:3.12.1--hdfd78af_0' }" + + input: + tuple val(meta), path(input1), path(input2), path(sam_rg) + + output: + tuple val(meta), path("*.sdf"), emit: sdf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def single = meta.containsKey("single_end") ? meta.single_end : true + + def input = single ? "${input1}" : "--left ${input1} --right ${input2}" + def rg = sam_rg ? "--sam-rg ${sam_rg}" : "" + + def avail_mem = "3G" + if (!task.memory) { + log.info '[RTG format] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + "M" + } + + """ + rtg RTG_MEM=${avail_mem} format \\ + ${args} \\ + ${rg} \\ + --output ${prefix}.sdf \\ + ${input} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rtg-tools: \$(echo \$(rtg version | head -n 1 | awk '{print \$4}')) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + def avail_mem = "3G" + if (!task.memory) { + log.info '[RTG format] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + "M" + } + """ + touch ${prefix}.sdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rtg-tools: \$(echo \$(rtg version | head -n 1 | awk '{print \$4}')) + END_VERSIONS + """ +} diff --git a/modules/nf-core/rtgtools/format/meta.yml b/modules/nf-core/rtgtools/format/meta.yml new file mode 100644 index 0000000000..e09aff3af6 --- /dev/null +++ b/modules/nf-core/rtgtools/format/meta.yml @@ -0,0 +1,58 @@ +name: "rtgtools_format" +description: Converts the contents of sequence data files (FASTA/FASTQ/SAM/BAM) into + the RTG Sequence Data File (SDF) format. +keywords: + - rtg + - fasta + - fastq + - bam + - sam +tools: + - "rtgtools": + description: "RealTimeGenomics Tools -- Utilities for accurate VCF comparison + and manipulation" + homepage: "https://www.realtimegenomics.com/products/rtg-tools" + documentation: "https://github.com/RealTimeGenomics/rtg-tools" + tool_dev_url: "https://github.com/RealTimeGenomics/rtg-tools" + licence: ["BSD"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input1: + type: file + description: FASTA, FASTQ, BAM or SAM file. This should be the left input file + when using paired end FASTQ/FASTA data + pattern: "*.{fasta,fa,fna,fastq,fastq.gz,fq,fq.gz,bam,sam}" + - input2: + type: file + description: The right input file when using paired end FASTQ/FASTA data + pattern: "*.{fasta,fa,fna,fastq,fastq.gz,fq,fq.gz}" + - sam_rg: + type: file + description: A file containing a single readgroup header as a SAM header. This + can also be supplied as a string in `task.ext.args` as `--sam-rg `. + pattern: "*.{txt,sam}" +output: + - sdf: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.sdf": + type: directory + description: The sequence dictionary format folder created from the input file(s) + pattern: "*.sdf" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/modules/nf-core/rtgtools/vcfeval/environment.yml b/modules/nf-core/rtgtools/vcfeval/environment.yml new file mode 100644 index 0000000000..8aa83cdc41 --- /dev/null +++ b/modules/nf-core/rtgtools/vcfeval/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::rtg-tools=3.12.1 diff --git a/modules/nf-core/rtgtools/vcfeval/main.nf b/modules/nf-core/rtgtools/vcfeval/main.nf new file mode 100644 index 0000000000..330a1f3d96 --- /dev/null +++ b/modules/nf-core/rtgtools/vcfeval/main.nf @@ -0,0 +1,90 @@ +process RTGTOOLS_VCFEVAL { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/rtg-tools:3.12.1--hdfd78af_0': + 'biocontainers/rtg-tools:3.12.1--hdfd78af_0' }" + + input: + tuple val(meta), path(query_vcf), path(query_vcf_tbi), path(truth_vcf), path(truth_vcf_tbi), path(truth_bed), path(regions_bed) + tuple val(meta2), path(sdf) + + output: + tuple val(meta), path("*.tp.vcf.gz") , emit: tp_vcf + tuple val(meta), path("*.tp.vcf.gz.tbi") , emit: tp_tbi + tuple val(meta), path("*.fn.vcf.gz") , emit: fn_vcf + tuple val(meta), path("*.fn.vcf.gz.tbi") , emit: fn_tbi + tuple val(meta), path("*.fp.vcf.gz") , emit: fp_vcf + tuple val(meta), path("*.fp.vcf.gz.tbi") , emit: fp_tbi + tuple val(meta), path("*.tp-baseline.vcf.gz") , emit: baseline_vcf + tuple val(meta), path("*.tp-baseline.vcf.gz.tbi") , emit: baseline_tbi + tuple val(meta), path("*.snp_roc.tsv.gz") , emit: snp_roc + tuple val(meta), path("*.non_snp_roc.tsv.gz") , emit: non_snp_roc + tuple val(meta), path("*.weighted_roc.tsv.gz") , emit: weighted_roc + tuple val(meta), path("*.summary.txt") , emit: summary + tuple val(meta), path("*.phasing.txt") , emit: phasing + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def bed_regions = regions_bed ? "--bed-regions=${regions_bed}" : "" + def eval_regions = truth_bed ? "--evaluation-regions=${truth_bed}" : "" + def truth_index = truth_vcf_tbi ? "" : "rtg index ${truth_vcf}" + def query_index = query_vcf_tbi ? "" : "rtg index ${query_vcf}" + def avail_mem = task.memory.toGiga() + "G" + + """ + ${truth_index} + ${query_index} + + rtg RTG_MEM=$avail_mem vcfeval \\ + ${args} \\ + --baseline=${truth_vcf} \\ + ${bed_regions} \\ + ${eval_regions} \\ + --calls=${query_vcf} \\ + --output=output \\ + --template=${sdf} \\ + --threads=${task.cpus} + + cd output/ + mv done progress .. + for f in * ; do mv "\$f" "../${prefix}.\$f" ; done + cd .. + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rtg-tools: \$(echo \$(rtg version | head -n 1 | awk '{print \$4}')) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + echo | gzip > ${prefix}.tp.vcf.gz + touch ${prefix}.tp.vcf.gz.tbi + echo | gzip > ${prefix}.fn.vcf.gz + touch ${prefix}.fn.vcf.gz.tbi + echo | gzip > ${prefix}.fp.vcf.gz + touch ${prefix}.fp.vcf.gz.tbi + echo | gzip > ${prefix}.tp-baseline.vcf.gz + touch ${prefix}.tp-baseline.vcf.gz.tbi + echo | gzip > ${prefix}.snp_roc.tsv.gz + echo | gzip > ${prefix}.non_snp_roc.tsv.gz + echo | gzip > ${prefix}.weighted_roc.tsv.gz + touch ${prefix}.summary.txt + touch ${prefix}.phasing.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rtg-tools: \$(echo \$(rtg version | head -n 1 | awk '{print \$4}')) + END_VERSIONS + """ +} diff --git a/modules/nf-core/rtgtools/vcfeval/meta.yml b/modules/nf-core/rtgtools/vcfeval/meta.yml new file mode 100644 index 0000000000..4c59bab521 --- /dev/null +++ b/modules/nf-core/rtgtools/vcfeval/meta.yml @@ -0,0 +1,199 @@ +name: "rtgtools_vcfeval" +description: The VCFeval tool of RTG tools. It is used to evaluate called variants + for agreement with a baseline variant set +keywords: + - benchmarking + - vcf + - rtg-tools +tools: + - "rtgtools": + description: "RealTimeGenomics Tools -- Utilities for accurate VCF comparison + and manipulation" + homepage: "https://www.realtimegenomics.com/products/rtg-tools" + documentation: "https://github.com/RealTimeGenomics/rtg-tools" + tool_dev_url: "https://github.com/RealTimeGenomics/rtg-tools" + licence: ["BSD"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - query_vcf: + type: file + description: A VCF with called variants to benchmark against the standard + pattern: "*.{vcf,vcf.gz}" + - query_vcf_tbi: + type: file + description: The index of the VCF file with called variants to benchmark against the standard + pattern: "*.{vcf.gz.tbi, vcf.tbi}" + - truth_vcf: + type: file + description: A standard VCF to compare against + pattern: "*.{vcf,vcf.gz}" + - truth_vcf_tbi: + type: file + description: The index of the standard VCF to compare against + pattern: "*.{vcf.gz.tbi, vcf.tbi}" + - truth_bed: + type: file + description: A BED file containining the strict regions where VCFeval should + only evaluate the fully overlapping variants (optional) + This input should be used to provide the golden truth BED files. + pattern: "*.bed" + - regions_bed: + type: file + description: A BED file containing the regions where VCFeval will evaluate every + fully and partially overlapping variant (optional) + This input should be used to provide the regions used by the analysis + pattern: "*.bed" + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - sdf: + type: file + description: The SDF (RTG Sequence Data File) folder of the reference genome +output: + - tp_vcf: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tp.vcf.gz": + type: file + description: A VCF file for the true positive variants + pattern: "*.tp.vcf.gz" + - tp_tbi: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tp.vcf.gz.tbi": + type: file + description: The index of the VCF file for the true positive variants + pattern: "*.tp.vcf.gz.tbi" + - fn_vcf: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fn.vcf.gz": + type: file + description: A VCF file for the false negative variants + pattern: "*.fn.vcf.gz" + - fn_tbi: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fn.vcf.gz.tbi": + type: file + description: The index of the VCF file for the false negative variants + pattern: "*.fn.vcf.gz.tbi" + - fp_vcf: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fp.vcf.gz": + type: file + description: A VCF file for the false positive variants + pattern: "*.fp.vcf.gz" + - fp_tbi: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fp.vcf.gz.tbi": + type: file + description: The index of the VCF file for the false positive variants + pattern: "*.fp.vcf.gz.tbi" + - baseline_vcf: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tp-baseline.vcf.gz": + type: file + description: A VCF file for the true positive variants from the baseline + pattern: "*.tp-baseline.vcf.gz" + - baseline_tbi: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tp-baseline.vcf.gz.tbi": + type: file + description: The index of the VCF file for the true positive variants from the + baseline + pattern: "*.tp-baseline.vcf.gz.tbi" + - snp_roc: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.snp_roc.tsv.gz": + type: file + description: TSV files containing ROC data for the SNPs + pattern: "*.snp_roc.tsv.gz" + - non_snp_roc: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.non_snp_roc.tsv.gz": + type: file + description: TSV files containing ROC data for all variants except SNPs + pattern: "*.non_snp_roc.tsv.gz" + - weighted_roc: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.weighted_roc.tsv.gz": + type: file + description: TSV files containing weighted ROC data for all variants + pattern: "*.weighted_snp_roc.tsv.gz" + - summary: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.summary.txt": + type: file + description: A TXT file containing the summary of the evaluation + pattern: "*.summary.txt" + - phasing: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.phasing.txt": + type: file + description: A TXT file containing the data on the phasing + pattern: "*.phasing.txt" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/modules/nf-core/rtgtools/vcfeval/tests/main.nf.test b/modules/nf-core/rtgtools/vcfeval/tests/main.nf.test new file mode 100644 index 0000000000..55abc84275 --- /dev/null +++ b/modules/nf-core/rtgtools/vcfeval/tests/main.nf.test @@ -0,0 +1,113 @@ +nextflow_process { + + name "Test Process RTGTOOLS_VCFEVAL" + script "../main.nf" + process "RTGTOOLS_VCFEVAL" + + tag "modules" + tag "modules_nfcore" + tag "rtgtools" + tag "rtgtools/vcfeval" + tag "untar" + + setup { + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = Channel.value([ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome_sdf.tar.gz', checkIfExists:true) + ]) + """ + } + } + } + + test("homo_sapiens - [vcf, tbi, truth, truth_tbi, truth_bed, regions_bed], sdf") { + + when { + process { + """ + input[0] = Channel.of([ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.vcf.gz', checkIfExists:true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.vcf.gz.tbi', checkIfExists:true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.ann.vcf.gz', checkIfExists:true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.ann.vcf.gz.tbi', checkIfExists:true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.bed', checkIfExists:true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.bed', checkIfExists:true) + ]) + input[1] = UNTAR.out.untar + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("homo_sapiens - [vcf, [], truth, [], [], []], sdf") { + + when { + process { + """ + input[0] = Channel.of([ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.vcf.gz', checkIfExists:true), + [], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.ann.vcf.gz', checkIfExists:true), + [], + [], + [] + ]) + input[1] = UNTAR.out.untar + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("homo_sapiens - [vcf, tbi, truth, truth_tbi, truth_bed, regions_bed], sdf - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.vcf.gz', checkIfExists:true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.vcf.gz.tbi', checkIfExists:true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.ann.vcf.gz', checkIfExists:true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.ann.vcf.gz.tbi', checkIfExists:true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.bed', checkIfExists:true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.bed', checkIfExists:true) + ]) + input[1] = UNTAR.out.untar + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/rtgtools/vcfeval/tests/main.nf.test.snap b/modules/nf-core/rtgtools/vcfeval/tests/main.nf.test.snap new file mode 100644 index 0000000000..4f39e2d464 --- /dev/null +++ b/modules/nf-core/rtgtools/vcfeval/tests/main.nf.test.snap @@ -0,0 +1,677 @@ +{ + "homo_sapiens - [vcf, tbi, truth, truth_tbi, truth_bed, regions_bed], sdf": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.tp.vcf.gz:md5,5171021307097220337dbcaccc860495" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.tp.vcf.gz.tbi:md5,092a7a3162e7cff25d273525751eb284" + ] + ], + "10": [ + [ + { + "id": "test" + }, + "test.weighted_roc.tsv.gz:md5,de36bf613b3dacf4a043311336bb4a94" + ] + ], + "11": [ + [ + { + "id": "test" + }, + "test.summary.txt:md5,f4c8df93c8bdab603036bbc27b4a28c3" + ] + ], + "12": [ + [ + { + "id": "test" + }, + "test.phasing.txt:md5,31988234bee208cacb3de90dabe1797f" + ] + ], + "13": [ + "versions.yml:md5,a228f0d9e8b205b4cc7c485151a77bb0" + ], + "2": [ + [ + { + "id": "test" + }, + "test.fn.vcf.gz:md5,fc419367818700d47df073615aeb9077" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test.fn.vcf.gz.tbi:md5,092a7a3162e7cff25d273525751eb284" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "test.fp.vcf.gz:md5,5171021307097220337dbcaccc860495" + ] + ], + "5": [ + [ + { + "id": "test" + }, + "test.fp.vcf.gz.tbi:md5,092a7a3162e7cff25d273525751eb284" + ] + ], + "6": [ + [ + { + "id": "test" + }, + "test.tp-baseline.vcf.gz:md5,fc419367818700d47df073615aeb9077" + ] + ], + "7": [ + [ + { + "id": "test" + }, + "test.tp-baseline.vcf.gz.tbi:md5,092a7a3162e7cff25d273525751eb284" + ] + ], + "8": [ + [ + { + "id": "test" + }, + "test.snp_roc.tsv.gz:md5,11d7393a16c25ac0a092382fecafee9b" + ] + ], + "9": [ + [ + { + "id": "test" + }, + "test.non_snp_roc.tsv.gz:md5,eb0910409b8b088655defbd152103b81" + ] + ], + "baseline_tbi": [ + [ + { + "id": "test" + }, + "test.tp-baseline.vcf.gz.tbi:md5,092a7a3162e7cff25d273525751eb284" + ] + ], + "baseline_vcf": [ + [ + { + "id": "test" + }, + "test.tp-baseline.vcf.gz:md5,fc419367818700d47df073615aeb9077" + ] + ], + "fn_tbi": [ + [ + { + "id": "test" + }, + "test.fn.vcf.gz.tbi:md5,092a7a3162e7cff25d273525751eb284" + ] + ], + "fn_vcf": [ + [ + { + "id": "test" + }, + "test.fn.vcf.gz:md5,fc419367818700d47df073615aeb9077" + ] + ], + "fp_tbi": [ + [ + { + "id": "test" + }, + "test.fp.vcf.gz.tbi:md5,092a7a3162e7cff25d273525751eb284" + ] + ], + "fp_vcf": [ + [ + { + "id": "test" + }, + "test.fp.vcf.gz:md5,5171021307097220337dbcaccc860495" + ] + ], + "non_snp_roc": [ + [ + { + "id": "test" + }, + "test.non_snp_roc.tsv.gz:md5,eb0910409b8b088655defbd152103b81" + ] + ], + "phasing": [ + [ + { + "id": "test" + }, + "test.phasing.txt:md5,31988234bee208cacb3de90dabe1797f" + ] + ], + "snp_roc": [ + [ + { + "id": "test" + }, + "test.snp_roc.tsv.gz:md5,11d7393a16c25ac0a092382fecafee9b" + ] + ], + "summary": [ + [ + { + "id": "test" + }, + "test.summary.txt:md5,f4c8df93c8bdab603036bbc27b4a28c3" + ] + ], + "tp_tbi": [ + [ + { + "id": "test" + }, + "test.tp.vcf.gz.tbi:md5,092a7a3162e7cff25d273525751eb284" + ] + ], + "tp_vcf": [ + [ + { + "id": "test" + }, + "test.tp.vcf.gz:md5,5171021307097220337dbcaccc860495" + ] + ], + "versions": [ + "versions.yml:md5,a228f0d9e8b205b4cc7c485151a77bb0" + ], + "weighted_roc": [ + [ + { + "id": "test" + }, + "test.weighted_roc.tsv.gz:md5,de36bf613b3dacf4a043311336bb4a94" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-30T15:17:31.564974666" + }, + "homo_sapiens - [vcf, [], truth, [], [], []], sdf": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.tp.vcf.gz:md5,5125ee41457c9d93f46b19e32788edb4" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.tp.vcf.gz.tbi:md5,a0e9ac2d38c04bd591ab8f857c5c9133" + ] + ], + "10": [ + [ + { + "id": "test" + }, + "test.weighted_roc.tsv.gz:md5,5dfacd641b080cc8ad22eebec015c698" + ] + ], + "11": [ + [ + { + "id": "test" + }, + "test.summary.txt:md5,f33feb32f84958fb931063044fba369b" + ] + ], + "12": [ + [ + { + "id": "test" + }, + "test.phasing.txt:md5,133677dbd8be657439ea2b03fdfb8795" + ] + ], + "13": [ + "versions.yml:md5,a228f0d9e8b205b4cc7c485151a77bb0" + ], + "2": [ + [ + { + "id": "test" + }, + "test.fn.vcf.gz:md5,df96e4e4014cdb3050cb6f221f0cdca9" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test.fn.vcf.gz.tbi:md5,092a7a3162e7cff25d273525751eb284" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "test.fp.vcf.gz:md5,d4bfa2c7271351ca19589f0f57f210b7" + ] + ], + "5": [ + [ + { + "id": "test" + }, + "test.fp.vcf.gz.tbi:md5,092a7a3162e7cff25d273525751eb284" + ] + ], + "6": [ + [ + { + "id": "test" + }, + "test.tp-baseline.vcf.gz:md5,920af25c3c18a438b11440702562fa35" + ] + ], + "7": [ + [ + { + "id": "test" + }, + "test.tp-baseline.vcf.gz.tbi:md5,95938320b425e28cf06c45ab45ad0360" + ] + ], + "8": [ + [ + { + "id": "test" + }, + "test.snp_roc.tsv.gz:md5,85edc0101bb9e8d3edc11abe4fdcda93" + ] + ], + "9": [ + [ + { + "id": "test" + }, + "test.non_snp_roc.tsv.gz:md5,30283ede3bcc5dd247f8a84bf345bf9a" + ] + ], + "baseline_tbi": [ + [ + { + "id": "test" + }, + "test.tp-baseline.vcf.gz.tbi:md5,95938320b425e28cf06c45ab45ad0360" + ] + ], + "baseline_vcf": [ + [ + { + "id": "test" + }, + "test.tp-baseline.vcf.gz:md5,920af25c3c18a438b11440702562fa35" + ] + ], + "fn_tbi": [ + [ + { + "id": "test" + }, + "test.fn.vcf.gz.tbi:md5,092a7a3162e7cff25d273525751eb284" + ] + ], + "fn_vcf": [ + [ + { + "id": "test" + }, + "test.fn.vcf.gz:md5,df96e4e4014cdb3050cb6f221f0cdca9" + ] + ], + "fp_tbi": [ + [ + { + "id": "test" + }, + "test.fp.vcf.gz.tbi:md5,092a7a3162e7cff25d273525751eb284" + ] + ], + "fp_vcf": [ + [ + { + "id": "test" + }, + "test.fp.vcf.gz:md5,d4bfa2c7271351ca19589f0f57f210b7" + ] + ], + "non_snp_roc": [ + [ + { + "id": "test" + }, + "test.non_snp_roc.tsv.gz:md5,30283ede3bcc5dd247f8a84bf345bf9a" + ] + ], + "phasing": [ + [ + { + "id": "test" + }, + "test.phasing.txt:md5,133677dbd8be657439ea2b03fdfb8795" + ] + ], + "snp_roc": [ + [ + { + "id": "test" + }, + "test.snp_roc.tsv.gz:md5,85edc0101bb9e8d3edc11abe4fdcda93" + ] + ], + "summary": [ + [ + { + "id": "test" + }, + "test.summary.txt:md5,f33feb32f84958fb931063044fba369b" + ] + ], + "tp_tbi": [ + [ + { + "id": "test" + }, + "test.tp.vcf.gz.tbi:md5,a0e9ac2d38c04bd591ab8f857c5c9133" + ] + ], + "tp_vcf": [ + [ + { + "id": "test" + }, + "test.tp.vcf.gz:md5,5125ee41457c9d93f46b19e32788edb4" + ] + ], + "versions": [ + "versions.yml:md5,a228f0d9e8b205b4cc7c485151a77bb0" + ], + "weighted_roc": [ + [ + { + "id": "test" + }, + "test.weighted_roc.tsv.gz:md5,5dfacd641b080cc8ad22eebec015c698" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-30T15:18:04.344989466" + }, + "homo_sapiens - [vcf, tbi, truth, truth_tbi, truth_bed, regions_bed], sdf - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.tp.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.tp.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "10": [ + [ + { + "id": "test" + }, + "test.weighted_roc.tsv.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "11": [ + [ + { + "id": "test" + }, + "test.summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "12": [ + [ + { + "id": "test" + }, + "test.phasing.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "13": [ + "versions.yml:md5,a228f0d9e8b205b4cc7c485151a77bb0" + ], + "2": [ + [ + { + "id": "test" + }, + "test.fn.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test.fn.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "test.fp.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "5": [ + [ + { + "id": "test" + }, + "test.fp.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + { + "id": "test" + }, + "test.tp-baseline.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "7": [ + [ + { + "id": "test" + }, + "test.tp-baseline.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "test" + }, + "test.snp_roc.tsv.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "9": [ + [ + { + "id": "test" + }, + "test.non_snp_roc.tsv.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "baseline_tbi": [ + [ + { + "id": "test" + }, + "test.tp-baseline.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "baseline_vcf": [ + [ + { + "id": "test" + }, + "test.tp-baseline.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "fn_tbi": [ + [ + { + "id": "test" + }, + "test.fn.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "fn_vcf": [ + [ + { + "id": "test" + }, + "test.fn.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "fp_tbi": [ + [ + { + "id": "test" + }, + "test.fp.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "fp_vcf": [ + [ + { + "id": "test" + }, + "test.fp.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "non_snp_roc": [ + [ + { + "id": "test" + }, + "test.non_snp_roc.tsv.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "phasing": [ + [ + { + "id": "test" + }, + "test.phasing.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "snp_roc": [ + [ + { + "id": "test" + }, + "test.snp_roc.tsv.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "summary": [ + [ + { + "id": "test" + }, + "test.summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tp_tbi": [ + [ + { + "id": "test" + }, + "test.tp.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tp_vcf": [ + [ + { + "id": "test" + }, + "test.tp.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,a228f0d9e8b205b4cc7c485151a77bb0" + ], + "weighted_roc": [ + [ + { + "id": "test" + }, + "test.weighted_roc.tsv.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-30T15:23:21.165461388" + } +} \ No newline at end of file diff --git a/modules/nf-core/rtgtools/vcfeval/tests/nextflow.config b/modules/nf-core/rtgtools/vcfeval/tests/nextflow.config new file mode 100644 index 0000000000..7563521514 --- /dev/null +++ b/modules/nf-core/rtgtools/vcfeval/tests/nextflow.config @@ -0,0 +1,4 @@ +process { + withName: UNTAR { + } +} diff --git a/nextflow.config b/nextflow.config index f01ed136d2..e847dc54ea 100644 --- a/nextflow.config +++ b/nextflow.config @@ -101,6 +101,11 @@ params { vep_spliceregion = null // spliceregion plugin disabled within VEP vep_version = "111.0-0" // Should be updated when we update VEP, needs this to get full path to some plugins + // Benchmark + benchmark = false // No benchmarking + truth_vcf = null // No truth vcf + truth_bed = null // No truth bed + // MultiQC options multiqc_config = null multiqc_title = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 2e66ccdf53..fbec35282f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -327,7 +327,7 @@ }, "cf_ploidy": { "type": "string", - "default": 2, + "default": "2", "fa_icon": "fas fa-bacon", "help_text": "In case of doubt, you can set different values and Control-FREEC will select the one that explains most observed CNAs Example: ploidy=2 , ploidy=2,3,4. For more details, see the [manual](http://boevalab.inf.ethz.ch/FREEC/tutorial.html).", "description": "Genome ploidy used by ControlFREEC" @@ -1028,5 +1028,16 @@ { "$ref": "#/$defs/generic_options" } - ] + ], + "properties": { + "benchmark": { + "type": "boolean" + }, + "truth_vcf": { + "type": "string" + }, + "truth_bed": { + "type": "string" + } + } } diff --git a/subworkflows/local/vcf_benchmark/main.nf b/subworkflows/local/vcf_benchmark/main.nf new file mode 100644 index 0000000000..c2853f1f51 --- /dev/null +++ b/subworkflows/local/vcf_benchmark/main.nf @@ -0,0 +1,62 @@ +// +// RUN BENCHMARKING OF SMALL GERMLINE VARIANTS +// Taken from nf-core/variantbenchmarking +// + +include { RTGTOOLS_FORMAT } from '../../../modules/nf-core/rtgtools/format/main' +include { RTGTOOLS_VCFEVAL } from '../../../modules/nf-core/rtgtools/vcfeval/main' +include { BCFTOOLS_INDEX as INDEX_QUERY } from '../../../modules/nf-core/bcftools/index/main.nf' +include { BCFTOOLS_INDEX as INDEX_TRUTH } from '../../../modules/nf-core/bcftools/index/main.nf' + +workflow VCF_BENCHMARK { + + take: + input_ch // channel: [val(meta),test_vcf] + input_truth // channel: [val(meta),truth_vcf,truth_bed] + fasta // reference channel [val(meta), ref.fa] + fai // reference channel [val(meta), ref.fa.fai] + + main: + versions = Channel.empty() + summary_reports = Channel.empty() + + query_vcf_tbi = Channel.empty() + truth_vcf_tbi = Channel.empty() + INDEX_TRUTH(input_ch.map { meta, test_vcf, truth_vcf, bed -> [ meta, test_vcf ] }) + INDEX_QUERY(input_ch.map { meta, test_vcf, truth_vcf, bed -> [ meta, truth_vcf ] }) + + versions = versions.mix(INDEX_TRUTH.out.versions) + versions = versions.mix(INDEX_QUERY.out.versions) + + query_vcf_tbi = query_vcf_tbi.mix(INDEX_QUERY.out.tbi) + truth_vcf_tbi = truth_vcf_tbi.mix(INDEX_TRUTH.out.tbi) + + // Use rtgtools format to generate sdf file if necessary + RTGTOOLS_FORMAT( + fasta.map { meta, fasta -> [ meta, fasta, [], [] ] } + ) + versions = versions.mix(RTGTOOLS_FORMAT.out.versions) + sdf = RTGTOOLS_FORMAT.out.sdf + + // apply rtgtools eval method + RTGTOOLS_VCFEVAL( + input_ch.map { meta, query_vcf, query_vcf_tbi, truth_vcf, truth_vcf_tbi, bed -> + [ meta, query_vcf, query_vcf_tbi, truth_vcf, truth_vcf_tbi, bed, [] ] + }, + sdf + ) + versions = versions.mix(RTGTOOLS_VCFEVAL.out.versions.first()) + + // collect summary reports + RTGTOOLS_VCFEVAL.out.summary + .map { meta, file -> tuple([vartype: meta.vartype] + [benchmark_tool: "rtgtools"], file) } + .groupTuple() + .set{ report } + + summary_reports = summary_reports.mix(report) + + emit: + summary_reports + versions + + } diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf index 6ece09f9a6..5ab9b7ca6a 100644 --- a/workflows/sarek/main.nf +++ b/workflows/sarek/main.nf @@ -74,6 +74,9 @@ include { BAM_VARIANT_CALLING_TUMOR_ONLY_ALL } from '../../subwor // Variant calling on tumor/normal pair include { BAM_VARIANT_CALLING_SOMATIC_ALL } from '../../subworkflows/local/bam_variant_calling_somatic_all/main' +// Benchmarking on germline small variants +include { VCF_BENCHMARK } from '../../subworkflows/local/vcf_benchmark/main' + // POST VARIANTCALLING: e.g. merging include { POST_VARIANTCALLING } from '../../subworkflows/local/post_variantcalling/main' @@ -139,6 +142,8 @@ workflow SAREK { vep_fasta vep_genome vep_species + truth_vcf + truth_bed main: @@ -892,6 +897,30 @@ workflow SAREK { } } + if (params.benchmark) { + + vcf_to_benchmark = Channel.empty() + vcf_to_benchmark = vcf_to_benchmark.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_deepvariant) + vcf_to_benchmark = vcf_to_benchmark.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_freebayes) + vcf_to_benchmark = vcf_to_benchmark.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_haplotypecaller) + vcf_to_benchmark = vcf_to_benchmark.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_manta) + vcf_to_benchmark = vcf_to_benchmark.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_sentieon_dnascope) + vcf_to_benchmark = vcf_to_benchmark.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_sentieon_haplotyper) + vcf_to_benchmark = vcf_to_benchmark.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_strelka) + vcf_to_benchmark = vcf_to_benchmark.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_tiddit) + vcf_to_benchmark = vcf_to_benchmark.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_mpileup) + + VCF_BENCHMARK( + vcf_to_benchmark.map{meta, vcf -> [ meta + [ file_name: vcf.baseName ], vcf ] }, + params.truth_vcf.map{meta, vcf -> [ meta, vcf, params.truth_bed ] }, + fasta, + fasta_fai + ) + + versions = versions.mix(VCF_BENCHMARK.out.versions) + + } + // // Collate and save software versions // From 66de48976280214934db69b46581885177d958f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Fri, 1 Nov 2024 16:02:17 +0100 Subject: [PATCH 2/8] less tools --- conf/benchmark.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/benchmark.config b/conf/benchmark.config index d0b5d8807c..f9c1df0a11 100644 --- a/conf/benchmark.config +++ b/conf/benchmark.config @@ -6,7 +6,7 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/csv/NA12878_Agilent_full_test.csv' // Other params - tools = 'strelka,freebayes,haplotypecaller,deepvariant,manta,tiddit,cnvkit,vep,snpeff' + tools = 'strelka,freebayes,vep' intervals = 's3://ngi-igenomes/test-data/sarek/Agilent_v7.bed' wes = true trim_fastq = true From dce4b9dd5da760bbd2e7f1e93c48ce519007aa02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Fri, 1 Nov 2024 16:37:32 +0100 Subject: [PATCH 3/8] try to solve channel error --- main.nf | 4 ++++ subworkflows/local/vcf_benchmark/main.nf | 23 +++++++++++++++++------ workflows/sarek/main.nf | 3 ++- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/main.nf b/main.nf index 58796d4d5b..11c6151a02 100755 --- a/main.nf +++ b/main.nf @@ -147,6 +147,10 @@ workflow NFCORE_SAREK { known_snps, pon) + // Gather Benchmark data from params + truth_vcf = params.truth_vcf ? Channel.fromPath(params.truth_vcf).map{ it -> [ [id:'truth_vcf'], it ] }.collect() : Channel.empty() + truth_bed = params.truth_bed ? Channel.fromPath(params.truth_bed).map{ it -> [ [id:'truth_bed'], it ] }.collect() : Channel.empty() + // Gather built indices or get them from the params // Built from the fasta file: dict = params.dict ? Channel.fromPath(params.dict).map{ it -> [ [id:'dict'], it ] }.collect() diff --git a/subworkflows/local/vcf_benchmark/main.nf b/subworkflows/local/vcf_benchmark/main.nf index c2853f1f51..b7fd687aa4 100644 --- a/subworkflows/local/vcf_benchmark/main.nf +++ b/subworkflows/local/vcf_benchmark/main.nf @@ -12,7 +12,8 @@ workflow VCF_BENCHMARK { take: input_ch // channel: [val(meta),test_vcf] - input_truth // channel: [val(meta),truth_vcf,truth_bed] + input_truth // channel: [val(meta),truth_vcf] + truth_bed // channel: [val(meta),truth_bed] fasta // reference channel [val(meta), ref.fa] fai // reference channel [val(meta), ref.fa.fai] @@ -22,8 +23,9 @@ workflow VCF_BENCHMARK { query_vcf_tbi = Channel.empty() truth_vcf_tbi = Channel.empty() - INDEX_TRUTH(input_ch.map { meta, test_vcf, truth_vcf, bed -> [ meta, test_vcf ] }) - INDEX_QUERY(input_ch.map { meta, test_vcf, truth_vcf, bed -> [ meta, truth_vcf ] }) + + INDEX_TRUTH( input_ch ) + INDEX_QUERY( input_truth ) versions = versions.mix(INDEX_TRUTH.out.versions) versions = versions.mix(INDEX_QUERY.out.versions) @@ -38,10 +40,19 @@ workflow VCF_BENCHMARK { versions = versions.mix(RTGTOOLS_FORMAT.out.versions) sdf = RTGTOOLS_FORMAT.out.sdf - // apply rtgtools eval method + // Combine input_ch and input_truth with query_vcf_tbi and truth_vcf_tbi + combined_inputs = input_ch.combine(input_truth) + .combine(truth_bed) + .combine(query_vcf_tbi) + .combine(truth_vcf_tbi) + + // Apply rtgtools eval method RTGTOOLS_VCFEVAL( - input_ch.map { meta, query_vcf, query_vcf_tbi, truth_vcf, truth_vcf_tbi, bed -> - [ meta, query_vcf, query_vcf_tbi, truth_vcf, truth_vcf_tbi, bed, [] ] + combined_inputs.map { ch, truth, bed, query_tbi, truth_tbi -> + def (meta, query_vcf) = ch + def (meta_truth, truth_vcf) = truth + def (meta_bed, truth_bed) = bed + [ meta, query_vcf, query_tbi, truth_vcf, truth_tbi, truth_bed ] }, sdf ) diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf index 5ab9b7ca6a..2cdf40f40d 100644 --- a/workflows/sarek/main.nf +++ b/workflows/sarek/main.nf @@ -912,7 +912,8 @@ workflow SAREK { VCF_BENCHMARK( vcf_to_benchmark.map{meta, vcf -> [ meta + [ file_name: vcf.baseName ], vcf ] }, - params.truth_vcf.map{meta, vcf -> [ meta, vcf, params.truth_bed ] }, + truth_vcf + truth_bed fasta, fasta_fai ) From 6e4055b4b20f19febb0ac05528f22390ddc07d05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Fri, 1 Nov 2024 16:41:10 +0100 Subject: [PATCH 4/8] fix small typo missing , --- workflows/sarek/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf index 2cdf40f40d..16f2981f5c 100644 --- a/workflows/sarek/main.nf +++ b/workflows/sarek/main.nf @@ -912,8 +912,8 @@ workflow SAREK { VCF_BENCHMARK( vcf_to_benchmark.map{meta, vcf -> [ meta + [ file_name: vcf.baseName ], vcf ] }, - truth_vcf - truth_bed + truth_vcf, + truth_bed, fasta, fasta_fai ) From 1df684066323ea8977c9351e993690a062ea179b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Fri, 1 Nov 2024 16:46:10 +0100 Subject: [PATCH 5/8] fix indexing --- subworkflows/local/vcf_benchmark/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/vcf_benchmark/main.nf b/subworkflows/local/vcf_benchmark/main.nf index b7fd687aa4..4f293340fc 100644 --- a/subworkflows/local/vcf_benchmark/main.nf +++ b/subworkflows/local/vcf_benchmark/main.nf @@ -24,8 +24,8 @@ workflow VCF_BENCHMARK { query_vcf_tbi = Channel.empty() truth_vcf_tbi = Channel.empty() - INDEX_TRUTH( input_ch ) - INDEX_QUERY( input_truth ) + INDEX_TRUTH( input_truth ) + INDEX_QUERY( input_ch ) versions = versions.mix(INDEX_TRUTH.out.versions) versions = versions.mix(INDEX_QUERY.out.versions) From fc50e44c831a2d5bb98713175e178abd6b6057e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Fri, 1 Nov 2024 16:59:45 +0100 Subject: [PATCH 6/8] update schema --- nextflow_schema.json | 66 +++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index fbec35282f..9c119e98dd 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -11,7 +11,10 @@ "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", "help_text": "Specify input samplesheet, step and output folder.", - "required": ["step", "outdir"], + "required": [ + "step", + "outdir" + ], "properties": { "input": { "description": "Path to comma-separated file containing information about the samples in the experiment.", @@ -215,7 +218,12 @@ "type": "string", "default": "bwa-mem", "fa_icon": "fas fa-puzzle-piece", - "enum": ["bwa-mem", "bwa-mem2", "dragmap", "sentieon-bwamem"], + "enum": [ + "bwa-mem", + "bwa-mem2", + "dragmap", + "sentieon-bwamem" + ], "description": "Specify aligner to be used to map reads to reference genome.", "help_text": "Sarek will build missing indices automatically if not provided. Set `--bwa false` if indices should be (re-)built.\nIf DragMap is selected as aligner, it is recommended to skip baserecalibration with `--skip_tools baserecalibrator`. For more info see [here](https://gatk.broadinstitute.org/hc/en-us/articles/4407897446939--How-to-Run-germline-single-sample-short-variant-discovery-in-DRAGEN-mode)." }, @@ -515,7 +523,11 @@ "type": "string", "default": "vcf", "description": "VEP output-file format.", - "enum": ["json", "tab", "vcf"], + "enum": [ + "json", + "tab", + "vcf" + ], "help_text": "Sets the format of the output-file from VEP. Available formats: json, tab and vcf.", "fa_icon": "fas fa-table" }, @@ -593,7 +605,10 @@ "type": "string", "description": "ASCAT genome.", "help_text": "Must be set to run ASCAT, either hg19 or hg38.\n\nIf you use AWS iGenomes, this has already been set for you appropriately.", - "enum": ["hg19", "hg38"] + "enum": [ + "hg19", + "hg38" + ] }, "ascat_alleles": { "type": "string", @@ -912,7 +927,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email": { @@ -995,6 +1017,26 @@ "hidden": true } } + }, + "benchmark_options": { + "title": "Benchmark options", + "type": "object", + "description": "", + "default": "", + "properties": { + "benchmark": { + "type": "boolean", + "description": "Boolean whether benchmarking should be done" + }, + "truth_vcf": { + "type": "string", + "description": "Location of truth VCF" + }, + "truth_bed": { + "type": "string", + "description": "Location of truth BED" + } + } } }, "allOf": [ @@ -1027,17 +1069,9 @@ }, { "$ref": "#/$defs/generic_options" - } - ], - "properties": { - "benchmark": { - "type": "boolean" }, - "truth_vcf": { - "type": "string" - }, - "truth_bed": { - "type": "string" + { + "$ref": "#/$defs/benchmark_options" } - } + ] } From 13c8adf133d089433c391cf16437015d65d33cf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Fri, 1 Nov 2024 17:09:15 +0100 Subject: [PATCH 7/8] change channels --- subworkflows/local/vcf_benchmark/main.nf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/vcf_benchmark/main.nf b/subworkflows/local/vcf_benchmark/main.nf index 4f293340fc..25723a39f5 100644 --- a/subworkflows/local/vcf_benchmark/main.nf +++ b/subworkflows/local/vcf_benchmark/main.nf @@ -40,8 +40,11 @@ workflow VCF_BENCHMARK { versions = versions.mix(RTGTOOLS_FORMAT.out.versions) sdf = RTGTOOLS_FORMAT.out.sdf + query_ch = Channel.from(input_ch) + truth_ch = Channel.from(input_truth) + // Combine input_ch and input_truth with query_vcf_tbi and truth_vcf_tbi - combined_inputs = input_ch.combine(input_truth) + combined_inputs = query_ch.combine(truth_ch) .combine(truth_bed) .combine(query_vcf_tbi) .combine(truth_vcf_tbi) From 77e713a93fe86978f0e789d765356eed9792fef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Fri, 1 Nov 2024 17:13:32 +0100 Subject: [PATCH 8/8] should work --- subworkflows/local/vcf_benchmark/main.nf | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/vcf_benchmark/main.nf b/subworkflows/local/vcf_benchmark/main.nf index 25723a39f5..17e3017a39 100644 --- a/subworkflows/local/vcf_benchmark/main.nf +++ b/subworkflows/local/vcf_benchmark/main.nf @@ -18,6 +18,7 @@ workflow VCF_BENCHMARK { fai // reference channel [val(meta), ref.fa.fai] main: + input_ch.view() versions = Channel.empty() summary_reports = Channel.empty() @@ -40,11 +41,8 @@ workflow VCF_BENCHMARK { versions = versions.mix(RTGTOOLS_FORMAT.out.versions) sdf = RTGTOOLS_FORMAT.out.sdf - query_ch = Channel.from(input_ch) - truth_ch = Channel.from(input_truth) - // Combine input_ch and input_truth with query_vcf_tbi and truth_vcf_tbi - combined_inputs = query_ch.combine(truth_ch) + combined_inputs = input_ch.combine(input_truth) .combine(truth_bed) .combine(query_vcf_tbi) .combine(truth_vcf_tbi)