From 305f87a1e0cf7a2a549275d8f5713bdd42d0c1f9 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 1 May 2024 08:20:42 +0000 Subject: [PATCH 01/15] Skip the subworkflow if the nohit list is empty --- subworkflows/local/run_blastn.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/run_blastn.nf b/subworkflows/local/run_blastn.nf index cc1fa6c5..d1ab58da 100644 --- a/subworkflows/local/run_blastn.nf +++ b/subworkflows/local/run_blastn.nf @@ -31,7 +31,7 @@ workflow RUN_BLASTN { // Subset of sequences with no hits SEQTK_SUBSEQ ( fasta, - NOHIT_LIST.out.nohitlist.map { meta, nohit -> nohit } + NOHIT_LIST.out.nohitlist.map { meta, nohit -> nohit } . filter { it.size() > 0 } ) ch_versions = ch_versions.mix ( SEQTK_SUBSEQ.out.versions.first() ) From 2737952ceea54bb10ca594200bd0e3fb92531a76 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 1 May 2024 15:04:00 +0000 Subject: [PATCH 02/15] Fixed the documentation --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 31512bc0..c7b92970 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,8 @@ It takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome 4. Run BUSCO ([`busco`](https://busco.ezlab.org/)) 5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond)) -7. Run BLASTn against extracted BUSCO genes ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) -8. Run BLASTx against extracted BUSCO genes ([`blast/blastx`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) +7. Run BLASTx against sequences with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) +8. Run BLASTn against sequences still with not hit ([`blast/blastx`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) 9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit)) 11. Imports analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit)) From 948e139f0e798de1d508e5d0a9664b2ce0669141 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 9 May 2024 10:15:14 +0100 Subject: [PATCH 03/15] Increased the resources for blastn --- conf/base.config | 6 ++++++ modules/nf-core/blast/blastn/blast-blastn.diff | 13 ++++++++++--- modules/nf-core/blast/blastn/main.nf | 1 - 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/conf/base.config b/conf/base.config index 8f51f7f8..65058d5e 100644 --- a/conf/base.config +++ b/conf/base.config @@ -104,6 +104,12 @@ process { time = { check_max( 3.h * Math.ceil(meta.genome_size/1000000000) * task.attempt, 'time') } } + withName: "BLAST_BLASTN" { + cpus = { check_max( 24 * task.attempt, 'cpus' ) } + memory = { check_max( 100.MB * task.attempt, 'memory' ) } + time = { check_max( 12.h * task.attempt, 'time' ) } + } + withName:CUSTOM_DUMPSOFTWAREVERSIONS { cache = false } diff --git a/modules/nf-core/blast/blastn/blast-blastn.diff b/modules/nf-core/blast/blastn/blast-blastn.diff index 1695c793..449f7240 100644 --- a/modules/nf-core/blast/blastn/blast-blastn.diff +++ b/modules/nf-core/blast/blastn/blast-blastn.diff @@ -1,7 +1,14 @@ Changes in module 'nf-core/blast/blastn' --- modules/nf-core/blast/blastn/main.nf +++ modules/nf-core/blast/blastn/main.nf -@@ -10,6 +10,7 @@ +@@ -1,6 +1,5 @@ + process BLAST_BLASTN { + tag "$meta.id" +- label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +@@ -10,6 +9,7 @@ input: tuple val(meta) , path(fasta) tuple val(meta2), path(db) @@ -9,7 +16,7 @@ Changes in module 'nf-core/blast/blastn' output: tuple val(meta), path('*.txt'), emit: txt -@@ -23,6 +24,7 @@ +@@ -23,6 +23,7 @@ def prefix = task.ext.prefix ?: "${meta.id}" def is_compressed = fasta.getExtension() == "gz" ? true : false def fasta_name = is_compressed ? fasta.getBaseName() : fasta @@ -17,7 +24,7 @@ Changes in module 'nf-core/blast/blastn' """ if [ "${is_compressed}" == "true" ]; then -@@ -39,6 +41,7 @@ +@@ -39,6 +40,7 @@ -num_threads ${task.cpus} \\ -db \$DB \\ -query ${fasta_name} \\ diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf index 065ad7cd..368e7bcc 100644 --- a/modules/nf-core/blast/blastn/main.nf +++ b/modules/nf-core/blast/blastn/main.nf @@ -1,6 +1,5 @@ process BLAST_BLASTN { tag "$meta.id" - label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? From a25e4bba1fc24e5094c51fa46838b9d681765cf6 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 13 May 2024 10:32:31 +0100 Subject: [PATCH 04/15] Version bump --- CHANGELOG.md | 8 ++++++++ nextflow.config | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 240bcd13..e9bfd4c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[0.5.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.0)] – Snorlax – [] + +General tidy up of the configuration and the pipeline + +### Enhancements & fixes + +- Increased the resources for blastn + ## [[0.4.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.4.0)] – Buneary – [2024-04-17] The pipeline has now been validated on dozens of genomes, up to 11 Gbp. diff --git a/nextflow.config b/nextflow.config index 83aaaafc..47bd59bc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,7 +17,7 @@ params { mask = false fetchngs_samplesheet = false - // Reference options + // Reference options fasta = null accession = null taxon = null @@ -248,7 +248,7 @@ manifest { description = """Quality assessment of genome assemblies""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '0.4.0' + version = '0.5.0' doi = '10.5281/zenodo.7949058' } From 162b5faf29749ce9d98d604645704373d752ac31 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 13 May 2024 10:41:06 +0100 Subject: [PATCH 05/15] Removed an unused parameter --- CHANGELOG.md | 9 +++++++++ nextflow.config | 1 - nextflow_schema.json | 7 ------- subworkflows/local/busco_diamond_blastp.nf | 8 +++++--- workflows/blobtoolkit.nf | 11 ++--------- 5 files changed, 16 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9bfd4c7..876db76e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,15 @@ General tidy up of the configuration and the pipeline ### Enhancements & fixes - Increased the resources for blastn +- Removed the `--taxa_file` option. The taxon now has to be provided through `--taxon`, which accepts both names and integers. + +### Parameters + +| Old parameter | New parameter | +| ------------- | ------------- | +| --taxa_file | | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present.
**NB:** Parameter has been **added** if just the new parameter information is present.
**NB:** Parameter has been **removed** if new parameter information isn't present. ## [[0.4.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.4.0)] – Buneary – [2024-04-17] diff --git a/nextflow.config b/nextflow.config index 47bd59bc..65d14441 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,7 +21,6 @@ params { fasta = null accession = null taxon = null - taxa_file = null // Output options image_format = 'png' diff --git a/nextflow_schema.json b/nextflow_schema.json index 97c84534..7cd7e72d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -102,13 +102,6 @@ "description": "Define the location and parameters to work with databases.", "required": ["blastp", "blastx", "blastn", "taxdump"], "properties": { - "taxa_file": { - "type": "string", - "format": "file-path", - "description": "Path to file containing the BUSCO lineages for the genome species", - "help_text": "If this file is not included, the relevant BUSCO lineages are automatically calculated using the taxon parameter.", - "fa_icon": "fas fa-file-alt" - }, "busco": { "type": "string", "format": "directory-path", diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index c3ebe104..74c9c3aa 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -12,7 +12,7 @@ include { RESTRUCTUREBUSCODIR } from '../../modules/local/restructurebusco workflow BUSCO_DIAMOND { take: fasta // channel: [ val(meta), path(fasta) ] - taxon_taxa // channel: [ val(meta, val(taxon), path(taxa) ] + taxon // channel: val(taxon) busco_db // channel: path(busco_db) blastp // channel: path(blastp_db) outext // channel: val(out_format) @@ -24,9 +24,11 @@ workflow BUSCO_DIAMOND { // - // Fetch BUSCO lineages for taxon (or taxa) + // Fetch BUSCO lineages for taxon // - GOAT_TAXONSEARCH ( taxon_taxa ) + GOAT_TAXONSEARCH ( + fasta.combine(taxon).map { meta, fasta, taxon -> [ meta, taxon, [] ] } + ) ch_versions = ch_versions.mix ( GOAT_TAXONSEARCH.out.versions.first() ) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index f25da1eb..7a8dd939 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -17,7 +17,7 @@ WorkflowBlobtoolkit.initialise(params, log) // Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxa_file, params.taxdump, params.busco, params.blastp, params.blastx ] +def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxdump, params.busco, params.blastp, params.blastx ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters @@ -128,16 +128,9 @@ workflow BLOBTOOLKIT { // // SUBWORKFLOW: Run BUSCO using lineages fetched from GOAT, then run diamond_blastp // - if (params.taxa_file) { - ch_taxa = Channel.from(params.taxa_file) - ch_taxon_taxa = PREPARE_GENOME.out.genome.combine(ch_taxon).combine(ch_taxa).map { meta, fasta, taxon, taxa -> [ meta, taxon, taxa ] } - } else { - ch_taxon_taxa = PREPARE_GENOME.out.genome.combine(ch_taxon).map { meta, fasta, taxon -> [ meta, taxon, [] ] } - } - BUSCO_DIAMOND ( PREPARE_GENOME.out.genome, - ch_taxon_taxa, + ch_taxon, ch_busco_db, ch_blastp, params.blastp_outext, From afb27d32961bed65a91895aefc5d87a92f553373 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 17 May 2024 14:20:33 +0100 Subject: [PATCH 06/15] Removed trailing whitespace --- subworkflows/local/blobtools.nf | 4 ++-- subworkflows/local/busco_diamond_blastp.nf | 6 +++--- subworkflows/local/collate_stats.nf | 2 +- subworkflows/local/coverage_stats.nf | 6 +++--- subworkflows/local/minimap_alignment.nf | 4 ++-- subworkflows/local/prepare_genome.nf | 2 +- subworkflows/local/run_blastn.nf | 12 ++++++------ subworkflows/local/run_blastx.nf | 2 +- 8 files changed, 19 insertions(+), 19 deletions(-) diff --git a/subworkflows/local/blobtools.nf b/subworkflows/local/blobtools.nf index 8411ad24..747bc9fa 100644 --- a/subworkflows/local/blobtools.nf +++ b/subworkflows/local/blobtools.nf @@ -28,14 +28,14 @@ workflow BLOBTOOLS { ch_versions = ch_versions.mix ( BLOBTOOLKIT_METADATA.out.versions.first() ) - // + // // Create Blobtools dataset files // BLOBTOOLKIT_CREATEBLOBDIR ( windowstats, busco, blastp, BLOBTOOLKIT_METADATA.out.yaml, taxdump ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_CREATEBLOBDIR.out.versions.first() ) - // + // // Update Blobtools dataset files // BLOBTOOLKIT_UPDATEBLOBDIR ( BLOBTOOLKIT_CREATEBLOBDIR.out.blobdir, blastx, blastn, taxdump ) diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index 74c9c3aa..e22ce3f8 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -30,7 +30,7 @@ workflow BUSCO_DIAMOND { fasta.combine(taxon).map { meta, fasta, taxon -> [ meta, taxon, [] ] } ) ch_versions = ch_versions.mix ( GOAT_TAXONSEARCH.out.versions.first() ) - + // // Get NCBI species ID @@ -110,7 +110,7 @@ workflow BUSCO_DIAMOND { // // Align BUSCO genes against the BLASTp database - // + // BLOBTOOLKIT_EXTRACTBUSCOS.out.genes | filter { it[1].size() > 140 } | set { ch_busco_genes } @@ -143,7 +143,7 @@ workflow BUSCO_DIAMOND { emit: - first_table = ch_first_table // channel: [ val(meta), path(full_table) ] + first_table = ch_first_table // channel: [ val(meta), path(full_table) ] all_tables = ch_indexed_buscos // channel: [ val(meta), path(full_tables) ] blastp_txt = DIAMOND_BLASTP.out.txt // channel: [ val(meta), path(txt) ] taxon_id = ch_taxid // channel: taxon_id diff --git a/subworkflows/local/collate_stats.nf b/subworkflows/local/collate_stats.nf index 08bc43c9..b986188d 100644 --- a/subworkflows/local/collate_stats.nf +++ b/subworkflows/local/collate_stats.nf @@ -8,7 +8,7 @@ include { BLOBTOOLKIT_WINDOWSTATS } from '../../modules/local/blobtoolkit/window workflow COLLATE_STATS { - take: + take: busco // channel: [ val(meta), path(full_table) ] bed // channel: [ val(meta), path(bed) ] freq // channel: [ val(meta), path(freq) ] diff --git a/subworkflows/local/coverage_stats.nf b/subworkflows/local/coverage_stats.nf index 79b39a8a..78be4620 100644 --- a/subworkflows/local/coverage_stats.nf +++ b/subworkflows/local/coverage_stats.nf @@ -10,8 +10,8 @@ include { CREATE_BED } from '../../modules/local/create_bed' workflow COVERAGE_STATS { - take: - input // channel: [ val(meta), path(aln) ] + take: + input // channel: [ val(meta), path(aln) ] fasta // channel: [ val(meta), path(fasta) ] @@ -57,7 +57,7 @@ workflow COVERAGE_STATS { CREATE_BED ( FASTAWINDOWS.out.mononuc ) ch_versions = ch_versions.mix ( CREATE_BED.out.versions.first() ) - + // Calculate coverage BLOBTK_DEPTH ( ch_bam_csi ) ch_versions = ch_versions.mix ( BLOBTK_DEPTH.out.versions.first() ) diff --git a/subworkflows/local/minimap_alignment.nf b/subworkflows/local/minimap_alignment.nf index 1d6263b3..0c25f4c7 100644 --- a/subworkflows/local/minimap_alignment.nf +++ b/subworkflows/local/minimap_alignment.nf @@ -1,4 +1,4 @@ -// +// // Optional alignment subworkflow using Minimap2 // @@ -52,7 +52,7 @@ workflow MINIMAP2_ALIGNMENT { // Align with Minimap2 MINIMAP2_HIC ( ch_input.hic, fasta, true, false, false ) ch_versions = ch_versions.mix(MINIMAP2_HIC.out.versions.first()) - + MINIMAP2_ILMN ( ch_input.illumina, fasta, true, false, false ) ch_versions = ch_versions.mix(MINIMAP2_ILMN.out.versions.first()) diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index 0b426fae..a1f03980 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -48,7 +48,7 @@ workflow PREPARE_GENOME { ch_fasta = ch_genome } - + emit: genome = ch_fasta // channel: [ meta, path(genome) ] versions = ch_versions // channel: [ versions.yml ] diff --git a/subworkflows/local/run_blastn.nf b/subworkflows/local/run_blastn.nf index d1ab58da..1ea64b82 100644 --- a/subworkflows/local/run_blastn.nf +++ b/subworkflows/local/run_blastn.nf @@ -12,8 +12,8 @@ include { BLOBTOOLKIT_UNCHUNK } from '../../modules/local/blobtoolkit/u workflow RUN_BLASTN { - take: - blast_table // channel: [ val(meta), path(blast_table) ] + take: + blast_table // channel: [ val(meta), path(blast_table) ] fasta // channel: [ val(meta), path(fasta) ] blastn // channel: [ val(meta), path(blastn_db) ] taxon_id // channel: val(taxon_id) @@ -27,16 +27,16 @@ workflow RUN_BLASTN { // Get list of sequence ids with no hits in diamond blastx search NOHIT_LIST ( blast_table, fasta ) ch_versions = ch_versions.mix ( NOHIT_LIST.out.versions.first() ) - + // Subset of sequences with no hits SEQTK_SUBSEQ ( fasta, NOHIT_LIST.out.nohitlist.map { meta, nohit -> nohit } . filter { it.size() > 0 } ) ch_versions = ch_versions.mix ( SEQTK_SUBSEQ.out.versions.first() ) - - - // Split long contigs into chunks + + + // Split long contigs into chunks // create chunks BLOBTOOLKIT_CHUNK ( SEQTK_SUBSEQ.out.sequences, [[],[]] ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_CHUNK.out.versions.first() ) diff --git a/subworkflows/local/run_blastx.nf b/subworkflows/local/run_blastx.nf index 1bad6f6d..b26d29bf 100644 --- a/subworkflows/local/run_blastx.nf +++ b/subworkflows/local/run_blastx.nf @@ -31,7 +31,7 @@ workflow RUN_BLASTX { // DIAMOND_BLASTX ( BLOBTOOLKIT_CHUNK.out.chunks, blastx, outext, cols) ch_versions = ch_versions.mix ( DIAMOND_BLASTX.out.versions.first() ) - + // // Unchunk chunked blastx results From 4a226cfabca4939941c46ce64407b89a4d28b671 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 17 May 2024 14:42:56 +0100 Subject: [PATCH 07/15] Channel.of(...).first() is the same as Channel.value(...) --- workflows/blobtoolkit.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 7a8dd939..9f45e274 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -22,11 +22,11 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.fasta && params.accession) { ch_fasta = Channel.of([ [ 'id': params.accession ], params.fasta ]).first() } else { exit 1, 'Genome fasta file and accession must be specified!' } +if (params.fasta && params.accession) { ch_fasta = Channel.value([ [ 'id': params.accession ], params.fasta ]) } else { exit 1, 'Genome fasta file and accession must be specified!' } if (params.taxon) { ch_taxon = Channel.of(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } -if (params.blastp && params.accession) { ch_blastp = Channel.of([ [ 'id': params.accession ], params.blastp ]).first() } else { exit 1, 'Diamond BLASTp database and accession must be specified!' } -if (params.blastx && params.accession) { ch_blastx = Channel.of([ [ 'id': params.accession ], params.blastx ]).first() } else { exit 1, 'Diamond BLASTx database and accession must be specified!' } -if (params.blastn && params.accession) { ch_blastn = Channel.of([ [ 'id': params.accession ], params.blastn ]).first() } else { exit 1, 'BLASTn database not specified!' } +if (params.blastp && params.accession) { ch_blastp = Channel.value([ [ 'id': params.accession ], params.blastp ]) } else { exit 1, 'Diamond BLASTp database and accession must be specified!' } +if (params.blastx && params.accession) { ch_blastx = Channel.value([ [ 'id': params.accession ], params.blastx ]) } else { exit 1, 'Diamond BLASTx database and accession must be specified!' } +if (params.blastn && params.accession) { ch_blastn = Channel.value([ [ 'id': params.accession ], params.blastn ]) } else { exit 1, 'BLASTn database not specified!' } if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' } if (params.fetchngs_samplesheet && !params.align) { exit 1, '--align not specified, even though the input samplesheet is a nf-core/fetchngs one - i.e has fastq files!' } From 0b12ad436da170eed7905ce9c8cf182a5deb6053 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 17 May 2024 14:46:58 +0100 Subject: [PATCH 08/15] The accession is not appropriate there --- workflows/blobtoolkit.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 9f45e274..b1bedcac 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -24,9 +24,9 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.fasta && params.accession) { ch_fasta = Channel.value([ [ 'id': params.accession ], params.fasta ]) } else { exit 1, 'Genome fasta file and accession must be specified!' } if (params.taxon) { ch_taxon = Channel.of(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } -if (params.blastp && params.accession) { ch_blastp = Channel.value([ [ 'id': params.accession ], params.blastp ]) } else { exit 1, 'Diamond BLASTp database and accession must be specified!' } -if (params.blastx && params.accession) { ch_blastx = Channel.value([ [ 'id': params.accession ], params.blastx ]) } else { exit 1, 'Diamond BLASTx database and accession must be specified!' } -if (params.blastn && params.accession) { ch_blastn = Channel.value([ [ 'id': params.accession ], params.blastn ]) } else { exit 1, 'BLASTn database not specified!' } +if (params.blastp) { ch_blastp = Channel.value([ [ 'id': file(params.blastp).baseName ], params.blastp ]) } else { exit 1, 'Diamond BLASTp database must be specified!' } +if (params.blastx) { ch_blastx = Channel.value([ [ 'id': file(params.blastx).baseName ], params.blastx ]) } else { exit 1, 'Diamond BLASTx database must be specified!' } +if (params.blastn) { ch_blastn = Channel.value([ [ 'id': file(params.blastn).baseName ], params.blastn ]) } else { exit 1, 'BLASTn database not specified!' } if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' } if (params.fetchngs_samplesheet && !params.align) { exit 1, '--align not specified, even though the input samplesheet is a nf-core/fetchngs one - i.e has fastq files!' } From 043a1202b8e7dcbac69dc3a31d3964cd5423b87b Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 17 May 2024 15:28:14 +0100 Subject: [PATCH 09/15] Removed some parameters that could in fact not be changed --- CHANGELOG.md | 12 ++++-- nextflow.config | 4 -- nextflow_schema.json | 24 ------------ subworkflows/local/busco_diamond_blastp.nf | 5 ++- subworkflows/local/run_blastx.nf | 5 ++- workflows/blobtoolkit.nf | 44 ++++++++++------------ 6 files changed, 34 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 876db76e..86425753 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,13 +10,17 @@ General tidy up of the configuration and the pipeline ### Enhancements & fixes - Increased the resources for blastn -- Removed the `--taxa_file` option. The taxon now has to be provided through `--taxon`, which accepts both names and integers. +- Removed some options that were not used or not needed ### Parameters -| Old parameter | New parameter | -| ------------- | ------------- | -| --taxa_file | | +| Old parameter | New parameter | +| --------------- | ------------- | +| --taxa_file | | +| --blastp_outext | | +| --blastp_cols | | +| --blastx_outext | | +| --blastx_cols | | > **NB:** Parameter has been **updated** if both old and new parameter information is present.
**NB:** Parameter has been **added** if just the new parameter information is present.
**NB:** Parameter has been **removed** if new parameter information isn't present. diff --git a/nextflow.config b/nextflow.config index 65d14441..db5ef388 100644 --- a/nextflow.config +++ b/nextflow.config @@ -31,10 +31,6 @@ params { blastp = null blastx = null blastn = null - blastp_outext = 'txt' - blastp_cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' - blastx_outext = 'txt' - blastx_cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' // MultiQC options multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 7cd7e72d..8fd05062 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -108,30 +108,6 @@ "description": "Local directory where clade-specific BUSCO lineage datasets are stored", "fa_icon": "fas fa-folder-open" }, - "blastp_cols": { - "type": "string", - "description": "When blastp_outext is 'txt', this is the list of columns that Diamond BLAST should print.", - "default": "qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" - }, - "blastp_outext": { - "type": "string", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], - "description": "Extension (file format) of the output file from Diamond BLAST.", - "fa_icon": "fas fa-file-circle-question", - "default": "txt" - }, - "blastx_cols": { - "type": "string", - "description": "When blastx_outext is 'txt', this is the list of columns that Diamond BLAST should print.", - "default": "qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" - }, - "blastx_outext": { - "type": "string", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], - "description": "Extension (file format) of the output file from Diamond BLAST.", - "fa_icon": "fas fa-file-circle-question", - "default": "txt" - }, "blastp": { "type": "string", "format": "file-path", diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index e22ce3f8..2efb2728 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -15,8 +15,6 @@ workflow BUSCO_DIAMOND { taxon // channel: val(taxon) busco_db // channel: path(busco_db) blastp // channel: path(blastp_db) - outext // channel: val(out_format) - cols // channel: val(column_names) main: @@ -115,6 +113,9 @@ workflow BUSCO_DIAMOND { | filter { it[1].size() > 140 } | set { ch_busco_genes } + // Hardocded to match the format expected by blobtools + def outext = 'txt' + def cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols ) ch_versions = ch_versions.mix ( DIAMOND_BLASTP.out.versions.first() ) diff --git a/subworkflows/local/run_blastx.nf b/subworkflows/local/run_blastx.nf index b26d29bf..ed2df41f 100644 --- a/subworkflows/local/run_blastx.nf +++ b/subworkflows/local/run_blastx.nf @@ -11,8 +11,6 @@ workflow RUN_BLASTX { fasta // channel: [ val(meta), path(fasta) ] table // channel: [ val(meta), path(busco_table) ] blastx // channel: [ val(meta), path(blastx_db) ] - outext // channel: val(out_format) - cols // channel: val(column_names) main: @@ -29,6 +27,9 @@ workflow RUN_BLASTX { // // Run diamond_blastx // + // Hardocded to match the format expected by blobtools + def outext = 'txt' + def cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' DIAMOND_BLASTX ( BLOBTOOLKIT_CHUNK.out.chunks, blastx, outext, cols) ch_versions = ch_versions.mix ( DIAMOND_BLASTX.out.versions.first() ) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index b1bedcac..231adfa6 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -108,7 +108,7 @@ workflow BLOBTOOLKIT { INPUT_CHECK ( ch_input, PREPARE_GENOME.out.genome, ch_yaml ) ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions ) - // + // // SUBWORKFLOW: Optional read alignment // if ( params.align ) { @@ -120,7 +120,7 @@ workflow BLOBTOOLKIT { } // - // SUBWORKFLOW: Calculate genome coverage and statistics + // SUBWORKFLOW: Calculate genome coverage and statistics // COVERAGE_STATS ( ch_aligned, PREPARE_GENOME.out.genome ) ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions ) @@ -128,25 +128,21 @@ workflow BLOBTOOLKIT { // // SUBWORKFLOW: Run BUSCO using lineages fetched from GOAT, then run diamond_blastp // - BUSCO_DIAMOND ( - PREPARE_GENOME.out.genome, + BUSCO_DIAMOND ( + PREPARE_GENOME.out.genome, ch_taxon, - ch_busco_db, - ch_blastp, - params.blastp_outext, - params.blastp_cols + ch_busco_db, + ch_blastp, ) ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions ) - + // // SUBWORKFLOW: Diamond blastx search of assembly contigs against the UniProt reference proteomes // - RUN_BLASTX ( + RUN_BLASTX ( PREPARE_GENOME.out.genome, BUSCO_DIAMOND.out.first_table, ch_blastx, - params.blastx_outext, - params.blastx_cols ) ch_versions = ch_versions.mix ( RUN_BLASTX.out.versions ) @@ -154,29 +150,29 @@ workflow BLOBTOOLKIT { // // SUBWORKFLOW: Run blastn search on sequences that had no blastx hits // - RUN_BLASTN ( - RUN_BLASTX.out.blastx_out, - PREPARE_GENOME.out.genome, - ch_blastn, + RUN_BLASTN ( + RUN_BLASTX.out.blastx_out, + PREPARE_GENOME.out.genome, + ch_blastn, BUSCO_DIAMOND.out.taxon_id ) - + // // SUBWORKFLOW: Collate genome statistics by various window sizes // - COLLATE_STATS ( + COLLATE_STATS ( BUSCO_DIAMOND.out.all_tables, - COVERAGE_STATS.out.bed, - COVERAGE_STATS.out.freq, - COVERAGE_STATS.out.mononuc, - COVERAGE_STATS.out.cov + COVERAGE_STATS.out.bed, + COVERAGE_STATS.out.freq, + COVERAGE_STATS.out.mononuc, + COVERAGE_STATS.out.cov ) ch_versions = ch_versions.mix ( COLLATE_STATS.out.versions ) // // SUBWORKFLOW: Create BlobTools dataset // - BLOBTOOLS ( + BLOBTOOLS ( INPUT_CHECK.out.config, COLLATE_STATS.out.window_tsv, BUSCO_DIAMOND.out.all_tables, @@ -186,7 +182,7 @@ workflow BLOBTOOLKIT { ch_taxdump ) ch_versions = ch_versions.mix ( BLOBTOOLS.out.versions ) - + // // SUBWORKFLOW: Generate summary and static images // From 5670deb385b2b1e99ee2f29a5d8828713ecafc37 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 17 May 2024 15:36:58 +0100 Subject: [PATCH 10/15] The accession number is in fact mutually exclusive of the yaml file --- nextflow_schema.json | 2 +- workflows/blobtoolkit.nf | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 8fd05062..b392e2a5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -75,7 +75,7 @@ "type": "object", "fa_icon": "fas fa-dna", "description": "Reference genome related files and options required for the workflow.", - "required": ["taxon", "accession", "fasta"], + "required": ["taxon", "fasta"], "properties": { "taxon": { "type": ["string", "integer"], diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 231adfa6..07ba51bc 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -22,7 +22,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.fasta && params.accession) { ch_fasta = Channel.value([ [ 'id': params.accession ], params.fasta ]) } else { exit 1, 'Genome fasta file and accession must be specified!' } +if (params.fasta) { ch_fasta = Channel.value([ [ 'id': params.accession ?: file(params.fasta).baseName ], file(params.fasta) ]) } else { exit 1, 'Genome fasta file must be specified!' } if (params.taxon) { ch_taxon = Channel.of(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } if (params.blastp) { ch_blastp = Channel.value([ [ 'id': file(params.blastp).baseName ], params.blastp ]) } else { exit 1, 'Diamond BLASTp database must be specified!' } if (params.blastx) { ch_blastx = Channel.value([ [ 'id': file(params.blastx).baseName ], params.blastx ]) } else { exit 1, 'Diamond BLASTx database must be specified!' } @@ -32,7 +32,9 @@ if (params.fetchngs_samplesheet && !params.align) { exit 1, '--align not specifi // Create channel for optional parameters if (params.busco) { ch_busco_db = Channel.fromPath(params.busco) } else { ch_busco_db = Channel.empty() } -if (params.yaml && params.accession) { ch_yaml = Channel.of([ [ 'id': params.accession ], params.yaml ]) } else { ch_yaml = Channel.empty() } +if (params.yaml) { ch_yaml = Channel.fromPath(params.yaml) } else { ch_yaml = Channel.empty() } +if (params.yaml && params.accession) { exit 1, '--yaml cannot be provided at the same time as --accession !' } +if (!params.yaml && !params.accession) { exit 1, '--yaml and --accession are both mising. Pick one !' } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 083dbb8dcbc9603f681057c445c7456156bb07bf Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 19 May 2024 10:33:18 +0100 Subject: [PATCH 11/15] This should be value channel --- workflows/blobtoolkit.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 07ba51bc..85297029 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -23,7 +23,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.fasta) { ch_fasta = Channel.value([ [ 'id': params.accession ?: file(params.fasta).baseName ], file(params.fasta) ]) } else { exit 1, 'Genome fasta file must be specified!' } -if (params.taxon) { ch_taxon = Channel.of(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } +if (params.taxon) { ch_taxon = Channel.value(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } if (params.blastp) { ch_blastp = Channel.value([ [ 'id': file(params.blastp).baseName ], params.blastp ]) } else { exit 1, 'Diamond BLASTp database must be specified!' } if (params.blastx) { ch_blastx = Channel.value([ [ 'id': file(params.blastx).baseName ], params.blastx ]) } else { exit 1, 'Diamond BLASTx database must be specified!' } if (params.blastn) { ch_blastn = Channel.value([ [ 'id': file(params.blastn).baseName ], params.blastn ]) } else { exit 1, 'BLASTn database not specified!' } From 9e36a7a948e8846361ac2c69c7beb9ccea79ec3f Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 19 May 2024 10:30:20 +0100 Subject: [PATCH 12/15] Improved the type and handling of the busco_db channel --- subworkflows/local/busco_diamond_blastp.nf | 2 +- workflows/blobtoolkit.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index 2efb2728..bd50a37a 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -70,7 +70,7 @@ workflow BUSCO_DIAMOND { ch_fasta_with_lineage, "genome", ch_fasta_with_lineage.map { it[0].lineage_name }, - busco_db.collect().ifEmpty([]), + busco_db, [], ) ch_versions = ch_versions.mix ( BUSCO.out.versions.first() ) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 85297029..1449df3d 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -31,7 +31,7 @@ if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI T if (params.fetchngs_samplesheet && !params.align) { exit 1, '--align not specified, even though the input samplesheet is a nf-core/fetchngs one - i.e has fastq files!' } // Create channel for optional parameters -if (params.busco) { ch_busco_db = Channel.fromPath(params.busco) } else { ch_busco_db = Channel.empty() } +if (params.busco) { ch_busco_db = Channel.fromPath(params.busco).first() } else { ch_busco_db = Channel.value([]) } if (params.yaml) { ch_yaml = Channel.fromPath(params.yaml) } else { ch_yaml = Channel.empty() } if (params.yaml && params.accession) { exit 1, '--yaml cannot be provided at the same time as --accession !' } if (!params.yaml && !params.accession) { exit 1, '--yaml and --accession are both mising. Pick one !' } From 5c996021816361aa6e93a6453c8eb75342420f02 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 20 May 2024 09:25:23 +0100 Subject: [PATCH 13/15] Nicer name, even for compressed files --- workflows/blobtoolkit.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 1449df3d..e3c0884e 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -22,7 +22,7 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.fasta) { ch_fasta = Channel.value([ [ 'id': params.accession ?: file(params.fasta).baseName ], file(params.fasta) ]) } else { exit 1, 'Genome fasta file must be specified!' } +if (params.fasta) { ch_fasta = Channel.value([ [ 'id': params.accession ?: file(params.fasta.replace(".gz", "")).baseName ], file(params.fasta) ]) } else { exit 1, 'Genome fasta file must be specified!' } if (params.taxon) { ch_taxon = Channel.value(params.taxon) } else { exit 1, 'NCBI Taxon ID not specified!' } if (params.blastp) { ch_blastp = Channel.value([ [ 'id': file(params.blastp).baseName ], params.blastp ]) } else { exit 1, 'Diamond BLASTp database must be specified!' } if (params.blastx) { ch_blastx = Channel.value([ [ 'id': file(params.blastx).baseName ], params.blastx ]) } else { exit 1, 'Diamond BLASTx database must be specified!' } From b0e4e57140f4c8dc2ac562431487be4a574e52a2 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 20 May 2024 17:32:36 +0100 Subject: [PATCH 14/15] This module is not used --- workflows/blobtoolkit.nf | 5 ----- 1 file changed, 5 deletions(-) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index e3c0884e..5c1d946c 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -53,11 +53,6 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// MODULE: Loaded from modules/local/ -// -include { BLOBTOOLKIT_CONFIG } from '../modules/local/blobtoolkit/config' - // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // From 9b1ecc35e1bb9176873ccedf878398c22c54c020 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 21 May 2024 19:38:17 +0100 Subject: [PATCH 15/15] typo Co-authored-by: Guoying Qi <729395+gq1@users.noreply.github.com> --- subworkflows/local/busco_diamond_blastp.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index bd50a37a..59c65a24 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -113,7 +113,7 @@ workflow BUSCO_DIAMOND { | filter { it[1].size() > 140 } | set { ch_busco_genes } - // Hardocded to match the format expected by blobtools + // Hardcoded to match the format expected by blobtools def outext = 'txt' def cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols )