diff --git a/subworkflows/nf-core/fasta_gxf_busco_plot/main.nf b/subworkflows/nf-core/fasta_gxf_busco_plot/main.nf new file mode 100644 index 00000000000..a04c722cfe0 --- /dev/null +++ b/subworkflows/nf-core/fasta_gxf_busco_plot/main.nf @@ -0,0 +1,172 @@ +include { BUSCO_BUSCO as BUSCO_ASSEMBLY } from '../../../modules/nf-core/busco/busco/main' +include { BUSCO_GENERATEPLOT as PLOT_ASSEMBLY } from '../../../modules/nf-core/busco/generateplot/main' +include { GFFREAD as EXTRACT_PROTEINS } from '../../../modules/nf-core/gffread/main' +include { BUSCO_BUSCO as BUSCO_ANNOTATION } from '../../../modules/nf-core/busco/busco/main' +include { BUSCO_GENERATEPLOT as PLOT_ANNOTATION } from '../../../modules/nf-core/busco/generateplot/main' + +workflow FASTA_GXF_BUSCO_PLOT { + + take: + ch_fasta // channel: [ val(meta), fasta ] + ch_gxf // channel: [ val(meta2), gxf ]; gxf ~ gff | gff3 | gtf + // + // meta and meta2 should have same id + + val_mode // val(mode); BUSCO mode to apply to ch_fasta + // - genome, for genome assemblies (DNA) + // - transcriptome, for transcriptome assemblies (DNA) + // - proteins, for annotated gene sets (protein) + // + // If mode is genome, annotations from ch_gxf are evaluated with + // mode proteins, otherwise, evaluation of the annotations is skipped + // + val_lineages // [ val(lineage) ] + val_busco_lineages_path // val(path); Optional; Set to [] if not needed + val_busco_config // val(path); Optional; Set to [] if not needed + + main: + ch_versions = Channel.empty() + ch_db_path = val_busco_lineages_path + ? Channel.of(file(val_busco_lineages_path, checkIfExists: true)) + : Channel.of( [ [] ] ) + ch_config_path = val_busco_config + ? Channel.of(file(val_busco_config, checkIfExists: true)) + : Channel.of( [ [] ] ) + + // MODULE: BUSCO_BUSCO as BUSCO_ASSEMBLY + ch_busco_assembly_inputs = ch_fasta + | combine( + Channel.of(val_mode) + ) + | combine( + Channel.fromList(val_lineages) + ) + | map { meta, fasta, mode, lineage -> + [ + meta + [ mode:mode, lineage:lineage ], + fasta, mode, lineage + ] + } + | combine( + ch_db_path + ) + | combine( + ch_config_path + ) + + BUSCO_ASSEMBLY( + ch_busco_assembly_inputs.map { meta, fasta, _mode, _lineage, _db, _config -> [ meta, fasta ] }, + ch_busco_assembly_inputs.map { _meta, _fasta, mode, _lineage, _db, _config -> mode }, + ch_busco_assembly_inputs.map { _meta, _fasta, _mode, lineage, _db, _config -> lineage }, + ch_busco_assembly_inputs.map { _meta, _fasta, _mode, _lineage, db, _config -> db }, + ch_busco_assembly_inputs.map { _meta, _fasta, _mode, _lineage, _db, config -> config } + ) + + ch_assembly_batch_summary = BUSCO_ASSEMBLY.out.batch_summary + ch_assembly_short_summaries_txt = BUSCO_ASSEMBLY.out.short_summaries_txt + ch_assembly_short_summaries_json = BUSCO_ASSEMBLY.out.short_summaries_json + ch_assembly_full_table = BUSCO_ASSEMBLY.out.full_table + ch_versions = ch_versions.mix(BUSCO_ASSEMBLY.out.versions.first()) + + // MODULE: BUSCO_GENERATEPLOT as PLOT_ASSEMBLY + ch_assembly_plot_summary = ch_assembly_short_summaries_txt + | map { meta, txt -> + def lineage_name = meta.lineage - '_odb' + [ + "short_summary.specific.${meta.lineage}.${meta.id}_${lineage_name}.txt", + txt.text + ] + } + | collectFile + + PLOT_ASSEMBLY( ch_assembly_plot_summary.collect() ) + + ch_assembly_png = PLOT_ASSEMBLY.out.png + ch_versions = ch_versions.mix(PLOT_ASSEMBLY.out.versions) + + // MODULE: GFFREAD as EXTRACT_PROTEINS + ch_gffread_inputs = val_mode !in [ 'geno', 'genome' ] + ? Channel.empty() + : ch_fasta + | map { meta, fasta -> [ meta.id, meta, fasta ] } + | join( + ch_gxf.map { meta2, gxf -> [ meta2.id, gxf ] } + // Join with matching annotation + // to allow one annotations per fasta + ) + | map { _id, meta, fasta, gxf -> [ meta, gxf, fasta ] } + EXTRACT_PROTEINS( + ch_gffread_inputs.map { meta, gxf, _fasta -> [ meta, gxf ] }, + ch_gffread_inputs.map { _meta, _gxf, fasta -> fasta } + ) + + ch_proteins = EXTRACT_PROTEINS.out.gffread_fasta + ch_versions = ch_versions.mix(EXTRACT_PROTEINS.out.versions.first()) + + // MODULE: BUSCO_BUSCO as BUSCO_ANNOTATION + ch_busco_annotation_inputs = ch_proteins + | combine( + Channel.of('proteins') + ) + | combine( + Channel.fromList(val_lineages) + ) + | map { meta, fasta, mode, lineage -> + [ + meta + [ mode:mode, lineage:lineage ], + fasta, mode, lineage + ] + } + | combine( + ch_db_path + ) + | combine( + ch_config_path + ) + + BUSCO_ANNOTATION( + ch_busco_annotation_inputs.map { meta, fasta, _mode, _lineage, _db, _config -> [ meta, fasta ] }, + ch_busco_annotation_inputs.map { _meta, _fasta, mode, _lineage, _db, _config -> mode }, + ch_busco_annotation_inputs.map { _meta, _fasta, _mode, lineage, _db, _config -> lineage }, + ch_busco_annotation_inputs.map { _meta, _fasta, _mode, _lineage, db, _config -> db }, + ch_busco_annotation_inputs.map { _meta, _fasta, _mode, _lineage, _db, config -> config } + ) + + ch_annotation_batch_summary = BUSCO_ANNOTATION.out.batch_summary + ch_annotation_short_summaries_txt = BUSCO_ANNOTATION.out.short_summaries_txt + ch_annotation_short_summaries_json = BUSCO_ANNOTATION.out.short_summaries_json + ch_annotation_full_table = BUSCO_ANNOTATION.out.full_table + ch_versions = ch_versions.mix(BUSCO_ANNOTATION.out.versions.first()) + + // MODULE: BUSCO_GENERATEPLOT as PLOT_ANNOTATION + ch_annotation_plot_summary = ch_annotation_short_summaries_txt + | map { meta, txt -> + def lineage_name = meta.lineage - '_odb' + [ + "short_summary.specific.${meta.lineage}.${meta.id}_${lineage_name}.proteins.txt", + txt.text + ] + } + | collectFile + + PLOT_ANNOTATION( ch_annotation_plot_summary.collect() ) + + ch_annotation_png = PLOT_ANNOTATION.out.png + ch_versions = ch_versions.mix(PLOT_ANNOTATION.out.versions) + + + emit: + assembly_batch_summary = ch_assembly_batch_summary // channel: [ meta3, txt ]; meta3 ~ meta + [ val(mode), val(lineage) ] + assembly_short_summaries_txt = ch_assembly_short_summaries_txt // channel: [ meta3, txt ] + assembly_short_summaries_json = ch_assembly_short_summaries_json // channel: [ meta3, json ] + assembly_full_table = ch_assembly_full_table // channel: [ meta3, tsv ] + assembly_plot_summary_txt = ch_assembly_plot_summary // channel: [ text ] + assembly_png = ch_assembly_png // channel: [ png ] + annotation_batch_summary = ch_annotation_batch_summary // channel: [ meta3, txt ] + annotation_short_summaries_txt = ch_annotation_short_summaries_txt // channel: [ meta3, txt ] + annotation_short_summaries_json = ch_annotation_short_summaries_json // channel: [ meta3, json ] + annotation_full_table = ch_annotation_full_table // channel: [ meta3, tsv ] + annotation_plot_summary_txt = ch_annotation_plot_summary // channel: [ txt ] + annotation_png = ch_annotation_png // channel: [ png ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/fasta_gxf_busco_plot/meta.yml b/subworkflows/nf-core/fasta_gxf_busco_plot/meta.yml new file mode 100644 index 00000000000..bf0e437f326 --- /dev/null +++ b/subworkflows/nf-core/fasta_gxf_busco_plot/meta.yml @@ -0,0 +1,131 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fasta_gxf_busco_plot" +description: | + Runs BUSCO for input assemblies and their annotations in GFF/GFF3/GTF format, and creates summary plots using `BUSCO/generate_plot.py` script +keywords: + - genome + - annotation + - busco + - plot +components: + - busco/busco + - busco/generateplot + - gffread +input: + - ch_fasta: + type: file + description: | + Channel containing FASTA files + Structure:[ val(meta), fasta ] + pattern: "*.{fa,faa,fsa,fas,fasta}(.gz)?" + - ch_gxf: + type: file + description: | + Channel containing GFF/GFF3/GTF files + Structure:[ val(meta2), gxf ] + pattern: "*.{gff,gff3,gtf}" + - val_mode: + type: string + description: | + String containing BUSCO mode to apply to ch_fasta files + Structure:val(mode) + - val_lineages: + type: array + description: | + Array of strings representing BUSCO lineage datasets + Structure:[ val(lineage) ] + - val_busco_lineages_path: + type: path + description: | + Path where BUSCO lineages are located or downloaded if not already there. If this input is `[]`, + BUSCO will download the datasets in the task work directory + Structure:val(busco_lineages_path) + - val_busco_config: + type: path + description: | + Path to BUSCO config. It is optional and can be set to `[]` + Structure:val(busco_config) +output: + - assembly_batch_summary: + type: file + description: | + Channel containing BUSCO batch summaries corresponding to fasta files + Structure: [ val(meta), txt ] + pattern: "*.txt" + - assembly_short_summaries_txt: + type: file + description: | + Channel containing BUSCO short summaries corresponding to fasta files + Structure: [ val(meta), txt ] + pattern: "*.txt" + - assembly_short_summaries_json: + type: file + description: | + Channel containing BUSCO short summaries corresponding to fasta files + Structure: [ val(meta), json ] + pattern: "*.json" + - assembly_full_table: + description: | + Channel containing complete results in a tabular format with scores and lengths of BUSCO matches, + and coordinates (for genome mode) or gene/protein IDs (for transcriptome or protein modes) + Structure: [ val(meta), tsv ] + pattern: "*.tsv" + - assembly_plot_summary_txt: + type: file + description: | + Channel containing BUSCO short summaries corresponding to fasta files renamed to include lineage in sample id + Structure: [ txt ] + pattern: "*.txt" + - assembly_png: + type: file + description: | + Channel containing summary plot for assemblies + Structure: png + pattern: "*.png" + - annotation_batch_summary: + type: file + description: | + Channel containing BUSCO batch summaries corresponding to annotation files + Structure: [ val(meta), txt ] + pattern: "*.txt" + - annotation_short_summaries_txt: + type: file + description: | + Channel containing BUSCO short summaries corresponding to annotation files + Structure: [ val(meta), txt ] + pattern: "*.txt" + - annotation_short_summaries_json: + type: file + description: | + Channel containing BUSCO short summaries corresponding to annotation files + Structure: [ val(meta), json ] + pattern: "*.json" + - annotation_full_table: + description: | + Channel containing complete results in a tabular format with scores and lengths of BUSCO matches, + protein IDs in protein mode + Structure: [ val(meta), tsv ] + pattern: "*.tsv" + - annotation_plot_summary_txt: + type: file + description: | + Channel containing BUSCO short summaries corresponding to annotation files renamed to include lineage in sample id + Structure: [ txt ] + pattern: "*.txt" + - annotation_png: + type: file + description: | + Channel containing summary plot for annotations + Structure: png + pattern: "*.png" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" + - "@FernandoDuarteF" diff --git a/subworkflows/nf-core/fasta_gxf_busco_plot/tests/main.nf.test b/subworkflows/nf-core/fasta_gxf_busco_plot/tests/main.nf.test new file mode 100644 index 00000000000..0154f9c1476 --- /dev/null +++ b/subworkflows/nf-core/fasta_gxf_busco_plot/tests/main.nf.test @@ -0,0 +1,114 @@ +nextflow_workflow { + + name "Test Subworkflow FASTA_GXF_BUSCO_PLOT" + script "../main.nf" + workflow "FASTA_GXF_BUSCO_PLOT" + config './nextflow.config' + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/fasta_gxf_busco_plot" + tag "busco" + tag "busco/busco" + tag "busco/generateplot" + tag "gffread" + + test("candidatus_portiera_aleyrodidarum - bacteroides_fragilis - genome") { + + when { + + params { + extract_proteins_args = '-y' + } + + workflow { + """ + input[0] = Channel.of( + [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/genome.fasta', checkIfExists: true) + ], + [ + [ id:'test2' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) + ] + ) + input[1] = Channel.of( + [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/gff/test1.gff', checkIfExists: true) + ] + ) + input[2] = 'genome' + input[3] = [ 'bacteria_odb10', 'archaea_odb10' ] + input[4] = [] + input[5] = [] + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot( + workflow.out.assembly_batch_summary, + workflow.out.annotation_batch_summary, + workflow.out.versions, + ).match() + }, + + { assert workflow.out.assembly_png != null }, + { assert workflow.out.annotation_png != null }, + + { assert workflow.out.assembly_short_summaries_json != null }, + { assert workflow.out.assembly_short_summaries_txt != null }, + { assert workflow.out.annotation_short_summaries_json != null }, + { assert workflow.out.annotation_short_summaries_txt != null } + ) + } + } + + test("candidatus_portiera_aleyrodidarum - bacteroides_fragilis - genome - stub") { + + options '-stub' + + when { + + params { + extract_proteins_args = '-y' + } + + workflow { + """ + input[0] = Channel.of( + [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/genome.fasta', checkIfExists: true) + ], + [ + [ id:'test2' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) + ] + ) + input[1] = Channel.of( + [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/gff/test1.gff', checkIfExists: true) + ] + ) + input[2] = 'genome' + input[3] = [ 'bacteria_odb10', 'archaea_odb10' ] + input[4] = [] + input[5] = [] + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match()} + ) + } + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/fasta_gxf_busco_plot/tests/main.nf.test.snap b/subworkflows/nf-core/fasta_gxf_busco_plot/tests/main.nf.test.snap new file mode 100644 index 00000000000..89604dc25c0 --- /dev/null +++ b/subworkflows/nf-core/fasta_gxf_busco_plot/tests/main.nf.test.snap @@ -0,0 +1,255 @@ +{ + "candidatus_portiera_aleyrodidarum - bacteroides_fragilis - genome - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "mode": "genome", + "lineage": "archaea_odb10" + }, + "test-archaea_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test", + "mode": "genome", + "lineage": "bacteria_odb10" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test2", + "mode": "genome", + "lineage": "archaea_odb10" + }, + "test2-archaea_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test2", + "mode": "genome", + "lineage": "bacteria_odb10" + }, + "test2-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "10": [ + + ], + "11": [ + + ], + "12": [ + "versions.yml:md5,36b11c442943567e471af0abd474a10b", + "versions.yml:md5,9435355f913e283f60b4fb7ef77dd52a", + "versions.yml:md5,e9d65e2f2f13175e99c5b7f4ae1013b9" + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + + ], + "6": [ + [ + { + "id": "test", + "mode": "proteins", + "lineage": "archaea_odb10" + }, + "test-archaea_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test", + "mode": "proteins", + "lineage": "bacteria_odb10" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + + ], + "8": [ + + ], + "9": [ + + ], + "annotation_batch_summary": [ + [ + { + "id": "test", + "mode": "proteins", + "lineage": "archaea_odb10" + }, + "test-archaea_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test", + "mode": "proteins", + "lineage": "bacteria_odb10" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "annotation_full_table": [ + + ], + "annotation_plot_summary_txt": [ + + ], + "annotation_png": [ + + ], + "annotation_short_summaries_json": [ + + ], + "annotation_short_summaries_txt": [ + + ], + "assembly_batch_summary": [ + [ + { + "id": "test", + "mode": "genome", + "lineage": "archaea_odb10" + }, + "test-archaea_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test", + "mode": "genome", + "lineage": "bacteria_odb10" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test2", + "mode": "genome", + "lineage": "archaea_odb10" + }, + "test2-archaea_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test2", + "mode": "genome", + "lineage": "bacteria_odb10" + }, + "test2-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "assembly_full_table": [ + + ], + "assembly_plot_summary_txt": [ + + ], + "assembly_png": [ + + ], + "assembly_short_summaries_json": [ + + ], + "assembly_short_summaries_txt": [ + + ], + "versions": [ + "versions.yml:md5,36b11c442943567e471af0abd474a10b", + "versions.yml:md5,9435355f913e283f60b4fb7ef77dd52a", + "versions.yml:md5,e9d65e2f2f13175e99c5b7f4ae1013b9" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.10.1" + }, + "timestamp": "2024-11-25T10:31:03.980517" + }, + "candidatus_portiera_aleyrodidarum - bacteroides_fragilis - genome": { + "content": [ + [ + [ + { + "id": "test", + "mode": "genome", + "lineage": "archaea_odb10" + }, + "test-archaea_odb10-busco.batch_summary.txt:md5,1397d74518a776ad75b16a843bc5b6c1" + ], + [ + { + "id": "test", + "mode": "genome", + "lineage": "bacteria_odb10" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,a1186bc25448ac1949bf7790810f7161" + ], + [ + { + "id": "test2", + "mode": "genome", + "lineage": "archaea_odb10" + }, + "test2-archaea_odb10-busco.batch_summary.txt:md5,946582b353a8dba7d6452a71856eca06" + ], + [ + { + "id": "test2", + "mode": "genome", + "lineage": "bacteria_odb10" + }, + "test2-bacteria_odb10-busco.batch_summary.txt:md5,21b3fb771cf36be917cc451540d999be" + ] + ], + [ + [ + { + "id": "test", + "mode": "proteins", + "lineage": "archaea_odb10" + }, + "test-archaea_odb10-busco.batch_summary.txt:md5,95172bd5b1a30e632fc79084ea0ca585" + ], + [ + { + "id": "test", + "mode": "proteins", + "lineage": "bacteria_odb10" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,995127c0caecb36205dbf21aa2f9f8a8" + ] + ], + [ + "versions.yml:md5,05d8022e3afb0d5642ed17147b991730", + "versions.yml:md5,36b11c442943567e471af0abd474a10b", + "versions.yml:md5,53987b35fc275297efdaf525937fdca3", + "versions.yml:md5,9435355f913e283f60b4fb7ef77dd52a", + "versions.yml:md5,e9d65e2f2f13175e99c5b7f4ae1013b9" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-13T16:39:04.376704" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/fasta_gxf_busco_plot/tests/nextflow.config b/subworkflows/nf-core/fasta_gxf_busco_plot/tests/nextflow.config new file mode 100644 index 00000000000..12b9956c2b2 --- /dev/null +++ b/subworkflows/nf-core/fasta_gxf_busco_plot/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: EXTRACT_PROTEINS { + ext.args = params.extract_proteins_args + } +}