diff --git a/CHANGELOG.md b/CHANGELOG.md index 2dfc228..c1bea00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,12 +3,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.0dev - [date] +## v1.0dev - [27-Nov-2024] -Initial release of ecoflow/genomeqc, created with the [nf-core](https://nf-co.re/) template. +Initial release of nf-core/genomeqc, created with the [nf-core](https://nf-co.re/) template. ### `Added` +1. Now using `FASTA_GXF_BUSCO_PLOT` sub workflow from nf-core/modules [#77](https://github.com/nf-core/genomeqc/issues/77) + ### `Fixed` ### `Dependencies` diff --git a/README.md b/README.md index 4af6489..a237202 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,7 @@ ecoflow/genomeqc was originally written by Chris Wyatt, Fernando Duarte. We thank the following people for their extensive assistance in the development of this pipeline: - [Stephen Turner](https://github.com/stephenturner/) ([Colossal Biosciences](https://colossal.com/)) +- [Usman Rashid](https://github.com/gallvp) ([The New Zealand Institute for Plant and Food Research](https://www.plantandfood.com/en-nz/)) diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 6571ba6..e32dd0d 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -12,3 +12,7 @@ report_section_order: export_plots: true disable_version_detection: true + +extra_fn_clean_exts: + - type: regex + pattern: "^short_summary\\.specific\\..*_odb10\\." diff --git a/conf/modules.config b/conf/modules.config index 606d411..e910873 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -31,7 +31,7 @@ process { ] } - withName: 'GFFREAD' { + withName: '.*:FASTA_GXF_BUSCO_PLOT:EXTRACT_PROTEINS' { ext.args = '-y -S' publishDir = [ path: { "${params.outdir}/gffread" }, @@ -57,7 +57,7 @@ process { ] } - withName: 'BUSCO_BUSCO' { + withName: '.*:GENOME:BUSCO_BUSCO' { publishDir = [ path: { "${params.outdir}/busco" }, mode: params.publish_dir_mode, @@ -133,4 +133,42 @@ process { ] } + withName: '.*:FASTA_GXF_BUSCO_PLOT:BUSCO_ASSEMBLY' { + ext.args = '--metaeuk' + publishDir = [ + path: { "${params.outdir}/busco/fasta" }, + mode: params.publish_dir_mode, + pattern: 'short_summary.*.txt', + saveAs: { filename -> "short_summary.specific.${meta.id}.${lineage}.txt" } + ] + } + + withName: '.*:FASTA_GXF_BUSCO_PLOT:PLOT_ASSEMBLY' { + ext.prefix = 'busco_figure' + publishDir = [ + path: { "${params.outdir}/busco/fasta" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: '.*:FASTA_GXF_BUSCO_PLOT:BUSCO_ANNOTATION' { + ext.args = '--metaeuk' + publishDir = [ + path: { "${params.outdir}/busco/gff" }, + mode: params.publish_dir_mode, + pattern: 'short_summary.*.txt', + saveAs: { filename -> "short_summary.specific.${meta.id}.${lineage}.txt" } + ] + } + + withName: '.*:FASTA_GXF_BUSCO_PLOT:PLOT_ANNOTATION' { + ext.prefix = 'busco_figure' + publishDir = [ + path: { "${params.outdir}/busco/gff" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + } diff --git a/main.nf b/main.nf old mode 100644 new mode 100755 diff --git a/modules.json b/modules.json index 834e202..a1142ec 100644 --- a/modules.json +++ b/modules.json @@ -18,7 +18,12 @@ "busco/busco": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": ["fasta_gxf_busco_plot", "modules"] + }, + "busco/generateplot": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["fasta_gxf_busco_plot"] }, "fastavalidator": { "branch": "master", @@ -33,7 +38,7 @@ "gffread": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": ["fasta_gxf_busco_plot", "modules"] }, "merqury/merqury": { "branch": "master", @@ -109,6 +114,11 @@ "git_sha": "ab80a04707104a4baf39341581dfbced5da05479", "installed_by": ["subworkflows"] }, + "fasta_gxf_busco_plot": { + "branch": "master", + "git_sha": "3628d826c68eea367143e403412133054f607650", + "installed_by": ["subworkflows"] + }, "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", diff --git a/modules/nf-core/busco/generateplot/environment.yml b/modules/nf-core/busco/generateplot/environment.yml new file mode 100644 index 0000000..766c0f4 --- /dev/null +++ b/modules/nf-core/busco/generateplot/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::busco=5.7.1 diff --git a/modules/nf-core/busco/generateplot/main.nf b/modules/nf-core/busco/generateplot/main.nf new file mode 100644 index 0000000..6a4b339 --- /dev/null +++ b/modules/nf-core/busco/generateplot/main.nf @@ -0,0 +1,45 @@ +process BUSCO_GENERATEPLOT { + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/busco:5.7.1--pyhdfd78af_0': + 'biocontainers/busco:5.7.1--pyhdfd78af_0' }" + + input: + path short_summary_txt, stageAs: 'busco/*' + + output: + path '*.png' , emit: png + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: 'busco_figure' + """ + generate_plot.py \\ + $args \\ + -wd busco + + mv ./busco/busco_figure.png ${prefix}.png + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: 'busco_figure' + """ + touch ${prefix}.png + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/busco/generateplot/meta.yml b/modules/nf-core/busco/generateplot/meta.yml new file mode 100644 index 0000000..72ad2c9 --- /dev/null +++ b/modules/nf-core/busco/generateplot/meta.yml @@ -0,0 +1,40 @@ +name: "busco_generateplot" +description: BUSCO plot generation tool +keywords: + - genome + - fasta + - annotation + - busco + - transcriptome + - quality control +tools: + - busco: + description: BUSCO provides measures for quantitative assessment of genome assembly, + gene set, and transcriptome completeness based on evolutionarily informed expectations + of gene content from near-universal single-copy orthologs selected from OrthoDB. + homepage: https://busco.ezlab.org/ + documentation: https://busco.ezlab.org/busco_userguide.html + tool_dev_url: https://gitlab.com/ezlab/busco + doi: "10.1007/978-1-4939-9173-0_14" + licence: ["MIT"] + identifier: biotools:busco +input: + - - short_summary_txt: + type: file + description: One or more short summary txt files from BUSCO + pattern: "short_summary.*.txt" +output: + - png: + - "*.png": + type: file + description: A summary plot in png format + pattern: "*.png" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/nf-core/busco/generateplot/tests/main.nf.test b/modules/nf-core/busco/generateplot/tests/main.nf.test new file mode 100644 index 0000000..0bf1ff1 --- /dev/null +++ b/modules/nf-core/busco/generateplot/tests/main.nf.test @@ -0,0 +1,72 @@ +nextflow_process { + + name "Test Process BUSCO_GENERATEPLOT" + script "../main.nf" + process "BUSCO_GENERATEPLOT" + + tag "modules" + tag "modules_nfcore" + tag "busco" + tag "busco/busco" + tag "busco/generateplot" + + test("bacteroides_fragilis-genome_fna_gz") { + + setup { + run("BUSCO_BUSCO") { + script "../../busco" + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) + ] + input[1] = 'genome' + input[2] = 'bacteria_odb10' + input[3] = [] + input[4] = [] + """ + } + } + } + + when { + process { + """ + input[0] = BUSCO_BUSCO.out.short_summaries_txt.map { meta, summary -> summary } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions") }, + { assert process.out.png != null } // PNGs with same data but different meta-data. Not sure how to get around this, yet! + ) + } + + } + + test("stub") { + + options "-stub" + + when { + process { + """ + input[0] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/busco/generateplot/tests/main.nf.test.snap b/modules/nf-core/busco/generateplot/tests/main.nf.test.snap new file mode 100644 index 0000000..d9773ec --- /dev/null +++ b/modules/nf-core/busco/generateplot/tests/main.nf.test.snap @@ -0,0 +1,37 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,726fa3440ea3a0b2e9d032d7e4d25e74" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-03T15:40:01.523993" + }, + "stub": { + "content": [ + { + "0": [ + "busco_figure.png:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "1": [ + "versions.yml:md5,726fa3440ea3a0b2e9d032d7e4d25e74" + ], + "png": [ + "busco_figure.png:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "versions": [ + "versions.yml:md5,726fa3440ea3a0b2e9d032d7e4d25e74" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-03T15:40:11.864276" + } +} \ No newline at end of file diff --git a/modules/nf-core/busco/generateplot/tests/tags.yml b/modules/nf-core/busco/generateplot/tests/tags.yml new file mode 100644 index 0000000..b6548a6 --- /dev/null +++ b/modules/nf-core/busco/generateplot/tests/tags.yml @@ -0,0 +1,2 @@ +busco/generateplot: + - "modules/nf-core/busco/generateplot/**" diff --git a/subworkflows/local/genome_and_annotation.nf b/subworkflows/local/genome_and_annotation.nf index 6841231..feeb10e 100644 --- a/subworkflows/local/genome_and_annotation.nf +++ b/subworkflows/local/genome_and_annotation.nf @@ -1,14 +1,14 @@ include { AGAT_CONVERTSPGXF2GXF } from '../../modules/nf-core/agat/convertspgxf2gxf' include { LONGEST } from '../../modules/local/longest' -include { BUSCO_BUSCO } from '../../modules/nf-core/busco/busco/main' include { QUAST } from '../../modules/nf-core/quast/main' include { AGAT_SPSTATISTICS } from '../../modules/nf-core/agat/spstatistics/main' include { PLOT_BUSCO_IDEOGRAM } from '../../modules/local/plot_busco_ideogram' -include { GFFREAD } from '../../modules/nf-core/gffread/main' include { ORTHOFINDER } from '../../modules/nf-core/orthofinder/main' include { FASTAVALIDATOR } from '../../modules/nf-core/fastavalidator/main' +include { FASTA_GXF_BUSCO_PLOT } from '../../subworkflows/nf-core/fasta_gxf_busco_plot/main' + workflow GENOME_AND_ANNOTATION { take: @@ -82,14 +82,18 @@ workflow GENOME_AND_ANNOTATION { ch_tree_data = ch_tree_data.mix(QUAST.out.tsv.map { tuple -> tuple[1] }) // - // MODULE: Run GFFREAD + // SUBWORKFLOW: FASTA_GXF_BUSCO_PLOT // - - GFFREAD ( + FASTA_GXF_BUSCO_PLOT ( + ch_input.fasta, ch_input.gff_filt, - ch_input.fasta.map { meta, fasta -> fasta} + 'genome', // mode + [ params.busco_lineage ], // List expected + params.busco_lineages_path, + params.busco_config ) - ch_versions = ch_versions.mix(GFFREAD.out.versions.first()) + + ch_versions = ch_versions.mix(FASTA_GXF_BUSCO_PLOT.out.versions) // // MODULE: Run fasta validator @@ -97,7 +101,7 @@ workflow GENOME_AND_ANNOTATION { // Shoud we keep this? FASTAVALIDATOR( - GFFREAD.out.gffread_fasta + FASTA_GXF_BUSCO_PLOT.out.proteins ) // @@ -105,7 +109,7 @@ workflow GENOME_AND_ANNOTATION { // // Prepare orthofinder input channel - ortho_ch = GFFREAD.out.gffread_fasta + ortho_ch = FASTA_GXF_BUSCO_PLOT.out.proteins | map { meta, fasta -> fasta // We only need the fastas } @@ -121,27 +125,12 @@ workflow GENOME_AND_ANNOTATION { ) ch_versions = ch_versions.mix(ORTHOFINDER.out.versions) - // - // MODULE: Run BUSCO - // - - //GFFREAD.out.gffread_fasta.collect().view() - - BUSCO_BUSCO ( - GFFREAD.out.gffread_fasta, - "proteins", // hardcoded - params.busco_lineage, - params.busco_lineages_path ?: [], - params.busco_config ?: [] - ) - ch_versions = ch_versions.mix(BUSCO_BUSCO.out.versions.first()) - // // Plot BUSCO ideogram // // Prepare BUSCO output - BUSCO_BUSCO.out.full_table.map { meta, full_tables -> full_tables }.view() + FASTA_GXF_BUSCO_PLOT.out.annotation_full_table.map { meta, full_tables -> full_tables }.view() //BUSCO_BUSCO.out.full_table.map { meta, full_tables -> full_tables.collect { it -> it.toString().split('/')[-2] } }.view() @@ -151,7 +140,7 @@ workflow GENOME_AND_ANNOTATION { // [meta.id, lineages, full_tables] // } - ch_busco_full_table = BUSCO_BUSCO.out.full_table + ch_busco_full_table = FASTA_GXF_BUSCO_PLOT.out.annotation_full_table .map { meta, full_tables -> def lineages = full_tables.toString().split('/')[-2].replaceAll('run_', '').replaceAll('_odb\\d+', '') [meta.id, lineages, full_tables] @@ -185,14 +174,14 @@ workflow GENOME_AND_ANNOTATION { PLOT_BUSCO_IDEOGRAM ( ch_plot_input )//removed this temporarily:, ch_karyotype - ch_tree_data = ch_tree_data.mix(BUSCO_BUSCO.out.batch_summary.collect { meta, file -> file }) + ch_tree_data = ch_tree_data.mix(FASTA_GXF_BUSCO_PLOT.out.annotation_batch_summary.collect { meta, file -> file }) emit: orthofinder = ORTHOFINDER.out.orthofinder // channel: [ val(meta), [folder] ] //busco = BUSCO_BUSCO.out.batch_summary.collect { meta, file -> file } tree_data = ch_tree_data.flatten().collect() - busco_mq = BUSCO_BUSCO.out.short_summaries_txt.map { meta, file -> file } + busco_mq = FASTA_GXF_BUSCO_PLOT.out.annotation_short_summaries_txt.map { meta, file -> file } quast_mq = QUAST.out.results.map { meta, file -> file } versions = ch_versions // channel: [ versions.yml ] diff --git a/subworkflows/nf-core/fasta_gxf_busco_plot/main.nf b/subworkflows/nf-core/fasta_gxf_busco_plot/main.nf new file mode 100644 index 0000000..f0af720 --- /dev/null +++ b/subworkflows/nf-core/fasta_gxf_busco_plot/main.nf @@ -0,0 +1,174 @@ +include { BUSCO_BUSCO as BUSCO_ASSEMBLY } from '../../../modules/nf-core/busco/busco/main' +include { BUSCO_GENERATEPLOT as PLOT_ASSEMBLY } from '../../../modules/nf-core/busco/generateplot/main' +include { GFFREAD as EXTRACT_PROTEINS } from '../../../modules/nf-core/gffread/main' +include { BUSCO_BUSCO as BUSCO_ANNOTATION } from '../../../modules/nf-core/busco/busco/main' +include { BUSCO_GENERATEPLOT as PLOT_ANNOTATION } from '../../../modules/nf-core/busco/generateplot/main' + +workflow FASTA_GXF_BUSCO_PLOT { + + take: + ch_fasta // channel: [ val(meta), fasta ] + ch_gxf // channel: [ val(meta2), gxf ]; gxf ~ gff | gff3 | gtf + // + // meta and meta2 should have same id + + val_mode // val(mode); BUSCO mode to apply to ch_fasta + // - genome, for genome assemblies (DNA) + // - transcriptome, for transcriptome assemblies (DNA) + // - proteins, for annotated gene sets (protein) + // + // If mode is genome, annotations from ch_gxf are evaluated with + // mode proteins, otherwise, evaluation of the annotations is skipped + // + val_lineages // [ val(lineage) ] + val_busco_lineages_path // val(path); Optional; Set to [] if not needed + val_busco_config // val(path); Optional; Set to [] if not needed + + main: + ch_versions = Channel.empty() + ch_db_path = val_busco_lineages_path + ? Channel.of(file(val_busco_lineages_path, checkIfExists: true)) + : Channel.of( [ [] ] ) + ch_config_path = val_busco_config + ? Channel.of(file(val_busco_config, checkIfExists: true)) + : Channel.of( [ [] ] ) + + // MODULE: BUSCO_BUSCO as BUSCO_ASSEMBLY + ch_busco_assembly_inputs = ch_fasta + | combine( + Channel.of(val_mode) + ) + | combine( + Channel.fromList(val_lineages) + ) + | map { meta, fasta, mode, lineage -> + [ + meta + [ mode:mode, lineage:lineage ], + fasta, mode, lineage + ] + } + | combine( + ch_db_path + ) + | combine( + ch_config_path + ) + + BUSCO_ASSEMBLY( + ch_busco_assembly_inputs.map { meta, fasta, _mode, _lineage, _db, _config -> [ meta, fasta ] }, + ch_busco_assembly_inputs.map { _meta, _fasta, mode, _lineage, _db, _config -> mode }, + ch_busco_assembly_inputs.map { _meta, _fasta, _mode, lineage, _db, _config -> lineage }, + ch_busco_assembly_inputs.map { _meta, _fasta, _mode, _lineage, db, _config -> db }, + ch_busco_assembly_inputs.map { _meta, _fasta, _mode, _lineage, _db, config -> config } + ) + + ch_assembly_batch_summary = BUSCO_ASSEMBLY.out.batch_summary + ch_assembly_short_summaries_txt = BUSCO_ASSEMBLY.out.short_summaries_txt + ch_assembly_short_summaries_json = BUSCO_ASSEMBLY.out.short_summaries_json + ch_assembly_full_table = BUSCO_ASSEMBLY.out.full_table + ch_versions = ch_versions.mix(BUSCO_ASSEMBLY.out.versions.first()) + + // MODULE: BUSCO_GENERATEPLOT as PLOT_ASSEMBLY + ch_assembly_plot_summary = ch_assembly_short_summaries_txt + | map { meta, txt -> + def lineage_name = meta.lineage - '_odb' + [ + "short_summary.specific.${meta.lineage}.${meta.id}_${lineage_name}.txt", + txt.text + ] + } + | collectFile + + PLOT_ASSEMBLY( ch_assembly_plot_summary.collect() ) + + ch_assembly_png = PLOT_ASSEMBLY.out.png + ch_versions = ch_versions.mix(PLOT_ASSEMBLY.out.versions) + + // MODULE: GFFREAD as EXTRACT_PROTEINS + ch_gffread_inputs = val_mode !in [ 'geno', 'genome' ] + ? Channel.empty() + : ch_fasta + | map { meta, fasta -> [ meta.id, meta, fasta ] } + | join( + ch_gxf.map { meta2, gxf -> [ meta2.id, gxf ] } + // Join with matching annotation + // to allow one annotations per fasta + ) + | map { _id, meta, fasta, gxf -> [ meta, gxf, fasta ] } + EXTRACT_PROTEINS( + ch_gffread_inputs.map { meta, gxf, _fasta -> [ meta, gxf ] }, + ch_gffread_inputs.map { _meta, _gxf, fasta -> fasta } + ) + + ch_proteins = EXTRACT_PROTEINS.out.gffread_fasta + ch_versions = ch_versions.mix(EXTRACT_PROTEINS.out.versions.first()) + + // MODULE: BUSCO_BUSCO as BUSCO_ANNOTATION + ch_busco_annotation_inputs = ch_proteins + | combine( + Channel.of('proteins') + ) + | combine( + Channel.fromList(val_lineages) + ) + | map { meta, fasta, mode, lineage -> + [ + meta + [ mode:mode, lineage:lineage ], + fasta, mode, lineage + ] + } + | combine( + ch_db_path + ) + | combine( + ch_config_path + ) + + BUSCO_ANNOTATION( + ch_busco_annotation_inputs.map { meta, fasta, _mode, _lineage, _db, _config -> [ meta, fasta ] }, + ch_busco_annotation_inputs.map { _meta, _fasta, mode, _lineage, _db, _config -> mode }, + ch_busco_annotation_inputs.map { _meta, _fasta, _mode, lineage, _db, _config -> lineage }, + ch_busco_annotation_inputs.map { _meta, _fasta, _mode, _lineage, db, _config -> db }, + ch_busco_annotation_inputs.map { _meta, _fasta, _mode, _lineage, _db, config -> config } + ) + + ch_annotation_batch_summary = BUSCO_ANNOTATION.out.batch_summary + ch_annotation_short_summaries_txt = BUSCO_ANNOTATION.out.short_summaries_txt + ch_annotation_short_summaries_json = BUSCO_ANNOTATION.out.short_summaries_json + ch_annotation_full_table = BUSCO_ANNOTATION.out.full_table + ch_versions = ch_versions.mix(BUSCO_ANNOTATION.out.versions.first()) + + // MODULE: BUSCO_GENERATEPLOT as PLOT_ANNOTATION + ch_annotation_plot_summary = ch_annotation_short_summaries_txt + | map { meta, txt -> + def lineage_name = meta.lineage - '_odb' + [ + "short_summary.specific.${meta.lineage}.${meta.id}_${lineage_name}.proteins.txt", + txt.text + ] + } + | collectFile + + PLOT_ANNOTATION( ch_annotation_plot_summary.collect() ) + + ch_annotation_png = PLOT_ANNOTATION.out.png + ch_versions = ch_versions.mix(PLOT_ANNOTATION.out.versions) + + + emit: + assembly_batch_summary = ch_assembly_batch_summary // channel: [ meta3, txt ]; meta3 ~ meta + [ val(mode), val(lineage) ] + assembly_short_summaries_txt = ch_assembly_short_summaries_txt // channel: [ meta3, txt ] + assembly_short_summaries_json = ch_assembly_short_summaries_json // channel: [ meta3, json ] + assembly_full_table = ch_assembly_full_table // channel: [ meta3, tsv ] + assembly_plot_summary_txt = ch_assembly_plot_summary // channel: [ text ] + assembly_png = ch_assembly_png // channel: [ png ] + + proteins = ch_proteins // channel: [ meta, faa ] + annotation_batch_summary = ch_annotation_batch_summary // channel: [ meta3, txt ] + annotation_short_summaries_txt = ch_annotation_short_summaries_txt // channel: [ meta3, txt ] + annotation_short_summaries_json = ch_annotation_short_summaries_json // channel: [ meta3, json ] + annotation_full_table = ch_annotation_full_table // channel: [ meta3, tsv ] + annotation_plot_summary_txt = ch_annotation_plot_summary // channel: [ txt ] + annotation_png = ch_annotation_png // channel: [ png ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/fasta_gxf_busco_plot/meta.yml b/subworkflows/nf-core/fasta_gxf_busco_plot/meta.yml new file mode 100644 index 0000000..bf0e437 --- /dev/null +++ b/subworkflows/nf-core/fasta_gxf_busco_plot/meta.yml @@ -0,0 +1,131 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fasta_gxf_busco_plot" +description: | + Runs BUSCO for input assemblies and their annotations in GFF/GFF3/GTF format, and creates summary plots using `BUSCO/generate_plot.py` script +keywords: + - genome + - annotation + - busco + - plot +components: + - busco/busco + - busco/generateplot + - gffread +input: + - ch_fasta: + type: file + description: | + Channel containing FASTA files + Structure:[ val(meta), fasta ] + pattern: "*.{fa,faa,fsa,fas,fasta}(.gz)?" + - ch_gxf: + type: file + description: | + Channel containing GFF/GFF3/GTF files + Structure:[ val(meta2), gxf ] + pattern: "*.{gff,gff3,gtf}" + - val_mode: + type: string + description: | + String containing BUSCO mode to apply to ch_fasta files + Structure:val(mode) + - val_lineages: + type: array + description: | + Array of strings representing BUSCO lineage datasets + Structure:[ val(lineage) ] + - val_busco_lineages_path: + type: path + description: | + Path where BUSCO lineages are located or downloaded if not already there. If this input is `[]`, + BUSCO will download the datasets in the task work directory + Structure:val(busco_lineages_path) + - val_busco_config: + type: path + description: | + Path to BUSCO config. It is optional and can be set to `[]` + Structure:val(busco_config) +output: + - assembly_batch_summary: + type: file + description: | + Channel containing BUSCO batch summaries corresponding to fasta files + Structure: [ val(meta), txt ] + pattern: "*.txt" + - assembly_short_summaries_txt: + type: file + description: | + Channel containing BUSCO short summaries corresponding to fasta files + Structure: [ val(meta), txt ] + pattern: "*.txt" + - assembly_short_summaries_json: + type: file + description: | + Channel containing BUSCO short summaries corresponding to fasta files + Structure: [ val(meta), json ] + pattern: "*.json" + - assembly_full_table: + description: | + Channel containing complete results in a tabular format with scores and lengths of BUSCO matches, + and coordinates (for genome mode) or gene/protein IDs (for transcriptome or protein modes) + Structure: [ val(meta), tsv ] + pattern: "*.tsv" + - assembly_plot_summary_txt: + type: file + description: | + Channel containing BUSCO short summaries corresponding to fasta files renamed to include lineage in sample id + Structure: [ txt ] + pattern: "*.txt" + - assembly_png: + type: file + description: | + Channel containing summary plot for assemblies + Structure: png + pattern: "*.png" + - annotation_batch_summary: + type: file + description: | + Channel containing BUSCO batch summaries corresponding to annotation files + Structure: [ val(meta), txt ] + pattern: "*.txt" + - annotation_short_summaries_txt: + type: file + description: | + Channel containing BUSCO short summaries corresponding to annotation files + Structure: [ val(meta), txt ] + pattern: "*.txt" + - annotation_short_summaries_json: + type: file + description: | + Channel containing BUSCO short summaries corresponding to annotation files + Structure: [ val(meta), json ] + pattern: "*.json" + - annotation_full_table: + description: | + Channel containing complete results in a tabular format with scores and lengths of BUSCO matches, + protein IDs in protein mode + Structure: [ val(meta), tsv ] + pattern: "*.tsv" + - annotation_plot_summary_txt: + type: file + description: | + Channel containing BUSCO short summaries corresponding to annotation files renamed to include lineage in sample id + Structure: [ txt ] + pattern: "*.txt" + - annotation_png: + type: file + description: | + Channel containing summary plot for annotations + Structure: png + pattern: "*.png" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" + - "@FernandoDuarteF" diff --git a/subworkflows/nf-core/fasta_gxf_busco_plot/tests/main.nf.test b/subworkflows/nf-core/fasta_gxf_busco_plot/tests/main.nf.test new file mode 100644 index 0000000..0154f9c --- /dev/null +++ b/subworkflows/nf-core/fasta_gxf_busco_plot/tests/main.nf.test @@ -0,0 +1,114 @@ +nextflow_workflow { + + name "Test Subworkflow FASTA_GXF_BUSCO_PLOT" + script "../main.nf" + workflow "FASTA_GXF_BUSCO_PLOT" + config './nextflow.config' + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/fasta_gxf_busco_plot" + tag "busco" + tag "busco/busco" + tag "busco/generateplot" + tag "gffread" + + test("candidatus_portiera_aleyrodidarum - bacteroides_fragilis - genome") { + + when { + + params { + extract_proteins_args = '-y' + } + + workflow { + """ + input[0] = Channel.of( + [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/genome.fasta', checkIfExists: true) + ], + [ + [ id:'test2' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) + ] + ) + input[1] = Channel.of( + [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/gff/test1.gff', checkIfExists: true) + ] + ) + input[2] = 'genome' + input[3] = [ 'bacteria_odb10', 'archaea_odb10' ] + input[4] = [] + input[5] = [] + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot( + workflow.out.assembly_batch_summary, + workflow.out.annotation_batch_summary, + workflow.out.versions, + ).match() + }, + + { assert workflow.out.assembly_png != null }, + { assert workflow.out.annotation_png != null }, + + { assert workflow.out.assembly_short_summaries_json != null }, + { assert workflow.out.assembly_short_summaries_txt != null }, + { assert workflow.out.annotation_short_summaries_json != null }, + { assert workflow.out.annotation_short_summaries_txt != null } + ) + } + } + + test("candidatus_portiera_aleyrodidarum - bacteroides_fragilis - genome - stub") { + + options '-stub' + + when { + + params { + extract_proteins_args = '-y' + } + + workflow { + """ + input[0] = Channel.of( + [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/genome.fasta', checkIfExists: true) + ], + [ + [ id:'test2' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) + ] + ) + input[1] = Channel.of( + [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/gff/test1.gff', checkIfExists: true) + ] + ) + input[2] = 'genome' + input[3] = [ 'bacteria_odb10', 'archaea_odb10' ] + input[4] = [] + input[5] = [] + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match()} + ) + } + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/fasta_gxf_busco_plot/tests/main.nf.test.snap b/subworkflows/nf-core/fasta_gxf_busco_plot/tests/main.nf.test.snap new file mode 100644 index 0000000..89604dc --- /dev/null +++ b/subworkflows/nf-core/fasta_gxf_busco_plot/tests/main.nf.test.snap @@ -0,0 +1,255 @@ +{ + "candidatus_portiera_aleyrodidarum - bacteroides_fragilis - genome - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "mode": "genome", + "lineage": "archaea_odb10" + }, + "test-archaea_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test", + "mode": "genome", + "lineage": "bacteria_odb10" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test2", + "mode": "genome", + "lineage": "archaea_odb10" + }, + "test2-archaea_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test2", + "mode": "genome", + "lineage": "bacteria_odb10" + }, + "test2-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "10": [ + + ], + "11": [ + + ], + "12": [ + "versions.yml:md5,36b11c442943567e471af0abd474a10b", + "versions.yml:md5,9435355f913e283f60b4fb7ef77dd52a", + "versions.yml:md5,e9d65e2f2f13175e99c5b7f4ae1013b9" + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + + ], + "6": [ + [ + { + "id": "test", + "mode": "proteins", + "lineage": "archaea_odb10" + }, + "test-archaea_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test", + "mode": "proteins", + "lineage": "bacteria_odb10" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + + ], + "8": [ + + ], + "9": [ + + ], + "annotation_batch_summary": [ + [ + { + "id": "test", + "mode": "proteins", + "lineage": "archaea_odb10" + }, + "test-archaea_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test", + "mode": "proteins", + "lineage": "bacteria_odb10" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "annotation_full_table": [ + + ], + "annotation_plot_summary_txt": [ + + ], + "annotation_png": [ + + ], + "annotation_short_summaries_json": [ + + ], + "annotation_short_summaries_txt": [ + + ], + "assembly_batch_summary": [ + [ + { + "id": "test", + "mode": "genome", + "lineage": "archaea_odb10" + }, + "test-archaea_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test", + "mode": "genome", + "lineage": "bacteria_odb10" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test2", + "mode": "genome", + "lineage": "archaea_odb10" + }, + "test2-archaea_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test2", + "mode": "genome", + "lineage": "bacteria_odb10" + }, + "test2-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "assembly_full_table": [ + + ], + "assembly_plot_summary_txt": [ + + ], + "assembly_png": [ + + ], + "assembly_short_summaries_json": [ + + ], + "assembly_short_summaries_txt": [ + + ], + "versions": [ + "versions.yml:md5,36b11c442943567e471af0abd474a10b", + "versions.yml:md5,9435355f913e283f60b4fb7ef77dd52a", + "versions.yml:md5,e9d65e2f2f13175e99c5b7f4ae1013b9" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.10.1" + }, + "timestamp": "2024-11-25T10:31:03.980517" + }, + "candidatus_portiera_aleyrodidarum - bacteroides_fragilis - genome": { + "content": [ + [ + [ + { + "id": "test", + "mode": "genome", + "lineage": "archaea_odb10" + }, + "test-archaea_odb10-busco.batch_summary.txt:md5,1397d74518a776ad75b16a843bc5b6c1" + ], + [ + { + "id": "test", + "mode": "genome", + "lineage": "bacteria_odb10" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,a1186bc25448ac1949bf7790810f7161" + ], + [ + { + "id": "test2", + "mode": "genome", + "lineage": "archaea_odb10" + }, + "test2-archaea_odb10-busco.batch_summary.txt:md5,946582b353a8dba7d6452a71856eca06" + ], + [ + { + "id": "test2", + "mode": "genome", + "lineage": "bacteria_odb10" + }, + "test2-bacteria_odb10-busco.batch_summary.txt:md5,21b3fb771cf36be917cc451540d999be" + ] + ], + [ + [ + { + "id": "test", + "mode": "proteins", + "lineage": "archaea_odb10" + }, + "test-archaea_odb10-busco.batch_summary.txt:md5,95172bd5b1a30e632fc79084ea0ca585" + ], + [ + { + "id": "test", + "mode": "proteins", + "lineage": "bacteria_odb10" + }, + "test-bacteria_odb10-busco.batch_summary.txt:md5,995127c0caecb36205dbf21aa2f9f8a8" + ] + ], + [ + "versions.yml:md5,05d8022e3afb0d5642ed17147b991730", + "versions.yml:md5,36b11c442943567e471af0abd474a10b", + "versions.yml:md5,53987b35fc275297efdaf525937fdca3", + "versions.yml:md5,9435355f913e283f60b4fb7ef77dd52a", + "versions.yml:md5,e9d65e2f2f13175e99c5b7f4ae1013b9" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-13T16:39:04.376704" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/fasta_gxf_busco_plot/tests/nextflow.config b/subworkflows/nf-core/fasta_gxf_busco_plot/tests/nextflow.config new file mode 100644 index 0000000..12b9956 --- /dev/null +++ b/subworkflows/nf-core/fasta_gxf_busco_plot/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: EXTRACT_PROTEINS { + ext.args = params.extract_proteins_args + } +}