Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added subworkflow fasta_gxf_busco_plot #7051

Merged
merged 9 commits into from
Nov 26, 2024
172 changes: 172 additions & 0 deletions subworkflows/nf-core/fasta_gxf_busco_plot/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
include { BUSCO_BUSCO as BUSCO_ASSEMBLY } from '../../../modules/nf-core/busco/busco/main'
include { BUSCO_GENERATEPLOT as PLOT_ASSEMBLY } from '../../../modules/nf-core/busco/generateplot/main'
include { GFFREAD as EXTRACT_PROTEINS } from '../../../modules/nf-core/gffread/main'
include { BUSCO_BUSCO as BUSCO_ANNOTATION } from '../../../modules/nf-core/busco/busco/main'
include { BUSCO_GENERATEPLOT as PLOT_ANNOTATION } from '../../../modules/nf-core/busco/generateplot/main'

workflow FASTA_GXF_BUSCO_PLOT {

take:
ch_fasta // channel: [ val(meta), fasta ]
ch_gxf // channel: [ val(meta2), gxf ]; gxf ~ gff | gff3 | gtf
//
// meta and meta2 should have same id

val_mode // val(mode); BUSCO mode to apply to ch_fasta
// - genome, for genome assemblies (DNA)
// - transcriptome, for transcriptome assemblies (DNA)
// - proteins, for annotated gene sets (protein)
//
// If mode is genome, annotations from ch_gxf are evaluated with
// mode proteins, otherwise, evaluation of the annotations is skipped
//
val_lineages // [ val(lineage) ]
val_busco_lineages_path // val(path); Optional; Set to [] if not needed
val_busco_config // val(path); Optional; Set to [] if not needed

main:
ch_versions = Channel.empty()
ch_db_path = val_busco_lineages_path
? Channel.of(file(val_busco_lineages_path, checkIfExists: true))
: Channel.of( [ [] ] )
ch_config_path = val_busco_config
? Channel.of(file(val_busco_config, checkIfExists: true))
: Channel.of( [ [] ] )

// MODULE: BUSCO_BUSCO as BUSCO_ASSEMBLY
ch_busco_assembly_inputs = ch_fasta
| combine(
Channel.of(val_mode)
)
| combine(
Channel.fromList(val_lineages)
)
| map { meta, fasta, mode, lineage ->
[
meta + [ mode:mode, lineage:lineage ],
fasta, mode, lineage
]
}
| combine(
ch_db_path
)
| combine(
ch_config_path
)

BUSCO_ASSEMBLY(
ch_busco_assembly_inputs.map { meta, fasta, _mode, _lineage, _db, _config -> [ meta, fasta ] },
ch_busco_assembly_inputs.map { _meta, _fasta, mode, _lineage, _db, _config -> mode },
ch_busco_assembly_inputs.map { _meta, _fasta, _mode, lineage, _db, _config -> lineage },
ch_busco_assembly_inputs.map { _meta, _fasta, _mode, _lineage, db, _config -> db },
ch_busco_assembly_inputs.map { _meta, _fasta, _mode, _lineage, _db, config -> config }
)

ch_assembly_batch_summary = BUSCO_ASSEMBLY.out.batch_summary
ch_assembly_short_summaries_txt = BUSCO_ASSEMBLY.out.short_summaries_txt
ch_assembly_short_summaries_json = BUSCO_ASSEMBLY.out.short_summaries_json
GallVp marked this conversation as resolved.
Show resolved Hide resolved
ch_assembly_full_table = BUSCO_ASSEMBLY.out.full_table
ch_versions = ch_versions.mix(BUSCO_ASSEMBLY.out.versions.first())

// MODULE: BUSCO_GENERATEPLOT as PLOT_ASSEMBLY
ch_assembly_plot_summary = ch_assembly_short_summaries_txt
| map { meta, txt ->
def lineage_name = meta.lineage - '_odb'
[
"short_summary.specific.${meta.lineage}.${meta.id}_${lineage_name}.txt",
mahesh-panchal marked this conversation as resolved.
Show resolved Hide resolved
txt.text
]
}
| collectFile

PLOT_ASSEMBLY( ch_assembly_plot_summary.collect() )
mahesh-panchal marked this conversation as resolved.
Show resolved Hide resolved

ch_assembly_png = PLOT_ASSEMBLY.out.png
ch_versions = ch_versions.mix(PLOT_ASSEMBLY.out.versions)

// MODULE: GFFREAD as EXTRACT_PROTEINS
ch_gffread_inputs = val_mode !in [ 'geno', 'genome' ]
? Channel.empty()
: ch_fasta
| map { meta, fasta -> [ meta.id, meta, fasta ] }
| join(
ch_gxf.map { meta2, gxf -> [ meta2.id, gxf ] }
// Join with matching annotation
// to allow one annotations per fasta
)
| map { _id, meta, fasta, gxf -> [ meta, gxf, fasta ] }
EXTRACT_PROTEINS(
ch_gffread_inputs.map { meta, gxf, _fasta -> [ meta, gxf ] },
ch_gffread_inputs.map { _meta, _gxf, fasta -> fasta }
)

ch_proteins = EXTRACT_PROTEINS.out.gffread_fasta
ch_versions = ch_versions.mix(EXTRACT_PROTEINS.out.versions.first())

// MODULE: BUSCO_BUSCO as BUSCO_ANNOTATION
ch_busco_annotation_inputs = ch_proteins
| combine(
Channel.of('proteins')
)
| combine(
Channel.fromList(val_lineages)
)
| map { meta, fasta, mode, lineage ->
[
meta + [ mode:mode, lineage:lineage ],
fasta, mode, lineage
]
}
| combine(
ch_db_path
)
| combine(
ch_config_path
)

BUSCO_ANNOTATION(
ch_busco_annotation_inputs.map { meta, fasta, _mode, _lineage, _db, _config -> [ meta, fasta ] },
ch_busco_annotation_inputs.map { _meta, _fasta, mode, _lineage, _db, _config -> mode },
ch_busco_annotation_inputs.map { _meta, _fasta, _mode, lineage, _db, _config -> lineage },
ch_busco_annotation_inputs.map { _meta, _fasta, _mode, _lineage, db, _config -> db },
ch_busco_annotation_inputs.map { _meta, _fasta, _mode, _lineage, _db, config -> config }
)

ch_annotation_batch_summary = BUSCO_ANNOTATION.out.batch_summary
ch_annotation_short_summaries_txt = BUSCO_ANNOTATION.out.short_summaries_txt
ch_annotation_short_summaries_json = BUSCO_ANNOTATION.out.short_summaries_json
GallVp marked this conversation as resolved.
Show resolved Hide resolved
ch_annotation_full_table = BUSCO_ANNOTATION.out.full_table
ch_versions = ch_versions.mix(BUSCO_ANNOTATION.out.versions.first())

// MODULE: BUSCO_GENERATEPLOT as PLOT_ANNOTATION
ch_annotation_plot_summary = ch_annotation_short_summaries_txt
| map { meta, txt ->
def lineage_name = meta.lineage - '_odb'
[
"short_summary.specific.${meta.lineage}.${meta.id}_${lineage_name}.proteins.txt",
txt.text
]
}
| collectFile

PLOT_ANNOTATION( ch_annotation_plot_summary.collect() )

ch_annotation_png = PLOT_ANNOTATION.out.png
ch_versions = ch_versions.mix(PLOT_ANNOTATION.out.versions)


emit:
assembly_batch_summary = ch_assembly_batch_summary // channel: [ meta3, txt ]; meta3 ~ meta + [ val(mode), val(lineage) ]
assembly_short_summaries_txt = ch_assembly_short_summaries_txt // channel: [ meta3, txt ]
assembly_short_summaries_json = ch_assembly_short_summaries_json // channel: [ meta3, json ]
assembly_full_table = ch_assembly_full_table // channel: [ meta3, tsv ]
assembly_plot_summary_txt = ch_assembly_plot_summary // channel: [ text ]
GallVp marked this conversation as resolved.
Show resolved Hide resolved
assembly_png = ch_assembly_png // channel: [ png ]
annotation_batch_summary = ch_annotation_batch_summary // channel: [ meta3, txt ]
annotation_short_summaries_txt = ch_annotation_short_summaries_txt // channel: [ meta3, txt ]
annotation_short_summaries_json = ch_annotation_short_summaries_json // channel: [ meta3, json ]
annotation_full_table = ch_annotation_full_table // channel: [ meta3, tsv ]
annotation_plot_summary_txt = ch_annotation_plot_summary // channel: [ txt ]
GallVp marked this conversation as resolved.
Show resolved Hide resolved
annotation_png = ch_annotation_png // channel: [ png ]
versions = ch_versions // channel: [ versions.yml ]
}
131 changes: 131 additions & 0 deletions subworkflows/nf-core/fasta_gxf_busco_plot/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
name: "fasta_gxf_busco_plot"
description: |
Runs BUSCO for input assemblies and their annotations in GFF/GFF3/GTF format, and creates summary plots using `BUSCO/generate_plot.py` script
keywords:
- genome
- annotation
- busco
- plot
components:
- busco/busco
- busco/generateplot
- gffread
input:
- ch_fasta:
type: file
description: |
Channel containing FASTA files
Structure:[ val(meta), fasta ]
pattern: "*.{fa,faa,fsa,fas,fasta}(.gz)?"
- ch_gxf:
type: file
description: |
Channel containing GFF/GFF3/GTF files
Structure:[ val(meta2), gxf ]
pattern: "*.{gff,gff3,gtf}"
- val_mode:
type: string
description: |
String containing BUSCO mode to apply to ch_fasta files
Structure:val(mode)
- val_lineages:
type: array
description: |
Array of strings representing BUSCO lineage datasets
Structure:[ val(lineage) ]
- val_busco_lineages_path:
type: path
description: |
Path where BUSCO lineages are located or downloaded if not already there. If this input is `[]`,
BUSCO will download the datasets in the task work directory
Structure:val(busco_lineages_path)
- val_busco_config:
type: path
description: |
Path to BUSCO config. It is optional and can be set to `[]`
Structure:val(busco_config)
output:
- assembly_batch_summary:
type: file
description: |
Channel containing BUSCO batch summaries corresponding to fasta files
Structure: [ val(meta), txt ]
pattern: "*.txt"
- assembly_short_summaries_txt:
type: file
description: |
Channel containing BUSCO short summaries corresponding to fasta files
Structure: [ val(meta), txt ]
pattern: "*.txt"
- assembly_short_summaries_json:
type: file
description: |
Channel containing BUSCO short summaries corresponding to fasta files
Structure: [ val(meta), json ]
pattern: "*.json"
- assembly_full_table:
description: |
Channel containing complete results in a tabular format with scores and lengths of BUSCO matches,
and coordinates (for genome mode) or gene/protein IDs (for transcriptome or protein modes)
Structure: [ val(meta), tsv ]
pattern: "*.tsv"
- assembly_plot_summary_txt:
type: file
description: |
Channel containing BUSCO short summaries corresponding to fasta files renamed to include lineage in sample id
Structure: [ txt ]
pattern: "*.txt"
- assembly_png:
type: file
description: |
Channel containing summary plot for assemblies
Structure: png
pattern: "*.png"
- annotation_batch_summary:
type: file
description: |
Channel containing BUSCO batch summaries corresponding to annotation files
Structure: [ val(meta), txt ]
pattern: "*.txt"
- annotation_short_summaries_txt:
type: file
description: |
Channel containing BUSCO short summaries corresponding to annotation files
Structure: [ val(meta), txt ]
pattern: "*.txt"
- annotation_short_summaries_json:
type: file
description: |
Channel containing BUSCO short summaries corresponding to annotation files
Structure: [ val(meta), json ]
pattern: "*.json"
- annotation_full_table:
description: |
Channel containing complete results in a tabular format with scores and lengths of BUSCO matches,
protein IDs in protein mode
Structure: [ val(meta), tsv ]
pattern: "*.tsv"
- annotation_plot_summary_txt:
type: file
description: |
Channel containing BUSCO short summaries corresponding to annotation files renamed to include lineage in sample id
Structure: [ txt ]
pattern: "*.txt"
- annotation_png:
type: file
description: |
Channel containing summary plot for annotations
Structure: png
pattern: "*.png"
- versions:
type: file
description: |
File containing software versions
Structure: [ path(versions.yml) ]
pattern: "versions.yml"
authors:
- "@GallVp"
maintainers:
- "@GallVp"
- "@FernandoDuarteF"
Loading
Loading