diff --git a/CHANGELOG.md b/CHANGELOG.md index e4d0ca6..9937888 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,12 +3,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v0.6.0 - [10-Dec-2024] +## v0.6.0 - [16-Dec-2024] ### 'Added' 1. Added cDNA and CDS outputs to /annotations/ directory [#118](https://github.com/Plant-Food-Research-Open/genepal/issues/118) 2. Added parameter `add_attrs_to_proteins_cds_fastas` +3. Added parameter `filter_genes_by_aa_length` with default set to `24` which allows removal of genes with ORFs shorter than 24 [#125](https://github.com/Plant-Food-Research-Open/genepal/issues/125) ### `Fixed` diff --git a/README.md b/README.md index 51f3a3e..af463db 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,9 @@ - Optionally, remove models without any EggNOG-mapper hits - [EggNOG-mapper](https://github.com/eggnogdb/eggnog-mapper): Add functional annotation to gff - [GenomeTools](https://github.com/genometools/genometools): GFF format validation -- [GffRead](https://github.com/gpertea/gffread): Extraction of protein sequences +- [GffRead](https://github.com/gpertea/gffread) + - Extraction of protein sequences + - Optionally, remove models with ORFs shorter than `N` amino acids - [OrthoFinder](https://github.com/davidemms/OrthoFinder): Perform phylogenetic orthology inference across genomes - [GffCompare](https://github.com/gpertea/gffcompare): Compare and benchmark against an existing annotation - [BUSCO](https://gitlab.com/ezlab/busco): Completeness statistics for genome and annotation through proteins diff --git a/conf/modules.config b/conf/modules.config index 44e6123..2a14621 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -240,6 +240,10 @@ process { // SUBWORKFLOW: GFF_MERGE_CLEANUP ext.prefix = { "${meta.id}.liftoff.braker" } } + withName: '.*:GFF_MERGE_CLEANUP:FILTER_BY_ORF_SIZE' { + ext.args = params.filter_genes_by_aa_length ? "--no-pseudo --keep-genes -C -l ${ ( params.filter_genes_by_aa_length + 1 ) * 3 }" : '' + } + withName: '.*:GFF_MERGE_CLEANUP:GT_GFF3' { ext.args = '-tidy -retainids -sort' } diff --git a/docs/output.md b/docs/output.md index f4793b5..40b546c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -169,8 +169,8 @@ If more than one genome is included in the pipeline, [ORTHOFINDER](https://githu - `Y/` - `Y.gt.gff3`: Final annotation file for genome `Y` which contains gene models and their functional annotations - `Y.pep.fasta`: Protein sequences for the gene models - - 'Y.cdna.fasta': cDNA sequences for the gene models - - 'Y.cds.fasta': Coding sequences for the gene models + - `Y.cdna.fasta`: cDNA sequences for the gene models + - `Y.cds.fasta`: Coding sequences for the gene models diff --git a/docs/parameters.md b/docs/parameters.md index 9297c4a..7ccd67a 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -59,12 +59,13 @@ A Nextflow pipeline for consensus, phased and pan-genome annotation. ## Post-annotation filtering options -| Parameter | Description | Type | Default | Required | Hidden | -| ----------------------------- | ----------------------------------------------------------------- | --------- | ------- | -------- | ------ | -| `allow_isoforms` | Allow multiple isoforms for gene models | `boolean` | True | | | -| `enforce_full_intron_support` | Require every model to have external evidence for all its introns | `boolean` | True | | | -| `filter_liftoff_by_hints` | Use BRAKER hints to filter Liftoff models | `boolean` | True | | | -| `eggnogmapper_purge_nohits` | Purge transcripts which do not have a hit against eggnog | `boolean` | | | | +| Parameter | Description | Type | Default | Required | Hidden | +| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | +| `allow_isoforms` | Allow multiple isoforms for gene models | `boolean` | True | | | +| `enforce_full_intron_support` | Require every model to have external evidence for all its introns | `boolean` | True | | | +| `filter_liftoff_by_hints` | Use BRAKER hints to filter Liftoff models | `boolean` | True | | | +| `eggnogmapper_purge_nohits` | Purge transcripts which do not have a hit against eggnog | `boolean` | | | | +| `filter_genes_by_aa_length` | Filter genes with open reading frames shorter than the specified number of amino acids excluding the stop codon. If set to `null`, this filter step is skipped. | `integer` | 24 | | | ## Annotation output options diff --git a/modules/local/tests/gffread/main.nf.test b/modules/local/tests/gffread/main.nf.test new file mode 100644 index 0000000..60e588b --- /dev/null +++ b/modules/local/tests/gffread/main.nf.test @@ -0,0 +1,38 @@ +nextflow_process { + + name "Test Process GFFREAD" + script "../../../nf-core/gffread/main.nf" + config "./nextflow.config" + process "GFFREAD" + + tag "gffread" + tag "modules_nfcore" + tag "modules" + + test("filter by length") { + + when { + process { + """ + input[0] = [ + [id: 'test'], + file("$baseDir" + '/modules/local/tests/gffread/testdata/t.gff', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert file(process.out.gffread_gff[0][1]).text.contains('gene19851') }, + { assert file(process.out.gffread_gff[0][1]).text.contains('gene19851.t1') }, + { assert ! file(process.out.gffread_gff[0][1]).text.contains('gene19851.t2') } // This is the only transcript which is being knocked out + ) + } + + } + +} diff --git a/modules/local/tests/gffread/main.nf.test.snap b/modules/local/tests/gffread/main.nf.test.snap new file mode 100644 index 0000000..261f436 --- /dev/null +++ b/modules/local/tests/gffread/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "filter by length": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.gff3:md5,59a7d6ff7123589ef2b90b20043a347c" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + + ], + "gffread_gff": [ + [ + { + "id": "test" + }, + "test.gff3:md5,59a7d6ff7123589ef2b90b20043a347c" + ] + ], + "gtf": [ + + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.04.4" + }, + "timestamp": "2024-12-11T21:11:59.953464" + } +} \ No newline at end of file diff --git a/modules/local/tests/gffread/nextflow.config b/modules/local/tests/gffread/nextflow.config new file mode 100644 index 0000000..734d066 --- /dev/null +++ b/modules/local/tests/gffread/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: GFFREAD { + ext.args = '--no-pseudo --keep-genes -C -l 72' + } +} diff --git a/modules/local/tests/gffread/testdata/t.gff b/modules/local/tests/gffread/testdata/t.gff new file mode 100644 index 0000000..6b1c076 --- /dev/null +++ b/modules/local/tests/gffread/testdata/t.gff @@ -0,0 +1,47 @@ +##gff-version 3 +### +chr23 AUGUSTUS gene 16515075 16516672 . - . ID=gene19849;description=Protein%20of%20unknown%20function%20%28DUF1635%29 +chr23 AUGUSTUS mRNA 16515075 16516597 1 - . ID=gene19849.t1;Parent=gene19849;description=Protein%20of%20unknown%20function%20%28DUF1635%29 +chr23 AUGUSTUS exon 16515075 16515794 . - . ID=gene19849.t1.exon1;Parent=gene19849.t1 +chr23 AUGUSTUS CDS 16515075 16515794 1 - 0 ID=gene19849.t1.cds1;Parent=gene19849.t1 +chr23 AUGUSTUS exon 16516562 16516597 . - . ID=gene19849.t1.exon2;Parent=gene19849.t1 +chr23 AUGUSTUS CDS 16516562 16516597 1 - 0 ID=gene19849.t1.cds2;Parent=gene19849.t1 +chr23 gmst mRNA 16515075 16516672 . - . ID=gene19849.t2;Parent=gene19849;description=Protein%20of%20unknown%20function%20%28DUF1635%29 +chr23 gmst exon 16515075 16515794 50.2 - 0 ID=gene19849.t2.exon1;Parent=gene19849.t2 +chr23 gmst CDS 16515075 16515794 50.2 - 0 ID=gene19849.t2.cds1;Parent=gene19849.t2 +chr23 gmst exon 16516562 16516672 50.2 - 0 ID=gene19849.t2.exon2;Parent=gene19849.t2 +chr23 gmst CDS 16516562 16516672 50.2 - 0 ID=gene19849.t2.cds2;Parent=gene19849.t2 +### +chr23 gmst gene 16530414 16531453 . - . ID=gene19850;description=Myb-like%20DNA-binding%20domain +chr23 gmst mRNA 16530414 16531453 . - . ID=gene19850.t1;Parent=gene19850;description=Myb-like%20DNA-binding%20domain +chr23 gmst exon 16530414 16531041 42.7 - 1 ID=gene19850.t1.exon1;Parent=gene19850.t1 +chr23 gmst CDS 16530414 16531041 42.7 - 1 ID=gene19850.t1.cds1;Parent=gene19850.t1 +chr23 gmst exon 16531197 16531453 42.7 - 0 ID=gene19850.t1.exon2;Parent=gene19850.t1 +chr23 gmst CDS 16531197 16531453 42.7 - 0 ID=gene19850.t1.cds2;Parent=gene19850.t1 +### +chr23 AUGUSTUS gene 16530414 16531542 . - . ID=gene19851;description=Differing%20isoform%20descriptions +chr23 AUGUSTUS mRNA 16530414 16531542 1 - . ID=gene19851.t1;Parent=gene19851;description=Myb-like%20DNA-binding%20domain +chr23 AUGUSTUS exon 16530414 16530721 . - . ID=gene19851.t1.exon1;Parent=gene19851.t1 +chr23 AUGUSTUS CDS 16530414 16530721 1 - 2 ID=gene19851.t1.cds1;Parent=gene19851.t1 +chr23 AUGUSTUS exon 16530824 16531041 . - . ID=gene19851.t1.exon2;Parent=gene19851.t1 +chr23 AUGUSTUS CDS 16530824 16531041 1 - 1 ID=gene19851.t1.cds2;Parent=gene19851.t1 +chr23 AUGUSTUS exon 16531197 16531326 . - . ID=gene19851.t1.exon3;Parent=gene19851.t1 +chr23 AUGUSTUS CDS 16531197 16531326 1 - 2 ID=gene19851.t1.cds3;Parent=gene19851.t1 +chr23 AUGUSTUS exon 16531428 16531542 . - . ID=gene19851.t1.exon4;Parent=gene19851.t1 +chr23 AUGUSTUS CDS 16531428 16531542 1 - 0 ID=gene19851.t1.cds4;Parent=gene19851.t1 +chr23 GeneMark.hmm3 mRNA 16531514 16531542 . - . ID=gene19851.t2;Parent=gene19851;description=Hypothetical%20protein%20%7C%20no%20eggnog%20hit +chr23 GeneMark.hmm3 exon 16531514 16531542 . - 0 ID=gene19851.t2.exon1;Parent=gene19851.t2 +chr23 GeneMark.hmm3 CDS 16531514 16531542 . - 0 ID=gene19851.t2.cds1;Parent=gene19851.t2 +### +chr23 AUGUSTUS gene 16539401 16545431 . + . ID=gene19852;description=nuclease%20HARBI1 +chr23 AUGUSTUS mRNA 16539401 16545431 1 + . ID=gene19852.t1;Parent=gene19852;description=nuclease%20HARBI1 +chr23 AUGUSTUS exon 16539401 16539509 . + . ID=gene19852.t1.exon1;Parent=gene19852.t1 +chr23 AUGUSTUS CDS 16539401 16539509 1 + 0 ID=gene19852.t1.cds1;Parent=gene19852.t1 +chr23 AUGUSTUS exon 16544386 16545431 . + . ID=gene19852.t1.exon2;Parent=gene19852.t1 +chr23 AUGUSTUS CDS 16544386 16545431 1 + 2 ID=gene19852.t1.cds2;Parent=gene19852.t1 +### +chr23 AUGUSTUS gene 16556338 16556796 . + . ID=gene19853;description=Zinc%20finger%20protein +chr23 AUGUSTUS mRNA 16556338 16556796 1 + . ID=gene19853.t1;Parent=gene19853;description=Zinc%20finger%20protein +chr23 AUGUSTUS exon 16556338 16556796 . + . ID=gene19853.t1.exon1;Parent=gene19853.t1 +chr23 AUGUSTUS CDS 16556338 16556796 1 + 0 ID=gene19853.t1.cds1;Parent=gene19853.t1 +### diff --git a/nextflow.config b/nextflow.config index 363f0c5..c3ce861 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,6 +54,7 @@ params { enforce_full_intron_support = true filter_liftoff_by_hints = true eggnogmapper_purge_nohits = false + filter_genes_by_aa_length = 24 // Annotation output options braker_save_outputs = false diff --git a/nextflow_schema.json b/nextflow_schema.json index b7b5cc4..1012531 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -272,6 +272,13 @@ "type": "boolean", "description": "Purge transcripts which do not have a hit against eggnog", "fa_icon": "fas fa-question-circle" + }, + "filter_genes_by_aa_length": { + "type": "integer", + "default": 24, + "fa_icon": "fas fa-hashtag", + "description": "Filter genes with open reading frames shorter than the specified number of amino acids excluding the stop codon. If set to `null`, this filter step is skipped.", + "minimum": 3 } } }, diff --git a/pfr/params.json b/pfr/params.json index 7e993bf..9983398 100644 --- a/pfr/params.json +++ b/pfr/params.json @@ -32,8 +32,9 @@ "enforce_full_intron_support": true, "filter_liftoff_by_hints": true, "eggnogmapper_purge_nohits": false, + "filter_genes_by_aa_length": 24, "braker_save_outputs": false, - "add_attrs_to_proteins_fasta": false, + "add_attrs_to_proteins_cds_fastas": false, "busco_skip": false, "busco_lineage_datasets": "embryophyta_odb10" } diff --git a/subworkflows/local/gff_eggnogmapper.nf b/subworkflows/local/gff_eggnogmapper.nf index 841a243..8e402d4 100644 --- a/subworkflows/local/gff_eggnogmapper.nf +++ b/subworkflows/local/gff_eggnogmapper.nf @@ -16,8 +16,8 @@ workflow GFF_EGGNOGMAPPER { | join(ch_fasta) GFF2FASTA_FOR_EGGNOGMAPPER( - ch_gffread_inputs.map { meta, gff, fasta -> [ meta, gff ] }, - ch_gffread_inputs.map { meta, gff, fasta -> fasta } + ch_gffread_inputs.map { meta, gff, _fasta -> [ meta, gff ] }, + ch_gffread_inputs.map { _meta, _gff, fasta -> fasta } ) ch_gffread_fasta = GFF2FASTA_FOR_EGGNOGMAPPER.out.gffread_fasta @@ -30,9 +30,9 @@ workflow GFF_EGGNOGMAPPER { | combine(Channel.fromPath(db_folder)) EGGNOGMAPPER( - ch_eggnogmapper_inputs.map { meta, fasta, db -> [ meta, fasta ] }, + ch_eggnogmapper_inputs.map { meta, fasta, _db -> [ meta, fasta ] }, [], - ch_eggnogmapper_inputs.map { meta, fasta, db -> db }, + ch_eggnogmapper_inputs.map { _meta, _fasta, db -> db }, [ [], [] ] ) diff --git a/subworkflows/local/gff_merge_cleanup.nf b/subworkflows/local/gff_merge_cleanup.nf index fc6c75e..8a77eda 100644 --- a/subworkflows/local/gff_merge_cleanup.nf +++ b/subworkflows/local/gff_merge_cleanup.nf @@ -1,18 +1,20 @@ include { AGAT_SPMERGEANNOTATIONS } from '../../modules/nf-core/agat/spmergeannotations/main' include { GT_GFF3 } from '../../modules/nf-core/gt/gff3/main' +include { GFFREAD as FILTER_BY_ORF_SIZE } from '../../modules/nf-core/gffread/main' include { AGAT_CONVERTSPGXF2GXF } from '../../modules/nf-core/agat/convertspgxf2gxf/main' workflow GFF_MERGE_CLEANUP { take: ch_braker_gff // Channel: [ meta, gff ] ch_liftoff_gff // Channel: [ meta, gff ] + val_filter_by_aa_length // val(null|Integer) main: ch_versions = Channel.empty() ch_gff_branch = ch_braker_gff | join(ch_liftoff_gff, remainder:true) - | branch { meta, braker_gff, liftoff_gff -> + | branch { _meta, braker_gff, liftoff_gff -> both : ( braker_gff && liftoff_gff ) braker_only : ( braker_gff && ( ! liftoff_gff ) ) liftoff_only: ( ( ! braker_gff ) && liftoff_gff ) @@ -25,12 +27,25 @@ workflow GFF_MERGE_CLEANUP { ) ch_merged_gff = AGAT_SPMERGEANNOTATIONS.out.gff - | mix ( ch_gff_branch.liftoff_only.map { meta, braker_gff, liftoff_gff -> [ meta, liftoff_gff ] } ) - | mix ( ch_gff_branch.braker_only.map { meta, braker_gff, liftoff_gff -> [ meta, braker_gff ] } ) + | mix ( ch_gff_branch.liftoff_only.map { meta, _braker_gff, liftoff_gff -> [ meta, liftoff_gff ] } ) + | mix ( ch_gff_branch.braker_only.map { meta, braker_gff, _liftoff_gff -> [ meta, braker_gff ] } ) ch_versions = ch_versions.mix(AGAT_SPMERGEANNOTATIONS.out.versions.first()) + // MODULE: GFFREAD as FILTER_BY_ORF_SIZE + ch_filter_input = ch_merged_gff + | branch { + filter: val_filter_by_aa_length != null + pass: val_filter_by_aa_length == null + } + + FILTER_BY_ORF_SIZE ( ch_filter_input.filter, [] ) + + ch_filtered_gff = FILTER_BY_ORF_SIZE.out.gffread_gff + | mix ( ch_filter_input.pass ) + ch_versions = ch_versions.mix(FILTER_BY_ORF_SIZE.out.versions.first()) + // MODULE: GT_GFF3 - GT_GFF3 ( ch_merged_gff ) + GT_GFF3 ( ch_filtered_gff ) ch_gt_gff = GT_GFF3.out.gt_gff3 ch_versions = ch_versions.mix(GT_GFF3.out.versions.first()) diff --git a/tests/minimal/main.nf.test b/tests/minimal/main.nf.test index cce8a77..5f1d1af 100644 --- a/tests/minimal/main.nf.test +++ b/tests/minimal/main.nf.test @@ -38,6 +38,8 @@ nextflow_pipeline { ['**'] ) + def summary_stats = (Map) new groovy.json.JsonSlurper().parseText(file("$outputDir/genepal_data/summary_stats.json").text) + assertAll( { assert workflow.success}, { assert snapshot( @@ -46,6 +48,7 @@ nextflow_pipeline { 'versions': removeNextflowVersion("$outputDir/pipeline_info/genepal_software_mqc_versions.yml"), 'stable paths': stable_path, 'stable names': getRelativePath(stable_name, outputDir), + 'summary_stats': summary_stats ] ).match() } ) diff --git a/tests/minimal/main.nf.test.snap b/tests/minimal/main.nf.test.snap index 48dae90..96c8444 100644 --- a/tests/minimal/main.nf.test.snap +++ b/tests/minimal/main.nf.test.snap @@ -2,7 +2,7 @@ "profile - test": { "content": [ { - "successful tasks": 20, + "successful tasks": 21, "versions": { "AGAT_CONVERTSPGFF2GTF": { "agat": "v1.4.0" @@ -37,6 +37,9 @@ "FASTAVALIDATOR": { "py_fasta_validator": 0.6 }, + "FILTER_BY_ORF_SIZE": { + "gffread": "0.12.7" + }, "FINAL_GFF_CHECK": { "genometools": "1.6.5" }, @@ -67,9 +70,9 @@ "stable paths": [ "a_thaliana.cdna.fasta:md5,12b9bef973e488640aec8c04ba3882fe", "a_thaliana.cds.fasta:md5,b81060419355a590560f92aec8536281", - "a_thaliana.gt.gff3:md5,8ab16549095f605ff8715ac4a3de58ed", + "a_thaliana.gt.gff3:md5,528459cf9596523bf66de99d24c37e20", "a_thaliana.pep.fasta:md5,4994c0393ca0245a1c57966d846d101e", - "a_thaliana.gff3:md5,d23d16cd86499d48a30ffb981ed27891", + "a_thaliana.gff3:md5,30adac1b21d7aaed6ca7fb71ab33f32d", "summary_stats.json:md5,007ba5cf2b7a2fd395a27d9458ca2d2e" ], "stable names": [ @@ -87,13 +90,26 @@ "genepal_report.html", "multiqc_report.html", "pipeline_info" - ] + ], + "summary_stats": { + "stats": [ + { + "ID": "a_thaliana", + "Genes": 252, + "mRNA": 265, + "CDS": 1340, + "Exons": 1340, + "Intron": 1075, + "Non canon splice sites": 18 + } + ] + } } ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.04.2" + "nextflow": "24.04.4" }, - "timestamp": "2024-12-05T07:51:43.818374" + "timestamp": "2024-12-12T09:36:52.952048" } -} +} \ No newline at end of file diff --git a/tests/stub/main.nf.test.snap b/tests/stub/main.nf.test.snap index 4516d50..7ed6f6e 100644 --- a/tests/stub/main.nf.test.snap +++ b/tests/stub/main.nf.test.snap @@ -2,7 +2,7 @@ "full - stub": { "content": [ { - "successful tasks": 162, + "successful tasks": 166, "versions": { "AGAT_CONVERTSPGFF2GTF": { "agat": "v1.4.0" @@ -70,6 +70,9 @@ "FASTP": { "fastp": "0.23.4" }, + "FILTER_BY_ORF_SIZE": { + "gffread": "0.12.7" + }, "FINAL_GFF_CHECK": { "genometools": "1.6.5" }, @@ -203,8 +206,8 @@ ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.04.2" + "nextflow": "24.04.4" }, - "timestamp": "2024-12-05T07:56:38.915238" + "timestamp": "2024-12-11T21:51:12.841395" } -} +} \ No newline at end of file diff --git a/workflows/genepal.nf b/workflows/genepal.nf index 538fcfe..6ee525b 100644 --- a/workflows/genepal.nf +++ b/workflows/genepal.nf @@ -178,7 +178,8 @@ workflow GENEPAL { // SUBWORKFLOW: GFF_MERGE_CLEANUP GFF_MERGE_CLEANUP( ch_braker_purged_gff, - ch_liftoff_gff3 + ch_liftoff_gff3, + params.filter_genes_by_aa_length ) ch_merged_gff = GFF_MERGE_CLEANUP.out.gff