From e494dda4f156deb1cfb6890725e2cbc9522491dc Mon Sep 17 00:00:00 2001 From: Usman Rashid <usman@smme.edu.pk> Date: Thu, 19 Sep 2024 14:09:08 +1200 Subject: [PATCH] Now gff files for circular molecules can have end coordinates greater than the sequence length --- CHANGELOG.md | 3 +- modules.json | 2 +- .../gff3_gt_gff3_gff3validator_stat/main.nf | 49 ++++++++++++++- .../tests/main.nf.test | 33 +++++++++- .../tests/main.nf.test.snap | 61 +++++++++++++++++++ 5 files changed, 142 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d8c13200..1a7a0783 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 1. Made the `hic` param pattern more flexible as `^SR\w+$|^\S+\{1,2\}[\w\.]*\.f(ast)?q\.gz$` [#130](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/130) 2. Fixed flowchart syntax to remove '\n' [#132](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/132) -3. Updated modules to remove Bioconda `defaults` channel +3. Updated modules to remove Bioconda `defaults` channel [#135](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/135) +4. Now gff files for circular molecules can have end coordinates greater than the sequence length [#129](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/129) ### `Dependencies` diff --git a/modules.json b/modules.json index f20e4b05..86cf046a 100644 --- a/modules.json +++ b/modules.json @@ -131,7 +131,7 @@ }, "gff3_gt_gff3_gff3validator_stat": { "branch": "main", - "git_sha": "775762619b57101ca800269b6ecda0b915fb9913", + "git_sha": "58c5f9e695b9e03d43e4c59d9339af7c93f0acbe", "installed_by": ["subworkflows"] } } diff --git a/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/main.nf b/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/main.nf index 07608710..291f675e 100644 --- a/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/main.nf +++ b/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/main.nf @@ -129,19 +129,62 @@ def checkGff3FastaCorrespondence(meta, gff3File, faiFile) { def end = parts[4].toInteger() def seqLength = sequenceLengths[name].toInteger() - if (start > seqLength || end > seqLength) { + if ( start > seqLength ) { return [ meta, [], // success log [ "Failed to validate gff3: ${gff3File.name}", - "Coordinates exceed sequence length in GFF3 file:", + "Start coordinates exceed sequence length in the GFF3 file:", + "Sequence: $name", + "Sequence length: $seqLength", + "Start: $start" + ] // error log + ] + } + + if ( end > seqLength ) { + + // Check if the sequence is defined as a circular region + // Otherwise, fail + def regionLine = gff3Lines.find { + def _parts = it.split('\t') + + _parts[0] == "$name" && _parts[2] == 'region' + } + + if ( ! regionLine ) { + return [ + meta, + [], // success log + [ + "Failed to validate gff3: ${gff3File.name}", + "End coordinates exceed sequence length and the sequence attributes are also missing in GFF3 file:", + "Sequence: $name", + "Sequence length: $seqLength", + "End: $end" + ] // error log + ] + } + + def regionAtts = regionLine.split('\t')[8] + def isCircular = regionAtts.contains('circular=true') + + // Models on circular molecules are allowed to exceed sequence length + if ( isCircular ) { continue } + + return [ + meta, + [], // success log + [ + "Failed to validate gff3: ${gff3File.name}", + "End coordinates exceed length of a non-circular sequence in GFF3 file:", "Sequence: $name", "Sequence length: $seqLength", - "Start: $start", "End: $end" ] // error log ] + } } diff --git a/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test b/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test index 6098d7da..d8a32cdf 100644 --- a/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test +++ b/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test @@ -114,7 +114,38 @@ nextflow_workflow { workflow.out.valid_gff3, workflow.out.versions).match() }, - { assert path(workflow.out.log_for_invalid_gff3[0][1]).text.contains('Coordinates exceed sequence length in GFF3 file') } + { assert path(workflow.out.log_for_invalid_gff3[0][1]).text.contains('Start coordinates exceed sequence length in the GFF3 file') } + ) + } + } + + test("sarscov2 - fasta - circular_region - pass") { + + when { + workflow { + """ + def circular_gff = new File('circular_gff.gff') + circular_gff.text = [ + '##gff-version 3', + 'MT192765.1 Genbank region 1 29829 . + . circular=true', + 'MT192765.1 Genbank gene 29551 39667 . + . ID=gene1', + 'MT192765.1 Genbank CDS 29551 39667 . + 0 Parent=gene1' + ].join('\\n') + + input[0] = Channel.of([ [ id:'test' ], // meta map + circular_gff.toPath() + ]) + input[1] = Channel.of([ [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match() } ) } } diff --git a/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test.snap b/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test.snap index e81b4459..660f7e0b 100644 --- a/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test.snap +++ b/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test.snap @@ -63,6 +63,67 @@ }, "timestamp": "2024-07-29T16:22:06.684959" }, + "sarscov2 - fasta - circular_region - pass": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gt.gff3:md5,b3bb01b18b8eeac28922ab55c5c6c939" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.yml:md5,545b8e290cfa8a93fd0ff01ad9daee08" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,0cb9519e626e5128d8495cf29b7d59ff", + "versions.yml:md5,80555fe6e28e9564cb534f5478842286", + "versions.yml:md5,8a418ac34d045b0cdac812eb2dc9c106", + "versions.yml:md5,c89b081a13c68acc5326e43ca9104344" + ], + "gff3_stats": [ + [ + { + "id": "test" + }, + "test.yml:md5,545b8e290cfa8a93fd0ff01ad9daee08" + ] + ], + "log_for_invalid_gff3": [ + + ], + "valid_gff3": [ + [ + { + "id": "test" + }, + "test.gt.gff3:md5,b3bb01b18b8eeac28922ab55c5c6c939" + ] + ], + "versions": [ + "versions.yml:md5,0cb9519e626e5128d8495cf29b7d59ff", + "versions.yml:md5,80555fe6e28e9564cb534f5478842286", + "versions.yml:md5,8a418ac34d045b0cdac812eb2dc9c106", + "versions.yml:md5,c89b081a13c68acc5326e43ca9104344" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-19T13:53:32.901064" + }, "sarscov2-genome_gff3-homo_sapiens-genome_fasta-correspondence_fail": { "content": [ [