From e494dda4f156deb1cfb6890725e2cbc9522491dc Mon Sep 17 00:00:00 2001
From: Usman Rashid <usman@smme.edu.pk>
Date: Thu, 19 Sep 2024 14:09:08 +1200
Subject: [PATCH] Now gff files for circular molecules can have end coordinates
 greater than the sequence length

---
 CHANGELOG.md                                  |  3 +-
 modules.json                                  |  2 +-
 .../gff3_gt_gff3_gff3validator_stat/main.nf   | 49 ++++++++++++++-
 .../tests/main.nf.test                        | 33 +++++++++-
 .../tests/main.nf.test.snap                   | 61 +++++++++++++++++++
 5 files changed, 142 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d8c13200..1a7a0783 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,7 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 1. Made the `hic` param pattern more flexible as `^SR\w+$|^\S+\{1,2\}[\w\.]*\.f(ast)?q\.gz$` [#130](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/130)
 2. Fixed flowchart syntax to remove '\n' [#132](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/132)
-3. Updated modules to remove Bioconda `defaults` channel
+3. Updated modules to remove Bioconda `defaults` channel [#135](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/135)
+4. Now gff files for circular molecules can have end coordinates greater than the sequence length [#129](https://github.com/Plant-Food-Research-Open/assemblyqc/issues/129)
 
 ### `Dependencies`
 
diff --git a/modules.json b/modules.json
index f20e4b05..86cf046a 100644
--- a/modules.json
+++ b/modules.json
@@ -131,7 +131,7 @@
                     },
                     "gff3_gt_gff3_gff3validator_stat": {
                         "branch": "main",
-                        "git_sha": "775762619b57101ca800269b6ecda0b915fb9913",
+                        "git_sha": "58c5f9e695b9e03d43e4c59d9339af7c93f0acbe",
                         "installed_by": ["subworkflows"]
                     }
                 }
diff --git a/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/main.nf b/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/main.nf
index 07608710..291f675e 100644
--- a/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/main.nf
+++ b/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/main.nf
@@ -129,19 +129,62 @@ def checkGff3FastaCorrespondence(meta, gff3File, faiFile) {
         def end = parts[4].toInteger()
         def seqLength = sequenceLengths[name].toInteger()
 
-        if (start > seqLength || end > seqLength) {
+        if ( start > seqLength ) {
             return [
                 meta,
                 [], // success log
                 [
                     "Failed to validate gff3: ${gff3File.name}",
-                    "Coordinates exceed sequence length in GFF3 file:",
+                    "Start coordinates exceed sequence length in the GFF3 file:",
+                    "Sequence: $name",
+                    "Sequence length: $seqLength",
+                    "Start: $start"
+                ] // error log
+            ]
+        }
+
+        if ( end > seqLength ) {
+
+            // Check if the sequence is defined as a circular region
+            // Otherwise, fail
+            def regionLine = gff3Lines.find {
+                def _parts = it.split('\t')
+
+                _parts[0] == "$name" && _parts[2] == 'region'
+            }
+
+            if ( ! regionLine ) {
+                return [
+                    meta,
+                    [], // success log
+                    [
+                        "Failed to validate gff3: ${gff3File.name}",
+                        "End coordinates exceed sequence length and the sequence attributes are also missing in GFF3 file:",
+                        "Sequence: $name",
+                        "Sequence length: $seqLength",
+                        "End: $end"
+                    ] // error log
+                ]
+            }
+
+            def regionAtts = regionLine.split('\t')[8]
+            def isCircular = regionAtts.contains('circular=true')
+
+            // Models on circular molecules are allowed to exceed sequence length
+            if ( isCircular ) { continue }
+
+            return [
+                meta,
+                [], // success log
+                [
+                    "Failed to validate gff3: ${gff3File.name}",
+                    "End coordinates exceed length of a non-circular sequence in GFF3 file:",
                     "Sequence: $name",
                     "Sequence length: $seqLength",
-                    "Start: $start",
                     "End: $end"
                 ] // error log
             ]
+
         }
     }
 
diff --git a/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test b/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test
index 6098d7da..d8a32cdf 100644
--- a/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test
+++ b/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test
@@ -114,7 +114,38 @@ nextflow_workflow {
                     workflow.out.valid_gff3,
                     workflow.out.versions).match()
                 },
-                { assert path(workflow.out.log_for_invalid_gff3[0][1]).text.contains('Coordinates exceed sequence length in GFF3 file') }
+                { assert path(workflow.out.log_for_invalid_gff3[0][1]).text.contains('Start coordinates exceed sequence length in the GFF3 file') }
+            )
+        }
+    }
+
+    test("sarscov2 - fasta - circular_region - pass") {
+
+        when {
+            workflow {
+                """
+                def circular_gff = new File('circular_gff.gff')
+                circular_gff.text = [
+                    '##gff-version 3',
+                    'MT192765.1	Genbank	region	1	29829	.	+	.	circular=true',
+                    'MT192765.1	Genbank	gene	29551	39667	.	+	.	ID=gene1',
+                    'MT192765.1	Genbank	CDS	29551	39667	.	+	0	Parent=gene1'
+                ].join('\\n')
+
+                input[0] = Channel.of([ [ id:'test' ], // meta map
+                    circular_gff.toPath()
+                ])
+                input[1] = Channel.of([ [ id:'test' ],
+                    file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+                ])
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert workflow.success},
+                { assert snapshot(workflow.out).match() }
             )
         }
     }
diff --git a/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test.snap b/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test.snap
index e81b4459..660f7e0b 100644
--- a/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test.snap
+++ b/subworkflows/gallvp/gff3_gt_gff3_gff3validator_stat/tests/main.nf.test.snap
@@ -63,6 +63,67 @@
         },
         "timestamp": "2024-07-29T16:22:06.684959"
     },
+    "sarscov2 - fasta - circular_region - pass": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "test.gt.gff3:md5,b3bb01b18b8eeac28922ab55c5c6c939"
+                    ]
+                ],
+                "1": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "test.yml:md5,545b8e290cfa8a93fd0ff01ad9daee08"
+                    ]
+                ],
+                "2": [
+                    
+                ],
+                "3": [
+                    "versions.yml:md5,0cb9519e626e5128d8495cf29b7d59ff",
+                    "versions.yml:md5,80555fe6e28e9564cb534f5478842286",
+                    "versions.yml:md5,8a418ac34d045b0cdac812eb2dc9c106",
+                    "versions.yml:md5,c89b081a13c68acc5326e43ca9104344"
+                ],
+                "gff3_stats": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "test.yml:md5,545b8e290cfa8a93fd0ff01ad9daee08"
+                    ]
+                ],
+                "log_for_invalid_gff3": [
+                    
+                ],
+                "valid_gff3": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "test.gt.gff3:md5,b3bb01b18b8eeac28922ab55c5c6c939"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,0cb9519e626e5128d8495cf29b7d59ff",
+                    "versions.yml:md5,80555fe6e28e9564cb534f5478842286",
+                    "versions.yml:md5,8a418ac34d045b0cdac812eb2dc9c106",
+                    "versions.yml:md5,c89b081a13c68acc5326e43ca9104344"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.9.0",
+            "nextflow": "24.04.4"
+        },
+        "timestamp": "2024-09-19T13:53:32.901064"
+    },
     "sarscov2-genome_gff3-homo_sapiens-genome_fasta-correspondence_fail": {
         "content": [
             [