From fb224a73e5dffbc87de568af09520f3c20157884 Mon Sep 17 00:00:00 2001
From: jemten <jemten@users.noreply.github.com>
Date: Fri, 13 Dec 2024 16:11:44 +0100
Subject: [PATCH 1/4] fixing parsing of chrM

---
 genmod/annotate_regions/get_features.py      |  4 +--
 genmod/annotate_regions/parse_annotations.py | 34 +++++++++++---------
 genmod/annotate_variants/annotate.py         | 18 ++++++-----
 genmod/utils/get_priority.py                 | 31 +++++++++---------
 pytest.ini                                   |  2 +-
 tests/annotate_regions/test_bed_parser.py    | 27 ++++++++++++++--
 tests/conftest.py                            |  6 ++--
 tests/fixtures/test_vcf_regions.vcf          |  2 +-
 tests/fixtures/test_vcf_regions_with_chr.vcf |  2 +-
 tests/functionality/test_annotate_variant.py |  9 +++---
 tests/utils/test_get_priority.py             |  6 +++-
 11 files changed, 86 insertions(+), 55 deletions(-)

diff --git a/genmod/annotate_regions/get_features.py b/genmod/annotate_regions/get_features.py
index 5f8ddd1..d867cc8 100755
--- a/genmod/annotate_regions/get_features.py
+++ b/genmod/annotate_regions/get_features.py
@@ -15,13 +15,13 @@
 
 def get_region(chrom, start, end, region_trees):
     """Check if a position overlapps any regions
-    
+
     Arguments:
        chrom (str): The chromosome
        start (int): The start position for the feature
        end (int): The stop position for the feature
        region_trees (dict): A dictionary with chromosomes as keys and interval trees as values
-    
+
     Returns:
         regions (set): The regions that the variant ovelapps
     """
diff --git a/genmod/annotate_regions/parse_annotations.py b/genmod/annotate_regions/parse_annotations.py
index e00be29..d00b461 100755
--- a/genmod/annotate_regions/parse_annotations.py
+++ b/genmod/annotate_regions/parse_annotations.py
@@ -3,7 +3,7 @@
 """
 annotation_parser.py
 
-This script will parse a file with intervals in .bed format and build 
+This script will parse a file with intervals in .bed format and build
 one interval tree for each chromosome.
 These intervals typically represents genes
 
@@ -29,28 +29,28 @@
 
 def get_interval(start, stop, value):
     """Create an interval instance
-    
+
         Args:
             start(int)
             stop(int)
             value
-        
+
         Returns:
             interval(intervaltree.Interval)
-    
+
     """
     interval = Interval(start, stop, value)
     return interval
 
 def build_region_trees(bed_lines, padding):
     """Build region trees for each chromosome
-    
-        Build a dictionary with chromosomes as keys and interval trees as 
+
+        Build a dictionary with chromosomes as keys and interval trees as
         values.
-    
+
         Args:
             bed_lines(iterable): An iterable with bed formated lines
-            padding (int): Defines what should be considered upstream 
+            padding (int): Defines what should be considered upstream
                            and downstream variants
     """
     region_trees = {}
@@ -71,14 +71,14 @@ def build_region_trees(bed_lines, padding):
 def bed_parser(bed_lines, padding=4000):
     """
     Parse a file in the bed format.
-    
+
     Arguments:
         bed_lines(iterable): An iterable with bed formated lines
-        padding (int): Defines what should be considered upstream 
+        padding (int): Defines what should be considered upstream
                        and downstream variants
-    
+
     Yields:
-        region(dict): 
+        region(dict):
                     {
                         'chrom': str,
                         'start': int,
@@ -93,22 +93,24 @@ def bed_parser(bed_lines, padding=4000):
             feature_id = str(index)
             # Get the coordinates for the region:
             chrom = line[0].lstrip('chr')
-            if chrom == 'MT':
+            if chrom in ['MT', 'M']:
+                # Only represent the mitochondrial chromosome as M
+                chrom = 'M'
                 feature_start = int(line[1])
                 feature_stop = int(line[2])
             else:
                 feature_start = max(int(line[1]) - padding, 0)
                 feature_stop = int(line[2])
-                
+
             # Get the feature id
             if len(line) > 3:
                 feature_id = line [3]
-            
+
             region = {
                 'chrom': chrom,
                 'start': feature_start,
                 'stop': feature_stop,
                 'symbol': feature_id
             }
-            
+
             yield region
diff --git a/genmod/annotate_variants/annotate.py b/genmod/annotate_variants/annotate.py
index 388309b..37b5c63 100644
--- a/genmod/annotate_variants/annotate.py
+++ b/genmod/annotate_variants/annotate.py
@@ -1,7 +1,7 @@
 import logging
 
 from genmod.annotate_regions.get_features import get_region
-from genmod.annotate_variants.read_tabix_files import (get_frequencies, 
+from genmod.annotate_variants.read_tabix_files import (get_frequencies,
      get_spidex_score, get_cadd_scores)
 
 logger = logging.getLogger(__name__)
@@ -12,29 +12,31 @@ def annotate_variant(variant, annotation_arguments):
     chrom = variant_info[0]
     if chrom.startswith(('chr', 'CHR', 'Chr')):
         chrom = chrom[3:]
+    elif chrom == 'MT':
+        chrom = 'M'
     pos = int(variant_info[1])
     ref = variant_info[3]
     alt = variant_info[4]
-    
+
     info = variant_info[7]
     if info == '.':
         info = []
     else:
         info = info.split(';')
-    
+
     ## TODO this needs to be handeled different for SV:s
     start = pos
     # This is a construct so that there will not be inconsistent genetic regions
     end = pos + 1
     # end = pos + max(len(ref), len(alt))
-    
+
     #Check which annotations that are available
     regions = None
     if 'region_trees' in annotation_arguments:
         regions = get_region(chrom, start, end, annotation_arguments['region_trees'])
         if regions:
             info.append("Annotation={0}".format(','.join(regions)))
-    
+
     if 'exac' in annotation_arguments:
         reader = annotation_arguments['exac']
         frequencies = get_frequencies(reader, chrom, start, alt)
@@ -76,7 +78,7 @@ def annotate_variant(variant, annotation_arguments):
         info_string = ';'.join(info)
     else:
         info_string = '.'
-    
+
     variant_info[7] = info_string
-    
-    return '\t'.join(variant_info)
\ No newline at end of file
+
+    return '\t'.join(variant_info)
diff --git a/genmod/utils/get_priority.py b/genmod/utils/get_priority.py
index 6f20e78..6d0b31e 100644
--- a/genmod/utils/get_priority.py
+++ b/genmod/utils/get_priority.py
@@ -4,21 +4,21 @@
 def get_chromosome_priority(chrom, chrom_dict={}):
     """
     Return the chromosome priority
-    
+
     Arguments:
         chrom (str): The cromosome name from the vcf
         chrom_dict (dict): A map of chromosome names and theis priority
-    
+
     Return:
         priority (str): The priority for this chromosom
     """
     priority = '0'
-    
+
     chrom = chrom.lstrip('chr')
-    
+
     if chrom_dict:
         priority = chrom_dict.get(chrom, '0')
-    
+
     else:
         try:
             if int(chrom) < 23:
@@ -28,38 +28,38 @@ def get_chromosome_priority(chrom, chrom_dict={}):
                 priority = '23'
             elif chrom == 'Y':
                 priority = '24'
-            elif chrom == 'MT':
+            elif chrom in ['MT', 'M']:
                 priority = '25'
             else:
                 priority = '26'
-    
+
     return priority
 
 def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score_type: str = 'RankScore'):
     """
     Return the rank score priority for a certain family.
-    
+
     If no family is given the first family found is used
-    
+
     Arguments:
         variant_line (str): A vcf variant line
         variant_dict (dict): A variant dictionary
         family_id (str): A family id
         rank_score_type(str): Return rank score based on raw or normalized format
         See the genmod.score_variants.rank_score_variant_definitions for more info.
-    
+
     Return:
         rank_score (str): The rank score for this variant
     """
     if rank_score_type not in RANK_SCORE_TYPE_NAMES:
         raise ValueError('Unknown rank_score_type', rank_score_type)
-    
+
     rank_score = -100
     raw_entry = None
-    
+
     if variant_line:
         variant_line = variant_line.split("\t")
-    
+
         for info_annotation in variant_line[7].split(';'):
             info_annotation = info_annotation.split('=')
             key = None
@@ -69,10 +69,10 @@ def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score
             if key == rank_score_type:
                 raw_entry = value
                 break
-    
+
     elif variant_dict:
         raw_entry: str = variant_dict['info_dict'].get(rank_score_type)
-    
+
     if raw_entry:
         for family_annotation in raw_entry.split(','):
             family_annotation = family_annotation.split(':')
@@ -86,4 +86,3 @@ def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score
                 rank_score = float(family_annotation[1])
 
     return str(rank_score)
-    
\ No newline at end of file
diff --git a/pytest.ini b/pytest.ini
index a8cde3d..313c0bf 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -2,4 +2,4 @@
 [pytest]
 minversion = 6.0
 addopts = -ra -q
-testpaths = []
\ No newline at end of file
+testpaths = tests
diff --git a/tests/annotate_regions/test_bed_parser.py b/tests/annotate_regions/test_bed_parser.py
index 477d180..351f726 100644
--- a/tests/annotate_regions/test_bed_parser.py
+++ b/tests/annotate_regions/test_bed_parser.py
@@ -12,6 +12,29 @@ def test_bed_parser(bed_lines):
     for index, region in enumerate(bed_parser(bed_lines)):
     # THEN assert the symbols are found in the bed lines
         assert region['symbol'] in symbols
-    
+
     assert index+1 == nr_regions
-        
\ No newline at end of file
+
+def test_bed_parser_with_mt_chromosome():
+    # Given a bed line with the mitochondrial chromosome encoded as MT
+    bed_lines = [
+        "MT\t1000\t5000\tgene_MT"
+    ]
+    # Then the output region should not be padded
+    expected_output = [
+        {'chrom': 'M', 'start': 1000, 'stop': 5000, 'symbol': 'gene_MT'}
+    ]
+    result = list(bed_parser(bed_lines, padding=4000))
+    assert result == expected_output
+
+def test_bed_parser_with_chrm_chromosome():
+    # Given a bed line with the mitochondrial chromosome encoded as chrM instead of MT
+    bed_lines = [
+        "chrM\t1000\t5000\tgene_chrM"
+    ]
+    # Then the output region should not be padded and the chrom encoded as M
+    expected_output = [
+        {'chrom': 'M', 'start': 1000, 'stop': 5000, 'symbol': 'gene_chrM'}
+    ]
+    result = list(bed_parser(bed_lines, padding=4000))
+    assert result == expected_output
diff --git a/tests/conftest.py b/tests/conftest.py
index a884d19..1ab9fd4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -46,7 +46,7 @@ def header(request, vcf_path):
     with open(vcf_path, 'r') as variant_file:
         for line in variant_file:
             line = line.rstrip()
-        
+
             if line.startswith('#'):
                 if line.startswith('##'):
                     head.parse_meta_data(line)
@@ -54,7 +54,7 @@ def header(request, vcf_path):
                     head.parse_header_line(line)
             else:
                 break
-    
+
     return head
 
 
@@ -72,4 +72,4 @@ def bed_lines(request):
 @pytest.fixture(scope='function')
 def ensembl_file(request):
     """Return the path to ensembl file with region defenitions"""
-    return ensembl_path
\ No newline at end of file
+    return ensembl_path
diff --git a/tests/fixtures/test_vcf_regions.vcf b/tests/fixtures/test_vcf_regions.vcf
index b13f977..14c03c1 100644
--- a/tests/fixtures/test_vcf_regions.vcf
+++ b/tests/fixtures/test_vcf_regions.vcf
@@ -18,4 +18,4 @@
 10	76154074	.	C	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60
 10	76154076	.	G	C	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/0:10,10:60	0/1:10,10:60	./.	0/0:10,10:60	0/1:10,10:60
 X	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1;Annotation=PPP2R3B	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
-MT	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
+MT	4336	.	T	C	100	PASS	MQ=1	GT:AD:GQ	0/0:321,0:60	1/1:0,341:54	1/1:0,341:54	0/0:321,0:60	1/1:0,341:54	1/1:0,341:54
diff --git a/tests/fixtures/test_vcf_regions_with_chr.vcf b/tests/fixtures/test_vcf_regions_with_chr.vcf
index 51eb0ef..ee012d5 100644
--- a/tests/fixtures/test_vcf_regions_with_chr.vcf
+++ b/tests/fixtures/test_vcf_regions_with_chr.vcf
@@ -18,4 +18,4 @@ chr10	76154073	.	T	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	0/0:10,10:60	0/0:10,1
 chr10	76154074	.	C	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60
 chr10	76154076	.	G	C	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/0:10,10:60	0/1:10,10:60	./.	0/0:10,10:60	0/1:10,10:60
 chrX	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1;Annotation=PPP2R3B	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
-chrM	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
+chrM	4336	.	T	C	100	PASS	MQ=1	GT:AD:GQ	0/0:321,0:60	1/1:0,341:54	1/1:0,341:54	0/0:321,0:60	1/1:0,341:54	1/1:0,341:54
diff --git a/tests/functionality/test_annotate_variant.py b/tests/functionality/test_annotate_variant.py
index d040a3b..67445bd 100644
--- a/tests/functionality/test_annotate_variant.py
+++ b/tests/functionality/test_annotate_variant.py
@@ -2,6 +2,7 @@
 from click.testing import CliRunner
 
 VCF_FILE = "tests/fixtures/test_vcf_regions.vcf"
+VCF_FILE_38 = "tests/fixtures/test_vcf_regions_with_chr.vcf"
 EMPTY_VCF_FILE = "tests/fixtures/empty.vcf"
 THOUSAND_G_FILE = "tests/fixtures/annotate_variant/small_1000G.vcf.gz"
 CADD_FILE = "tests/fixtures/annotate_variant/small_CADD.tsv.gz"
@@ -60,7 +61,7 @@ def test_genmod_annotate_thousand_g():
             '--thousand-g',
             THOUSAND_G_FILE
         ])
-    
+
     assert result.exit_code == 0
 
 def test_genmod_annotate_cadd():
@@ -73,7 +74,7 @@ def test_genmod_annotate_cadd():
             '--cadd-file',
             CADD_FILE
         ])
-    
+
     assert result.exit_code == 0
 
 def test_genmod_annotate_multiple_cadd():
@@ -87,8 +88,8 @@ def test_genmod_annotate_multiple_cadd():
             CADD_FILE,
             '--cadd-file',
             CADD_1000G_FILE
-            
+
         ])
-    
+
     assert result.exit_code == 0
 
diff --git a/tests/utils/test_get_priority.py b/tests/utils/test_get_priority.py
index 4608e0d..ee6f0ed 100644
--- a/tests/utils/test_get_priority.py
+++ b/tests/utils/test_get_priority.py
@@ -17,6 +17,10 @@ def test_get_MT_priority():
     """docstring for test_get_MT_priority"""
     assert get_chromosome_priority(chrom='MT', chrom_dict={}) == '25'
 
+def test_get_chrM_priority():
+    """docstring for test_get_MT_priority"""
+    assert get_chromosome_priority(chrom='M', chrom_dict={}) == '25'
+
 def test_get_OTHER_priority():
     """docstring for test_get_MT_priority"""
     assert get_chromosome_priority(chrom='GL37', chrom_dict={}) == '26'
@@ -28,4 +32,4 @@ def test_get_chr_prority():
 def test_get_custom_prority():
     """docstring for test_get_chr_prority"""
     assert get_chromosome_priority(chrom='AHA_1', chrom_dict={'AHA_1':2, 'AHA_2':3}) == 2
-    
+

From 151013db63b9dc2905bc3ae3b7a8a55aa9724671 Mon Sep 17 00:00:00 2001
From: jemten <jemten@users.noreply.github.com>
Date: Fri, 13 Dec 2024 16:26:52 +0000
Subject: [PATCH 2/4] update test vcf

---
 tests/fixtures/test_vcf_regions.vcf          | 2 +-
 tests/fixtures/test_vcf_regions_with_chr.vcf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/fixtures/test_vcf_regions.vcf b/tests/fixtures/test_vcf_regions.vcf
index 14c03c1..645054d 100644
--- a/tests/fixtures/test_vcf_regions.vcf
+++ b/tests/fixtures/test_vcf_regions.vcf
@@ -18,4 +18,4 @@
 10	76154074	.	C	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60
 10	76154076	.	G	C	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/0:10,10:60	0/1:10,10:60	./.	0/0:10,10:60	0/1:10,10:60
 X	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1;Annotation=PPP2R3B	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
-MT	4336	.	T	C	100	PASS	MQ=1	GT:AD:GQ	0/0:321,0:60	1/1:0,341:54	1/1:0,341:54	0/0:321,0:60	1/1:0,341:54	1/1:0,341:54
+MT	4336	.	T	C	100	PASS	MQ=1;Annotation=MT-TQ	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
diff --git a/tests/fixtures/test_vcf_regions_with_chr.vcf b/tests/fixtures/test_vcf_regions_with_chr.vcf
index ee012d5..b0069e7 100644
--- a/tests/fixtures/test_vcf_regions_with_chr.vcf
+++ b/tests/fixtures/test_vcf_regions_with_chr.vcf
@@ -18,4 +18,4 @@ chr10	76154073	.	T	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	0/0:10,10:60	0/0:10,1
 chr10	76154074	.	C	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60
 chr10	76154076	.	G	C	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/0:10,10:60	0/1:10,10:60	./.	0/0:10,10:60	0/1:10,10:60
 chrX	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1;Annotation=PPP2R3B	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
-chrM	4336	.	T	C	100	PASS	MQ=1	GT:AD:GQ	0/0:321,0:60	1/1:0,341:54	1/1:0,341:54	0/0:321,0:60	1/1:0,341:54	1/1:0,341:54
+chrM	4336	.	T	C	100	PASS	MQ=1;Annotation=MT-TQ	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60

From d89f397f66587e806eb13d8c8ea73057d1b9535b Mon Sep 17 00:00:00 2001
From: jemten <jemten@users.noreply.github.com>
Date: Fri, 13 Dec 2024 17:41:27 +0100
Subject: [PATCH 3/4] update changelog

---
 CHANGELOG.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d29c9b3..0ff373d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,7 +13,8 @@ Try to use the following format:
 - The optional fields Source and Version are allowed in the VCF header([#106](https://github.com/Clinical-Genomics/genmod/pull/106))
 - Released the constraint on Python 3.8 (collections, pkg_resources to importlib, tests) ([#142](https://github.com/Clinical-Genomics/genmod/pull/142))
 - Update annotation examples ([#144](https://github.com/Clinical-Genomics/genmod/pull/144))
-- Updated documentation with warning about compounds only being scored within the first family in the VCF ([#151](https://github.com/Clinical-Genomics/genmod/pull/151)) 
+- Updated documentation with warning about compounds only being scored within the first family in the VCF ([#151](https://github.com/Clinical-Genomics/genmod/pull/151))
+- genmod annotate for mitochondrial variants when using the `chrM` notation ([#157](https://github.com/Clinical-Genomics/genmod/pull/157))
 
 ## [3.9]
 - Fixed wrong models when chromosome X was named `chrX` and not `X` ([#135](https://github.com/Clinical-Genomics/genmod/pull/135))

From 4e7dab4fbfde6b546e3526f2a631b58ae713b4d4 Mon Sep 17 00:00:00 2001
From: jemten <jemten@users.noreply.github.com>
Date: Fri, 13 Dec 2024 16:45:59 +0000
Subject: [PATCH 4/4] Remove unused VCF_FILE_38 variable from
 test_annotate_variant.py

---
 tests/functionality/test_annotate_variant.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/functionality/test_annotate_variant.py b/tests/functionality/test_annotate_variant.py
index 67445bd..5bd8d1a 100644
--- a/tests/functionality/test_annotate_variant.py
+++ b/tests/functionality/test_annotate_variant.py
@@ -2,7 +2,6 @@
 from click.testing import CliRunner
 
 VCF_FILE = "tests/fixtures/test_vcf_regions.vcf"
-VCF_FILE_38 = "tests/fixtures/test_vcf_regions_with_chr.vcf"
 EMPTY_VCF_FILE = "tests/fixtures/empty.vcf"
 THOUSAND_G_FILE = "tests/fixtures/annotate_variant/small_1000G.vcf.gz"
 CADD_FILE = "tests/fixtures/annotate_variant/small_CADD.tsv.gz"