Clinical-Genomics · jemten · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,7 +13,8 @@ Try to use the following format:
 - The optional fields Source and Version are allowed in the VCF header([#106](https://github.com/Clinical-Genomics/genmod/pull/106))
 - Released the constraint on Python 3.8 (collections, pkg_resources to importlib, tests) ([#142](https://github.com/Clinical-Genomics/genmod/pull/142))
 - Update annotation examples ([#144](https://github.com/Clinical-Genomics/genmod/pull/144))
-- Updated documentation with warning about compounds only being scored within the first family in the VCF ([#151](https://github.com/Clinical-Genomics/genmod/pull/151)) 
+- Updated documentation with warning about compounds only being scored within the first family in the VCF ([#151](https://github.com/Clinical-Genomics/genmod/pull/151))
+- genmod annotate for mitochondrial variants when using the `chrM` notation ([#157](https://github.com/Clinical-Genomics/genmod/pull/157))
 
 ## [3.9]
 - Fixed wrong models when chromosome X was named `chrX` and not `X` ([#135](https://github.com/Clinical-Genomics/genmod/pull/135))

diff --git a/genmod/annotate_regions/get_features.py b/genmod/annotate_regions/get_features.py
@@ -15,13 +15,13 @@
 
 def get_region(chrom, start, end, region_trees):
     """Check if a position overlapps any regions
-    
+
     Arguments:
        chrom (str): The chromosome
        start (int): The start position for the feature
        end (int): The stop position for the feature
        region_trees (dict): A dictionary with chromosomes as keys and interval trees as values
-    
+
     Returns:
         regions (set): The regions that the variant ovelapps
     """

diff --git a/genmod/annotate_regions/parse_annotations.py b/genmod/annotate_regions/parse_annotations.py
@@ -3,7 +3,7 @@
 """
 annotation_parser.py
 
-This script will parse a file with intervals in .bed format and build 
+This script will parse a file with intervals in .bed format and build
 one interval tree for each chromosome.
 These intervals typically represents genes
 
@@ -29,28 +29,28 @@
 
 def get_interval(start, stop, value):
     """Create an interval instance
-    
+
         Args:
             start(int)
             stop(int)
             value
-        
+
         Returns:
             interval(intervaltree.Interval)
-    
+
     """
     interval = Interval(start, stop, value)
     return interval
 
 def build_region_trees(bed_lines, padding):
     """Build region trees for each chromosome
-    
-        Build a dictionary with chromosomes as keys and interval trees as 
+
+        Build a dictionary with chromosomes as keys and interval trees as
         values.
-    
+
         Args:
             bed_lines(iterable): An iterable with bed formated lines
-            padding (int): Defines what should be considered upstream 
+            padding (int): Defines what should be considered upstream
                            and downstream variants
     """
     region_trees = {}
@@ -71,14 +71,14 @@ def build_region_trees(bed_lines, padding):
 def bed_parser(bed_lines, padding=4000):
     """
     Parse a file in the bed format.
-    
+
     Arguments:
         bed_lines(iterable): An iterable with bed formated lines
-        padding (int): Defines what should be considered upstream 
+        padding (int): Defines what should be considered upstream
                        and downstream variants
-    
+
     Yields:
-        region(dict): 
+        region(dict):
                     {
                         'chrom': str,
                         'start': int,
@@ -93,22 +93,24 @@ def bed_parser(bed_lines, padding=4000):
             feature_id = str(index)
             # Get the coordinates for the region:
             chrom = line[0].lstrip('chr')
-            if chrom == 'MT':
+            if chrom in ['MT', 'M']:
+                # Only represent the mitochondrial chromosome as M
+                chrom = 'M'
                 feature_start = int(line[1])
                 feature_stop = int(line[2])
             else:
                 feature_start = max(int(line[1]) - padding, 0)
                 feature_stop = int(line[2])
-                
+
             # Get the feature id
             if len(line) > 3:
                 feature_id = line [3]
-            
+
             region = {
                 'chrom': chrom,
                 'start': feature_start,
                 'stop': feature_stop,
                 'symbol': feature_id
             }
-            
+
             yield region
diff --git a/genmod/annotate_variants/annotate.py b/genmod/annotate_variants/annotate.py
@@ -1,7 +1,7 @@
 import logging
 
 from genmod.annotate_regions.get_features import get_region
-from genmod.annotate_variants.read_tabix_files import (get_frequencies, 
+from genmod.annotate_variants.read_tabix_files import (get_frequencies,
      get_spidex_score, get_cadd_scores)
 
 logger = logging.getLogger(__name__)
@@ -12,29 +12,31 @@ def annotate_variant(variant, annotation_arguments):
     chrom = variant_info[0]
     if chrom.startswith(('chr', 'CHR', 'Chr')):
         chrom = chrom[3:]
+    elif chrom == 'MT':
+        chrom = 'M'
     pos = int(variant_info[1])
     ref = variant_info[3]
     alt = variant_info[4]
-    
+
     info = variant_info[7]
     if info == '.':
         info = []
     else:
         info = info.split(';')
-    
+
     ## TODO this needs to be handeled different for SV:s
     start = pos
     # This is a construct so that there will not be inconsistent genetic regions
     end = pos + 1
     # end = pos + max(len(ref), len(alt))
-    
+
     #Check which annotations that are available
     regions = None
     if 'region_trees' in annotation_arguments:
         regions = get_region(chrom, start, end, annotation_arguments['region_trees'])
         if regions:
             info.append("Annotation={0}".format(','.join(regions)))
-    
+
     if 'exac' in annotation_arguments:
         reader = annotation_arguments['exac']
         frequencies = get_frequencies(reader, chrom, start, alt)
@@ -76,7 +78,7 @@ def annotate_variant(variant, annotation_arguments):
         info_string = ';'.join(info)
     else:
         info_string = '.'
-    
+
     variant_info[7] = info_string
-    
-    return '\t'.join(variant_info)
+
+    return '\t'.join(variant_info)
diff --git a/genmod/utils/get_priority.py b/genmod/utils/get_priority.py
@@ -4,21 +4,21 @@
 def get_chromosome_priority(chrom, chrom_dict={}):
     """
     Return the chromosome priority
-    
+
     Arguments:
         chrom (str): The cromosome name from the vcf
         chrom_dict (dict): A map of chromosome names and theis priority
-    
+
     Return:
         priority (str): The priority for this chromosom
     """
     priority = '0'
-    
+
     chrom = chrom.lstrip('chr')
-    
+
     if chrom_dict:
         priority = chrom_dict.get(chrom, '0')
-    
+
     else:
         try:
             if int(chrom) < 23:
@@ -28,38 +28,38 @@ def get_chromosome_priority(chrom, chrom_dict={}):
                 priority = '23'
             elif chrom == 'Y':
                 priority = '24'
-            elif chrom == 'MT':
+            elif chrom in ['MT', 'M']:
                 priority = '25'
             else:
                 priority = '26'
-    
+
     return priority
 
 def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score_type: str = 'RankScore'):
     """
     Return the rank score priority for a certain family.
-    
+
     If no family is given the first family found is used
-    
+
     Arguments:
         variant_line (str): A vcf variant line
         variant_dict (dict): A variant dictionary
         family_id (str): A family id
         rank_score_type(str): Return rank score based on raw or normalized format
         See the genmod.score_variants.rank_score_variant_definitions for more info.
-    
+
     Return:
         rank_score (str): The rank score for this variant
     """
     if rank_score_type not in RANK_SCORE_TYPE_NAMES:
         raise ValueError('Unknown rank_score_type', rank_score_type)
-    
+
     rank_score = -100
     raw_entry = None
-    
+
     if variant_line:
         variant_line = variant_line.split("\t")
-    
+
         for info_annotation in variant_line[7].split(';'):
             info_annotation = info_annotation.split('=')
             key = None
@@ -69,10 +69,10 @@ def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score
             if key == rank_score_type:
                 raw_entry = value
                 break
-    
+
     elif variant_dict:
         raw_entry: str = variant_dict['info_dict'].get(rank_score_type)
-    
+
     if raw_entry:
         for family_annotation in raw_entry.split(','):
             family_annotation = family_annotation.split(':')
@@ -86,4 +86,3 @@ def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score
                 rank_score = float(family_annotation[1])
 
     return str(rank_score)
-
diff --git a/pytest.ini b/pytest.ini
@@ -2,4 +2,4 @@
 [pytest]
 minversion = 6.0
 addopts = -ra -q
-testpaths = []
+testpaths = tests
diff --git a/tests/annotate_regions/test_bed_parser.py b/tests/annotate_regions/test_bed_parser.py
@@ -12,6 +12,29 @@ def test_bed_parser(bed_lines):
     for index, region in enumerate(bed_parser(bed_lines)):
     # THEN assert the symbols are found in the bed lines
         assert region['symbol'] in symbols
-    
+
     assert index+1 == nr_regions
-
+
+def test_bed_parser_with_mt_chromosome():
+    # Given a bed line with the mitochondrial chromosome encoded as MT
+    bed_lines = [
+        "MT\t1000\t5000\tgene_MT"
+    ]
+    # Then the output region should not be padded
+    expected_output = [
+        {'chrom': 'M', 'start': 1000, 'stop': 5000, 'symbol': 'gene_MT'}
+    ]
+    result = list(bed_parser(bed_lines, padding=4000))
+    assert result == expected_output
+
+def test_bed_parser_with_chrm_chromosome():
+    # Given a bed line with the mitochondrial chromosome encoded as chrM instead of MT
+    bed_lines = [
+        "chrM\t1000\t5000\tgene_chrM"
+    ]
+    # Then the output region should not be padded and the chrom encoded as M
+    expected_output = [
+        {'chrom': 'M', 'start': 1000, 'stop': 5000, 'symbol': 'gene_chrM'}
+    ]
+    result = list(bed_parser(bed_lines, padding=4000))
+    assert result == expected_output
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -46,15 +46,15 @@ def header(request, vcf_path):
     with open(vcf_path, 'r') as variant_file:
         for line in variant_file:
             line = line.rstrip()
-        
+
             if line.startswith('#'):
                 if line.startswith('##'):
                     head.parse_meta_data(line)
                 else:
                     head.parse_header_line(line)
             else:
                 break
-    
+
     return head
 
 
@@ -72,4 +72,4 @@ def bed_lines(request):
 @pytest.fixture(scope='function')
 def ensembl_file(request):
     """Return the path to ensembl file with region defenitions"""
-    return ensembl_path
+    return ensembl_path
diff --git a/tests/fixtures/test_vcf_regions.vcf b/tests/fixtures/test_vcf_regions.vcf
@@ -18,4 +18,4 @@
 10	76154074	.	C	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60
 10	76154076	.	G	C	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/0:10,10:60	0/1:10,10:60	./.	0/0:10,10:60	0/1:10,10:60
 X	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1;Annotation=PPP2R3B	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
-MT	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
+MT	4336	.	T	C	100	PASS	MQ=1;Annotation=MT-TQ	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
diff --git a/tests/fixtures/test_vcf_regions_with_chr.vcf b/tests/fixtures/test_vcf_regions_with_chr.vcf
@@ -18,4 +18,4 @@ chr10	76154073	.	T	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	0/0:10,10:60	0/0:10,1
 chr10	76154074	.	C	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60
 chr10	76154076	.	G	C	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/0:10,10:60	0/1:10,10:60	./.	0/0:10,10:60	0/1:10,10:60
 chrX	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1;Annotation=PPP2R3B	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
-chrM	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
+chrM	4336	.	T	C	100	PASS	MQ=1;Annotation=MT-TQ	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
diff --git a/tests/functionality/test_annotate_variant.py b/tests/functionality/test_annotate_variant.py
@@ -60,7 +60,7 @@ def test_genmod_annotate_thousand_g():
             '--thousand-g',
             THOUSAND_G_FILE
         ])
-    
+
     assert result.exit_code == 0
 
 def test_genmod_annotate_cadd():
@@ -73,7 +73,7 @@ def test_genmod_annotate_cadd():
             '--cadd-file',
             CADD_FILE
         ])
-    
+
     assert result.exit_code == 0
 
 def test_genmod_annotate_multiple_cadd():
@@ -87,8 +87,8 @@ def test_genmod_annotate_multiple_cadd():
             CADD_FILE,
             '--cadd-file',
             CADD_1000G_FILE
-            
+
         ])
-    
+
     assert result.exit_code == 0
 
diff --git a/tests/utils/test_get_priority.py b/tests/utils/test_get_priority.py
@@ -17,6 +17,10 @@ def test_get_MT_priority():
     """docstring for test_get_MT_priority"""
     assert get_chromosome_priority(chrom='MT', chrom_dict={}) == '25'
 
+def test_get_chrM_priority():
+    """docstring for test_get_MT_priority"""
+    assert get_chromosome_priority(chrom='M', chrom_dict={}) == '25'
+
 def test_get_OTHER_priority():
     """docstring for test_get_MT_priority"""
     assert get_chromosome_priority(chrom='GL37', chrom_dict={}) == '26'
@@ -28,4 +32,4 @@ def test_get_chr_prority():
 def test_get_custom_prority():
     """docstring for test_get_chr_prority"""
     assert get_chromosome_priority(chrom='AHA_1', chrom_dict={'AHA_1':2, 'AHA_2':3}) == 2
-    
+