From fb224a73e5dffbc87de568af09520f3c20157884 Mon Sep 17 00:00:00 2001 From: jemten Date: Fri, 13 Dec 2024 16:11:44 +0100 Subject: [PATCH 1/4] fixing parsing of chrM --- genmod/annotate_regions/get_features.py | 4 +-- genmod/annotate_regions/parse_annotations.py | 34 +++++++++++--------- genmod/annotate_variants/annotate.py | 18 ++++++----- genmod/utils/get_priority.py | 31 +++++++++--------- pytest.ini | 2 +- tests/annotate_regions/test_bed_parser.py | 27 ++++++++++++++-- tests/conftest.py | 6 ++-- tests/fixtures/test_vcf_regions.vcf | 2 +- tests/fixtures/test_vcf_regions_with_chr.vcf | 2 +- tests/functionality/test_annotate_variant.py | 9 +++--- tests/utils/test_get_priority.py | 6 +++- 11 files changed, 86 insertions(+), 55 deletions(-) diff --git a/genmod/annotate_regions/get_features.py b/genmod/annotate_regions/get_features.py index 5f8ddd1..d867cc8 100755 --- a/genmod/annotate_regions/get_features.py +++ b/genmod/annotate_regions/get_features.py @@ -15,13 +15,13 @@ def get_region(chrom, start, end, region_trees): """Check if a position overlapps any regions - + Arguments: chrom (str): The chromosome start (int): The start position for the feature end (int): The stop position for the feature region_trees (dict): A dictionary with chromosomes as keys and interval trees as values - + Returns: regions (set): The regions that the variant ovelapps """ diff --git a/genmod/annotate_regions/parse_annotations.py b/genmod/annotate_regions/parse_annotations.py index e00be29..d00b461 100755 --- a/genmod/annotate_regions/parse_annotations.py +++ b/genmod/annotate_regions/parse_annotations.py @@ -3,7 +3,7 @@ """ annotation_parser.py -This script will parse a file with intervals in .bed format and build +This script will parse a file with intervals in .bed format and build one interval tree for each chromosome. These intervals typically represents genes @@ -29,28 +29,28 @@ def get_interval(start, stop, value): """Create an interval instance - + Args: start(int) stop(int) value - + Returns: interval(intervaltree.Interval) - + """ interval = Interval(start, stop, value) return interval def build_region_trees(bed_lines, padding): """Build region trees for each chromosome - - Build a dictionary with chromosomes as keys and interval trees as + + Build a dictionary with chromosomes as keys and interval trees as values. - + Args: bed_lines(iterable): An iterable with bed formated lines - padding (int): Defines what should be considered upstream + padding (int): Defines what should be considered upstream and downstream variants """ region_trees = {} @@ -71,14 +71,14 @@ def build_region_trees(bed_lines, padding): def bed_parser(bed_lines, padding=4000): """ Parse a file in the bed format. - + Arguments: bed_lines(iterable): An iterable with bed formated lines - padding (int): Defines what should be considered upstream + padding (int): Defines what should be considered upstream and downstream variants - + Yields: - region(dict): + region(dict): { 'chrom': str, 'start': int, @@ -93,22 +93,24 @@ def bed_parser(bed_lines, padding=4000): feature_id = str(index) # Get the coordinates for the region: chrom = line[0].lstrip('chr') - if chrom == 'MT': + if chrom in ['MT', 'M']: + # Only represent the mitochondrial chromosome as M + chrom = 'M' feature_start = int(line[1]) feature_stop = int(line[2]) else: feature_start = max(int(line[1]) - padding, 0) feature_stop = int(line[2]) - + # Get the feature id if len(line) > 3: feature_id = line [3] - + region = { 'chrom': chrom, 'start': feature_start, 'stop': feature_stop, 'symbol': feature_id } - + yield region diff --git a/genmod/annotate_variants/annotate.py b/genmod/annotate_variants/annotate.py index 388309b..37b5c63 100644 --- a/genmod/annotate_variants/annotate.py +++ b/genmod/annotate_variants/annotate.py @@ -1,7 +1,7 @@ import logging from genmod.annotate_regions.get_features import get_region -from genmod.annotate_variants.read_tabix_files import (get_frequencies, +from genmod.annotate_variants.read_tabix_files import (get_frequencies, get_spidex_score, get_cadd_scores) logger = logging.getLogger(__name__) @@ -12,29 +12,31 @@ def annotate_variant(variant, annotation_arguments): chrom = variant_info[0] if chrom.startswith(('chr', 'CHR', 'Chr')): chrom = chrom[3:] + elif chrom == 'MT': + chrom = 'M' pos = int(variant_info[1]) ref = variant_info[3] alt = variant_info[4] - + info = variant_info[7] if info == '.': info = [] else: info = info.split(';') - + ## TODO this needs to be handeled different for SV:s start = pos # This is a construct so that there will not be inconsistent genetic regions end = pos + 1 # end = pos + max(len(ref), len(alt)) - + #Check which annotations that are available regions = None if 'region_trees' in annotation_arguments: regions = get_region(chrom, start, end, annotation_arguments['region_trees']) if regions: info.append("Annotation={0}".format(','.join(regions))) - + if 'exac' in annotation_arguments: reader = annotation_arguments['exac'] frequencies = get_frequencies(reader, chrom, start, alt) @@ -76,7 +78,7 @@ def annotate_variant(variant, annotation_arguments): info_string = ';'.join(info) else: info_string = '.' - + variant_info[7] = info_string - - return '\t'.join(variant_info) \ No newline at end of file + + return '\t'.join(variant_info) diff --git a/genmod/utils/get_priority.py b/genmod/utils/get_priority.py index 6f20e78..6d0b31e 100644 --- a/genmod/utils/get_priority.py +++ b/genmod/utils/get_priority.py @@ -4,21 +4,21 @@ def get_chromosome_priority(chrom, chrom_dict={}): """ Return the chromosome priority - + Arguments: chrom (str): The cromosome name from the vcf chrom_dict (dict): A map of chromosome names and theis priority - + Return: priority (str): The priority for this chromosom """ priority = '0' - + chrom = chrom.lstrip('chr') - + if chrom_dict: priority = chrom_dict.get(chrom, '0') - + else: try: if int(chrom) < 23: @@ -28,38 +28,38 @@ def get_chromosome_priority(chrom, chrom_dict={}): priority = '23' elif chrom == 'Y': priority = '24' - elif chrom == 'MT': + elif chrom in ['MT', 'M']: priority = '25' else: priority = '26' - + return priority def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score_type: str = 'RankScore'): """ Return the rank score priority for a certain family. - + If no family is given the first family found is used - + Arguments: variant_line (str): A vcf variant line variant_dict (dict): A variant dictionary family_id (str): A family id rank_score_type(str): Return rank score based on raw or normalized format See the genmod.score_variants.rank_score_variant_definitions for more info. - + Return: rank_score (str): The rank score for this variant """ if rank_score_type not in RANK_SCORE_TYPE_NAMES: raise ValueError('Unknown rank_score_type', rank_score_type) - + rank_score = -100 raw_entry = None - + if variant_line: variant_line = variant_line.split("\t") - + for info_annotation in variant_line[7].split(';'): info_annotation = info_annotation.split('=') key = None @@ -69,10 +69,10 @@ def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score if key == rank_score_type: raw_entry = value break - + elif variant_dict: raw_entry: str = variant_dict['info_dict'].get(rank_score_type) - + if raw_entry: for family_annotation in raw_entry.split(','): family_annotation = family_annotation.split(':') @@ -86,4 +86,3 @@ def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score rank_score = float(family_annotation[1]) return str(rank_score) - \ No newline at end of file diff --git a/pytest.ini b/pytest.ini index a8cde3d..313c0bf 100644 --- a/pytest.ini +++ b/pytest.ini @@ -2,4 +2,4 @@ [pytest] minversion = 6.0 addopts = -ra -q -testpaths = [] \ No newline at end of file +testpaths = tests diff --git a/tests/annotate_regions/test_bed_parser.py b/tests/annotate_regions/test_bed_parser.py index 477d180..351f726 100644 --- a/tests/annotate_regions/test_bed_parser.py +++ b/tests/annotate_regions/test_bed_parser.py @@ -12,6 +12,29 @@ def test_bed_parser(bed_lines): for index, region in enumerate(bed_parser(bed_lines)): # THEN assert the symbols are found in the bed lines assert region['symbol'] in symbols - + assert index+1 == nr_regions - \ No newline at end of file + +def test_bed_parser_with_mt_chromosome(): + # Given a bed line with the mitochondrial chromosome encoded as MT + bed_lines = [ + "MT\t1000\t5000\tgene_MT" + ] + # Then the output region should not be padded + expected_output = [ + {'chrom': 'M', 'start': 1000, 'stop': 5000, 'symbol': 'gene_MT'} + ] + result = list(bed_parser(bed_lines, padding=4000)) + assert result == expected_output + +def test_bed_parser_with_chrm_chromosome(): + # Given a bed line with the mitochondrial chromosome encoded as chrM instead of MT + bed_lines = [ + "chrM\t1000\t5000\tgene_chrM" + ] + # Then the output region should not be padded and the chrom encoded as M + expected_output = [ + {'chrom': 'M', 'start': 1000, 'stop': 5000, 'symbol': 'gene_chrM'} + ] + result = list(bed_parser(bed_lines, padding=4000)) + assert result == expected_output diff --git a/tests/conftest.py b/tests/conftest.py index a884d19..1ab9fd4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,7 +46,7 @@ def header(request, vcf_path): with open(vcf_path, 'r') as variant_file: for line in variant_file: line = line.rstrip() - + if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) @@ -54,7 +54,7 @@ def header(request, vcf_path): head.parse_header_line(line) else: break - + return head @@ -72,4 +72,4 @@ def bed_lines(request): @pytest.fixture(scope='function') def ensembl_file(request): """Return the path to ensembl file with region defenitions""" - return ensembl_path \ No newline at end of file + return ensembl_path diff --git a/tests/fixtures/test_vcf_regions.vcf b/tests/fixtures/test_vcf_regions.vcf index b13f977..14c03c1 100644 --- a/tests/fixtures/test_vcf_regions.vcf +++ b/tests/fixtures/test_vcf_regions.vcf @@ -18,4 +18,4 @@ 10 76154074 . C G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 10 76154076 . G C 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/0:10,10:60 0/1:10,10:60 ./. 0/0:10,10:60 0/1:10,10:60 X 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;Annotation=PPP2R3B GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 -MT 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 +MT 4336 . T C 100 PASS MQ=1 GT:AD:GQ 0/0:321,0:60 1/1:0,341:54 1/1:0,341:54 0/0:321,0:60 1/1:0,341:54 1/1:0,341:54 diff --git a/tests/fixtures/test_vcf_regions_with_chr.vcf b/tests/fixtures/test_vcf_regions_with_chr.vcf index 51eb0ef..ee012d5 100644 --- a/tests/fixtures/test_vcf_regions_with_chr.vcf +++ b/tests/fixtures/test_vcf_regions_with_chr.vcf @@ -18,4 +18,4 @@ chr10 76154073 . T G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ 0/0:10,10:60 0/0:10,1 chr10 76154074 . C G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 chr10 76154076 . G C 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/0:10,10:60 0/1:10,10:60 ./. 0/0:10,10:60 0/1:10,10:60 chrX 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;Annotation=PPP2R3B GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 -chrM 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 +chrM 4336 . T C 100 PASS MQ=1 GT:AD:GQ 0/0:321,0:60 1/1:0,341:54 1/1:0,341:54 0/0:321,0:60 1/1:0,341:54 1/1:0,341:54 diff --git a/tests/functionality/test_annotate_variant.py b/tests/functionality/test_annotate_variant.py index d040a3b..67445bd 100644 --- a/tests/functionality/test_annotate_variant.py +++ b/tests/functionality/test_annotate_variant.py @@ -2,6 +2,7 @@ from click.testing import CliRunner VCF_FILE = "tests/fixtures/test_vcf_regions.vcf" +VCF_FILE_38 = "tests/fixtures/test_vcf_regions_with_chr.vcf" EMPTY_VCF_FILE = "tests/fixtures/empty.vcf" THOUSAND_G_FILE = "tests/fixtures/annotate_variant/small_1000G.vcf.gz" CADD_FILE = "tests/fixtures/annotate_variant/small_CADD.tsv.gz" @@ -60,7 +61,7 @@ def test_genmod_annotate_thousand_g(): '--thousand-g', THOUSAND_G_FILE ]) - + assert result.exit_code == 0 def test_genmod_annotate_cadd(): @@ -73,7 +74,7 @@ def test_genmod_annotate_cadd(): '--cadd-file', CADD_FILE ]) - + assert result.exit_code == 0 def test_genmod_annotate_multiple_cadd(): @@ -87,8 +88,8 @@ def test_genmod_annotate_multiple_cadd(): CADD_FILE, '--cadd-file', CADD_1000G_FILE - + ]) - + assert result.exit_code == 0 diff --git a/tests/utils/test_get_priority.py b/tests/utils/test_get_priority.py index 4608e0d..ee6f0ed 100644 --- a/tests/utils/test_get_priority.py +++ b/tests/utils/test_get_priority.py @@ -17,6 +17,10 @@ def test_get_MT_priority(): """docstring for test_get_MT_priority""" assert get_chromosome_priority(chrom='MT', chrom_dict={}) == '25' +def test_get_chrM_priority(): + """docstring for test_get_MT_priority""" + assert get_chromosome_priority(chrom='M', chrom_dict={}) == '25' + def test_get_OTHER_priority(): """docstring for test_get_MT_priority""" assert get_chromosome_priority(chrom='GL37', chrom_dict={}) == '26' @@ -28,4 +32,4 @@ def test_get_chr_prority(): def test_get_custom_prority(): """docstring for test_get_chr_prority""" assert get_chromosome_priority(chrom='AHA_1', chrom_dict={'AHA_1':2, 'AHA_2':3}) == 2 - + From 151013db63b9dc2905bc3ae3b7a8a55aa9724671 Mon Sep 17 00:00:00 2001 From: jemten Date: Fri, 13 Dec 2024 16:26:52 +0000 Subject: [PATCH 2/4] update test vcf --- tests/fixtures/test_vcf_regions.vcf | 2 +- tests/fixtures/test_vcf_regions_with_chr.vcf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fixtures/test_vcf_regions.vcf b/tests/fixtures/test_vcf_regions.vcf index 14c03c1..645054d 100644 --- a/tests/fixtures/test_vcf_regions.vcf +++ b/tests/fixtures/test_vcf_regions.vcf @@ -18,4 +18,4 @@ 10 76154074 . C G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 10 76154076 . G C 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/0:10,10:60 0/1:10,10:60 ./. 0/0:10,10:60 0/1:10,10:60 X 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;Annotation=PPP2R3B GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 -MT 4336 . T C 100 PASS MQ=1 GT:AD:GQ 0/0:321,0:60 1/1:0,341:54 1/1:0,341:54 0/0:321,0:60 1/1:0,341:54 1/1:0,341:54 +MT 4336 . T C 100 PASS MQ=1;Annotation=MT-TQ GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 diff --git a/tests/fixtures/test_vcf_regions_with_chr.vcf b/tests/fixtures/test_vcf_regions_with_chr.vcf index ee012d5..b0069e7 100644 --- a/tests/fixtures/test_vcf_regions_with_chr.vcf +++ b/tests/fixtures/test_vcf_regions_with_chr.vcf @@ -18,4 +18,4 @@ chr10 76154073 . T G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ 0/0:10,10:60 0/0:10,1 chr10 76154074 . C G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 chr10 76154076 . G C 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/0:10,10:60 0/1:10,10:60 ./. 0/0:10,10:60 0/1:10,10:60 chrX 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;Annotation=PPP2R3B GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 -chrM 4336 . T C 100 PASS MQ=1 GT:AD:GQ 0/0:321,0:60 1/1:0,341:54 1/1:0,341:54 0/0:321,0:60 1/1:0,341:54 1/1:0,341:54 +chrM 4336 . T C 100 PASS MQ=1;Annotation=MT-TQ GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 From d89f397f66587e806eb13d8c8ea73057d1b9535b Mon Sep 17 00:00:00 2001 From: jemten Date: Fri, 13 Dec 2024 17:41:27 +0100 Subject: [PATCH 3/4] update changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d29c9b3..0ff373d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,8 @@ Try to use the following format: - The optional fields Source and Version are allowed in the VCF header([#106](https://github.com/Clinical-Genomics/genmod/pull/106)) - Released the constraint on Python 3.8 (collections, pkg_resources to importlib, tests) ([#142](https://github.com/Clinical-Genomics/genmod/pull/142)) - Update annotation examples ([#144](https://github.com/Clinical-Genomics/genmod/pull/144)) -- Updated documentation with warning about compounds only being scored within the first family in the VCF ([#151](https://github.com/Clinical-Genomics/genmod/pull/151)) +- Updated documentation with warning about compounds only being scored within the first family in the VCF ([#151](https://github.com/Clinical-Genomics/genmod/pull/151)) +- genmod annotate for mitochondrial variants when using the `chrM` notation ([#157](https://github.com/Clinical-Genomics/genmod/pull/157)) ## [3.9] - Fixed wrong models when chromosome X was named `chrX` and not `X` ([#135](https://github.com/Clinical-Genomics/genmod/pull/135)) From 4e7dab4fbfde6b546e3526f2a631b58ae713b4d4 Mon Sep 17 00:00:00 2001 From: jemten Date: Fri, 13 Dec 2024 16:45:59 +0000 Subject: [PATCH 4/4] Remove unused VCF_FILE_38 variable from test_annotate_variant.py --- tests/functionality/test_annotate_variant.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/functionality/test_annotate_variant.py b/tests/functionality/test_annotate_variant.py index 67445bd..5bd8d1a 100644 --- a/tests/functionality/test_annotate_variant.py +++ b/tests/functionality/test_annotate_variant.py @@ -2,7 +2,6 @@ from click.testing import CliRunner VCF_FILE = "tests/fixtures/test_vcf_regions.vcf" -VCF_FILE_38 = "tests/fixtures/test_vcf_regions_with_chr.vcf" EMPTY_VCF_FILE = "tests/fixtures/empty.vcf" THOUSAND_G_FILE = "tests/fixtures/annotate_variant/small_1000G.vcf.gz" CADD_FILE = "tests/fixtures/annotate_variant/small_CADD.tsv.gz"