Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Annotation update for chrM #157

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ Try to use the following format:
- The optional fields Source and Version are allowed in the VCF header([#106](https://github.com/Clinical-Genomics/genmod/pull/106))
- Released the constraint on Python 3.8 (collections, pkg_resources to importlib, tests) ([#142](https://github.com/Clinical-Genomics/genmod/pull/142))
- Update annotation examples ([#144](https://github.com/Clinical-Genomics/genmod/pull/144))
- Updated documentation with warning about compounds only being scored within the first family in the VCF ([#151](https://github.com/Clinical-Genomics/genmod/pull/151))
- Updated documentation with warning about compounds only being scored within the first family in the VCF ([#151](https://github.com/Clinical-Genomics/genmod/pull/151))
- genmod annotate for mitochondrial variants when using the `chrM` notation ([#157](https://github.com/Clinical-Genomics/genmod/pull/157))

## [3.9]
- Fixed wrong models when chromosome X was named `chrX` and not `X` ([#135](https://github.com/Clinical-Genomics/genmod/pull/135))
Expand Down
4 changes: 2 additions & 2 deletions genmod/annotate_regions/get_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@

def get_region(chrom, start, end, region_trees):
"""Check if a position overlapps any regions

Arguments:
chrom (str): The chromosome
start (int): The start position for the feature
end (int): The stop position for the feature
region_trees (dict): A dictionary with chromosomes as keys and interval trees as values

Returns:
regions (set): The regions that the variant ovelapps
"""
Expand Down
34 changes: 18 additions & 16 deletions genmod/annotate_regions/parse_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
annotation_parser.py

This script will parse a file with intervals in .bed format and build
This script will parse a file with intervals in .bed format and build
one interval tree for each chromosome.
These intervals typically represents genes

Expand All @@ -29,28 +29,28 @@

def get_interval(start, stop, value):
"""Create an interval instance

Args:
start(int)
stop(int)
value

Returns:
interval(intervaltree.Interval)

"""
interval = Interval(start, stop, value)
return interval

def build_region_trees(bed_lines, padding):
"""Build region trees for each chromosome
Build a dictionary with chromosomes as keys and interval trees as

Build a dictionary with chromosomes as keys and interval trees as
values.

Args:
bed_lines(iterable): An iterable with bed formated lines
padding (int): Defines what should be considered upstream
padding (int): Defines what should be considered upstream
and downstream variants
"""
region_trees = {}
Expand All @@ -71,14 +71,14 @@ def build_region_trees(bed_lines, padding):
def bed_parser(bed_lines, padding=4000):
"""
Parse a file in the bed format.

Arguments:
bed_lines(iterable): An iterable with bed formated lines
padding (int): Defines what should be considered upstream
padding (int): Defines what should be considered upstream
and downstream variants

Yields:
region(dict):
region(dict):
{
'chrom': str,
'start': int,
Expand All @@ -93,22 +93,24 @@ def bed_parser(bed_lines, padding=4000):
feature_id = str(index)
# Get the coordinates for the region:
chrom = line[0].lstrip('chr')
if chrom == 'MT':
if chrom in ['MT', 'M']:
# Only represent the mitochondrial chromosome as M
chrom = 'M'
feature_start = int(line[1])
feature_stop = int(line[2])
else:
feature_start = max(int(line[1]) - padding, 0)
feature_stop = int(line[2])

# Get the feature id
if len(line) > 3:
feature_id = line [3]

region = {
'chrom': chrom,
'start': feature_start,
'stop': feature_stop,
'symbol': feature_id
}

yield region
18 changes: 10 additions & 8 deletions genmod/annotate_variants/annotate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging

from genmod.annotate_regions.get_features import get_region
from genmod.annotate_variants.read_tabix_files import (get_frequencies,
from genmod.annotate_variants.read_tabix_files import (get_frequencies,
get_spidex_score, get_cadd_scores)

logger = logging.getLogger(__name__)
Expand All @@ -12,29 +12,31 @@ def annotate_variant(variant, annotation_arguments):
chrom = variant_info[0]
if chrom.startswith(('chr', 'CHR', 'Chr')):
chrom = chrom[3:]
elif chrom == 'MT':
chrom = 'M'
pos = int(variant_info[1])
ref = variant_info[3]
alt = variant_info[4]

info = variant_info[7]
if info == '.':
info = []
else:
info = info.split(';')

## TODO this needs to be handeled different for SV:s
start = pos
# This is a construct so that there will not be inconsistent genetic regions
end = pos + 1
# end = pos + max(len(ref), len(alt))

#Check which annotations that are available
regions = None
if 'region_trees' in annotation_arguments:
regions = get_region(chrom, start, end, annotation_arguments['region_trees'])
if regions:
info.append("Annotation={0}".format(','.join(regions)))

if 'exac' in annotation_arguments:
reader = annotation_arguments['exac']
frequencies = get_frequencies(reader, chrom, start, alt)
Expand Down Expand Up @@ -76,7 +78,7 @@ def annotate_variant(variant, annotation_arguments):
info_string = ';'.join(info)
else:
info_string = '.'

variant_info[7] = info_string
return '\t'.join(variant_info)

return '\t'.join(variant_info)
31 changes: 15 additions & 16 deletions genmod/utils/get_priority.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,21 @@
def get_chromosome_priority(chrom, chrom_dict={}):
"""
Return the chromosome priority

Arguments:
chrom (str): The cromosome name from the vcf
chrom_dict (dict): A map of chromosome names and theis priority

Return:
priority (str): The priority for this chromosom
"""
priority = '0'

chrom = chrom.lstrip('chr')

if chrom_dict:
priority = chrom_dict.get(chrom, '0')

else:
try:
if int(chrom) < 23:
Expand All @@ -28,38 +28,38 @@ def get_chromosome_priority(chrom, chrom_dict={}):
priority = '23'
elif chrom == 'Y':
priority = '24'
elif chrom == 'MT':
elif chrom in ['MT', 'M']:
priority = '25'
else:
priority = '26'

return priority

def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score_type: str = 'RankScore'):
"""
Return the rank score priority for a certain family.

If no family is given the first family found is used

Arguments:
variant_line (str): A vcf variant line
variant_dict (dict): A variant dictionary
family_id (str): A family id
rank_score_type(str): Return rank score based on raw or normalized format
See the genmod.score_variants.rank_score_variant_definitions for more info.

Return:
rank_score (str): The rank score for this variant
"""
if rank_score_type not in RANK_SCORE_TYPE_NAMES:
raise ValueError('Unknown rank_score_type', rank_score_type)

rank_score = -100
raw_entry = None

if variant_line:
variant_line = variant_line.split("\t")

for info_annotation in variant_line[7].split(';'):
info_annotation = info_annotation.split('=')
key = None
Expand All @@ -69,10 +69,10 @@ def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score
if key == rank_score_type:
raw_entry = value
break

elif variant_dict:
raw_entry: str = variant_dict['info_dict'].get(rank_score_type)

if raw_entry:
for family_annotation in raw_entry.split(','):
family_annotation = family_annotation.split(':')
Expand All @@ -86,4 +86,3 @@ def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score
rank_score = float(family_annotation[1])

return str(rank_score)

2 changes: 1 addition & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
[pytest]
minversion = 6.0
addopts = -ra -q
testpaths = []
testpaths = tests
27 changes: 25 additions & 2 deletions tests/annotate_regions/test_bed_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,29 @@ def test_bed_parser(bed_lines):
for index, region in enumerate(bed_parser(bed_lines)):
# THEN assert the symbols are found in the bed lines
assert region['symbol'] in symbols

assert index+1 == nr_regions


def test_bed_parser_with_mt_chromosome():
# Given a bed line with the mitochondrial chromosome encoded as MT
bed_lines = [
"MT\t1000\t5000\tgene_MT"
]
# Then the output region should not be padded
expected_output = [
{'chrom': 'M', 'start': 1000, 'stop': 5000, 'symbol': 'gene_MT'}
]
result = list(bed_parser(bed_lines, padding=4000))
assert result == expected_output

def test_bed_parser_with_chrm_chromosome():
# Given a bed line with the mitochondrial chromosome encoded as chrM instead of MT
bed_lines = [
"chrM\t1000\t5000\tgene_chrM"
]
# Then the output region should not be padded and the chrom encoded as M
expected_output = [
{'chrom': 'M', 'start': 1000, 'stop': 5000, 'symbol': 'gene_chrM'}
]
result = list(bed_parser(bed_lines, padding=4000))
assert result == expected_output
6 changes: 3 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,15 @@ def header(request, vcf_path):
with open(vcf_path, 'r') as variant_file:
for line in variant_file:
line = line.rstrip()

if line.startswith('#'):
if line.startswith('##'):
head.parse_meta_data(line)
else:
head.parse_header_line(line)
else:
break

return head


Expand All @@ -72,4 +72,4 @@ def bed_lines(request):
@pytest.fixture(scope='function')
def ensembl_file(request):
"""Return the path to ensembl file with region defenitions"""
return ensembl_path
return ensembl_path
2 changes: 1 addition & 1 deletion tests/fixtures/test_vcf_regions.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@
10 76154074 . C G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60
10 76154076 . G C 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/0:10,10:60 0/1:10,10:60 ./. 0/0:10,10:60 0/1:10,10:60
X 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;Annotation=PPP2R3B GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60
MT 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60
MT 4336 . T C 100 PASS MQ=1;Annotation=MT-TQ GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60
2 changes: 1 addition & 1 deletion tests/fixtures/test_vcf_regions_with_chr.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ chr10 76154073 . T G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ 0/0:10,10:60 0/0:10,1
chr10 76154074 . C G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60
chr10 76154076 . G C 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/0:10,10:60 0/1:10,10:60 ./. 0/0:10,10:60 0/1:10,10:60
chrX 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;Annotation=PPP2R3B GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60
chrM 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60
chrM 4336 . T C 100 PASS MQ=1;Annotation=MT-TQ GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60
8 changes: 4 additions & 4 deletions tests/functionality/test_annotate_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def test_genmod_annotate_thousand_g():
'--thousand-g',
THOUSAND_G_FILE
])

assert result.exit_code == 0

def test_genmod_annotate_cadd():
Expand All @@ -73,7 +73,7 @@ def test_genmod_annotate_cadd():
'--cadd-file',
CADD_FILE
])

assert result.exit_code == 0

def test_genmod_annotate_multiple_cadd():
Expand All @@ -87,8 +87,8 @@ def test_genmod_annotate_multiple_cadd():
CADD_FILE,
'--cadd-file',
CADD_1000G_FILE

])

assert result.exit_code == 0

6 changes: 5 additions & 1 deletion tests/utils/test_get_priority.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ def test_get_MT_priority():
"""docstring for test_get_MT_priority"""
assert get_chromosome_priority(chrom='MT', chrom_dict={}) == '25'

def test_get_chrM_priority():
"""docstring for test_get_MT_priority"""
assert get_chromosome_priority(chrom='M', chrom_dict={}) == '25'

def test_get_OTHER_priority():
"""docstring for test_get_MT_priority"""
assert get_chromosome_priority(chrom='GL37', chrom_dict={}) == '26'
Expand All @@ -28,4 +32,4 @@ def test_get_chr_prority():
def test_get_custom_prority():
"""docstring for test_get_chr_prority"""
assert get_chromosome_priority(chrom='AHA_1', chrom_dict={'AHA_1':2, 'AHA_2':3}) == 2

Loading