Skip to content

Commit

Permalink
Ensure chrX and X are treated equally
Browse files Browse the repository at this point in the history
  • Loading branch information
fellen31 committed Sep 24, 2024
1 parent d8090a2 commit 2c4c2f8
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 13 deletions.
2 changes: 1 addition & 1 deletion genmod/annotate_models/genetic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def check_genetic_models(variant_batch, families, phased = False,

# Only check X-linked for the variants in the X-chromosome:
# For X-linked we do not need to check the other models
if variant['CHROM'] == 'X':
if variant['CHROM'] in ['X', 'chrX']:
if check_X_recessive(variant, family, strict):
variant['inheritance_models'][family_id]['XR'] = True
for individual_id in individuals:
Expand Down
21 changes: 21 additions & 0 deletions tests/fixtures/test_vcf_regions_with_chr.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
##fileformat=VCFv4.1
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=Annotation,Number=.,Type=String,Description="Annotates what feature(s) this variant belongs to.">
##contig=<ID=1,length=249250621,assembly=b37>
##reference=file:///humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT father mother proband father_2 mother_2 proband_2
chr1 879537 . T C 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60
chr1 879541 . G A 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ ./. 0/1:10,10:60 1/1:10,10:60 ./. 0/1:10,10:60 0/1:10,10:60
chr1 879595 . C T 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 0/0:10,10:60 1/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60
chr1 879676 . G A 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 1/1:10,10:60 1/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60
chr1 879911 . G A 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60
chr1 880012 . A G 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60
chr1 880086 . T C 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60
chr1 880199 . G A 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60
chr1 880217 . T G 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60
chr10 76154051 . A G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60
chr10 76154073 . T G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60
chr10 76154074 . C G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60
chr10 76154076 . G C 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/0:10,10:60 0/1:10,10:60 ./. 0/0:10,10:60 0/1:10,10:60
chrX 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;Annotation=PPP2R3B GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60
chrM 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60
98 changes: 86 additions & 12 deletions tests/functionality/test_annotate_models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from build.lib.genmod.annotate_models import genetic_models
from genmod.commands import models_command
from tempfile import NamedTemporaryFile
from typing import Dict, Union
from click.testing import CliRunner
from genmod.vcf_tools import HeaderParser, get_variant_dict, get_info_dict

ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf"
VCF_FILE = "tests/fixtures/test_vcf_regions.vcf"
VCF_FILE_WITH_CHR = "tests/fixtures/test_vcf_regions_with_chr.vcf"
FAMILY_FILE = "tests/fixtures/recessive_trio.ped"
BAD_FAMILY_FILE = "tests/fixtures/annotate_models/one_ind.ped"
EMPTY_VCF_FILE = "tests/fixtures/empty.vcf"
Expand All @@ -11,6 +16,65 @@
from genmod.log import init_log
init_log(logger, loglevel="INFO")

def _parse_variant_file(file_path: str) -> HeaderParser:
"""
Parse VCF header fields
:param file_path: VCF to be read
:raises ValueError: in case file is empty
"""
with open(file_path, 'r') as variant_file:
head = HeaderParser()
for line_index, line in enumerate(variant_file):
line = line.rstrip()
if line.startswith('#'):
if line.startswith('##'):
head.parse_meta_data(line)
else:
head.parse_header_line(line)
else:
break
if line_index == 0:
raise ValueError('Expected contents in file, got none')
return head


def _generate_variants_from_file(file_path: str) -> Dict[str, Union[str, int, float]]:
"""
Yield variants from VCF file.
:param file_path: VCF to be read
"""
header = _parse_variant_file(file_path=file_path)
with open(file_path, 'r') as variant_file:
for line in variant_file:
if line.startswith('#'):
continue
variant: Dict[str, str] = get_variant_dict(line, header.header)
variant['info_dict'] = get_info_dict(variant['INFO'])
yield variant

def _generate_genetic_models_string_from_file(file_path: str) -> str:
"""
Yield genetic model string from VCF.
:param file_path: VCF to be read
"""
for variant in _generate_variants_from_file(file_path=file_path):
genetic_models_entry: str = variant['info_dict'].get('GeneticModels', '')
for family_genetic_models in genetic_models_entry.split(','):
family_genetic_models = family_genetic_models.split(':')
if len(family_genetic_models) > 1: # Not all variants will have a model
genetic_models: str = family_genetic_models[1]
yield genetic_models

def _run_model_command(vcf_file):
"""Helper function to run models_command and return output as a list."""
runner = CliRunner()
result = runner.invoke(models_command, [vcf_file, '-f', FAMILY_FILE])
assert result.exit_code == 0, f"Command failed with exit code: {result.exit_code}"

with NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(result.stdout_bytes)
temp_file.seek(0) # Move back to the start of the file
return list(_generate_genetic_models_string_from_file(temp_file.name))

def test_genmod_annotate_models_no_family():
"""docstring for test_genmod_annotate_models"""
Expand All @@ -29,8 +93,7 @@ def test_genmod_annotate_models():
FAMILY_FILE
]
)

print(result.output)
print(result.output)
assert result.exit_code == 0

def test_genmod_annotate_models_empty_vcf():
Expand All @@ -43,7 +106,6 @@ def test_genmod_annotate_models_empty_vcf():
]
)

print(result.output)
assert result.exit_code == 0

def test_annotate_models_already_annotated():
Expand All @@ -59,13 +121,25 @@ def test_annotate_models_already_annotated():
assert result.exit_code == 1

def test_annotate_models_lacking_ind():
"""docstring for test_genmod_annotate_models"""
runner = CliRunner()
result = runner.invoke(models_command, [
VCF_FILE,
'-f',
BAD_FAMILY_FILE
]
)
"""docstring for test_genmod_annotate_models"""
runner = CliRunner()
result = runner.invoke(models_command, [
VCF_FILE,
'-f',
BAD_FAMILY_FILE
]
)

assert result.exit_code == 1

def test_annotate_models_chr_prefix():
"""Test that genetic models are identical for VCF with and without 'chr' prefix."""

assert result.exit_code == 1
# Get models from both VCF files
models_list = _run_model_command(VCF_FILE)
models_list_with_chr = _run_model_command(VCF_FILE_WITH_CHR)
print(models_list)
print(models_list_with_chr)
# Assert that the lists of models are identical
assert len(models_list) > 0 and len(models_list_with_chr) > 0, "No models in VCFs"
assert models_list == models_list_with_chr, "Models differ between VCF files."

0 comments on commit 2c4c2f8

Please sign in to comment.