Skip to content

Commit

Permalink
Merge pull request #135 from fellen31/fix-chr
Browse files Browse the repository at this point in the history
Ensure chrX and X are treated equally
  • Loading branch information
fellen31 authored Oct 8, 2024
2 parents 68bc1ed + bd0d89d commit 5bee959
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 44 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ Please add a new candidate release at the top after changing the latest one. Fee

Try to use the following format:

## [unreleased]
- Fixed wrong models when chromosome X was named `chrX` and not `X`

## [3.8.3]

- Fixed unstable compounds order in models output ([#134](https://github.com/Clinical-Genomics/genmod/pull/134)))
Expand Down
2 changes: 1 addition & 1 deletion genmod/annotate_models/genetic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def check_genetic_models(variant_batch, families, phased = False,

# Only check X-linked for the variants in the X-chromosome:
# For X-linked we do not need to check the other models
if variant['CHROM'] == 'X':
if variant['CHROM'] in ['X', 'chrX']:
if check_X_recessive(variant, family, strict):
variant['inheritance_models'][family_id]['XR'] = True
for individual_id in individuals:
Expand Down
21 changes: 21 additions & 0 deletions tests/fixtures/test_vcf_regions_with_chr.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
##fileformat=VCFv4.1
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=Annotation,Number=.,Type=String,Description="Annotates what feature(s) this variant belongs to.">
##contig=<ID=1,length=249250621,assembly=b37>
##reference=file:///humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT father mother proband father_2 mother_2 proband_2
chr1 879537 . T C 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60
chr1 879541 . G A 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ ./. 0/1:10,10:60 1/1:10,10:60 ./. 0/1:10,10:60 0/1:10,10:60
chr1 879595 . C T 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 0/0:10,10:60 1/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60
chr1 879676 . G A 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 1/1:10,10:60 1/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60
chr1 879911 . G A 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60
chr1 880012 . A G 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60
chr1 880086 . T C 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60
chr1 880199 . G A 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60
chr1 880217 . T G 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60
chr10 76154051 . A G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60
chr10 76154073 . T G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60
chr10 76154074 . C G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60
chr10 76154076 . G C 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/0:10,10:60 0/1:10,10:60 ./. 0/0:10,10:60 0/1:10,10:60
chrX 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;Annotation=PPP2R3B GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60
chrM 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60
43 changes: 41 additions & 2 deletions tests/functionality/test_annotate_models.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,44 @@
from genmod.commands import models_command
from tempfile import NamedTemporaryFile
from typing import Dict, Union
from click.testing import CliRunner

ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf"
VCF_FILE = "tests/fixtures/test_vcf_regions.vcf"
VCF_FILE_WITH_CHR = "tests/fixtures/test_vcf_regions_with_chr.vcf"
FAMILY_FILE = "tests/fixtures/recessive_trio.ped"
BAD_FAMILY_FILE = "tests/fixtures/annotate_models/one_ind.ped"
EMPTY_VCF_FILE = "tests/fixtures/empty.vcf"

from genmod import logger
from genmod.log import init_log
from test_utils import generate_variants_from_file

init_log(logger, loglevel="INFO")

def _generate_genetic_models_string_from_file(file_path: str) -> str:
"""
Yield genetic model string from VCF.
:param file_path: VCF to be read
"""
for variant in generate_variants_from_file(file_path=file_path):
genetic_models_entry: str = variant['info_dict'].get('GeneticModels', '')
for family_genetic_models in genetic_models_entry.split(','):
family_genetic_models = family_genetic_models.split(':')
if len(family_genetic_models) > 1: # Not all variants will have a model
genetic_models: str = family_genetic_models[1]
yield genetic_models

def _run_model_command(vcf_file):
"""Helper function to run models_command and return output as a list."""
runner = CliRunner()
result = runner.invoke(models_command, [vcf_file, '-f', FAMILY_FILE])
assert result.exit_code == 0, f"Command failed with exit code: {result.exit_code}"

with NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(result.stdout_bytes)
temp_file.seek(0) # Move back to the start of the file
return list(_generate_genetic_models_string_from_file(temp_file.name))

def test_genmod_annotate_models_no_family():
"""docstring for test_genmod_annotate_models"""
Expand Down Expand Up @@ -42,7 +70,7 @@ def test_genmod_annotate_models_empty_vcf():
FAMILY_FILE
]
)

print(result.output)
assert result.exit_code == 0

Expand All @@ -67,5 +95,16 @@ def test_annotate_models_lacking_ind():
BAD_FAMILY_FILE
]
)

assert result.exit_code == 1

def test_annotate_models_chr_prefix():
"""Test that genetic models are identical for VCF with and without 'chr' prefix."""

# Get models from both VCF files
models_list = _run_model_command(VCF_FILE)
models_list_with_chr = _run_model_command(VCF_FILE_WITH_CHR)

assert result.exit_code == 1
# Assert that the lists of models are identical
assert len(models_list) > 0 and len(models_list_with_chr) > 0, "No models in VCFs"
assert models_list == models_list_with_chr, "Models differ between VCF files."
43 changes: 2 additions & 41 deletions tests/functionality/test_score_variants_ranks_score_is_float.py
Original file line number Diff line number Diff line change
@@ -1,58 +1,19 @@
import pytest
from tempfile import NamedTemporaryFile
from typing import Dict, Union
from click.testing import CliRunner

from genmod.commands import score_command, score_compounds_command
from genmod.vcf_tools import HeaderParser, get_variant_dict, get_info_dict
from test_utils import generate_variants_from_file

ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf"
SCORE_CONFIG = "tests/fixtures/score_variants/genmod_example.ini"


def _parse_variant_file(file_path: str) -> HeaderParser:
"""
Parse VCF header fields
:param file_path: VCF to be read
:raises ValueError: in case file is empty
"""
with open(file_path, 'r') as variant_file:
head = HeaderParser()
for line_index, line in enumerate(variant_file):
line = line.rstrip()
if line.startswith('#'):
if line.startswith('##'):
head.parse_meta_data(line)
else:
head.parse_header_line(line)
else:
break
if line_index == 0:
raise ValueError('Expected contents in file, got none')
return head


def _generate_variants_from_file(file_path: str) -> Dict[str, Union[str, int, float]]:
"""
Yield variants from VCF file.
:param file_path: VCF to be read
"""
header = _parse_variant_file(file_path=file_path)
with open(file_path, 'r') as variant_file:
for line in variant_file:
if line.startswith('#'):
continue
variant: Dict[str, str] = get_variant_dict(line, header.header)
variant['info_dict'] = get_info_dict(variant['INFO'])
yield variant


def _generate_rank_score_strings_from_file(file_path: str) -> str:
"""
Yield rank score strings from VCF.
:param file_path: VCF to be read
"""
for variant in _generate_variants_from_file(file_path=file_path):
for variant in generate_variants_from_file(file_path=file_path):
rank_score_entry: str = variant['info_dict'].get('RankScore', '')
for family_rank_score in rank_score_entry.split(','):
family_rank_score = family_rank_score.split(':')
Expand Down
38 changes: 38 additions & 0 deletions tests/functionality/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from typing import Dict, Union
from genmod.vcf_tools import HeaderParser, get_variant_dict, get_info_dict

def parse_variant_file(file_path: str) -> HeaderParser:
"""
Parse VCF header fields
:param file_path: VCF to be read
:raises ValueError: in case file is empty
"""
with open(file_path, 'r') as variant_file:
head = HeaderParser()
for line_index, line in enumerate(variant_file):
line = line.rstrip()
if line.startswith('#'):
if line.startswith('##'):
head.parse_meta_data(line)
else:
head.parse_header_line(line)
else:
break
if line_index == 0:
raise ValueError('Expected contents in file, got none')
return head

def generate_variants_from_file(file_path: str) -> Dict[str, Union[str, int, float]]:
"""
Yield variants from VCF file.
:param file_path: VCF to be read
"""
header = parse_variant_file(file_path=file_path)
with open(file_path, 'r') as variant_file:
for line in variant_file:
if line.startswith('#'):
continue
variant: Dict[str, str] = get_variant_dict(line, header.header)
variant['info_dict'] = get_info_dict(variant['INFO'])
yield variant

0 comments on commit 5bee959

Please sign in to comment.