Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure chrX and X are treated equally #135

Merged
merged 2 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ Please add a new candidate release at the top after changing the latest one. Fee

Try to use the following format:

## [unreleased]
- Fixed wrong models when chromosome X was named `chrX` and not `X`

## [3.8.3]

- Fixed unstable compounds order in models output ([#134](https://github.com/Clinical-Genomics/genmod/pull/134)))
Expand Down
2 changes: 1 addition & 1 deletion genmod/annotate_models/genetic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def check_genetic_models(variant_batch, families, phased = False,

# Only check X-linked for the variants in the X-chromosome:
# For X-linked we do not need to check the other models
if variant['CHROM'] == 'X':
if variant['CHROM'] in ['X', 'chrX']:
if check_X_recessive(variant, family, strict):
variant['inheritance_models'][family_id]['XR'] = True
for individual_id in individuals:
Expand Down
21 changes: 21 additions & 0 deletions tests/fixtures/test_vcf_regions_with_chr.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
##fileformat=VCFv4.1
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=Annotation,Number=.,Type=String,Description="Annotates what feature(s) this variant belongs to.">
##contig=<ID=1,length=249250621,assembly=b37>
##reference=file:///humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT father mother proband father_2 mother_2 proband_2
chr1 879537 . T C 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60
chr1 879541 . G A 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ ./. 0/1:10,10:60 1/1:10,10:60 ./. 0/1:10,10:60 0/1:10,10:60
chr1 879595 . C T 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 0/0:10,10:60 1/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60
chr1 879676 . G A 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 1/1:10,10:60 1/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60
chr1 879911 . G A 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60
chr1 880012 . A G 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60
chr1 880086 . T C 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60
chr1 880199 . G A 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60
chr1 880217 . T G 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60
chr10 76154051 . A G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60
chr10 76154073 . T G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60
chr10 76154074 . C G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60
chr10 76154076 . G C 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/0:10,10:60 0/1:10,10:60 ./. 0/0:10,10:60 0/1:10,10:60
chrX 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;Annotation=PPP2R3B GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60
chrM 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60
43 changes: 41 additions & 2 deletions tests/functionality/test_annotate_models.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,44 @@
from genmod.commands import models_command
from tempfile import NamedTemporaryFile
from typing import Dict, Union
from click.testing import CliRunner

ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf"
VCF_FILE = "tests/fixtures/test_vcf_regions.vcf"
VCF_FILE_WITH_CHR = "tests/fixtures/test_vcf_regions_with_chr.vcf"
FAMILY_FILE = "tests/fixtures/recessive_trio.ped"
BAD_FAMILY_FILE = "tests/fixtures/annotate_models/one_ind.ped"
EMPTY_VCF_FILE = "tests/fixtures/empty.vcf"

from genmod import logger
from genmod.log import init_log
from test_utils import generate_variants_from_file

init_log(logger, loglevel="INFO")

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One suggestion would be to move the variant generation methods into a fixture in the functionality module, so that we reduce duplicate code, related fellen31@bc15a2f

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review @torbjorgen, I'll look into moving it into a fixture, and just ping you afterwards so you can make sure it looks ok.

def _generate_genetic_models_string_from_file(file_path: str) -> str:
"""
Yield genetic model string from VCF.
:param file_path: VCF to be read
"""
for variant in generate_variants_from_file(file_path=file_path):
genetic_models_entry: str = variant['info_dict'].get('GeneticModels', '')
for family_genetic_models in genetic_models_entry.split(','):
family_genetic_models = family_genetic_models.split(':')
if len(family_genetic_models) > 1: # Not all variants will have a model
genetic_models: str = family_genetic_models[1]
yield genetic_models

def _run_model_command(vcf_file):
"""Helper function to run models_command and return output as a list."""
runner = CliRunner()
result = runner.invoke(models_command, [vcf_file, '-f', FAMILY_FILE])
assert result.exit_code == 0, f"Command failed with exit code: {result.exit_code}"

with NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(result.stdout_bytes)
temp_file.seek(0) # Move back to the start of the file
return list(_generate_genetic_models_string_from_file(temp_file.name))

def test_genmod_annotate_models_no_family():
"""docstring for test_genmod_annotate_models"""
Expand Down Expand Up @@ -42,7 +70,7 @@ def test_genmod_annotate_models_empty_vcf():
FAMILY_FILE
]
)

print(result.output)
assert result.exit_code == 0

Expand All @@ -67,5 +95,16 @@ def test_annotate_models_lacking_ind():
BAD_FAMILY_FILE
]
)

assert result.exit_code == 1

def test_annotate_models_chr_prefix():
"""Test that genetic models are identical for VCF with and without 'chr' prefix."""

# Get models from both VCF files
models_list = _run_model_command(VCF_FILE)
models_list_with_chr = _run_model_command(VCF_FILE_WITH_CHR)

assert result.exit_code == 1
# Assert that the lists of models are identical
assert len(models_list) > 0 and len(models_list_with_chr) > 0, "No models in VCFs"
assert models_list == models_list_with_chr, "Models differ between VCF files."
43 changes: 2 additions & 41 deletions tests/functionality/test_score_variants_ranks_score_is_float.py
Original file line number Diff line number Diff line change
@@ -1,58 +1,19 @@
import pytest
from tempfile import NamedTemporaryFile
from typing import Dict, Union
from click.testing import CliRunner

from genmod.commands import score_command, score_compounds_command
from genmod.vcf_tools import HeaderParser, get_variant_dict, get_info_dict
from test_utils import generate_variants_from_file

ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf"
SCORE_CONFIG = "tests/fixtures/score_variants/genmod_example.ini"


def _parse_variant_file(file_path: str) -> HeaderParser:
"""
Parse VCF header fields
:param file_path: VCF to be read
:raises ValueError: in case file is empty
"""
with open(file_path, 'r') as variant_file:
head = HeaderParser()
for line_index, line in enumerate(variant_file):
line = line.rstrip()
if line.startswith('#'):
if line.startswith('##'):
head.parse_meta_data(line)
else:
head.parse_header_line(line)
else:
break
if line_index == 0:
raise ValueError('Expected contents in file, got none')
return head


def _generate_variants_from_file(file_path: str) -> Dict[str, Union[str, int, float]]:
"""
Yield variants from VCF file.
:param file_path: VCF to be read
"""
header = _parse_variant_file(file_path=file_path)
with open(file_path, 'r') as variant_file:
for line in variant_file:
if line.startswith('#'):
continue
variant: Dict[str, str] = get_variant_dict(line, header.header)
variant['info_dict'] = get_info_dict(variant['INFO'])
yield variant


def _generate_rank_score_strings_from_file(file_path: str) -> str:
"""
Yield rank score strings from VCF.
:param file_path: VCF to be read
"""
for variant in _generate_variants_from_file(file_path=file_path):
for variant in generate_variants_from_file(file_path=file_path):
rank_score_entry: str = variant['info_dict'].get('RankScore', '')
for family_rank_score in rank_score_entry.split(','):
family_rank_score = family_rank_score.split(':')
Expand Down
38 changes: 38 additions & 0 deletions tests/functionality/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from typing import Dict, Union
from genmod.vcf_tools import HeaderParser, get_variant_dict, get_info_dict

def parse_variant_file(file_path: str) -> HeaderParser:
"""
Parse VCF header fields
:param file_path: VCF to be read
:raises ValueError: in case file is empty
"""
with open(file_path, 'r') as variant_file:
head = HeaderParser()
for line_index, line in enumerate(variant_file):
line = line.rstrip()
if line.startswith('#'):
if line.startswith('##'):
head.parse_meta_data(line)
else:
head.parse_header_line(line)
else:
break
if line_index == 0:
raise ValueError('Expected contents in file, got none')
return head

def generate_variants_from_file(file_path: str) -> Dict[str, Union[str, int, float]]:
"""
Yield variants from VCF file.
:param file_path: VCF to be read
"""
header = parse_variant_file(file_path=file_path)
with open(file_path, 'r') as variant_file:
for line in variant_file:
if line.startswith('#'):
continue
variant: Dict[str, str] = get_variant_dict(line, header.header)
variant['info_dict'] = get_info_dict(variant['INFO'])
yield variant

Loading