From 6677c92e5e856312d7ad6e59e346d427ace7f995 Mon Sep 17 00:00:00 2001 From: fellen31 Date: Tue, 24 Sep 2024 13:24:48 +0200 Subject: [PATCH] Ensure chrX and X are treated equally --- genmod/annotate_models/genetic_models.py | 2 +- tests/fixtures/test_vcf_regions_with_chr.vcf | 21 +++++ tests/functionality/test_annotate_models.py | 94 +++++++++++++++++--- 3 files changed, 106 insertions(+), 11 deletions(-) create mode 100644 tests/fixtures/test_vcf_regions_with_chr.vcf diff --git a/genmod/annotate_models/genetic_models.py b/genmod/annotate_models/genetic_models.py index 706a578..15e2ecd 100755 --- a/genmod/annotate_models/genetic_models.py +++ b/genmod/annotate_models/genetic_models.py @@ -144,7 +144,7 @@ def check_genetic_models(variant_batch, families, phased = False, # Only check X-linked for the variants in the X-chromosome: # For X-linked we do not need to check the other models - if variant['CHROM'] == 'X': + if variant['CHROM'] in ['X', 'chrX']: if check_X_recessive(variant, family, strict): variant['inheritance_models'][family_id]['XR'] = True for individual_id in individuals: diff --git a/tests/fixtures/test_vcf_regions_with_chr.vcf b/tests/fixtures/test_vcf_regions_with_chr.vcf new file mode 100644 index 0000000..51eb0ef --- /dev/null +++ b/tests/fixtures/test_vcf_regions_with_chr.vcf @@ -0,0 +1,21 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##contig= +##reference=file:///humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT father mother proband father_2 mother_2 proband_2 +chr1 879537 . T C 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 +chr1 879541 . G A 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ ./. 0/1:10,10:60 1/1:10,10:60 ./. 0/1:10,10:60 0/1:10,10:60 +chr1 879595 . C T 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 0/0:10,10:60 1/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 +chr1 879676 . G A 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 1/1:10,10:60 1/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 +chr1 879911 . G A 100 PASS MQ=1;Annotation=SAMD11,NOC2L GT:AD:GQ 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 +chr1 880012 . A G 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 +chr1 880086 . T C 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 +chr1 880199 . G A 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 +chr1 880217 . T G 100 PASS MQ=1;Annotation=NOC2L GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 +chr10 76154051 . A G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 +chr10 76154073 . T G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 +chr10 76154074 . C G 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 +chr10 76154076 . G C 100 PASS MQ=1;Annotation=ADK GT:AD:GQ ./. 0/0:10,10:60 0/1:10,10:60 ./. 0/0:10,10:60 0/1:10,10:60 +chrX 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;Annotation=PPP2R3B GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 +chrM 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 diff --git a/tests/functionality/test_annotate_models.py b/tests/functionality/test_annotate_models.py index c38cdb1..df19179 100644 --- a/tests/functionality/test_annotate_models.py +++ b/tests/functionality/test_annotate_models.py @@ -1,8 +1,12 @@ from genmod.commands import models_command +from tempfile import NamedTemporaryFile +from typing import Dict, Union from click.testing import CliRunner +from genmod.vcf_tools import HeaderParser, get_variant_dict, get_info_dict ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf" VCF_FILE = "tests/fixtures/test_vcf_regions.vcf" +VCF_FILE_WITH_CHR = "tests/fixtures/test_vcf_regions_with_chr.vcf" FAMILY_FILE = "tests/fixtures/recessive_trio.ped" BAD_FAMILY_FILE = "tests/fixtures/annotate_models/one_ind.ped" EMPTY_VCF_FILE = "tests/fixtures/empty.vcf" @@ -11,6 +15,65 @@ from genmod.log import init_log init_log(logger, loglevel="INFO") +def _parse_variant_file(file_path: str) -> HeaderParser: + """ + Parse VCF header fields + :param file_path: VCF to be read + :raises ValueError: in case file is empty + """ + with open(file_path, 'r') as variant_file: + head = HeaderParser() + for line_index, line in enumerate(variant_file): + line = line.rstrip() + if line.startswith('#'): + if line.startswith('##'): + head.parse_meta_data(line) + else: + head.parse_header_line(line) + else: + break + if line_index == 0: + raise ValueError('Expected contents in file, got none') + return head + + +def _generate_variants_from_file(file_path: str) -> Dict[str, Union[str, int, float]]: + """ + Yield variants from VCF file. + :param file_path: VCF to be read + """ + header = _parse_variant_file(file_path=file_path) + with open(file_path, 'r') as variant_file: + for line in variant_file: + if line.startswith('#'): + continue + variant: Dict[str, str] = get_variant_dict(line, header.header) + variant['info_dict'] = get_info_dict(variant['INFO']) + yield variant + +def _generate_genetic_models_string_from_file(file_path: str) -> str: + """ + Yield genetic model string from VCF. + :param file_path: VCF to be read + """ + for variant in _generate_variants_from_file(file_path=file_path): + genetic_models_entry: str = variant['info_dict'].get('GeneticModels', '') + for family_genetic_models in genetic_models_entry.split(','): + family_genetic_models = family_genetic_models.split(':') + if len(family_genetic_models) > 1: # Not all variants will have a model + genetic_models: str = family_genetic_models[1] + yield genetic_models + +def _run_model_command(vcf_file): + """Helper function to run models_command and return output as a list.""" + runner = CliRunner() + result = runner.invoke(models_command, [vcf_file, '-f', FAMILY_FILE]) + assert result.exit_code == 0, f"Command failed with exit code: {result.exit_code}" + + with NamedTemporaryFile(delete=False) as temp_file: + temp_file.write(result.stdout_bytes) + temp_file.seek(0) # Move back to the start of the file + return list(_generate_genetic_models_string_from_file(temp_file.name)) def test_genmod_annotate_models_no_family(): """docstring for test_genmod_annotate_models""" @@ -42,7 +105,7 @@ def test_genmod_annotate_models_empty_vcf(): FAMILY_FILE ] ) - + print(result.output) assert result.exit_code == 0 @@ -59,13 +122,24 @@ def test_annotate_models_already_annotated(): assert result.exit_code == 1 def test_annotate_models_lacking_ind(): - """docstring for test_genmod_annotate_models""" - runner = CliRunner() - result = runner.invoke(models_command, [ - VCF_FILE, - '-f', - BAD_FAMILY_FILE - ] - ) + """docstring for test_genmod_annotate_models""" + runner = CliRunner() + result = runner.invoke(models_command, [ + VCF_FILE, + '-f', + BAD_FAMILY_FILE + ] + ) + + assert result.exit_code == 1 + +def test_annotate_models_chr_prefix(): + """Test that genetic models are identical for VCF with and without 'chr' prefix.""" + + # Get models from both VCF files + models_list = _run_model_command(VCF_FILE) + models_list_with_chr = _run_model_command(VCF_FILE_WITH_CHR) - assert result.exit_code == 1 \ No newline at end of file + # Assert that the lists of models are identical + assert len(models_list) > 0 and len(models_list_with_chr) > 0, "No models in VCFs" + assert models_list == models_list_with_chr, "Models differ between VCF files."