Merge pull request #135 from fellen31/fix-chr

Ensure chrX and X are treated equally
Clinical-Genomics · Oct 8, 2024 · 5bee959 · 5bee959
2 parents 68bc1ed + bd0d89d
commit 5bee959
Show file tree

Hide file tree

Showing 6 changed files with 106 additions and 44 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,9 @@ Please add a new candidate release at the top after changing the latest one. Fee
 
 Try to use the following format:
 
+## [unreleased]
+- Fixed wrong models when chromosome X was named `chrX` and not `X` 
+
 ## [3.8.3]
 
 - Fixed unstable compounds order in models output ([#134](https://github.com/Clinical-Genomics/genmod/pull/134)))

diff --git a/genmod/annotate_models/genetic_models.py b/genmod/annotate_models/genetic_models.py
@@ -144,7 +144,7 @@ def check_genetic_models(variant_batch, families, phased = False,
 
             # Only check X-linked for the variants in the X-chromosome:
             # For X-linked we do not need to check the other models
-            if variant['CHROM'] == 'X':
+            if variant['CHROM'] in ['X', 'chrX']:
                 if check_X_recessive(variant, family, strict):
                     variant['inheritance_models'][family_id]['XR'] = True
                     for individual_id in individuals:

diff --git a/tests/fixtures/test_vcf_regions_with_chr.vcf b/tests/fixtures/test_vcf_regions_with_chr.vcf
@@ -0,0 +1,21 @@
+##fileformat=VCFv4.1
+##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
+##INFO=<ID=Annotation,Number=.,Type=String,Description="Annotates what feature(s) this variant belongs to.">
+##contig=<ID=1,length=249250621,assembly=b37>
+##reference=file:///humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	father	mother	proband	father_2	mother_2	proband_2
+chr1	879537	.	T	C	100	PASS	MQ=1;Annotation=SAMD11,NOC2L	GT:AD:GQ	0/1:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60
+chr1	879541	.	G	A	100	PASS	MQ=1;Annotation=SAMD11,NOC2L	GT:AD:GQ	./.	0/1:10,10:60	1/1:10,10:60	./.	0/1:10,10:60	0/1:10,10:60
+chr1	879595	.	C	T	100	PASS	MQ=1;Annotation=SAMD11,NOC2L	GT:AD:GQ	0/1:10,10:60	0/0:10,10:60	1/1:10,10:60	0/1:10,10:60	0/0:10,10:60	0/1:10,10:60
+chr1	879676	.	G	A	100	PASS	MQ=1;Annotation=SAMD11,NOC2L	GT:AD:GQ	0/1:10,10:60	1/1:10,10:60	1/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60
+chr1	879911	.	G	A	100	PASS	MQ=1;Annotation=SAMD11,NOC2L	GT:AD:GQ	0/1:10,10:60	0/0:10,10:60	0/1:10,10:60	0/1:10,10:60	0/0:10,10:60	0/1:10,10:60
+chr1	880012	.	A	G	100	PASS	MQ=1;Annotation=NOC2L	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	0/1:10,10:60	0/0:10,10:60	0/1:10,10:60	0/1:10,10:60
+chr1	880086	.	T	C	100	PASS	MQ=1;Annotation=NOC2L	GT:AD:GQ	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60
+chr1	880199	.	G	A	100	PASS	MQ=1;Annotation=NOC2L	GT:AD:GQ	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60
+chr1	880217	.	T	G	100	PASS	MQ=1;Annotation=NOC2L	GT:AD:GQ	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60
+chr10	76154051	.	A	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	0/1:10,10:60	0/0:10,10:60	0/1:10,10:60	0/1:10,10:60
+chr10	76154073	.	T	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60
+chr10	76154074	.	C	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60
+chr10	76154076	.	G	C	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/0:10,10:60	0/1:10,10:60	./.	0/0:10,10:60	0/1:10,10:60
+chrX	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1;Annotation=PPP2R3B	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
+chrM	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
diff --git a/tests/functionality/test_annotate_models.py b/tests/functionality/test_annotate_models.py
@@ -1,16 +1,44 @@
 from genmod.commands import models_command
+from tempfile import NamedTemporaryFile
+from typing import Dict, Union
 from click.testing import CliRunner
 
 ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf"
 VCF_FILE = "tests/fixtures/test_vcf_regions.vcf"
+VCF_FILE_WITH_CHR = "tests/fixtures/test_vcf_regions_with_chr.vcf"
 FAMILY_FILE = "tests/fixtures/recessive_trio.ped"
 BAD_FAMILY_FILE = "tests/fixtures/annotate_models/one_ind.ped"
 EMPTY_VCF_FILE = "tests/fixtures/empty.vcf"
 
 from genmod import logger
 from genmod.log import init_log
+from test_utils import generate_variants_from_file
+
 init_log(logger, loglevel="INFO")
 
+def _generate_genetic_models_string_from_file(file_path: str) -> str:
+    """
+    Yield genetic model string from VCF.
+    :param file_path: VCF to be read
+    """
+    for variant in generate_variants_from_file(file_path=file_path):
+        genetic_models_entry: str = variant['info_dict'].get('GeneticModels', '')
+        for family_genetic_models in genetic_models_entry.split(','):
+            family_genetic_models = family_genetic_models.split(':')
+            if len(family_genetic_models) > 1: # Not all variants will have a model
+                genetic_models: str = family_genetic_models[1]
+                yield genetic_models
+
+def _run_model_command(vcf_file):
+    """Helper function to run models_command and return output as a list."""
+    runner = CliRunner()
+    result = runner.invoke(models_command, [vcf_file, '-f', FAMILY_FILE])
+    assert result.exit_code == 0, f"Command failed with exit code: {result.exit_code}"
+
+    with NamedTemporaryFile(delete=False) as temp_file:
+        temp_file.write(result.stdout_bytes)
+        temp_file.seek(0)  # Move back to the start of the file
+        return list(_generate_genetic_models_string_from_file(temp_file.name))
 
 def test_genmod_annotate_models_no_family():
     """docstring for test_genmod_annotate_models"""
@@ -42,7 +70,7 @@ def test_genmod_annotate_models_empty_vcf():
         FAMILY_FILE 
         ]
     )
-    
+
     print(result.output)
     assert result.exit_code == 0
 
@@ -67,5 +95,16 @@ def test_annotate_models_lacking_ind():
         BAD_FAMILY_FILE 
         ]
     )
+
+    assert result.exit_code == 1
+
+def test_annotate_models_chr_prefix():
+    """Test that genetic models are identical for VCF with and without 'chr' prefix."""
+
+    # Get models from both VCF files
+    models_list = _run_model_command(VCF_FILE)
+    models_list_with_chr = _run_model_command(VCF_FILE_WITH_CHR)
 
-    assert result.exit_code == 1
+    # Assert that the lists of models are identical
+    assert len(models_list) > 0 and len(models_list_with_chr) > 0, "No models in VCFs"
+    assert models_list == models_list_with_chr, "Models differ between VCF files."
diff --git a/tests/functionality/test_score_variants_ranks_score_is_float.py b/tests/functionality/test_score_variants_ranks_score_is_float.py
@@ -1,58 +1,19 @@
 import pytest
 from tempfile import NamedTemporaryFile
-from typing import Dict, Union
 from click.testing import CliRunner
 
 from genmod.commands import score_command, score_compounds_command
-from genmod.vcf_tools import HeaderParser, get_variant_dict, get_info_dict
+from test_utils import generate_variants_from_file 
 
 ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf"
 SCORE_CONFIG = "tests/fixtures/score_variants/genmod_example.ini"
 
-
-def _parse_variant_file(file_path: str) -> HeaderParser:
-    """
-    Parse VCF header fields
-    :param file_path: VCF to be read
-    :raises ValueError: in case file is empty
-    """
-    with open(file_path, 'r') as variant_file:
-        head = HeaderParser()
-        for line_index, line in enumerate(variant_file):
-            line = line.rstrip()
-            if line.startswith('#'):
-                if line.startswith('##'):
-                    head.parse_meta_data(line)
-                else:
-                    head.parse_header_line(line)
-            else:
-                break
-        if line_index == 0:
-            raise ValueError('Expected contents in file, got none')
-    return head
-
-
-def _generate_variants_from_file(file_path: str) -> Dict[str, Union[str, int, float]]:
-    """
-    Yield variants from VCF file.
-    :param file_path: VCF to be read
-    """
-    header = _parse_variant_file(file_path=file_path)
-    with open(file_path, 'r') as variant_file:
-        for line in variant_file:
-            if line.startswith('#'):
-                continue
-            variant: Dict[str, str] = get_variant_dict(line, header.header)
-            variant['info_dict'] = get_info_dict(variant['INFO'])
-            yield variant
-
-
 def _generate_rank_score_strings_from_file(file_path: str) -> str:
     """
     Yield rank score strings from VCF.
     :param file_path: VCF to be read
     """
-    for variant in _generate_variants_from_file(file_path=file_path):
+    for variant in generate_variants_from_file(file_path=file_path):
         rank_score_entry: str = variant['info_dict'].get('RankScore', '')
         for family_rank_score in rank_score_entry.split(','):
             family_rank_score = family_rank_score.split(':')

diff --git a/tests/functionality/test_utils.py b/tests/functionality/test_utils.py
@@ -0,0 +1,38 @@
+from typing import Dict, Union
+from genmod.vcf_tools import HeaderParser, get_variant_dict, get_info_dict
+
+def parse_variant_file(file_path: str) -> HeaderParser:
+    """
+    Parse VCF header fields
+    :param file_path: VCF to be read
+    :raises ValueError: in case file is empty
+    """
+    with open(file_path, 'r') as variant_file:
+        head = HeaderParser()
+        for line_index, line in enumerate(variant_file):
+            line = line.rstrip()
+            if line.startswith('#'):
+                if line.startswith('##'):
+                    head.parse_meta_data(line)
+                else:
+                    head.parse_header_line(line)
+            else:
+                break
+        if line_index == 0:
+            raise ValueError('Expected contents in file, got none')
+    return head
+
+def generate_variants_from_file(file_path: str) -> Dict[str, Union[str, int, float]]:
+    """
+    Yield variants from VCF file.
+    :param file_path: VCF to be read
+    """
+    header = parse_variant_file(file_path=file_path)
+    with open(file_path, 'r') as variant_file:
+        for line in variant_file:
+            if line.startswith('#'):
+                continue
+            variant: Dict[str, str] = get_variant_dict(line, header.header)
+            variant['info_dict'] = get_info_dict(variant['INFO'])
+            yield variant
+