Merge master branch

Clinical-Genomics · Oct 18, 2024 · 5127fb0 · 5127fb0
2 parents 98935d6 + cc7cdf0
commit 5127fb0
Show file tree

Hide file tree

Showing 11 changed files with 172 additions and 48 deletions.
diff --git a/.github/workflows/build_and_publish.yml b/.github/workflows/build_and_publish.yml
@@ -0,0 +1,39 @@
+name: Publish to PyPI
+
+on:
+ release:
+  types:
+   - created
+
+jobs:
+ build-n-publish:
+  name: Build and publish Python distribution to PyPI
+  runs-on: ubuntu-latest
+  permissions:
+    id-token: write
+  steps:
+   - name: Check out git repository
+     uses: actions/checkout@v4
+
+   - name: Set up Python 3.8
+     uses: actions/setup-python@v5
+     with:
+      python-version: 3.8
+
+   - name: Install build tools
+     run: >-
+        python -m
+        pip install
+        wheel
+        twine
+        --user
+
+   - name: Build a binary wheel and a source tarball
+     run: >-
+        python
+        setup.py
+        sdist
+        bdist_wheel
+
+   - name: Publish distribution 📦 to PyPI
+     uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/keep_a_changelog.yml b/.github/workflows/keep_a_changelog.yml
@@ -0,0 +1,15 @@
+name: "Changelog Reminder"
+on:
+  pull_request:
+      types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]
+
+jobs:
+  # Enforces the update of a changelog file on every pull request
+  changelog:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: dangoslen/changelog-enforcer@v3
+      with:
+        changeLogPath: 'CHANGELOG.md'
+        skipLabels: 'Skip-Changelog'
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,9 +8,16 @@ Please add a new candidate release at the top after changing the latest one. Fee
 
 Try to use the following format:
 
-## [3.8.1]
+## [unreleased]
+- Fixed wrong models when chromosome X was named `chrX` and not `X` 
+- Added GitHub Actions workflows for automatic publishing to PyPI on release, and keep a changelog reminder ([#136](https://github.com/Clinical-Genomics/genmod/pull/136))
 - Optional user defined threshold and penalty for compound scoring
 
+## [3.8.3]
+
+- Fixed unstable compounds order in models output ([#134](https://github.com/Clinical-Genomics/genmod/pull/134))
+- Added `six` to requirements.txt and setup.py ([#134](https://github.com/Clinical-Genomics/genmod/pull/134))
+
 ## [3.8.0]
 - Rank score normalisation
 
@@ -25,4 +32,4 @@ Try to use the following format:
 - Adds support for escaped characters in FORMAT description header strings
 
 ### Fixed
-- Documentation about cli options `strict` and `phased`
+- Documentation about cli options `strict` and `phased`
diff --git a/genmod/annotate_models/genetic_models.py b/genmod/annotate_models/genetic_models.py
@@ -144,7 +144,7 @@ def check_genetic_models(variant_batch, families, phased = False,
 
             # Only check X-linked for the variants in the X-chromosome:
             # For X-linked we do not need to check the other models
-            if variant['CHROM'] == 'X':
+            if variant['CHROM'] in ['X', 'chrX']:
                 if check_X_recessive(variant, family, strict):
                     variant['inheritance_models'][family_id]['XR'] = True
                     for individual_id in individuals:

diff --git a/genmod/score_variants/compound_scorer.py b/genmod/score_variants/compound_scorer.py
@@ -270,6 +270,8 @@ def run(self):
                             new_compound = "{0}>{1}".format(compound_id, compound_score)
                             scored_compound_list.append(new_compound)
 
+                        # Sort compound variants lexicographically
+                        scored_compound_list.sort()
                         new_compound_string = "{0}:{1}".format(
                             compound_family_id, '|'.join(scored_compound_list))
 

diff --git a/requirements.txt b/requirements.txt
@@ -7,3 +7,4 @@ configobj == 5.0.8
 intervaltree == 3.1.0
 extract_vcf == 0.5
 vcftoolbox == 1.5.1
+six == 1.16.0
diff --git a/setup.py b/setup.py
@@ -20,7 +20,7 @@
     long_description = 'Tool for annotating patterns of genetic inheritance in Variant Call Format (VCF) files.'
 
 setup(name='genmod',
-    version='3.8.2',
+    version='3.8.3',
     description='Annotate genetic inheritance models in variant files',
     author = 'Mans Magnusson',
     author_email = '[email protected]',
@@ -36,7 +36,8 @@
         'configobj >= 5.0.8',
         'intervaltree >= 3.1.0',
         'extract_vcf >= 0.5',
-        'vcftoolbox >= 1.5.1'
+        'vcftoolbox >= 1.5.1',
+        'six >= 1.16.0',
     ],
     packages=find_packages(
         exclude=('tests*', 'docs', 'examples', 'configs')

diff --git a/tests/fixtures/test_vcf_regions_with_chr.vcf b/tests/fixtures/test_vcf_regions_with_chr.vcf
@@ -0,0 +1,21 @@
+##fileformat=VCFv4.1
+##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
+##INFO=<ID=Annotation,Number=.,Type=String,Description="Annotates what feature(s) this variant belongs to.">
+##contig=<ID=1,length=249250621,assembly=b37>
+##reference=file:///humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	father	mother	proband	father_2	mother_2	proband_2
+chr1	879537	.	T	C	100	PASS	MQ=1;Annotation=SAMD11,NOC2L	GT:AD:GQ	0/1:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60
+chr1	879541	.	G	A	100	PASS	MQ=1;Annotation=SAMD11,NOC2L	GT:AD:GQ	./.	0/1:10,10:60	1/1:10,10:60	./.	0/1:10,10:60	0/1:10,10:60
+chr1	879595	.	C	T	100	PASS	MQ=1;Annotation=SAMD11,NOC2L	GT:AD:GQ	0/1:10,10:60	0/0:10,10:60	1/1:10,10:60	0/1:10,10:60	0/0:10,10:60	0/1:10,10:60
+chr1	879676	.	G	A	100	PASS	MQ=1;Annotation=SAMD11,NOC2L	GT:AD:GQ	0/1:10,10:60	1/1:10,10:60	1/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60
+chr1	879911	.	G	A	100	PASS	MQ=1;Annotation=SAMD11,NOC2L	GT:AD:GQ	0/1:10,10:60	0/0:10,10:60	0/1:10,10:60	0/1:10,10:60	0/0:10,10:60	0/1:10,10:60
+chr1	880012	.	A	G	100	PASS	MQ=1;Annotation=NOC2L	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	0/1:10,10:60	0/0:10,10:60	0/1:10,10:60	0/1:10,10:60
+chr1	880086	.	T	C	100	PASS	MQ=1;Annotation=NOC2L	GT:AD:GQ	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60
+chr1	880199	.	G	A	100	PASS	MQ=1;Annotation=NOC2L	GT:AD:GQ	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60
+chr1	880217	.	T	G	100	PASS	MQ=1;Annotation=NOC2L	GT:AD:GQ	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60
+chr10	76154051	.	A	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	0/1:10,10:60	0/0:10,10:60	0/1:10,10:60	0/1:10,10:60
+chr10	76154073	.	T	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60	0/0:10,10:60	0/0:10,10:60	0/1:10,10:60
+chr10	76154074	.	C	G	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60	0/1:10,10:60
+chr10	76154076	.	G	C	100	PASS	MQ=1;Annotation=ADK	GT:AD:GQ	./.	0/0:10,10:60	0/1:10,10:60	./.	0/0:10,10:60	0/1:10,10:60
+chrX	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1;Annotation=PPP2R3B	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
+chrM	302253	.	CCCTCCTGCCCCT	C	100	PASS	MQ=1	GT:AD:GQ	0/0:10,10:60	0/1:10,10:60	1/1:10,10:60	0/0:10,10:60	1/1:10,10:60	1/1:10,10:60
diff --git a/tests/functionality/test_annotate_models.py b/tests/functionality/test_annotate_models.py
@@ -1,16 +1,44 @@
 from genmod.commands import models_command
+from tempfile import NamedTemporaryFile
+from typing import Dict, Union
 from click.testing import CliRunner
 
 ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf"
 VCF_FILE = "tests/fixtures/test_vcf_regions.vcf"
+VCF_FILE_WITH_CHR = "tests/fixtures/test_vcf_regions_with_chr.vcf"
 FAMILY_FILE = "tests/fixtures/recessive_trio.ped"
 BAD_FAMILY_FILE = "tests/fixtures/annotate_models/one_ind.ped"
 EMPTY_VCF_FILE = "tests/fixtures/empty.vcf"
 
 from genmod import logger
 from genmod.log import init_log
+from test_utils import generate_variants_from_file
+
 init_log(logger, loglevel="INFO")
 
+def _generate_genetic_models_string_from_file(file_path: str) -> str:
+    """
+    Yield genetic model string from VCF.
+    :param file_path: VCF to be read
+    """
+    for variant in generate_variants_from_file(file_path=file_path):
+        genetic_models_entry: str = variant['info_dict'].get('GeneticModels', '')
+        for family_genetic_models in genetic_models_entry.split(','):
+            family_genetic_models = family_genetic_models.split(':')
+            if len(family_genetic_models) > 1: # Not all variants will have a model
+                genetic_models: str = family_genetic_models[1]
+                yield genetic_models
+
+def _run_model_command(vcf_file):
+    """Helper function to run models_command and return output as a list."""
+    runner = CliRunner()
+    result = runner.invoke(models_command, [vcf_file, '-f', FAMILY_FILE])
+    assert result.exit_code == 0, f"Command failed with exit code: {result.exit_code}"
+
+    with NamedTemporaryFile(delete=False) as temp_file:
+        temp_file.write(result.stdout_bytes)
+        temp_file.seek(0)  # Move back to the start of the file
+        return list(_generate_genetic_models_string_from_file(temp_file.name))
 
 def test_genmod_annotate_models_no_family():
     """docstring for test_genmod_annotate_models"""
@@ -42,7 +70,7 @@ def test_genmod_annotate_models_empty_vcf():
         FAMILY_FILE 
         ]
     )
-    
+
     print(result.output)
     assert result.exit_code == 0
 
@@ -67,5 +95,16 @@ def test_annotate_models_lacking_ind():
         BAD_FAMILY_FILE 
         ]
     )
+
+    assert result.exit_code == 1
+
+def test_annotate_models_chr_prefix():
+    """Test that genetic models are identical for VCF with and without 'chr' prefix."""
+
+    # Get models from both VCF files
+    models_list = _run_model_command(VCF_FILE)
+    models_list_with_chr = _run_model_command(VCF_FILE_WITH_CHR)
 
-    assert result.exit_code == 1
+    # Assert that the lists of models are identical
+    assert len(models_list) > 0 and len(models_list_with_chr) > 0, "No models in VCFs"
+    assert models_list == models_list_with_chr, "Models differ between VCF files."
diff --git a/tests/functionality/test_score_variants_ranks_score_is_float.py b/tests/functionality/test_score_variants_ranks_score_is_float.py
@@ -1,58 +1,19 @@
 import pytest
 from tempfile import NamedTemporaryFile
-from typing import Dict, Union
 from click.testing import CliRunner
 
 from genmod.commands import score_command, score_compounds_command
-from genmod.vcf_tools import HeaderParser, get_variant_dict, get_info_dict
+from test_utils import generate_variants_from_file 
 
 ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf"
 SCORE_CONFIG = "tests/fixtures/score_variants/genmod_example.ini"
 
-
-def _parse_variant_file(file_path: str) -> HeaderParser:
-    """
-    Parse VCF header fields
-    :param file_path: VCF to be read
-    :raises ValueError: in case file is empty
-    """
-    with open(file_path, 'r') as variant_file:
-        head = HeaderParser()
-        for line_index, line in enumerate(variant_file):
-            line = line.rstrip()
-            if line.startswith('#'):
-                if line.startswith('##'):
-                    head.parse_meta_data(line)
-                else:
-                    head.parse_header_line(line)
-            else:
-                break
-        if line_index == 0:
-            raise ValueError('Expected contents in file, got none')
-    return head
-
-
-def _generate_variants_from_file(file_path: str) -> Dict[str, Union[str, int, float]]:
-    """
-    Yield variants from VCF file.
-    :param file_path: VCF to be read
-    """
-    header = _parse_variant_file(file_path=file_path)
-    with open(file_path, 'r') as variant_file:
-        for line in variant_file:
-            if line.startswith('#'):
-                continue
-            variant: Dict[str, str] = get_variant_dict(line, header.header)
-            variant['info_dict'] = get_info_dict(variant['INFO'])
-            yield variant
-
-
 def _generate_rank_score_strings_from_file(file_path: str) -> str:
     """
     Yield rank score strings from VCF.
     :param file_path: VCF to be read
     """
-    for variant in _generate_variants_from_file(file_path=file_path):
+    for variant in generate_variants_from_file(file_path=file_path):
         rank_score_entry: str = variant['info_dict'].get('RankScore', '')
         for family_rank_score in rank_score_entry.split(','):
             family_rank_score = family_rank_score.split(':')

diff --git a/tests/functionality/test_utils.py b/tests/functionality/test_utils.py
@@ -0,0 +1,38 @@
+from typing import Dict, Union
+from genmod.vcf_tools import HeaderParser, get_variant_dict, get_info_dict
+
+def parse_variant_file(file_path: str) -> HeaderParser:
+    """
+    Parse VCF header fields
+    :param file_path: VCF to be read
+    :raises ValueError: in case file is empty
+    """
+    with open(file_path, 'r') as variant_file:
+        head = HeaderParser()
+        for line_index, line in enumerate(variant_file):
+            line = line.rstrip()
+            if line.startswith('#'):
+                if line.startswith('##'):
+                    head.parse_meta_data(line)
+                else:
+                    head.parse_header_line(line)
+            else:
+                break
+        if line_index == 0:
+            raise ValueError('Expected contents in file, got none')
+    return head
+
+def generate_variants_from_file(file_path: str) -> Dict[str, Union[str, int, float]]:
+    """
+    Yield variants from VCF file.
+    :param file_path: VCF to be read
+    """
+    header = parse_variant_file(file_path=file_path)
+    with open(file_path, 'r') as variant_file:
+        for line in variant_file:
+            if line.startswith('#'):
+                continue
+            variant: Dict[str, str] = get_variant_dict(line, header.header)
+            variant['info_dict'] = get_info_dict(variant['INFO'])
+            yield variant
+