From d24ffc33ae8cb4027a4c3d794d265fb2f4805dac Mon Sep 17 00:00:00 2001 From: Daniel Nilsson Date: Wed, 27 Nov 2024 15:41:11 +0100 Subject: [PATCH] Ruff lint, format, sort imports --- .github/workflows/lint_and_fix.yml | 0 genmod/annotate_models/__init__.py | 4 +- genmod/annotate_models/fix_variant.py | 84 +- genmod/annotate_models/genetic_models.py | 425 +++++---- genmod/annotate_models/make_haploblocks.py | 145 ++-- genmod/annotate_models/model_score.py | 42 +- genmod/annotate_models/models/__init__.py | 6 +- .../annotate_models/models/compound_model.py | 109 +-- .../annotate_models/models/dominant_model.py | 40 +- .../annotate_models/models/recessive_model.py | 31 +- genmod/annotate_models/models/x_models.py | 63 +- genmod/annotate_models/variant_annotator.py | 124 +-- genmod/annotate_regions/get_features.py | 5 +- genmod/annotate_regions/parse_annotations.py | 79 +- genmod/annotate_variants/__init__.py | 3 +- genmod/annotate_variants/add_annotations.py | 119 +-- genmod/annotate_variants/annotate.py | 84 +- genmod/annotate_variants/read_tabix_files.py | 91 +- genmod/annotations/__init__.py | 8 +- genmod/commands/__init__.py | 8 +- genmod/commands/analyze.py | 816 +++++++++--------- genmod/commands/annotate_models.py | 333 +++---- genmod/commands/annotate_variant.py | 228 ++--- genmod/commands/base.py | 61 +- genmod/commands/filter_variants.py | 117 +-- genmod/commands/genmod_sort.py | 115 +-- genmod/commands/score_compounds.py | 144 ++-- genmod/commands/score_variants.py | 215 ++--- genmod/commands/summarize_variants.py | 153 ++-- genmod/commands/utils.py | 70 +- genmod/errors/__init__.py | 2 +- genmod/errors/warning.py | 12 +- genmod/log.py | 16 +- genmod/score_variants/__init__.py | 10 +- .../cap_rank_score_to_min_bound.py | 12 +- genmod/score_variants/check_plugins.py | 39 +- genmod/score_variants/compound_scorer.py | 249 +++--- genmod/score_variants/config_parser.py | 353 ++++---- .../rank_score_variant_definitions.py | 6 +- genmod/score_variants/score_function.py | 148 ++-- genmod/score_variants/score_variant.py | 130 +-- genmod/utils/__init__.py | 112 +-- genmod/utils/check_individuals.py | 15 +- genmod/utils/get_batches.py | 107 +-- genmod/utils/get_features.py | 54 +- genmod/utils/get_priority.py | 70 +- genmod/utils/is_number.py | 6 +- genmod/utils/pair_generator.py | 26 +- genmod/utils/variant_printer.py | 72 +- genmod/vcf_tools/__init__.py | 26 +- genmod/vcf_tools/add_metadata.py | 115 +-- genmod/vcf_tools/add_variant_information.py | 90 +- genmod/vcf_tools/check_info_header.py | 11 +- genmod/vcf_tools/genotype.py | 68 +- genmod/vcf_tools/get_genotypes.py | 38 +- genmod/vcf_tools/header_parser.py | 292 +++---- genmod/vcf_tools/parse_variant.py | 123 +-- genmod/vcf_tools/print_headers.py | 11 +- genmod/vcf_tools/print_variants.py | 85 +- genmod/vcf_tools/sort_variants.py | 51 +- tests/annotate_regions/test_bed_parser.py | 12 +- .../test_build_region_trees.py | 25 +- tests/annotate_regions/test_get_interval.py | 11 +- .../annotate_variants/test_add_annotations.py | 41 +- tests/conftest.py | 42 +- tests/functionality/test_annotate_models.py | 61 +- tests/functionality/test_annotate_variant.py | 65 +- tests/functionality/test_filter_variants.py | 6 +- tests/functionality/test_score_variants.py | 33 +- ...est_score_variants_ranks_score_is_float.py | 63 +- tests/functionality/test_sort_variants.py | 42 +- tests/functionality/test_utils.py | 20 +- tests/genetic_models/test_dominant_model.py | 157 ++-- tests/genetic_models/test_x_dominant.py | 297 +++---- tests/genetic_models/test_x_recessive.py | 217 ++--- tests/score_variants/test_category_score.py | 257 +++--- tests/score_variants/test_config_parser.py | 114 +-- .../score_variants/test_rankscore_capping.py | 26 +- tests/score_variants/test_score_function.py | 78 +- tests/utils/test_check_individuals.py | 16 +- tests/utils/test_check_vep_annotation.py | 32 +- tests/utils/test_generate_pairs.py | 21 +- tests/utils/test_get_annotation.py | 54 +- tests/utils/test_get_batches.py | 136 +-- tests/utils/test_get_priority.py | 21 +- tests/utils/test_get_rank_score.py | 79 +- tests/utils/test_is_number.py | 13 +- tests/utils/test_variant_printer.py | 88 +- .../test_get_frequencies.py | 34 +- .../test_get_haploblocks.py | 141 +-- .../test_get_tabix_records.py | 24 +- tests/vcf_tools/test_genotype.py | 89 +- tests/vcf_tools/test_header_parser.py | 73 +- tests/vcf_tools/test_parse_variant.py | 38 +- tests/vcf_tools/test_sorting.py | 65 +- 95 files changed, 4302 insertions(+), 4260 deletions(-) create mode 100644 .github/workflows/lint_and_fix.yml diff --git a/.github/workflows/lint_and_fix.yml b/.github/workflows/lint_and_fix.yml new file mode 100644 index 0000000..e69de29 diff --git a/genmod/annotate_models/__init__.py b/genmod/annotate_models/__init__.py index 3d0a70c..e5f5831 100644 --- a/genmod/annotate_models/__init__.py +++ b/genmod/annotate_models/__init__.py @@ -1,7 +1,7 @@ from __future__ import absolute_import -from .make_haploblocks import get_haploblocks +from .fix_variant import make_print_version from .genetic_models import check_genetic_models +from .make_haploblocks import get_haploblocks from .model_score import get_model_score -from .fix_variant import make_print_version from .variant_annotator import VariantAnnotator diff --git a/genmod/annotate_models/fix_variant.py b/genmod/annotate_models/fix_variant.py index 27a1868..0f462d9 100644 --- a/genmod/annotate_models/fix_variant.py +++ b/genmod/annotate_models/fix_variant.py @@ -1,90 +1,78 @@ from . import get_model_score + def make_print_version(variant, families): """ Get the variants ready for printing - + This function collects the annotations added and merge them in the INFO dict. - + Arguments: variant (dict): A variant dictionary - + """ - - variant_id = variant['variant_id'] - vcf_info = variant['INFO'].split(';') - - feature_list = variant.get('annotation', set()) - + + variant_id = variant["variant_id"] + vcf_info = variant["INFO"].split(";") + + feature_list = variant.get("annotation", set()) + # variant[compounds] is a dictionary with family id as keys and a set of compounds as values - compounds = variant.get('compounds', dict()) + compounds = variant.get("compounds", dict()) # Here we store the compound strings that should be added to the variant: family_compound_strings = [] - genetic_models = variant.get('inheritance_models', {}) + genetic_models = variant.get("inheritance_models", {}) # We need to check if compounds have already been annotated. - if 'Compounds' not in variant['info_dict']: - + if "Compounds" not in variant["info_dict"]: for family_id in compounds: - if (genetic_models[family_id].get('AR_comp') or genetic_models[family_id].get('AR_comp_dn')): - compound_string = '' + if genetic_models[family_id].get("AR_comp") or genetic_models[family_id].get( + "AR_comp_dn" + ): + compound_string = "" compound_set = compounds[family_id] - #We do not want reference to itself as a compound: + # We do not want reference to itself as a compound: compound_set.discard(variant_id) # If there are any compounds for the family: if compounds[family_id]: - compound_string = '|'.join(compound_set) - family_compound_strings.append(':'.join([family_id, compound_string])) - + compound_string = "|".join(compound_set) + family_compound_strings.append(":".join([family_id, compound_string])) + if len(family_compound_strings) > 0: - vcf_info.append('Compounds=' + ','.join(family_compound_strings)) - + vcf_info.append("Compounds=" + ",".join(family_compound_strings)) + # Check if any genetic models are followed - if 'GeneticModels' not in variant['info_dict']: + if "GeneticModels" not in variant["info_dict"]: # Here we store the compound strings that should be added to the variant: family_model_strings = [] model_scores = {} for family_id in genetic_models: - model_string = '' + model_string = "" model_list = [] for model in genetic_models[family_id]: if genetic_models[family_id][model]: model_list.append(model) - model_string = '|'.join(model_list) + model_string = "|".join(model_list) if len(model_list) > 0: - family_model_strings.append(':'.join( - [family_id, model_string])) - + family_model_strings.append(":".join([family_id, model_string])) + model_scores[family_id] = str( - get_model_score(families[family_id].individuals, variant)) - + get_model_score(families[family_id].individuals, variant) + ) + if len(family_model_strings) > 0: - vcf_info.append( - 'GeneticModels={0}'.format( - ','.join(family_model_strings))) + vcf_info.append("GeneticModels={0}".format(",".join(family_model_strings))) model_score_list = [] for family_id in model_scores: if model_scores[family_id]: if float(model_scores[family_id]) > 0: - model_score_list.append( - ':'.join( - [ - family_id, - model_scores[family_id] - ] - ) - ) + model_score_list.append(":".join([family_id, model_scores[family_id]])) if len(model_score_list) > 0: - vcf_info.append( - 'ModelScore=' + - ','.join(model_score_list) - ) - - - - variant['INFO'] = ';'.join(vcf_info) + vcf_info.append("ModelScore=" + ",".join(model_score_list)) + + variant["INFO"] = ";".join(vcf_info) return variant diff --git a/genmod/annotate_models/genetic_models.py b/genmod/annotate_models/genetic_models.py index 15e2ecd..caecb83 100755 --- a/genmod/annotate_models/genetic_models.py +++ b/genmod/annotate_models/genetic_models.py @@ -14,8 +14,8 @@ - Autosomal Recessive De Novo(AR_DN) - Autosomal Recesive Compound(AR_comp). -In this model a variant must imply affected status, otherwise it can not be -dominant. All sick has to be ay least heterozygote for the variant and all +In this model a variant must imply affected status, otherwise it can not be +dominant. All sick has to be ay least heterozygote for the variant and all healthy can not have it. We will assume that each individual that we have information about is present @@ -24,10 +24,10 @@ is the individual sick? - - If the individual is homozygote alternative then AD/AD-denovo + - If the individual is homozygote alternative then AD/AD-denovo and AR/AR-denovo are ok - - If the individual is is heterozygote then AD/AD-denovo are ok + - If the individual is is heterozygote then AD/AD-denovo are ok but AR/AR-denovo are not ok - If the individual is homozygote reference no model is ok @@ -40,7 +40,7 @@ - If the individual is homozygote alternative then no model is ok - - If the individual is heterozygote then AR/AR-denove are ok but + - If the individual is heterozygote then AR/AR-denove are ok but AD/AD-denovo are not ok - If the individual is homozygote referense all models are ok @@ -59,414 +59,397 @@ Copyright (c) 2013 __MyCompanyName__. All rights reserved. """ -from __future__ import print_function, absolute_import +from __future__ import absolute_import, print_function +import logging import os import sys -import logging from datetime import datetime - from pprint import pprint as pp -from .models import (check_dominant, check_recessive, check_compounds, -check_X_recessive, check_X_dominant) - from genmod.utils import generate_pairs -def check_genetic_models(variant_batch, families, phased = False, - strict = False): +from .models import ( + check_compounds, + check_dominant, + check_recessive, + check_X_dominant, + check_X_recessive, +) + + +def check_genetic_models(variant_batch, families, phased=False, strict=False): """ - Check and annotate which genetic models that are followed for the variants + Check and annotate which genetic models that are followed for the variants in a batch - + See more description in header of file or documentation. - + Arguments: variant_batch (dict): A dictionary with variant ids as keys and variant dictionaries as values - families (dict): A dictionary with family ids as keys and Family + families (dict): A dictionary with family ids as keys and Family objects as values phased (bool): If the variants are phased - strict (bool): If the strict mode should be used when checking the + strict (bool): If the strict mode should be used when checking the genetic models - + """ - # A variant batch is a dictionary on the form + # A variant batch is a dictionary on the form # {variant_id:variant_dict, variant_2_id:variant_dict_2, ...} logger = logging.getLogger(__name__) - intervals = variant_batch.pop('haploblocks', {}) - + intervals = variant_batch.pop("haploblocks", {}) + # We check the genetic models for one family at a time for family_id in families: - logger.debug("Checking genetic models for family {0}".format( - family_id - )) + logger.debug("Checking genetic models for family {0}".format(family_id)) family = families[family_id] individuals = family.individuals - + compound_candidates = [] compound_pairs = [] - + for variant_id in variant_batch: inheritance_models = { - 'XR' : False, - 'XR_dn' : False, - 'XD' : False, - 'XD_dn' : False, - 'AD' : False, - 'AD_dn' : False, - 'AR_hom' : False, - 'AR_hom_dn' : False, - 'AR_comp' : False, - 'AR_comp_dn' : False + "XR": False, + "XR_dn": False, + "XD": False, + "XD_dn": False, + "AD": False, + "AD_dn": False, + "AR_hom": False, + "AR_hom_dn": False, + "AR_comp": False, + "AR_comp_dn": False, } - + variant = variant_batch[variant_id] # save the compound pairs for a variant in a set - if 'compounds' in variant: - variant['compounds'][family_id] = set() + if "compounds" in variant: + variant["compounds"][family_id] = set() else: - variant['compounds'] = {family_id : set()} - + variant["compounds"] = {family_id: set()} + # Add information of models followed: - if 'inheritance_models' in variant: - variant['inheritance_models'][family_id] = inheritance_models + if "inheritance_models" in variant: + variant["inheritance_models"][family_id] = inheritance_models else: - variant['inheritance_models'] = {family_id: inheritance_models} - - - # If the variant is in a genetic region we check for compound + variant["inheritance_models"] = {family_id: inheritance_models} + + # If the variant is in a genetic region we check for compound # candidates - if variant.get('compound_candidate',True): - + if variant.get("compound_candidate", True): if check_compound_candidate(variant, family, strict): compound_candidates.append(variant_id) - + # Only check X-linked for the variants in the X-chromosome: # For X-linked we do not need to check the other models - if variant['CHROM'] in ['X', 'chrX']: + if variant["CHROM"] in ["X", "chrX"]: if check_X_recessive(variant, family, strict): - variant['inheritance_models'][family_id]['XR'] = True + variant["inheritance_models"][family_id]["XR"] = True for individual_id in individuals: individual = individuals[individual_id] if individual.has_parents: check_parents( - 'X_recessive', - individual_id, - family, - variant, - strict=strict + "X_recessive", individual_id, family, variant, strict=strict ) - + if check_X_dominant(variant, family, strict): - variant['inheritance_models'][family_id]['XD'] = True + variant["inheritance_models"][family_id]["XD"] = True for individual_id in family.individuals: individual = individuals[individual_id] if individual.has_parents: check_parents( - 'X_dominant', - individual_id, - family, - variant, - strict=strict + "X_dominant", individual_id, family, variant, strict=strict ) # If variant is not on X: else: # Check the dominant model: if check_dominant(variant, family, strict): - variant['inheritance_models'][family_id]['AD'] = True + variant["inheritance_models"][family_id]["AD"] = True for individual_id in individuals: individual = individuals[individual_id] if individual.has_parents: - check_parents( - 'dominant', - individual_id, - family, - variant, - strict=strict - ) - + check_parents("dominant", individual_id, family, variant, strict=strict) + # Check the recessive model: if check_recessive(variant, family, strict): - variant['inheritance_models'][family_id]['AR_hom'] = True + variant["inheritance_models"][family_id]["AR_hom"] = True for individual_id in individuals: individual = individuals[individual_id] if individual.has_parents: check_parents( - 'recessive', - individual_id, - family, - variant, - strict=strict + "recessive", individual_id, family, variant, strict=strict ) - + # Now check the compound models: - + if len(compound_candidates) > 1: for pair in generate_pairs(compound_candidates): - # If the variants in the pair belong to the same gene we check for compounds: + # If the variants in the pair belong to the same gene we check for compounds: variant_1 = variant_batch[pair[0]] variant_2 = variant_batch[pair[1]] # Check that the pair is in the same feature: - if variant_1['annotation'].intersection(variant_2['annotation']): + if variant_1["annotation"].intersection(variant_2["annotation"]): if len(individuals) == 1: - variant_1['compounds'][family_id].add(pair[1]) - variant_2['compounds'][family_id].add(pair[0]) - variant_1['inheritance_models'][family_id]['AR_comp'] = True - variant_2['inheritance_models'][family_id]['AR_comp'] = True - # We know from check_compound_candidates that all variants are present in all affected + variant_1["compounds"][family_id].add(pair[1]) + variant_2["compounds"][family_id].add(pair[0]) + variant_1["inheritance_models"][family_id]["AR_comp"] = True + variant_2["inheritance_models"][family_id]["AR_comp"] = True + # We know from check_compound_candidates that all variants are present in all affected elif check_compounds(variant_1, variant_2, family, intervals, phased): parents_found = False for individual_id in individuals: individual = individuals[individual_id] if individual.has_parents: check_parents( - model='compound', - individual_id=individual_id, - family=family, - variant=variant_1, + model="compound", + individual_id=individual_id, + family=family, + variant=variant_1, variant_2=variant_2, - strict=strict + strict=strict, ) parents_found = True if not parents_found: - variant_1['inheritance_models'][family_id]['AR_comp'] = True - variant_2['inheritance_models'][family_id]['AR_comp'] = True - - if (variant_1['inheritance_models'][family_id]['AR_comp'] or - variant_1['inheritance_models'][family_id]['AR_comp_dn']): - - variant_1['compounds'][family_id].add(pair[1]) - - if (variant_2['inheritance_models'][family_id]['AR_comp'] or - variant_2['inheritance_models'][family_id]['AR_comp_dn']): - - variant_2['compounds'][family_id].add(pair[0]) + variant_1["inheritance_models"][family_id]["AR_comp"] = True + variant_2["inheritance_models"][family_id]["AR_comp"] = True + + if ( + variant_1["inheritance_models"][family_id]["AR_comp"] + or variant_1["inheritance_models"][family_id]["AR_comp_dn"] + ): + variant_1["compounds"][family_id].add(pair[1]) + + if ( + variant_2["inheritance_models"][family_id]["AR_comp"] + or variant_2["inheritance_models"][family_id]["AR_comp_dn"] + ): + variant_2["compounds"][family_id].add(pair[0]) return + def check_compound_candidate(variant, family, strict): """ - Sort out the variants that are potential compound candidates. + Sort out the variants that are potential compound candidates. This function is used to reduce the number of potential candidates - for the future analysis. It will go through all variants in a - batch(gene or other feature) and filter out those variants that not - fit the model. Returns a bool depending on if the variant is a + for the future analysis. It will go through all variants in a + batch(gene or other feature) and filter out those variants that not + fit the model. Returns a bool depending on if the variant is a potential compound candidate. - + Cases: Affected: - - If individual is affected it needs to be heterozygpte + - If individual is affected it needs to be heterozygpte otherwise it can not be a compound candidate - + Healthy: - - Can not be hom. alt for any variant in a potential + - Can not be hom. alt for any variant in a potential compound pair. - + If strict: Affected must be heterozygote - + Args: variant : A variant dictionary. - family : A family object with information about the family + family : A family object with information about the family members for this analysis - + Returns: - bool: depending on if the variant is a potential compound + bool: depending on if the variant is a potential compound candidate according to therules stated above - + """ # This is the case when the variant is located in an uninteresting region(non gene region): - + for individual_id in family.individuals: individual = family.individuals[individual_id] - individual_genotype = variant['genotypes'][individual_id] - + individual_genotype = variant["genotypes"][individual_id] + # No individuals can be homo_alt if individual_genotype.homo_alt: return False - + if individual.affected: # Affected have to be heterozygote for compounds if not individual_genotype.heterozygote: return False # If both parents are healthy none of them can have both variants - + mother_id = individual.mother father_id = individual.father - - if mother_id != '0': - mother_genotype = variant['genotypes'][mother_id] + + if mother_id != "0": + mother_genotype = variant["genotypes"][mother_id] mother = family.individuals[mother_id] - - if father_id != '0': - father_genotype = variant['genotypes'][father_id] + + if father_id != "0": + father_genotype = variant["genotypes"][father_id] father = family.individuals[father_id] - - # If both parents exist and both are healthy, + + # If both parents exist and both are healthy, # both can not have the variant - if mother_id != '0' and father_id != '0': + if mother_id != "0" and father_id != "0": if mother.healthy and father.healthy: - if (mother_genotype.has_variant and - father_genotype.has_variant): + if mother_genotype.has_variant and father_genotype.has_variant: return False # We have now significantly reduced the number # of compound candidates. # In the next step we check if pairs of compounds # follow the compound inheritance pattern. - + return True -def check_parents(model, individual_id, family, variant, variant_2={}, - strict = False): +def check_parents(model, individual_id, family, variant, variant_2={}, strict=False): """ - Check if information in the parents can tell us if model is - de novo or not. + Check if information in the parents can tell us if model is + de novo or not. Model IN ['recessive', 'compound', 'dominant', 'X_recessive', 'X_dominant']. - If the expected pattern of a variant is followed in the family, + If the expected pattern of a variant is followed in the family, de novo will be False. Otherwise de novo will be True. - - - If only one parent is present then we can never exclude denovo for + + + If only one parent is present then we can never exclude denovo for heterozygous inheritance patterns. If strict and one parent we will never say it is denovo - + Args: - model : String, one of 'recessive', 'compound', 'dominant', + model : String, one of 'recessive', 'compound', 'dominant', 'X_recessive', 'X_dominant' individual_id : String that represents the individual id family : A family object variant : A dictionary that represents the variant variant_2 : If compound pair this is the second variant strict : Bool - + """ sex = family.individuals[individual_id].sex family_id = family.family_id - + mother_genotype = False father_genotype = False - + parent_genotypes = [] mother_id = family.individuals[individual_id].mother father_id = family.individuals[individual_id].father - - if mother_id != '0': - mother_genotype = variant['genotypes'][mother_id] + + if mother_id != "0": + mother_genotype = variant["genotypes"][mother_id] mother_phenotype = family.get_phenotype(mother_id) parent_genotypes.append(mother_genotype) - - if father_id != '0': - father_genotype = variant['genotypes'][father_id] + + if father_id != "0": + father_genotype = variant["genotypes"][father_id] father_phenotype = family.get_phenotype(father_id) parent_genotypes.append(father_genotype) - if model == 'recessive': - # If a parent is homozygote or if both parents are heterozygote + if model == "recessive": + # If a parent is homozygote or if both parents are heterozygote # the variant is not denovo. # If strict we know from before that both parents are genotyped if len(parent_genotypes) == 2: - if not (mother_genotype.has_variant and - father_genotype.has_variant): - variant['inheritance_models'][family_id]['AR_hom_dn'] = True - # If both parents are called but none of the above is - # fullfilled it is pure denovo - if (mother_genotype.genotyped and father_genotype.genotyped): - variant['inheritance_models'][family_id]['AR_hom'] = False + if not (mother_genotype.has_variant and father_genotype.has_variant): + variant["inheritance_models"][family_id]["AR_hom_dn"] = True + # If both parents are called but none of the above is + # fullfilled it is pure denovo + if mother_genotype.genotyped and father_genotype.genotyped: + variant["inheritance_models"][family_id]["AR_hom"] = False elif not strict: - variant['inheritance_models'][family_id]['AR_hom_dn'] = True - - elif model == 'dominant': + variant["inheritance_models"][family_id]["AR_hom_dn"] = True + + elif model == "dominant": # If none of the parents carry variant it is de novo if len(parent_genotypes) == 2: if not (mother_genotype.has_variant or father_genotype.has_variant): - variant['inheritance_models'][family_id]['AD_dn'] = True - # If both parents are called but none of them carry the variant it is denovo + variant["inheritance_models"][family_id]["AD_dn"] = True + # If both parents are called but none of them carry the variant it is denovo if mother_genotype.genotyped and father_genotype.genotyped: - variant['inheritance_models'][family_id]['AD'] = False + variant["inheritance_models"][family_id]["AD"] = False else: for parent in parent_genotypes: if not parent.has_variant: - variant['inheritance_models'][family_id]['AD_dn'] = True - variant['inheritance_models'][family_id]['AD'] = False - - elif model == 'X_recessive': - #If the individual is a male we only need to check if the mother carry the variant: + variant["inheritance_models"][family_id]["AD_dn"] = True + variant["inheritance_models"][family_id]["AD"] = False + + elif model == "X_recessive": + # If the individual is a male we only need to check if the mother carry the variant: if sex == 1: if mother_genotype: if not mother_genotype.has_variant: - variant['inheritance_models'][family_id]['XR_dn'] = True + variant["inheritance_models"][family_id]["XR_dn"] = True if mother_genotype.genotyped: - variant['inheritance_models'][family_id]['XR'] = False + variant["inheritance_models"][family_id]["XR"] = False elif not strict: - variant['inheritance_models'][family_id]['XR_dn'] = True - - #If female, both parents must have the variant otherwise denovo is true + variant["inheritance_models"][family_id]["XR_dn"] = True + + # If female, both parents must have the variant otherwise denovo is true elif sex == 2: if len(parent_genotypes) == 2: if not (mother_genotype.has_variant and father_genotype.has_variant): - variant['inheritance_models'][family_id]['XR_dn'] = True - #If both parents are genotyped but they both are not carriers XR is not true - if (mother_genotype.genotyped and father_genotype.genotyped): - variant['inheritance_models'][family_id]['XR'] = False + variant["inheritance_models"][family_id]["XR_dn"] = True + # If both parents are genotyped but they both are not carriers XR is not true + if mother_genotype.genotyped and father_genotype.genotyped: + variant["inheritance_models"][family_id]["XR"] = False elif not strict: - variant['inheritance_models'][family_id]['XR_dn'] = True - - elif model == 'X_dominant': - #If the individual is a male we only need to look at the mother: + variant["inheritance_models"][family_id]["XR_dn"] = True + + elif model == "X_dominant": + # If the individual is a male we only need to look at the mother: if sex == 1: if mother_genotype: if not mother_genotype.has_variant: - variant['inheritance_models'][family_id]['XD_dn'] = True + variant["inheritance_models"][family_id]["XD_dn"] = True if mother_genotype.genotyped: - variant['inheritance_models'][family_id]['XD'] = False - #If female, one of the parents must have the variant otherwise denovo is true + variant["inheritance_models"][family_id]["XD"] = False + # If female, one of the parents must have the variant otherwise denovo is true elif sex == 2: if len(parent_genotypes) == 2: if not (mother_genotype.has_variant or father_genotype.has_variant): - variant['inheritance_models'][family_id]['XD_dn'] = True + variant["inheritance_models"][family_id]["XD_dn"] = True if mother_genotype.genotyped and father_genotype.genotyped: - variant['inheritance_models'][family_id]['XD'] = False + variant["inheritance_models"][family_id]["XD"] = False elif not strict: - variant['inheritance_models'][family_id]['XD_dn'] = True - - elif model == 'compound': + variant["inheritance_models"][family_id]["XD_dn"] = True + elif model == "compound": mother_genotype_2 = None father_genotype_2 = None parent_genotypes_2 = [] - if mother_id != '0': - mother_genotype_2 = variant_2['genotypes'][mother_id] + if mother_id != "0": + mother_genotype_2 = variant_2["genotypes"][mother_id] parent_genotypes_2.append(mother_genotype_2) - if father_id != '0': - father_genotype_2 = variant_2['genotypes'][father_id] + if father_id != "0": + father_genotype_2 = variant_2["genotypes"][father_id] parent_genotypes_2.append(father_genotype_2) - + # One of the variants must come from father and one from mother - if (len(parent_genotypes) == 2 and len(parent_genotypes_2) == 2): - + if len(parent_genotypes) == 2 and len(parent_genotypes_2) == 2: # If both parents are genotyped and one of them are homozygote reference for both variants # the pair will be considered AR compound de novo - - if ((mother_genotype.genotyped and mother_genotype_2.genotyped) and - father_genotype.genotyped and father_genotype_2.genotyped): + if ( + (mother_genotype.genotyped and mother_genotype_2.genotyped) + and father_genotype.genotyped + and father_genotype_2.genotyped + ): # if not both parents have one of the variants it is de novo - if not ((mother_genotype.has_variant or mother_genotype_2.has_variant) and - (father_genotype.has_variant or father_genotype_2.has_variant)): - variant['inheritance_models'][family_id]['AR_comp_dn'] = True - variant_2['inheritance_models'][family_id]['AR_comp_dn'] = True - + if not ( + (mother_genotype.has_variant or mother_genotype_2.has_variant) + and (father_genotype.has_variant or father_genotype_2.has_variant) + ): + variant["inheritance_models"][family_id]["AR_comp_dn"] = True + variant_2["inheritance_models"][family_id]["AR_comp_dn"] = True + else: - - variant['inheritance_models'][family_id]['AR_comp'] = True - variant_2['inheritance_models'][family_id]['AR_comp'] = True - + variant["inheritance_models"][family_id]["AR_comp"] = True + variant_2["inheritance_models"][family_id]["AR_comp"] = True + elif not strict: - variant['inheritance_models'][family_id]['AR_comp_dn'] = True - variant_2['inheritance_models'][family_id]['AR_comp_dn'] = True - variant['inheritance_models'][family_id]['AR_comp'] = True - variant_2['inheritance_models'][family_id]['AR_comp'] = True - + variant["inheritance_models"][family_id]["AR_comp_dn"] = True + variant_2["inheritance_models"][family_id]["AR_comp_dn"] = True + variant["inheritance_models"][family_id]["AR_comp"] = True + variant_2["inheritance_models"][family_id]["AR_comp"] = True + return diff --git a/genmod/annotate_models/make_haploblocks.py b/genmod/annotate_models/make_haploblocks.py index 6a462c8..99a5d45 100644 --- a/genmod/annotate_models/make_haploblocks.py +++ b/genmod/annotate_models/make_haploblocks.py @@ -1,126 +1,119 @@ -from __future__ import (print_function) +from __future__ import print_function + import logging + from interval_tree import IntervalTree + def get_haploblocks(variant_batch, individuals): """ Take a variant batch and return the haploblocks for each of the individuals. The haploblocks are dictionaries with individual trees as keys. - - If a variant is phased it is denoted in the genotype call with a pipe + + If a variant is phased it is denoted in the genotype call with a pipe instead of a backslash. Unphased call: '0/1' Phased call: '0|1' A collection of censequtive phased variants makes a haploblock. The haploblocks are broken when a unphased call is seen. - + Arguments: - variant_batch (dict): variant_batch is a dictionary with variant_id:s + variant_batch (dict): variant_batch is a dictionary with variant_id:s as keys and variant dictionaries as values. individuals (list): A list with strings that represents the individual ids """ logger = logging.getLogger(__name__) logger.debug("Init haploblocks") - haploblocks = {ind_id:[] for ind_id in individuals} + haploblocks = {ind_id: [] for ind_id in individuals} logger.debug("Haploblocks: {0}".format(haploblocks)) - haploblock_starts = {ind_id:None for ind_id in individuals} + haploblock_starts = {ind_id: None for ind_id in individuals} logger.debug("Set beginning to True") # This variable indicates if we are in a haploblock - in_haploblock = {ind_id:False for ind_id in individuals} + in_haploblock = {ind_id: False for ind_id in individuals} haploblock_id = 1 interval_trees = {} for variant_id in variant_batch: logger.debug("Variant: {0}".format(variant_id)) - + variant = variant_batch[variant_id] - + for ind_id in individuals: - raw_gt_call = variant.get(ind_id, './.') + raw_gt_call = variant.get(ind_id, "./.") # If the variant is phased we must check if it is the start of # a haploblock or in the middle of one - if '|' in raw_gt_call: + if "|" in raw_gt_call: # Check if we are already in a haploblock - logger.debug("Variant {0} is phased for individual {1}.".format( - variant_id, ind_id)) - + logger.debug("Variant {0} is phased for individual {1}.".format(variant_id, ind_id)) + if not in_haploblock[ind_id]: - logger.debug("Setting haploblock start to: {0}"\ - " for individual {1}".format( - variant['POS'], ind_id)) - - haploblock_starts[ind_id] = int(variant['POS']) - logger.debug("Setting is_haploblock to True for individual {0}".format( - ind_id - )) + logger.debug( + "Setting haploblock start to: {0}" " for individual {1}".format( + variant["POS"], ind_id + ) + ) + + haploblock_starts[ind_id] = int(variant["POS"]) + logger.debug("Setting is_haploblock to True for individual {0}".format(ind_id)) in_haploblock[ind_id] = True - + # Here the variant is not phased else: - #If call is not passed we consider it to be on same - # haploblock(GATK recommendations) - if variant.get('FILTER', '.') == 'PASS': + # If call is not passed we consider it to be on same + # haploblock(GATK recommendations) + if variant.get("FILTER", ".") == "PASS": # The intervals is a list wiht start, stop and id if in_haploblock[ind_id]: - logger.debug("Creating a haploblock for individual {0}"\ - " with start:{1}, stop:{2} and id:{3}".format( - ind_id, - haploblock_starts[ind_id], - int(variant['POS']) - 1, - str(haploblock_id) - )) + logger.debug( + "Creating a haploblock for individual {0}" + " with start:{1}, stop:{2} and id:{3}".format( + ind_id, + haploblock_starts[ind_id], + int(variant["POS"]) - 1, + str(haploblock_id), + ) + ) haploblocks[ind_id].append( - [ - haploblock_starts[ind_id], - int(variant['POS']) - 1, - str(haploblock_id) - ] + [haploblock_starts[ind_id], int(variant["POS"]) - 1, str(haploblock_id)] ) haploblock_id += 1 - logger.debug("Setting haploblock id to {0}".format( - haploblock_id - )) + logger.debug("Setting haploblock id to {0}".format(haploblock_id)) in_haploblock[ind_id] = False - logger.debug("Setting is_haploblock to False for individual {0}".format( - ind_id - )) - + logger.debug( + "Setting is_haploblock to False for individual {0}".format(ind_id) + ) + for ind_id in individuals: - #Check if we have just finished an interval - + # Check if we have just finished an interval + if in_haploblock[ind_id]: - logger.debug("Creating a haploblock for individual {0}"\ - " with start:{1}, stop:{2} and id:{3}".format( - ind_id, - haploblock_starts[ind_id], - int(variant['POS']) - 1, - str(haploblock_id) - )) + logger.debug( + "Creating a haploblock for individual {0}" + " with start:{1}, stop:{2} and id:{3}".format( + ind_id, haploblock_starts[ind_id], int(variant["POS"]) - 1, str(haploblock_id) + ) + ) haploblocks[ind_id].append( - [ - haploblock_starts[ind_id], - int(variant['POS']), - str(haploblock_id) - ] - ) - + [haploblock_starts[ind_id], int(variant["POS"]), str(haploblock_id)] + ) + haploblock_id += 1 - logger.debug("Setting haploblock id to {0}".format( - haploblock_id - )) - + logger.debug("Setting haploblock id to {0}".format(haploblock_id)) + # Create interval trees of the haploblocks if haploblocks[ind_id]: - logger.debug("Creating IntervalTree for individual {0} with haploblocks:{1}, "\ - "start:{2}, stop:{3}".format( - ind_id, haploblocks[ind_id], - haploblocks[ind_id][0][0]-1, haploblocks[ind_id][-1][1]+1 - )) + logger.debug( + "Creating IntervalTree for individual {0} with haploblocks:{1}, " + "start:{2}, stop:{3}".format( + ind_id, + haploblocks[ind_id], + haploblocks[ind_id][0][0] - 1, + haploblocks[ind_id][-1][1] + 1, + ) + ) interval_trees[ind_id] = IntervalTree( - haploblocks[ind_id], - haploblocks[ind_id][0][0]-1, - haploblocks[ind_id][-1][1]+1 - ) + haploblocks[ind_id], haploblocks[ind_id][0][0] - 1, haploblocks[ind_id][-1][1] + 1 + ) logger.debug("Interval tree created") - + return interval_trees diff --git a/genmod/annotate_models/model_score.py b/genmod/annotate_models/model_score.py index 9df20d2..18147d8 100644 --- a/genmod/annotate_models/model_score.py +++ b/genmod/annotate_models/model_score.py @@ -1,43 +1,45 @@ -import logging -import operator import functools +import logging import math +import operator + def get_model_score(individuals, variant): """ Return the model score for this variant. - + The score is a estimation on how likely the genetic inheritance pattern is for these individuals. Scores are based on the genotype calls from variant calling. - + Arguments: individuals (list): A list with individual id:s variant (dict): A variant dictionary - + Returns: model_score (float): The model score - + """ logger = logging.getLogger(__name__) - + model_score = 0 genotype_scores = [] - + for individual in individuals: - logger.debug("Checking gt call for individual {0}".format( - individual - )) - - gt_call = variant.get('genotypes', {}).get(individual, None) + logger.debug("Checking gt call for individual {0}".format(individual)) + + gt_call = variant.get("genotypes", {}).get(individual, None) if gt_call: if gt_call.genotype_quality > 0: gq = min(gt_call.genotype_quality, 99) - genotype_scores.append(10**-(float(gq)/10)) - + genotype_scores.append(10 ** -(float(gq) / 10)) + if len(genotype_scores) > 0: - model_score = ( - round(-10*math.log10(1-functools.reduce( - operator.mul, [1-score for score in genotype_scores])))) - - return int(model_score) \ No newline at end of file + model_score = round( + -10 + * math.log10( + 1 - functools.reduce(operator.mul, [1 - score for score in genotype_scores]) + ) + ) + + return int(model_score) diff --git a/genmod/annotate_models/models/__init__.py b/genmod/annotate_models/models/__init__.py index 3ba9af7..337b55e 100644 --- a/genmod/annotate_models/models/__init__.py +++ b/genmod/annotate_models/models/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- -from __future__ import (print_function, absolute_import) +from __future__ import absolute_import, print_function +from .compound_model import check_compounds from .dominant_model import check_dominant from .recessive_model import check_recessive -from .compound_model import check_compounds -from .x_models import check_X_recessive, check_X_dominant +from .x_models import check_X_dominant, check_X_recessive diff --git a/genmod/annotate_models/models/compound_model.py b/genmod/annotate_models/models/compound_model.py index 95fbf13..918e7e1 100755 --- a/genmod/annotate_models/models/compound_model.py +++ b/genmod/annotate_models/models/compound_model.py @@ -12,116 +12,117 @@ from __future__ import print_function +import logging import os import sys -import logging + def check_compounds(variant_1, variant_2, family, intervals, phased): """ - Check if two variants of a pair follow the compound heterozygous model. - - At this stage we know: + Check if two variants of a pair follow the compound heterozygous model. + + At this stage we know: - None of the individuals are homozygote alternative for the variants - All affected individuals are heterozygote for both variants. - - We do not allow healthy individuals to be heterozygote for both variants + + We do not allow healthy individuals to be heterozygote for both variants in the pair - (ref. + (ref. http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0070151) - - If the individuals are phased we will only consider pairs that are on + + If the individuals are phased we will only consider pairs that are on different alleles in affected individuals. - + Args: variant_1, variant_2: Variants in a potential compound pair family: A family object with the individuals intervals: A interval tree that describes the phased intervals phased: A bool that tells if the individuals are phased - + Returns: bool: depending on if the pair follow the rules stated above - + """ # Check in all individuals what genotypes that are in the trio based of the # individual picked. logger = logging.getLogger(__name__) - + for individual_id in family.individuals: logger.debug("Check compounds for individual {0}".format(individual_id)) individual = family.individuals[individual_id] - - genotype_1 = variant_1['genotypes'][individual_id] - genotype_2 = variant_2['genotypes'][individual_id] - + + genotype_1 = variant_1["genotypes"][individual_id] + genotype_2 = variant_2["genotypes"][individual_id] + if individual.has_parents: mother_id = individual.mother father_id = individual.father - - if mother_id != '0': + + if mother_id != "0": # mother_genotype_1 = variant_1['genotypes'][mother_id] # mother_genotype_2 = variant_2['genotypes'][mother_id] mother = family.individuals[mother_id] - - if father_id != '0': + + if father_id != "0": # father_genotype_1 = variant_1['genotypes'][father_id] # father_genotype_2 = variant_2['genotypes'][father_id] father = family.individuals[father_id] - - if mother_id != '0' and mother.healthy: - if (variant_1['genotypes'][mother_id].has_variant and - variant_2['genotypes'][mother_id].has_variant): + + if mother_id != "0" and mother.healthy: + if ( + variant_1["genotypes"][mother_id].has_variant + and variant_2["genotypes"][mother_id].has_variant + ): return False - - if father_id != '0' and father.healthy: - if (variant_1['genotypes'][father_id].has_variant and - variant_2['genotypes'][father_id].has_variant): + + if father_id != "0" and father.healthy: + if ( + variant_1["genotypes"][father_id].has_variant + and variant_2["genotypes"][father_id].has_variant + ): return False - - #check if variants are in the same phased interval: - + + # check if variants are in the same phased interval: + if phased: variant_1_interval = intervals[individual_id].find_range( - [ - int(variant_1['POS']), - int(variant_1['POS']) - ] - ) + [int(variant_1["POS"]), int(variant_1["POS"])] + ) variant_2_interval = intervals[individual_id].find_range( - [ - int(variant_2['POS']), - int(variant_2['POS']) - ] - ) - + [int(variant_2["POS"]), int(variant_2["POS"])] + ) + # If phased a healthy individual can have both variants if they are on # the same haploblock # if not phased: - + if individual.healthy: if genotype_1.heterozygote and genotype_2.heterozygote: return False # The case where the individual is affected - # We know since ealier that all affected are heterozygotes - #for these variants + # We know since ealier that all affected are heterozygotes + # for these variants # So we only need to know if the variants are on the same phase elif individual.affected: - #If the individual is sick and phased it has to have one variant on + # If the individual is sick and phased it has to have one variant on # each allele if phased: # Variants need to be in the same phased interval, othervise we # do not have any extra info if variant_1_interval == variant_2_interval: - # If they are in the same interval they can not be on same - #allele - if ((genotype_1.allele_1 == genotype_2.allele_1) or - (genotype_1.allele_2 == genotype_2.allele_2)): + # If they are in the same interval they can not be on same + # allele + if (genotype_1.allele_1 == genotype_2.allele_1) or ( + genotype_1.allele_2 == genotype_2.allele_2 + ): return False - + return True + def main(): pass -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/genmod/annotate_models/models/dominant_model.py b/genmod/annotate_models/models/dominant_model.py index bbbaa66..d76306d 100755 --- a/genmod/annotate_models/models/dominant_model.py +++ b/genmod/annotate_models/models/dominant_model.py @@ -16,11 +16,12 @@ logger = logging.getLogger(__name__) + def check_dominant(variant, family, strict=False): """ - Check if the variant follows the autosomal dominant (AD) pattern in + Check if the variant follows the autosomal dominant (AD) pattern in this family. - + A variant is following the dominant patttern if: Healthy: - Can not have the variant in any form. @@ -28,40 +29,40 @@ def check_dominant(variant, family, strict=False): if strict: - Have to be homozygote reference - No call will return false - + Affected: - Has to be heterozygote for this position. - If no call we can not exclude dominant. if strict: - Have to be heterozygote - No call will return false - + No affection status: We can not tell if variant follows the model or not. - + SPECIAL CASE: If the variants is annotated with incomplete penetrance we allow healthy individuals to be carriers (i.e. healthy individuals can be het.) - + Args: variant: variant dictionary. family: A family object with the individuals strict: A boolean that tells if strict analyzis should be performed. - + Return: bool: depending on if the model is followed in these indivduals - + """ - - for individual in family.individuals: - # Check in all individuals what genotypes that are in the trio based + + for individual in family.individuals: + # Check in all individuals what genotypes that are in the trio based # of the individual picked. - logger.debug("Checking autosomal dominant pattern for variant {0},"\ - " individual: {1}".format( - variant.get('variant_id', None), - individual) + logger.debug( + "Checking autosomal dominant pattern for variant {0}," " individual: {1}".format( + variant.get("variant_id", None), individual + ) ) - individual_genotype = variant['genotypes'][individual] + individual_genotype = variant["genotypes"][individual] if strict: if not individual_genotype.genotyped: return False @@ -69,18 +70,17 @@ def check_dominant(variant, family, strict=False): if family.individuals[individual].healthy: logger.debug("Individual {0} is healthy".format(individual)) if individual_genotype.has_variant: - if variant.get('reduced_penetrance', False): + if variant.get("reduced_penetrance", False): if individual_genotype.homo_alt: return False else: return False - + elif family.individuals[individual].affected: logger.debug("Individual {0} is affected".format(individual)) # The case when the individual is sick if individual_genotype.genotyped: if not individual_genotype.heterozygote: return False - - return True + return True diff --git a/genmod/annotate_models/models/recessive_model.py b/genmod/annotate_models/models/recessive_model.py index 1ede602..26b6e14 100755 --- a/genmod/annotate_models/models/recessive_model.py +++ b/genmod/annotate_models/models/recessive_model.py @@ -14,11 +14,12 @@ import os import sys + def check_recessive(variant, family, strict): """ - Check if the variant follows the autosomal recessive homozygote (AR_hom) + Check if the variant follows the autosomal recessive homozygote (AR_hom) pattern in this family. - + A variant is following the AR_hom pattern if: Healthy: - Can not be homozygote alternative. @@ -26,47 +27,45 @@ def check_recessive(variant, family, strict): if strict: - Have to be homozygote reference or heterozygote. - No call will return False - + Affected: - Have to be homozygote alternative. - If no call we can not exclude AR if strict: - Have to be homozygote alternative - No call will return false - + No affection status: We can not tell if variant follows the model or not. - + Args: variant: variant dictionary. family: A family object with the individuals strict: A boolean that tells if strict analyzis should be performed. - + Return: bool: depending on if the model is followed in these indivduals - + """ for individual in family.individuals: - individual_genotype = variant['genotypes'][individual] + individual_genotype = variant["genotypes"][individual] if strict: if not individual_genotype.genotyped: return False # The case where the individual is healthy: if family.individuals[individual].healthy: - # If the individual is healthy and homozygote alt the model is broken. + # If the individual is healthy and homozygote alt the model is broken. if individual_genotype.genotyped: if individual_genotype.homo_alt: return False - + # The case when the individual is sick: elif family.individuals[individual].affected: - # In the case of a sick individual it must be homozygote alternative - # for Autosomal recessive to be true. - # Also, we can not exclude the model if no call. + # In the case of a sick individual it must be homozygote alternative + # for Autosomal recessive to be true. + # Also, we can not exclude the model if no call. if individual_genotype.genotyped: if not individual_genotype.homo_alt: return False - - return True - + return True diff --git a/genmod/annotate_models/models/x_models.py b/genmod/annotate_models/models/x_models.py index 1e5e13e..dc5ef5f 100755 --- a/genmod/annotate_models/models/x_models.py +++ b/genmod/annotate_models/models/x_models.py @@ -10,57 +10,59 @@ """ from __future__ import print_function + import logging logger = logging.getLogger(__name__) + def check_X_recessive(variant, family, strict=False): """ - Check if the variant follows the x linked heterozygous (XR) pattern of + Check if the variant follows the x linked heterozygous (XR) pattern of inheritance in this family. - + A variant is following the XR pattern if: - + Healthy: - Can not be homozygote alternative - If no call we can not exclude XR - Males can not have variant at all. This is added since sometimes males - get called as heterozygotes but this should not be possible since + get called as heterozygotes but this should not be possible since they only have one copy of the X chromosome. if strict: - Have to be homozygote reference(if male) or heterozygote(if female). - No call will return False - + Affected: - Have to be homozygote alternative(or heterozygote if male). - If no call we can not exclude AR if strict: - Have to be homozygote alternative(or heterozygote if male) - No call will return false - + No affection status: We can not tell if variant follows the model or not. - + Args: variant: variant dictionary. family: A Family object with the individuals strict: A boolean that tells if strict analyzis should be performed. - + Return: bool: depending on if the model is followed in these indivduals - + """ - + for individual in family.individuals: # Get the genotype for this variant for this individual - individual_genotype = variant['genotypes'][individual] - + individual_genotype = variant["genotypes"][individual] + if strict: if not individual_genotype.genotyped: return False # The case where the individual is healthy if family.individuals[individual].healthy: - # If individual is healthy and homozygote alternative + # If individual is healthy and homozygote alternative # the variant can not be deleterious: if individual_genotype.genotyped: if individual_genotype.homo_alt: @@ -69,64 +71,65 @@ def check_X_recessive(variant, family, strict=False): if family.individuals[individual].sex == 1: if individual_genotype.has_variant: return False - + # The case when the individual is sick elif family.individuals[individual].affected: - #If the individual is sick and homozygote ref it can not be x-recessive + # If the individual is sick and homozygote ref it can not be x-recessive if individual_genotype.genotyped: if individual_genotype.homo_ref: return False - # Women have to be hom alt to be sick (almost allways carriers) + # Women have to be hom alt to be sick (almost allways carriers) elif family.individuals[individual].sex == 2: if not individual_genotype.homo_alt: return False return True + def check_X_dominant(variant, family, strict=False): """ - Check if the variant follows the x linked dominant (XD) pattern of + Check if the variant follows the x linked dominant (XD) pattern of inheritance in this family. A variant is following the XD pattern if: - + Healthy: - Can not be homozygote alternative - - Healthy females can be heterozygotes. This is possible since there + - Healthy females can be heterozygotes. This is possible since there are several documented diseases where only one allele at a time is expressed during development. - If no call we can not exclude XR if strict: - Have to be homozygote reference (or heterozygote womens). - No call will return False - + Affected: - Have to be heterozygote. - If no call we can not exclude AR if strict: - Have to be heterozygote or homozygote(for males) - No call will return false - + No affection status: We can not tell if variant follows the model or not. - + Args: variant: variant dictionary. family: A family object with the individuals strict: A boolean that tells if strict analyzis should be performed. - + Return: bool: depending on if the model is followed in these indivduals - + """ for individual in family.individuals: # Get the genotype for this variant for this individual - individual_genotype = variant['genotypes'][individual] - + individual_genotype = variant["genotypes"][individual] + if strict: if not individual_genotype.genotyped: return False # The case where the individual is healthy if family.individuals[individual].healthy: - # Healthy womans can be carriers but not homozygote: + # Healthy womans can be carriers but not homozygote: if individual_genotype.genotyped: if family.individuals[individual].sex == 2: if individual_genotype.homo_alt: @@ -135,11 +138,11 @@ def check_X_dominant(variant, family, strict=False): elif family.individuals[individual].sex == 1: if individual_genotype.has_variant: return False - + # The case when the individual is sick elif family.individuals[individual].affected: - # If the individual is sick and homozygote ref it - # can not be x-linked-dominant + # If the individual is sick and homozygote ref it + # can not be x-linked-dominant if individual_genotype.genotyped: if individual_genotype.homo_ref: return False diff --git a/genmod/annotate_models/variant_annotator.py b/genmod/annotate_models/variant_annotator.py index 99fa71f..c6a43c5 100755 --- a/genmod/annotate_models/variant_annotator.py +++ b/genmod/annotate_models/variant_annotator.py @@ -3,41 +3,48 @@ """ variant_consumer.py -Consumes batches of variants and annotates them. Each batch is a dictionary +Consumes batches of variants and annotates them. Each batch is a dictionary with variant_id:s as keys and dictionaries with variant information. The variants will get different annotations depending on input - + Created by MÃ¥ns Magnusson on 2013-03-01. Copyright (c) 2013 __MyCompanyName__. All rights reserved. """ -from __future__ import (division, print_function, absolute_import) +from __future__ import absolute_import, division, print_function import logging - from multiprocessing import Process from genmod.vcf_tools import get_genotypes -from . import (get_haploblocks, check_genetic_models, get_model_score, - make_print_version) - + +from . import check_genetic_models, get_haploblocks, get_model_score, make_print_version + class VariantAnnotator(Process): """ - Annotates variant in batches from the task queue and puts the result in + Annotates variant in batches from the task queue and puts the result in the results queue. """ - - def __init__(self, task_queue, results_queue, families, individuals, - phased=False, strict=False, vep=False, - reduced_penetrance_genes = set()): + + def __init__( + self, + task_queue, + results_queue, + families, + individuals, + phased=False, + strict=False, + vep=False, + reduced_penetrance_genes=set(), + ): """ Initialize the VariantAnnotator - - Consume variant batches from the task queue, annotate them with the - genetic inheritance patterns that they follow and put them in the + + Consume variant batches from the task queue, annotate them with the + genetic inheritance patterns that they follow and put them in the results queue. - + Arguments: task_queue (Queue) results_queue (Queue) @@ -50,24 +57,23 @@ def __init__(self, task_queue, results_queue, families, individuals, """ Process.__init__(self) self.logger = logging.getLogger(__name__) - + self.proc_name = self.name - - self.logger.info("Setting up variant_annotator: {0}".format( - self.proc_name)) - + + self.logger.info("Setting up variant_annotator: {0}".format(self.proc_name)) + self.logger.debug("Setting up task queue") self.task_queue = task_queue - + self.logger.debug("Setting up results queue") self.results_queue = results_queue - + # The families that should be annotated self.families = families self.logger.debug("Families found: {0}".format(self.families)) self.individuals = individuals self.logger.debug("Individuals found: {0}".format(self.individuals)) - + # Settings for the annotation self.phased = phased self.logger.debug("Setting phased to {0}".format(self.phased)) @@ -76,76 +82,72 @@ def __init__(self, task_queue, results_queue, families, individuals, self.vep = vep self.logger.debug("Setting vep to {0}".format(self.vep)) self.reduced_penetrance = reduced_penetrance_genes - def run(self): """Run the consuming""" - self.logger.info('%s: Starting!' % self.proc_name) + self.logger.info("%s: Starting!" % self.proc_name) # Check if there are any batches in the queue while True: # A batch is a dictionary with varints on the form {variant_id:variant_dict} self.logger.debug("Getting task from task_queue") variant_batch = self.task_queue.get() - + if variant_batch is None: - self.logger.info('No more batches') + self.logger.info("No more batches") self.task_queue.task_done() - self.logger.info('{0}: Exiting'.format(self.proc_name)) + self.logger.info("{0}: Exiting".format(self.proc_name)) break - + # We are now going to check the genetic models for the variants in # the batch - + for variant_id in variant_batch: variant = variant_batch[variant_id] - variant['genotypes'] = get_genotypes(variant, self.individuals) - + variant["genotypes"] = get_genotypes(variant, self.individuals) + # Check if the variant is in a gene with reduced penetrance - if variant.get('annotation', set()).intersection(self.reduced_penetrance): - self.logger.debug("Setting reduced_penetrance to True for"\ - " variant: {0}".format(variant_id)) - - variant['reduced_penetrance'] = True - + if variant.get("annotation", set()).intersection(self.reduced_penetrance): + self.logger.debug( + "Setting reduced_penetrance to True for" " variant: {0}".format(variant_id) + ) + + variant["reduced_penetrance"] = True + if len(variant_batch) > 1: - #If the variant are phased we want to find out which - #haploblocks they belong to for compounds + # If the variant are phased we want to find out which + # haploblocks they belong to for compounds if self.phased: self.logger.debug("Get haploblocks for variant batch") - variant_batch['haploblocks'] = get_haploblocks( - variant_batch, self.individuals - ) - - # We only need to check compound candidates if there is + variant_batch["haploblocks"] = get_haploblocks(variant_batch, self.individuals) + + # We only need to check compound candidates if there is # more than one variant in the batch for variant_id in variant_batch: self.logger.debug("Check compound candidates") variant = variant_batch[variant_id] - - variant['compound_candidate'] = False - - if variant['annotation']: - variant['compound_candidate'] = True + + variant["compound_candidate"] = False + + if variant["annotation"]: + variant["compound_candidate"] = True self.logger.debug("Set compound_candidate to True") - # Check the genetic models for all variants in the batch check_genetic_models( - variant_batch = variant_batch, - families = self.families, - phased = self.phased, - strict = self.strict, - ) + variant_batch=variant_batch, + families=self.families, + phased=self.phased, + strict=self.strict, + ) # # Now we want to make versions of the variants that are ready for printing. for variant_id in variant_batch: variant = make_print_version( - variant=variant_batch[variant_id], - families=self.families + variant=variant_batch[variant_id], families=self.families ) self.logger.debug("Putting variant in results_queue") self.results_queue.put(variant) - + self.task_queue.task_done() - + return diff --git a/genmod/annotate_regions/get_features.py b/genmod/annotate_regions/get_features.py index 5f8ddd1..6592a3d 100755 --- a/genmod/annotate_regions/get_features.py +++ b/genmod/annotate_regions/get_features.py @@ -13,15 +13,16 @@ logger = logging.getLogger(__name__) + def get_region(chrom, start, end, region_trees): """Check if a position overlapps any regions - + Arguments: chrom (str): The chromosome start (int): The start position for the feature end (int): The stop position for the feature region_trees (dict): A dictionary with chromosomes as keys and interval trees as values - + Returns: regions (set): The regions that the variant ovelapps """ diff --git a/genmod/annotate_regions/parse_annotations.py b/genmod/annotate_regions/parse_annotations.py index e00be29..b9cf606 100755 --- a/genmod/annotate_regions/parse_annotations.py +++ b/genmod/annotate_regions/parse_annotations.py @@ -3,7 +3,7 @@ """ annotation_parser.py -This script will parse a file with intervals in .bed format and build +This script will parse a file with intervals in .bed format and build one interval tree for each chromosome. These intervals typically represents genes @@ -16,6 +16,7 @@ Created by MÃ¥ns Magnusson on 2016-12-23. Copyright (c) 2016 __MonsoInc__. All rights reserved. """ + import logging try: @@ -23,42 +24,44 @@ except: import pickle -from intervaltree import (Interval, IntervalTree) +from intervaltree import Interval, IntervalTree logger = logging.getLogger(__name__) + def get_interval(start, stop, value): """Create an interval instance - - Args: - start(int) - stop(int) - value - - Returns: - interval(intervaltree.Interval) - + + Args: + start(int) + stop(int) + value + + Returns: + interval(intervaltree.Interval) + """ interval = Interval(start, stop, value) return interval + def build_region_trees(bed_lines, padding): """Build region trees for each chromosome - - Build a dictionary with chromosomes as keys and interval trees as - values. - - Args: - bed_lines(iterable): An iterable with bed formated lines - padding (int): Defines what should be considered upstream - and downstream variants + + Build a dictionary with chromosomes as keys and interval trees as + values. + + Args: + bed_lines(iterable): An iterable with bed formated lines + padding (int): Defines what should be considered upstream + and downstream variants """ region_trees = {} for region in bed_parser(bed_lines, padding): - chrom = region['chrom'] - start = region['start'] - stop = region['stop'] - symbol = region['symbol'] + chrom = region["chrom"] + start = region["start"] + stop = region["stop"] + symbol = region["symbol"] if chrom not in region_trees: region_trees[chrom] = IntervalTree() @@ -71,14 +74,14 @@ def build_region_trees(bed_lines, padding): def bed_parser(bed_lines, padding=4000): """ Parse a file in the bed format. - + Arguments: bed_lines(iterable): An iterable with bed formated lines - padding (int): Defines what should be considered upstream + padding (int): Defines what should be considered upstream and downstream variants - + Yields: - region(dict): + region(dict): { 'chrom': str, 'start': int, @@ -88,27 +91,27 @@ def bed_parser(bed_lines, padding=4000): """ genes = {} for index, line in enumerate(bed_lines): - if not line.startswith('#') and len(line) > 1: + if not line.startswith("#") and len(line) > 1: line = line.rstrip().split() feature_id = str(index) # Get the coordinates for the region: - chrom = line[0].lstrip('chr') - if chrom == 'MT': + chrom = line[0].lstrip("chr") + if chrom == "MT": feature_start = int(line[1]) feature_stop = int(line[2]) else: feature_start = max(int(line[1]) - padding, 0) feature_stop = int(line[2]) - + # Get the feature id if len(line) > 3: - feature_id = line [3] - + feature_id = line[3] + region = { - 'chrom': chrom, - 'start': feature_start, - 'stop': feature_stop, - 'symbol': feature_id + "chrom": chrom, + "start": feature_start, + "stop": feature_stop, + "symbol": feature_id, } - + yield region diff --git a/genmod/annotate_variants/__init__.py b/genmod/annotate_variants/__init__.py index 2314de8..df8eb69 100644 --- a/genmod/annotate_variants/__init__.py +++ b/genmod/annotate_variants/__init__.py @@ -1,4 +1,3 @@ from __future__ import absolute_import -from .read_tabix_files import (get_frequencies, get_cadd_scores, - get_spidex_score, get_cosmic) +from .read_tabix_files import get_cadd_scores, get_cosmic, get_frequencies, get_spidex_score diff --git a/genmod/annotate_variants/add_annotations.py b/genmod/annotate_variants/add_annotations.py index b00bef5..cb2a08b 100644 --- a/genmod/annotate_variants/add_annotations.py +++ b/genmod/annotate_variants/add_annotations.py @@ -1,122 +1,131 @@ import logging -from genmod.vcf_tools import (add_metadata) +from genmod.vcf_tools import add_metadata logger = logging.getLogger(__name__) + def add_regions(header): """Add region annotations to header""" logger.info("Adding 'Annotation' to vcf header") add_metadata( header, - 'info', - 'Annotation', - annotation_number='.', - entry_type='String', - description='Annotates what feature(s) this variant belongs to.' + "info", + "Annotation", + annotation_number=".", + entry_type="String", + description="Annotates what feature(s) this variant belongs to.", ) return + def add_exac(header): """Add exac annotation to vcf header""" - logger.info("Adding 'EXACAF' to vcf header") + logger.info("Adding 'EXACAF' to vcf header") add_metadata( header, - 'info', - 'EXACAF', - annotation_number='1', - entry_type='Float', - description="Frequency in the ExAC database." + "info", + "EXACAF", + annotation_number="1", + entry_type="Float", + description="Frequency in the ExAC database.", ) return + def add_exac_max(header): """Add exac annotation to vcf header""" - logger.info("Adding 'EXAC_MAX_AF' to vcf header") + logger.info("Adding 'EXAC_MAX_AF' to vcf header") add_metadata( header, - 'info', - 'EXAC_MAX_AF', - annotation_number='1', - entry_type='Float', - description="The max af for ExAC populations." + "info", + "EXAC_MAX_AF", + annotation_number="1", + entry_type="Float", + description="The max af for ExAC populations.", ) return + def add_thousandg(header): """Add thousand genomes annotation to vcf header""" - logger.info("Adding '1000GAF' to vcf header") + logger.info("Adding '1000GAF' to vcf header") add_metadata( header, - 'info', - '1000GAF', - annotation_number='1', - entry_type='Float', - description="Frequency in the 1000G database." + "info", + "1000GAF", + annotation_number="1", + entry_type="Float", + description="Frequency in the 1000G database.", ) return + def add_thousandg_max(header): """Add thousand genomes max annotation to vcf header""" - logger.info("Adding '1000G_MAX_AF' to vcf header") + logger.info("Adding '1000G_MAX_AF' to vcf header") add_metadata( header, - 'info', - '1000G_MAX_AF', - annotation_number='1', - entry_type='Float', - description="The max af for thousand genomes populations." + "info", + "1000G_MAX_AF", + annotation_number="1", + entry_type="Float", + description="The max af for thousand genomes populations.", ) return + def add_spidex(header): """Add spidex annotation to vcf header""" - logger.info("Adding 'SPIDEX' to vcf header") + logger.info("Adding 'SPIDEX' to vcf header") add_metadata( header, - 'info', - 'SPIDEX', - annotation_number='1', - entry_type='Float', - description="Z score from the spidex database." + "info", + "SPIDEX", + annotation_number="1", + entry_type="Float", + description="Z score from the spidex database.", ) return + def add_cadd(header): """Add cadd annotation to vcf header""" - logger.info("Adding 'CADD' to vcf header") + logger.info("Adding 'CADD' to vcf header") add_metadata( header, - 'info', - 'CADD', - annotation_number='1', - entry_type='Integer', - description="The CADD relative score for this alternative." + "info", + "CADD", + annotation_number="1", + entry_type="Integer", + description="The CADD relative score for this alternative.", ) return + def add_cadd_raw(header): """Add cadd annotation to vcf header""" - logger.info("Adding 'CADD' to vcf header") + logger.info("Adding 'CADD' to vcf header") add_metadata( header, - 'info', - 'CADD_raw', - annotation_number='1', - entry_type='Float', - description="The CADD raw score(s) for this alternative(s)." + "info", + "CADD_raw", + annotation_number="1", + entry_type="Float", + description="The CADD raw score(s) for this alternative(s).", ) return + def add_cosmic(header): """Add cosmic annotation to vcf header""" - logger.info("Adding 'COSMIC' to vcf header") + logger.info("Adding 'COSMIC' to vcf header") add_metadata( header, - 'info', - 'COSMIC', - annotation_number='0', - entry_type='Flag', - description="If variant is in COSMIC database." + "info", + "COSMIC", + annotation_number="0", + entry_type="Flag", + description="If variant is in COSMIC database.", ) - return \ No newline at end of file + return diff --git a/genmod/annotate_variants/annotate.py b/genmod/annotate_variants/annotate.py index 388309b..e065882 100644 --- a/genmod/annotate_variants/annotate.py +++ b/genmod/annotate_variants/annotate.py @@ -1,82 +1,86 @@ import logging from genmod.annotate_regions.get_features import get_region -from genmod.annotate_variants.read_tabix_files import (get_frequencies, - get_spidex_score, get_cadd_scores) +from genmod.annotate_variants.read_tabix_files import ( + get_cadd_scores, + get_frequencies, + get_spidex_score, +) logger = logging.getLogger(__name__) + def annotate_variant(variant, annotation_arguments): """Annotate a variant based on what arguments that are passed""" - variant_info = variant.rstrip().split('\t') + variant_info = variant.rstrip().split("\t") chrom = variant_info[0] - if chrom.startswith(('chr', 'CHR', 'Chr')): + if chrom.startswith(("chr", "CHR", "Chr")): chrom = chrom[3:] pos = int(variant_info[1]) ref = variant_info[3] alt = variant_info[4] - + info = variant_info[7] - if info == '.': + if info == ".": info = [] else: - info = info.split(';') - + info = info.split(";") + ## TODO this needs to be handeled different for SV:s start = pos # This is a construct so that there will not be inconsistent genetic regions end = pos + 1 # end = pos + max(len(ref), len(alt)) - - #Check which annotations that are available + + # Check which annotations that are available regions = None - if 'region_trees' in annotation_arguments: - regions = get_region(chrom, start, end, annotation_arguments['region_trees']) + if "region_trees" in annotation_arguments: + regions = get_region(chrom, start, end, annotation_arguments["region_trees"]) if regions: - info.append("Annotation={0}".format(','.join(regions))) - - if 'exac' in annotation_arguments: - reader = annotation_arguments['exac'] + info.append("Annotation={0}".format(",".join(regions))) + + if "exac" in annotation_arguments: + reader = annotation_arguments["exac"] frequencies = get_frequencies(reader, chrom, start, alt) - if 'AF' in frequencies: - info.append("EXACAF={0}".format(frequencies['AF'])) - if annotation_arguments.get('max_af'): - if 'MAX_AF' in frequencies: - info.append("EXAC_MAX_AF={0}".format(frequencies['MAX_AF'])) + if "AF" in frequencies: + info.append("EXACAF={0}".format(frequencies["AF"])) + if annotation_arguments.get("max_af"): + if "MAX_AF" in frequencies: + info.append("EXAC_MAX_AF={0}".format(frequencies["MAX_AF"])) - if 'thousand_g' in annotation_arguments: - reader = annotation_arguments['thousand_g'] + if "thousand_g" in annotation_arguments: + reader = annotation_arguments["thousand_g"] frequencies = get_frequencies(reader, chrom, start, alt) - if 'AF' in frequencies: - info.append("1000GAF={0}".format(frequencies['AF'])) - if annotation_arguments.get('max_af'): - if 'MAX_AF' in frequencies: - info.append("1000G_MAX_AF={0}".format(frequencies['MAX_AF'])) + if "AF" in frequencies: + info.append("1000GAF={0}".format(frequencies["AF"])) + if annotation_arguments.get("max_af"): + if "MAX_AF" in frequencies: + info.append("1000G_MAX_AF={0}".format(frequencies["MAX_AF"])) - if 'spidex' in annotation_arguments: - reader = annotation_arguments['spidex'] + if "spidex" in annotation_arguments: + reader = annotation_arguments["spidex"] spidex_score = get_spidex_score(reader, chrom, start, alt) if spidex_score: info.append("SPIDEX={0}".format(spidex_score)) - if 'cadd_files' in annotation_arguments: - readers = annotation_arguments['cadd_files'] + if "cadd_files" in annotation_arguments: + readers = annotation_arguments["cadd_files"] cadd_scores = {} for reader in readers: if not cadd_scores: cadd_scores = get_cadd_scores(reader, chrom, start, alt) if cadd_scores: - info.append("CADD={0}".format(cadd_scores['cadd_phred'])) - if annotation_arguments.get('cadd_raw'): - info.append("CADD_raw={0}".format(cadd_scores['cadd_raw'])) + info.append("CADD={0}".format(cadd_scores["cadd_phred"])) + if annotation_arguments.get("cadd_raw"): + info.append("CADD_raw={0}".format(cadd_scores["cadd_raw"])) # Rebuild the info string if len(info) > 0: - info_string = ';'.join(info) + info_string = ";".join(info) else: - info_string = '.' - + info_string = "." + variant_info[7] = info_string - - return '\t'.join(variant_info) \ No newline at end of file + + return "\t".join(variant_info) diff --git a/genmod/annotate_variants/read_tabix_files.py b/genmod/annotate_variants/read_tabix_files.py index 3ea06c0..1f27f6b 100644 --- a/genmod/annotate_variants/read_tabix_files.py +++ b/genmod/annotate_variants/read_tabix_files.py @@ -1,33 +1,34 @@ -import os import logging +import os import tabix - from tabix import TabixError logger = logging.getLogger(__name__) + def get_tabixhandle(path): """Check if a file is zipped and that the index exists - If something looks wierd raise a TabixError + If something looks wierd raise a TabixError """ - if not path.endswith('.gz'): + if not path.endswith(".gz"): raise TabixError("File {0} does not end with '.gz'".format(path)) - index_file = path + '.tbi' + index_file = path + ".tbi" if not os.path.isfile(index_file): raise TabixError("No index could be found for {0}".format(path)) - + return tabix.open(path) + def get_tabix_records(tabix_reader, chrom, start): """Get the tabix records for some given coordinates - + Args: tabix_reader (Tabix.reader): A Tabix object chrom (str): The chromosome of the position start (str): The start position of the variant alt (str): The alternative sequence - + Returns: records (Iterable): The overlapping records found """ @@ -36,30 +37,30 @@ def get_tabix_records(tabix_reader, chrom, start): logger.debug("Looking for records with chr:%s, pos:%s" % (chrom, start)) tabix_key = int(start) try: - records = tabix_reader.query(chrom, tabix_key-1, tabix_key) + records = tabix_reader.query(chrom, tabix_key - 1, tabix_key) except TypeError: - records = tabix_reader.query(str(chrom), tabix_key-1, tabix_key) + records = tabix_reader.query(str(chrom), tabix_key - 1, tabix_key) except TabixError: try: - records = tabix_reader.query('chr'+chrom, tabix_key-1, tabix_key) - except TabixError: - logger.info("Chromosome {0} does not seem to exist in {1}".format( - chrom, tabix_reader)) + records = tabix_reader.query("chr" + chrom, tabix_key - 1, tabix_key) + except TabixError: + logger.info("Chromosome {0} does not seem to exist in {1}".format(chrom, tabix_reader)) except: pass return records + def get_frequencies(tabix_reader, chrom, start, alt): """ Return the frequencies from a tabix indexed vcf file. - + Arguments: tabix_reader (Tabix.reader): A Tabix object chrom (str): The chromosome of the position start (str): The start position of the variant alt (str): The alternative sequence - + Returns: frequencies (dict): A dictionary with relevant frequencies """ @@ -68,65 +69,67 @@ def get_frequencies(tabix_reader, chrom, start, alt): frequencies = {} for record in records: - logger.debug("Found record: %s" % '\t'.join(record)) - #We can get multiple rows so need to check each one - #We also need to check each one of the alternatives per row - for i,alternative in enumerate(record[4].split(',')): + logger.debug("Found record: %s" % "\t".join(record)) + # We can get multiple rows so need to check each one + # We also need to check each one of the alternatives per row + for i, alternative in enumerate(record[4].split(",")): if alternative == alt: - for info in record[7].split(';'): - info = info.split('=') - if info[0] in ['AF', 'MAX_AF']: - freqs = info[-1].split(',') + for info in record[7].split(";"): + info = info.split("=") + if info[0] in ["AF", "MAX_AF"]: + freqs = info[-1].split(",") frequencies[info[0]] = freqs[i] return frequencies + def get_spidex_score(tabix_reader, chrom, start, alt): """ Return the record from a spidex file. - + Arguments: tabix_reader (Tabix.reader): A Tabix object chrom (str): The chromosome of the position start (str): The start position of the variant alt (str): The alternative sequence - + Returns: spidex_score float: The spidex z scores for this position - + """ records = get_tabix_records(tabix_reader, chrom, start) spidex_score = None - + for record in records: if record[3] == alt: - #We need to send both cadd values + # We need to send both cadd values spidex_score = float(record[5]) - + logger.debug("Found spidex score: %s" % str(spidex_score)) return spidex_score + def get_cosmic(tabix_reader, chrom, start, alt): """ Return if record exists in cosmic database. - + Arguments: tabix_reader (Tabix.reader): A Tabix object chrom (str): The chromosome of the position start (str): The start position of the variant alt (str): The alternative sequence - + Returns: in_cosmic (bool): If variant is in COSMIC - + """ records = get_tabix_records(tabix_reader, chrom, start) in_cosmic = False - + for record in records: if record[4] == alt: - #We need to send both cadd values + # We need to send both cadd values in_cosmic = True if in_cosmic: logger.debug("Variant was found in COSMIC") @@ -135,28 +138,28 @@ def get_cosmic(tabix_reader, chrom, start, alt): return in_cosmic + def get_cadd_scores(tabix_reader, chrom, start, alt): """ Return the record from a cadd file. - + Arguments: tabix_reader (Tabix.reader): A Tabix object chrom (str): The chromosome of the position start (str): The start position of the variant alternatives (str): The alternative sequence - + Returns: cadd_scores (dict): The cadd scores for this position - + """ cadd_scores = {} - records = get_tabix_records(tabix_reader, chrom, start) + records = get_tabix_records(tabix_reader, chrom, start) # CADD values are only for snps: for record in records: if record[3] == alt: - #We need to send both cadd values - cadd_scores['cadd_raw'] = record[-2] - cadd_scores['cadd_phred'] = record[-1] - - return cadd_scores + # We need to send both cadd values + cadd_scores["cadd_raw"] = record[-2] + cadd_scores["cadd_phred"] = record[-1] + return cadd_scores diff --git a/genmod/annotations/__init__.py b/genmod/annotations/__init__.py index 73f3ba0..889c073 100644 --- a/genmod/annotations/__init__.py +++ b/genmod/annotations/__init__.py @@ -1,7 +1,7 @@ from importlib_resources import files -ensembl_file_37 = 'annotations/ensembl_genes_37.txt.gz' -ensembl_file_38 = 'annotations/ensembl_genes_38.txt.gz' +ensembl_file_37 = "annotations/ensembl_genes_37.txt.gz" +ensembl_file_38 = "annotations/ensembl_genes_38.txt.gz" -ensembl_path_37 = files('genmod').joinpath(ensembl_file_37) -ensembl_path_38 = files('genmod').joinpath(ensembl_file_38) +ensembl_path_37 = files("genmod").joinpath(ensembl_file_37) +ensembl_path_38 = files("genmod").joinpath(ensembl_file_38) diff --git a/genmod/commands/__init__.py b/genmod/commands/__init__.py index a8015b2..578fdc2 100644 --- a/genmod/commands/__init__.py +++ b/genmod/commands/__init__.py @@ -1,8 +1,8 @@ from __future__ import absolute_import -from .genmod_sort import sort as sort_command from .annotate_models import models as models_command -from .score_variants import score as score_command -from .score_compounds import compound as score_compounds_command from .annotate_variant import annotate as annotate_variant_command -from .filter_variants import filter as filter_command \ No newline at end of file +from .filter_variants import filter as filter_command +from .genmod_sort import sort as sort_command +from .score_compounds import compound as score_compounds_command +from .score_variants import score as score_command diff --git a/genmod/commands/analyze.py b/genmod/commands/analyze.py index 0f9380f..ebfef21 100755 --- a/genmod/commands/analyze.py +++ b/genmod/commands/analyze.py @@ -11,8 +11,9 @@ from __future__ import print_function -import sys import os +import sys + import click try: @@ -27,167 +28,170 @@ from genmod.utils import print_headers -# This is an ad hoc solution to remove huge mostly uninteresting genes. +# This is an ad hoc solution to remove huge mostly uninteresting genes. # Please modify this set for your own needs -PROBLEMATIC_GENES = set(['MIR6077-1', - 'MIR6077-2', - 'MIR4315-1', - 'MIR4315-2', - 'LINC00623', - 'LINC00869', - 'NBPF8', - 'NBPF9', - 'NBPF20', - 'PPIAL4A', - 'PPIAL4B', - 'PPIAL4C', - 'PDE4DIP', - 'LOC100132057', - 'LOC100288162', - 'SRGAP2D', - 'FAM272C', - 'SNAR-A3', - 'SNAR-A4', - 'SNAR-A5', - 'SNAR-A6', - 'SNAR-A7', - 'SNAR-A8', - 'SNAR-A9', - 'SNAR-A10', - 'SNAR-A11', - 'SNAR-A14', - 'GLUD1P7', - ]) +PROBLEMATIC_GENES = set( + [ + "MIR6077-1", + "MIR6077-2", + "MIR4315-1", + "MIR4315-2", + "LINC00623", + "LINC00869", + "NBPF8", + "NBPF9", + "NBPF20", + "PPIAL4A", + "PPIAL4B", + "PPIAL4C", + "PDE4DIP", + "LOC100132057", + "LOC100288162", + "SRGAP2D", + "FAM272C", + "SNAR-A3", + "SNAR-A4", + "SNAR-A5", + "SNAR-A6", + "SNAR-A7", + "SNAR-A8", + "SNAR-A9", + "SNAR-A10", + "SNAR-A11", + "SNAR-A14", + "GLUD1P7", + ] +) + def check_families(variant_file): """Loop through the vcf file and check which families that are found.""" families = set([]) - if variant_file == '-': - variant_parser = VCFParser(fsock = sys.stdin) + if variant_file == "-": + variant_parser = VCFParser(fsock=sys.stdin) else: - variant_parser = VCFParser(infile = variant_file) + variant_parser = VCFParser(infile=variant_file) for variant in variant_parser: - genetic_models = variant['info_dict'].get('GeneticModels', None) + genetic_models = variant["info_dict"].get("GeneticModels", None) if genetic_models: for family_models in genetic_models: - family = family_models.split(':')[0] + family = family_models.split(":")[0] families.add(family) return families -def print_results(variant_dict, - outfile, - vcf_header, - family_id, - score_key='CADD', - freq_key='1000G_freq', - mode = 'homozygote', - silent=False): +def print_results( + variant_dict, + outfile, + vcf_header, + family_id, + score_key="CADD", + freq_key="1000G_freq", + mode="homozygote", + silent=False, +): """Print the variants to a results file or stdout.""" - - score_dict = {} # A dictionary with {variant_id: score}. Score is usually cadd score or rank score + + score_dict = {} # A dictionary with {variant_id: score}. Score is usually cadd score or rank score # for variant_id, variant in sorted(variant_dict.items(), key = lambda sort_key: float(sort_key[1]['info_dict'].get('CADD', '0')), reverse=True): column_width = 12 length_of_output = 20 for variant_id in variant_dict: # Get the score for each variant: max_score = max( - [ - float(score) for score in - variant_dict[variant_id]['info_dict'].get( - score_key, - '0') - ] - ) - if mode == 'compound': + [float(score) for score in variant_dict[variant_id]["info_dict"].get(score_key, "0")] + ) + if mode == "compound": # If we look at compounds we want to consider the combined score - family_compounds = compound_dict[variant_id]['info_dict'].get('Compounds', None) + family_compounds = compound_dict[variant_id]["info_dict"].get("Compounds", None) if compounds: for family in family_compounds: - splitted_compounds = family.split(':') + splitted_compounds = family.split(":") if splitted_compounds[0] == family_id: - compounds = splitted_compounds[1].split('|') - + compounds = splitted_compounds[1].split("|") + for variant_2_id in compounds: if variant_2_id in variant_dict: max_score_2 = max( - [ - float(score) for score in - variant_dict[variant_2_id]['info_dict'].get( - score_key, - '0') - ] - ) + [ + float(score) + for score in variant_dict[variant_2_id]["info_dict"].get( + score_key, "0" + ) + ] + ) if max_score_2 > 10: # print(variant_dict[variant_2_id]) variant_pair = (variant_id, variant_2_id) - max_score = (max_score + max_score_2)/2 + max_score = (max_score + max_score_2) / 2 already_scored = [set(var_pair) for var_pair in list(score_dict.keys())] if set(variant_pair) not in already_scored: score_dict[variant_pair] = max_score else: score_dict[variant_id] = max_score - - if mode == 'compound': - print('\nCompound analysis:\n') - if mode == 'dominant': - print('\nDominant analysis:\n') - if mode == 'homozygote': - print('\nHomozygote analysis:\n') - if mode == 'denovo': - print('\nDe novo analysis:\n') - if mode == 'xlinked': - print('\nX-linked analysis:\n') - header = ['Chrom', - 'Position', - 'Reference', - 'Alternative', - 'Cadd score', - '1000GMAF', - 'Annotation' - ] - - print(''.join(word.ljust(column_width) for word in header)) - + + if mode == "compound": + print("\nCompound analysis:\n") + if mode == "dominant": + print("\nDominant analysis:\n") + if mode == "homozygote": + print("\nHomozygote analysis:\n") + if mode == "denovo": + print("\nDe novo analysis:\n") + if mode == "xlinked": + print("\nX-linked analysis:\n") + header = [ + "Chrom", + "Position", + "Reference", + "Alternative", + "Cadd score", + "1000GMAF", + "Annotation", + ] + + print("".join(word.ljust(column_width) for word in header)) + i = 0 - - with open(outfile , mode='a', encoding='utf-8') as f: + + with open(outfile, mode="a", encoding="utf-8") as f: for variant_id in sorted(score_dict, key=score_dict.get, reverse=True): - if mode == 'compound': + if mode == "compound": if i < length_of_output: - print('Pair %s' % (i+1)) + print("Pair %s" % (i + 1)) for compound_id in variant_id: - print_line = [variant_dict[compound_id]['CHROM'], - variant_dict[compound_id]['POS'], - variant_dict[compound_id]['REF'], - variant_dict[compound_id]['ALT'], - variant_dict[compound_id]['info_dict'].get(score_key, '-'), - variant_dict[compound_id]['info_dict'].get(freq_key, '-'), - variant_dict[compound_id]['info_dict'].get('Annotation', '-') - ] + print_line = [ + variant_dict[compound_id]["CHROM"], + variant_dict[compound_id]["POS"], + variant_dict[compound_id]["REF"], + variant_dict[compound_id]["ALT"], + variant_dict[compound_id]["info_dict"].get(score_key, "-"), + variant_dict[compound_id]["info_dict"].get(freq_key, "-"), + variant_dict[compound_id]["info_dict"].get("Annotation", "-"), + ] if i < length_of_output: - print(''.join(word.ljust(column_width) for word in print_line)) - print_line = [variant_dict[compound_id].get(entry, '-') for entry in vcf_header] - f.write('\t'.join(print_line)+'\n') + print("".join(word.ljust(column_width) for word in print_line)) + print_line = [variant_dict[compound_id].get(entry, "-") for entry in vcf_header] + f.write("\t".join(print_line) + "\n") else: - print_line = [variant_dict[variant_id]['CHROM'], - variant_dict[variant_id]['POS'], - variant_dict[variant_id]['REF'], - variant_dict[variant_id]['ALT'], - variant_dict[variant_id]['info_dict'].get(score_key, ['-'])[0], - variant_dict[variant_id]['info_dict'].get(freq_key, ['-'])[0], - variant_dict[variant_id]['info_dict'].get('Annotation', ['-'])[0] - ] + print_line = [ + variant_dict[variant_id]["CHROM"], + variant_dict[variant_id]["POS"], + variant_dict[variant_id]["REF"], + variant_dict[variant_id]["ALT"], + variant_dict[variant_id]["info_dict"].get(score_key, ["-"])[0], + variant_dict[variant_id]["info_dict"].get(freq_key, ["-"])[0], + variant_dict[variant_id]["info_dict"].get("Annotation", ["-"])[0], + ] # Print the highest ranked variants to screen: if i < length_of_output: - print(''.join(word.ljust(column_width) for word in print_line)) - print_line = [variant_dict[variant_id].get(entry, '-') for entry in vcf_header] - f.write('\t'.join(print_line)+'\n') + print("".join(word.ljust(column_width) for word in print_line)) + print_line = [variant_dict[variant_id].get(entry, "-") for entry in vcf_header] + f.write("\t".join(print_line) + "\n") i += 1 - - return + return ### This is for analyzing the variants ### @@ -195,42 +199,41 @@ def print_results(variant_dict, def make_models(list_of_models): """Make a dictionary of the prefered models. - If no models are specified all are considered interesting.""" + If no models are specified all are considered interesting.""" model_set = set() # If no models are specified we allow all models if len(list_of_models) == 0: - list_of_models = ['AR', 'AD', 'X'] - + list_of_models = ["AR", "AD", "X"] + for model in list_of_models: - if 'AR' in model: - model_set.add('AR_hom') - model_set.add('AR_hom_dn') - model_set.add('AR_comp') - model_set.add('AR_comp_dn') - if 'AD' in model: - model_set.add('AD') - model_set.add('AD_dn') - if 'X' in model: - model_set.add('XR') - model_set.add('XR_dn') - model_set.add('XD') - model_set.add('XD_dn') + if "AR" in model: + model_set.add("AR_hom") + model_set.add("AR_hom_dn") + model_set.add("AR_comp") + model_set.add("AR_comp_dn") + if "AD" in model: + model_set.add("AD") + model_set.add("AD_dn") + if "X" in model: + model_set.add("XR") + model_set.add("XR_dn") + model_set.add("XD") + model_set.add("XD_dn") return model_set - def remove_inacurate_compounds(compound_dict, family_id): """If the second variant in a compound pair does not meet the requirements they should not be considered.""" - + for variant_id in list(compound_dict.keys()): # Get the compounds for the variant - family_compounds = compound_dict[variant_id]['info_dict'].get('Compounds', None) + family_compounds = compound_dict[variant_id]["info_dict"].get("Compounds", None) if compounds: - for family in family_compounds.split(','): - splitted_compounds = family.split(':') + for family in family_compounds.split(","): + splitted_compounds = family.split(":") if splitted_compounds[0] == family_id: - compounds = splitted_compounds[1].split('|') - compound_set = set(compounds) + compounds = splitted_compounds[1].split("|") + compound_set = set(compounds) for compound in compounds: # If requrements are not met it has never been placed in compound dict if compound not in compound_dict: @@ -241,89 +244,82 @@ def remove_inacurate_compounds(compound_dict, family_id): return -def covered_in_all(variant, coverage_treshold = 7): +def covered_in_all(variant, coverage_treshold=7): """Check if the variant is covered in all individuals.""" - for individual in variant['genotypes']: - if variant['genotypes'][individual].quality_depth < coverage_treshold: + for individual in variant["genotypes"]: + if variant["genotypes"][individual].quality_depth < coverage_treshold: return False return True - -def get_interesting_variants(variant_parser, family_id, dominant_dict, - homozygote_dict, compound_dict, x_linked_dict, dominant_dn_dict, - freq_treshold, freq_keyword, cadd_treshold, cadd_keyword, gq_treshold, - coverage, exclude_problematic): + +def get_interesting_variants( + variant_parser, + family_id, + dominant_dict, + homozygote_dict, + compound_dict, + x_linked_dict, + dominant_dn_dict, + freq_treshold, + freq_keyword, + cadd_treshold, + cadd_keyword, + gq_treshold, + coverage, + exclude_problematic, +): """Collect the interesting variants in their dictionarys. add RankScore.""" - - inheritance_keyword = 'GeneticModels' - - - de_novo_set = set(['AD_dn', 'AR_hom_dn', 'AR_comp_dn', 'XD_dn', 'XR_dn']) - dominant_set = set(['AD']) - homozygote_set = set(['AR_hom']) - compound_set = set(['AR_comp']) - x_linked_set = set(['XD', 'XR']) - dominant_dn_set = set(['AD_dn']) - - + + inheritance_keyword = "GeneticModels" + + de_novo_set = set(["AD_dn", "AR_hom_dn", "AR_comp_dn", "XD_dn", "XR_dn"]) + dominant_set = set(["AD"]) + homozygote_set = set(["AR_hom"]) + compound_set = set(["AR_comp"]) + x_linked_set = set(["XD", "XR"]) + dominant_dn_set = set(["AD_dn"]) + for variant in variant_parser: - annotation = set(variant['info_dict'].get('Annotation', '')) + annotation = set(variant["info_dict"].get("Annotation", "")) models_found = set([]) - - family_models = variant['info_dict'].get(inheritance_keyword, None) + + family_models = variant["info_dict"].get(inheritance_keyword, None) if family_models: - #This is a string on the form 'fam_1:AR_hom,fam_2:AR_hom|AR_hom_dn + # This is a string on the form 'fam_1:AR_hom,fam_2:AR_hom|AR_hom_dn for family_info in family_models: - splitted_family = family_info.split(':') + splitted_family = family_info.split(":") if splitted_family[0] == family_id: - models_found = set(splitted_family[1].split('|')) - - maf = min( - [ - float(frequency) for frequency in - variant['info_dict'].get( - freq_keyword, - '0' - ) - ] - ) - cadd_score = max( - [ - float(cscore) for cscore in - variant['info_dict'].get( - cadd_keyword, - '0' - ) - ] - ) - - variant_id = variant['variant_id'] - + models_found = set(splitted_family[1].split("|")) + + maf = min([float(frequency) for frequency in variant["info_dict"].get(freq_keyword, "0")]) + cadd_score = max([float(cscore) for cscore in variant["info_dict"].get(cadd_keyword, "0")]) + + variant_id = variant["variant_id"] + # There is a list of huge genes that becomes problematic when analysing single individuals - + interesting = True - + if not models_found: interesting = False - + if exclude_problematic: if annotation.intersection(PROBLEMATIC_GENES): interesting = False - + # if not covered_in_all(variant, coverage): # interesting = False - - if not variant['FILTER'] == 'PASS': + + if not variant["FILTER"] == "PASS": interesting = False - - if not float(variant['QUAL']) > gq_treshold: + + if not float(variant["QUAL"]) > gq_treshold: interesting = False - - + if interesting: # Check if cadd score is available: if cadd_score > cadd_treshold: - # Check if MAF is below treshold: + # Check if MAF is below treshold: if maf < freq_treshold: # First we look at the variants that are not dn: if models_found.intersection(dominant_set): @@ -336,273 +332,274 @@ def get_interesting_variants(variant_parser, family_id, dominant_dict, x_linked_dict[variant_id] = variant if models_found.intersection(dominant_dn_set): dominant_dn_dict[variant_id] = variant - + return @click.command() -@click.argument('variant_file', - nargs=1, - type=click.Path(exists=True), - metavar=' or "-"' -) -@click.option('-t' ,'--family_type', - type=click.Choice(['ped', 'alt', 'cmms', 'mip']), - default='ped', - help="""If the analysis use one of the known setups, - please specify which one.""" +@click.argument("variant_file", nargs=1, type=click.Path(exists=True), metavar=' or "-"') +@click.option( + "-t", + "--family_type", + type=click.Choice(["ped", "alt", "cmms", "mip"]), + default="ped", + help="""If the analysis use one of the known setups, + please specify which one.""", ) # @click.option('-c', '--config_file', # type=click.Path(exists=True), # help="""Specify the path to a config file.""" # ) -@click.option('--frequency_treshold', '-freq', - default=0.02, - nargs=1, - help="""Specify maf treshold for variants to be considered. - Default 0.02""" +@click.option( + "--frequency_treshold", + "-freq", + default=0.02, + nargs=1, + help="""Specify maf treshold for variants to be considered. + Default 0.02""", ) -@click.option('--frequency_keyword', '-freqkey', - default='1000G_freq', - nargs=1, - help="""Specify keyword for frequency in vcf. - Default 1000G_freq""" +@click.option( + "--frequency_keyword", + "-freqkey", + default="1000G_freq", + nargs=1, + help="""Specify keyword for frequency in vcf. + Default 1000G_freq""", ) -@click.option('--cadd_treshold', '-cadd', - default=12.0, - nargs=1, - help="""Specify the cadd treshold for variants to be - considered. Default 12.0""" +@click.option( + "--cadd_treshold", + "-cadd", + default=12.0, + nargs=1, + help="""Specify the cadd treshold for variants to be + considered. Default 12.0""", ) -@click.option('--cadd_keyword', '-caddkey', - default='CADD', - nargs=1, - help="""Specify keyword for CADD scores in vcf. - Default CADD""" +@click.option( + "--cadd_keyword", + "-caddkey", + default="CADD", + nargs=1, + help="""Specify keyword for CADD scores in vcf. + Default CADD""", ) -@click.option('--coverage', '-cov', - default=7, - nargs=1, - help="""Specify minimum read depth in all individuals for - variant to be considered. Default 7""" +@click.option( + "--coverage", + "-cov", + default=7, + nargs=1, + help="""Specify minimum read depth in all individuals for + variant to be considered. Default 7""", ) -@click.option('--gq_treshold', '-gq', - default=20, - nargs=1, - help="""Specify genotype quality treshold for variants - to be considered. Default 20.""" +@click.option( + "--gq_treshold", + "-gq", + default=20, + nargs=1, + help="""Specify genotype quality treshold for variants + to be considered. Default 20.""", ) # @click.option('-p', '--patterns', # type=click.Choice(['AR', 'AD', 'X']), # multiple=True, # help='Specify the inheritance patterns. Default is all patterns' # ) -@click.option('-o', '--outdir', - type=click.Path(exists=True), - default=os.getcwd(), - help="""Specify the path to a directory where results - should be stored. Default is ./""" +@click.option( + "-o", + "--outdir", + type=click.Path(exists=True), + default=os.getcwd(), + help="""Specify the path to a directory where results + should be stored. Default is ./""", ) -@click.option('-s', '--silent', - is_flag=True, - help='Do not output variants.' +@click.option("-s", "--silent", is_flag=True, help="Do not output variants.") +@click.option( + "-exclude", + "--exclude_problematic", + is_flag=True, + help="""Exclude problematic genes. This flag is preferable + if analysis of only one individual.""", ) -@click.option('-exclude', '--exclude_problematic', - is_flag=True, - help="""Exclude problematic genes. This flag is preferable - if analysis of only one individual.""" -) -@click.option('-v', '--verbose', - is_flag=True, - help='Increase output verbosity.' -) -def analyze(variant_file, family_type, frequency_treshold, frequency_keyword, - cadd_treshold, cadd_keyword, coverage, gq_treshold, outdir, silent, - exclude_problematic, verbose): - """Analyze the annotated variants in a VCF file. - - If there are multiple families in the ped one analysis per family will - be done. The variants are analyzed in five different categories based - on what inheritance patterns that are followed. - The differen analysies are: - - AR compound\n - AR homozygote\n - Dominant\n - X linked\n - Dominant dn\n - - Which variants to be considered are specified in the command line. - Defaults are (based on a rare disease assumption): - - MAF < 0.02\n - CADD score > 12\n - Coverage in all individuals > 7\n - Call quality > 20\n - - The highest scoring variants of each category is printed to screen. - The full list of each category is printed to new vcf files in a - directory specified by the user. Default current dir. - File names are the same like the input vcf with the name of the - analysis appended. - +@click.option("-v", "--verbose", is_flag=True, help="Increase output verbosity.") +def analyze( + variant_file, + family_type, + frequency_treshold, + frequency_keyword, + cadd_treshold, + cadd_keyword, + coverage, + gq_treshold, + outdir, + silent, + exclude_problematic, + verbose, +): + """Analyze the annotated variants in a VCF file. + + If there are multiple families in the ped one analysis per family will + be done. The variants are analyzed in five different categories based + on what inheritance patterns that are followed. + The differen analysies are: + + AR compound\n + AR homozygote\n + Dominant\n + X linked\n + Dominant dn\n + + Which variants to be considered are specified in the command line. + Defaults are (based on a rare disease assumption): + + MAF < 0.02\n + CADD score > 12\n + Coverage in all individuals > 7\n + Call quality > 20\n + + The highest scoring variants of each category is printed to screen. + The full list of each category is printed to new vcf files in a + directory specified by the user. Default current dir. + File names are the same like the input vcf with the name of the + analysis appended. + """ - + start_time_analysis = datetime.now() - - # configs = ConfigObj(config_file) + + # configs = ConfigObj(config_file) # prefered_models = make_models([]) - - inheritance_keyword = 'GeneticModels' + + inheritance_keyword = "GeneticModels" families = check_families(variant_file) file_name = os.path.splitext(os.path.split(variant_file)[-1])[0] - - + # if config_file: # frequency_treshold = float(configs.get('frequency', {}).get('rare', frequency_treshold)) # freq_keyword = configs.get('frequency', {}).get('keyword', freq_keyword) # inheritance_patterns = [pattern for pattern in configs.get('inheritance', {}).get('patterns',[])] # inheritance_keyword = configs.get('inheritance', {}).get('keyword',inheritance_keyword) # prefered_models = make_models(inheritance_patterns) - - if variant_file == '-': - variant_parser = VCFParser(fsock = sys.stdin) + + if variant_file == "-": + variant_parser = VCFParser(fsock=sys.stdin) else: - variant_parser = VCFParser(infile = variant_file) - + variant_parser = VCFParser(infile=variant_file) + for family_id in families: - print('Analysis for family: %s' % family_id) - + print("Analysis for family: %s" % family_id) + head = variant_parser.metadata - + dominant_dict = {} homozygote_dict = {} compound_dict = {} x_linked_dict = {} dominant_dn_dict = {} - - - get_interesting_variants(variant_parser, - family_id, - dominant_dict, - homozygote_dict, - compound_dict, - x_linked_dict, - dominant_dn_dict, - frequency_treshold, - frequency_keyword, - cadd_treshold, - cadd_keyword, - gq_treshold, - coverage, - exclude_problematic) - + + get_interesting_variants( + variant_parser, + family_id, + dominant_dict, + homozygote_dict, + compound_dict, + x_linked_dict, + dominant_dn_dict, + frequency_treshold, + frequency_keyword, + cadd_treshold, + cadd_keyword, + gq_treshold, + coverage, + exclude_problematic, + ) + remove_inacurate_compounds(compound_dict, family_id) - + if len(dominant_dict) > 0: - dominant_file = os.path.join( - outdir, - file_name+'_dominant_analysis.vcf' - ) - + dominant_file = os.path.join(outdir, file_name + "_dominant_analysis.vcf") + print_headers(head, dominant_file) - + print_results( - dominant_dict, - dominant_file, - family_id, - variant_parser.header, - cadd_keyword, - frequency_keyword, - mode='dominant' - ) - + dominant_dict, + dominant_file, + family_id, + variant_parser.header, + cadd_keyword, + frequency_keyword, + mode="dominant", + ) + if len(homozygote_dict) > 0: - homozygote_file = os.path.join( - outdir, - file_name+'_homozygote_analysis.vcf' - ) + homozygote_file = os.path.join(outdir, file_name + "_homozygote_analysis.vcf") print_headers(head, homozygote_file) - + print_results( - homozygote_dict, - homozygote_file, - family_id, - variant_parser.header, - cadd_keyword, - frequency_keyword, - mode='homozygote' - ) - + homozygote_dict, + homozygote_file, + family_id, + variant_parser.header, + cadd_keyword, + frequency_keyword, + mode="homozygote", + ) + if len(compound_dict) > 0: - compound_file = os.path.join( - outdir, - file_name+'_compound_analysis.vcf' - ) + compound_file = os.path.join(outdir, file_name + "_compound_analysis.vcf") print_headers(head, compound_file) - + print_results( - compound_dict, - compound_file, - family_id, - variant_parser.header, - cadd_keyword, - frequency_keyword, - mode='compound' - ) - + compound_dict, + compound_file, + family_id, + variant_parser.header, + cadd_keyword, + frequency_keyword, + mode="compound", + ) + if len(x_linked_dict) > 0: - xlinked_file = os.path.join( - outdir, - file_name+'_x_linked_analysis.vcf' - ) + xlinked_file = os.path.join(outdir, file_name + "_x_linked_analysis.vcf") print_headers(head, xlinked_file) - + print_results( - x_linked_dict, - xlinked_file, - family_id, - variant_parser.header, - cadd_keyword, - frequency_keyword, - mode='xlinked' - ) - + x_linked_dict, + xlinked_file, + family_id, + variant_parser.header, + cadd_keyword, + frequency_keyword, + mode="xlinked", + ) + if len(dominant_dn_dict) > 0: - dominant_dn_file = os.path.join( - outdir, - file_name+'_ad_denovo_analysis.vcf' - ) + dominant_dn_file = os.path.join(outdir, file_name + "_ad_denovo_analysis.vcf") print_headers(head, dominant_dn_file) - + print_results( - dominant_dn_dict, - dominant_dn_file, - family_id, - variant_parser.header, - cadd_keyword, - frequency_keyword, - mode='denovo' - ) - - print('') - - print('Number of interesting Dominant variants: %s' % - len(dominant_dict)) - print('Number of interesting Homozygote variants: %s' % - len(homozygote_dict)) - print('Number of interesting Compound variants: %s' % - len(compound_dict)) - print('Number of interesting X-linked variants: %s' % - len(x_linked_dict)) - print('Number of interesting Autosomal Dominant de novo variants: %s' % - len(dominant_dn_dict)) - + dominant_dn_dict, + dominant_dn_file, + family_id, + variant_parser.header, + cadd_keyword, + frequency_keyword, + mode="denovo", + ) + + print("") + + print("Number of interesting Dominant variants: %s" % len(dominant_dict)) + print("Number of interesting Homozygote variants: %s" % len(homozygote_dict)) + print("Number of interesting Compound variants: %s" % len(compound_dict)) + print("Number of interesting X-linked variants: %s" % len(x_linked_dict)) + print( + "Number of interesting Autosomal Dominant de novo variants: %s" % len(dominant_dn_dict) + ) + # pp(compound_dict) - - print('Time for analysis: %s' % str(datetime.now() - start_time_analysis)) + + print("Time for analysis: %s" % str(datetime.now() - start_time_analysis)) # print_headers(variant_parser.metadata, outfile=outfile) - + # dominant_results = NamedTemporaryFile(delete=False) # dominant_results.close() # @@ -612,8 +609,7 @@ def analyze(variant_file, family_type, frequency_treshold, frequency_keyword, # print(dominant_results) # print(outfile) # print_variants(dominant_results.name, outfile, silent) - - -if __name__ == '__main__': - analyze() \ No newline at end of file + +if __name__ == "__main__": + analyze() diff --git a/genmod/commands/annotate_models.py b/genmod/commands/annotate_models.py index 18746e3..23a8b8a 100755 --- a/genmod/commands/annotate_models.py +++ b/genmod/commands/annotate_models.py @@ -11,78 +11,99 @@ Copyright (c) 2015 __MoonsoInc__. All rights reserved. """ -from __future__ import (print_function) +from __future__ import print_function -import sys -import os -import click import inspect +import itertools import logging +import os import shutil -import itertools - -from multiprocessing import JoinableQueue, Manager, cpu_count, util +import sys from codecs import open from datetime import datetime +from multiprocessing import JoinableQueue, Manager, cpu_count, util from tempfile import NamedTemporaryFile +import click from ped_parser import FamilyParser -from genmod import (__version__) +from genmod import __version__ +from genmod.annotate_models import VariantAnnotator +from genmod.utils import VariantPrinter, check_individuals, get_batches +from genmod.vcf_tools import HeaderParser, add_metadata, print_headers, print_variant, sort_variants -from genmod.utils import (get_batches, VariantPrinter, check_individuals) -from genmod.annotate_models import (VariantAnnotator) -from genmod.vcf_tools import (add_metadata, print_headers, sort_variants, -print_variant, HeaderParser) - -from .utils import (temp_dir, silent, outfile, processes, variant_file, - family_file, family_type, get_file_handle) +from .utils import ( + family_file, + family_type, + get_file_handle, + outfile, + processes, + silent, + temp_dir, + variant_file, +) logger = logging.getLogger(__name__) util.abstract_sockets_supported = False -@click.command('models', short_help="Annotate inheritance") + +@click.command("models", short_help="Annotate inheritance") @variant_file @family_file @family_type -@click.option('-r', '--reduced_penetrance','--reduced-penetrance', - nargs=1, - type=click.File('r'), - metavar='', - help='File with gene ids that have reduced penetrance.' +@click.option( + "-r", + "--reduced_penetrance", + "--reduced-penetrance", + nargs=1, + type=click.File("r"), + metavar="", + help="File with gene ids that have reduced penetrance.", ) -@click.option('--vep', - is_flag=True, - help='If variants are annotated with the Variant Effect Predictor.' +@click.option( + "--vep", is_flag=True, help="If variants are annotated with the Variant Effect Predictor." ) -@click.option('--phased', - is_flag=True, - help='If data is phased use this flag.' +@click.option("--phased", is_flag=True, help="If data is phased use this flag.") +@click.option( + "-s", + "--strict", + is_flag=True, + help="If strict model annotations should be used(see documentation).", ) -@click.option('-s' ,'--strict', - is_flag=True, - help='If strict model annotations should be used(see documentation).' -) -@click.option('-w' ,'--whole_gene','--whole-gene', - is_flag=True, - help='DEPRECATED FLAG - on by default' +@click.option( + "-w", "--whole_gene", "--whole-gene", is_flag=True, help="DEPRECATED FLAG - on by default" ) @processes @silent -@click.option('-k' ,'--keyword', - default="Annotation", - help="""What annotation keyword that should be used when - searching for features.""" +@click.option( + "-k", + "--keyword", + default="Annotation", + help="""What annotation keyword that should be used when + searching for features.""", ) @outfile @temp_dir @click.pass_context -def models(context, variant_file, family_file, family_type, reduced_penetrance, - vep, keyword, phased, strict, silent, processes, outfile, - temp_dir, whole_gene): +def models( + context, + variant_file, + family_file, + family_type, + reduced_penetrance, + vep, + keyword, + phased, + strict, + silent, + processes, + outfile, + temp_dir, + whole_gene, +): """ - Annotate genetic models for vcf variants. - + Annotate genetic models for vcf variants. + Checks what patterns of inheritance that are followed in a VCF file. The analysis is family based so each family that are specified in the family file and exists in the variant file will get it's own annotation. @@ -93,44 +114,35 @@ def models(context, variant_file, family_file, family_type, reduced_penetrance, ######### This is for logging the command line string ######### frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) - argument_list = [ - i+'='+str(values[i]) for i in values if values[i] and - i not in ['frame'] - ] - + argument_list = [i + "=" + str(values[i]) for i in values if values[i] and i not in ["frame"]] + variant_file = get_file_handle(variant_file) ########################################################################### - + logger.info("Running GENMOD annotate models version {0}".format(__version__)) - logger.debug("Arguments: {0}".format(', '.join(argument_list))) - + logger.debug("Arguments: {0}".format(", ".join(argument_list))) + reduced_penetrance_genes = set() nr_reduced_penetrance_genes = 0 if reduced_penetrance: logger.info("Found file with genes that have reduced penetrance") for line in reduced_penetrance: - if not line.startswith('#'): + if not line.startswith("#"): nr_reduced_penetrance_genes += 1 gene_id = line.rstrip().split()[0] - logger.debug("Adding gene {0} to reduced penetrance genes".format( - gene_id - )) - reduced_penetrance_genes.add( - gene_id - ) - - logger.info("Found {0} genes with reduced penetrance".format( - nr_reduced_penetrance_genes)) - - + logger.debug("Adding gene {0} to reduced penetrance genes".format(gene_id)) + reduced_penetrance_genes.add(gene_id) + + logger.info("Found {0} genes with reduced penetrance".format(nr_reduced_penetrance_genes)) + if not family_file: logger.warning("Please provide a family file with -f/--family_file") context.abort() - + logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") - + families = {} logger.info("Check if the familys have any affected") for family_id in family_parser.families: @@ -140,42 +152,46 @@ def models(context, variant_file, family_file, family_type, reduced_penetrance, ind_obj = family_obj.individuals[ind_id] if ind_obj.affected: found_affected = True - + if found_affected: families[family_id] = family_obj else: - logger.warning("No affected individuals found for family {0}."\ - " Skipping family.".format(family_id)) - + logger.warning( + "No affected individuals found for family {0}." " Skipping family.".format( + family_id + ) + ) + if not families: logger.warning("Please provide at least one family with affected individuals") context.abort() # The individuals in the ped file must be present in the variant file: - logger.info("Families used in analysis: {0}".format( - ','.join(list(families.keys())))) - logger.info("Individuals included in analysis: {0}".format( - ','.join(list(family_parser.individuals.keys())))) - - + logger.info("Families used in analysis: {0}".format(",".join(list(families.keys())))) + logger.info( + "Individuals included in analysis: {0}".format( + ",".join(list(family_parser.individuals.keys())) + ) + ) + head = HeaderParser() - + for line in variant_file: line = line.rstrip() - if line.startswith('#'): - if line.startswith('##'): + if line.startswith("#"): + if line.startswith("##"): head.parse_meta_data(line) else: head.parse_header_line(line) else: break - - #Add the first variant to the iterator - if not line.startswith('#'): + + # Add the first variant to the iterator + if not line.startswith("#"): variant_file = itertools.chain([line], variant_file) else: print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) - + if vep: if not "CSQ" in head.info_dict: logger.warning("vep flag is used but there is no CSQ field specified in header") @@ -190,77 +206,75 @@ def models(context, variant_file, family_file, family_type, reduced_penetrance, context.abort() else: logger.info("Using {0} annotation".format(keyword)) - - + if "GeneticModels" in head.info_dict: - logger.warning("Genetic models are already annotated according to vcf"\ - " header.") + logger.warning("Genetic models are already annotated according to vcf" " header.") context.abort() - + logger.info("Adding genmod version to vcf header") head.add_version_tracking( - info_id='genmod', - version=__version__, - date=datetime.now().strftime("%Y-%m-%d %H:%M"), - command_line=' '.join(argument_list) - ) - + info_id="genmod", + version=__version__, + date=datetime.now().strftime("%Y-%m-%d %H:%M"), + command_line=" ".join(argument_list), + ) + logger.debug("Version added") logger.info("Adding genetic models to vcf header") add_metadata( head, - 'info', - 'GeneticModels', - annotation_number='.', - entry_type='String', - description="':'-separated list of genetic models for this variant." + "info", + "GeneticModels", + annotation_number=".", + entry_type="String", + description="':'-separated list of genetic models for this variant.", ) - + logger.debug("Genetic models added") logger.info("Adding model score to vcf header") add_metadata( head, - 'info', - 'ModelScore', - annotation_number='.', - entry_type='String', - description="PHRED score for genotype models." + "info", + "ModelScore", + annotation_number=".", + entry_type="String", + description="PHRED score for genotype models.", ) logger.debug("Model score added") - + logger.info("Adding Compounds to vcf header") add_metadata( head, - 'info', - 'Compounds', - annotation_number='.', - entry_type='String', - description=("List of compound pairs for this variant." - "The list is splitted on ',' family id is separated with compounds" - "with ':'. Compounds are separated with '|'.") + "info", + "Compounds", + annotation_number=".", + entry_type="String", + description=( + "List of compound pairs for this variant." + "The list is splitted on ',' family id is separated with compounds" + "with ':'. Compounds are separated with '|'." + ), ) logger.debug("Compounds added") - + vcf_individuals = head.individuals - logger.debug("Individuals found in vcf file: {}".format(', '.join(vcf_individuals))) - + logger.debug("Individuals found in vcf file: {}".format(", ".join(vcf_individuals))) + try: check_individuals(family_parser.individuals, vcf_individuals) except IOError as e: logger.error(e) - logger.info("Individuals in PED file: {0}".format( - ', '.join(family_parser.individuals))) - logger.info("Individuals in VCF file: {0}".format(', '.join(vcf_individuals))) - + logger.info("Individuals in PED file: {0}".format(", ".join(family_parser.individuals))) + logger.info("Individuals in VCF file: {0}".format(", ".join(vcf_individuals))) + context.abort() start_time_analysis = datetime.now() analysis_individuals = list(family_parser.individuals.keys()) - - logger.info("Individuals used in analysis: {0}".format( - ', '.join(analysis_individuals))) - + + logger.info("Individuals used in analysis: {0}".format(", ".join(analysis_individuals))) + ################################################################### ### The task queue is where all jobs(in this case batches that ### ### represents variants in a region) is put. The consumers will ### @@ -276,13 +290,12 @@ def models(context, variant_file, family_file, family_type, reduced_penetrance, results = Manager().Queue() num_model_checkers = processes - #Adapt the number of processes to the machine that run the analysis - logger.info('Number of CPU:s {}'.format(cpu_count())) - logger.info('Number of model checkers: {}'.format(num_model_checkers)) - + # Adapt the number of processes to the machine that run the analysis + logger.info("Number of CPU:s {}".format(cpu_count())) + logger.info("Number of model checkers: {}".format(num_model_checkers)) # These are the workers that do the heavy part of the analysis - logger.info('Seting up the workers') + logger.info("Seting up the workers") try: model_checkers = [ VariantAnnotator( @@ -293,24 +306,21 @@ def models(context, variant_file, family_file, family_type, reduced_penetrance, phased=phased, strict=strict, vep=vep, - reduced_penetrance_genes = reduced_penetrance_genes + reduced_penetrance_genes=reduced_penetrance_genes, ) for i in range(num_model_checkers) ] - logger.info('Starting the workers') + logger.info("Starting the workers") for worker in model_checkers: - logger.debug('Starting worker {0}'.format(worker)) + logger.debug("Starting worker {0}".format(worker)) worker.start() - + # This process prints the variants to temporary files - logger.info('Seting up the variant printer') + logger.info("Seting up the variant printer") if len(model_checkers) == 1: print_headers(head=head, outfile=outfile, silent=silent) variant_printer = VariantPrinter( - task_queue=results, - head=head, - mode='normal', - outfile = outfile + task_queue=results, head=head, mode="normal", outfile=outfile ) else: # We use a temp file to store the processed variants @@ -320,51 +330,45 @@ def models(context, variant_file, family_file, family_type, reduced_penetrance, else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() - + variant_printer = VariantPrinter( - task_queue=results, - head=head, - mode='chromosome', - outfile = temp_file.name + task_queue=results, head=head, mode="chromosome", outfile=temp_file.name ) - - logger.info('Starting the variant printer process') + + logger.info("Starting the variant printer process") variant_printer.start() - + start_time_variant_parsing = datetime.now() - + # This process parses the original vcf and create batches to put in the variant queue: - logger.info('Start parsing the variants') + logger.info("Start parsing the variants") chromosome_list = get_batches( - variants = variant_file, - batch_queue = variant_queue, - header = head, - vep = vep, - annotation_keyword = keyword - ) - + variants=variant_file, + batch_queue=variant_queue, + header=head, + vep=vep, + annotation_keyword=keyword, + ) + logger.debug("Put stop signs in the variant queue") for i in range(num_model_checkers): variant_queue.put(None) - + variant_queue.join() results.put(None) variant_printer.join() - + if len(model_checkers) > 1: - sort_variants(infile=temp_file.name, mode='chromosome') + sort_variants(infile=temp_file.name, mode="chromosome") print_headers(head=head, outfile=outfile, silent=silent) - with open(temp_file.name, 'r', encoding='utf-8') as f: + with open(temp_file.name, "r", encoding="utf-8") as f: for line in f: print_variant( - variant_line=line, - outfile=outfile, - mode='modified', - silent=silent + variant_line=line, outfile=outfile, mode="modified", silent=silent ) - + except Exception as err: logger.warning(err) for worker in model_checkers: @@ -377,5 +381,4 @@ def models(context, variant_file, family_file, family_type, reduced_penetrance, os.remove(temp_file.name) logger.debug("Temp file removed") - logger.info('Time for whole analyis: {0}'.format( - str(datetime.now() - start_time_analysis))) + logger.info("Time for whole analyis: {0}".format(str(datetime.now() - start_time_analysis))) diff --git a/genmod/commands/annotate_variant.py b/genmod/commands/annotate_variant.py index db23794..0d49fd5 100755 --- a/genmod/commands/annotate_variant.py +++ b/genmod/commands/annotate_variant.py @@ -11,140 +11,163 @@ Copyright (c) 2015 __MoonsoInc__. All rights reserved. """ -import sys -import logging import itertools - -import click - +import logging +import sys from datetime import datetime +import click from tabix import TabixError from genmod import __version__ - -from genmod.vcf_tools import (HeaderParser, print_headers, print_variant) - -from genmod.annotations import (ensembl_path_37, ensembl_path_38) - -from genmod.annotate_regions.parse_annotations import (build_region_trees) -from genmod.annotate_variants.add_annotations import (add_regions, add_exac, - add_exac_max, add_thousandg, add_thousandg_max, add_spidex, add_cadd, - add_cadd_raw, add_cosmic) - -from genmod.annotate_variants.read_tabix_files import (get_tabixhandle) +from genmod.annotate_regions.parse_annotations import build_region_trees +from genmod.annotate_variants.add_annotations import ( + add_cadd, + add_cadd_raw, + add_cosmic, + add_exac, + add_exac_max, + add_regions, + add_spidex, + add_thousandg, + add_thousandg_max, +) from genmod.annotate_variants.annotate import annotate_variant - -from genmod.commands.utils import (outfile, silent, temp_dir, - variant_file, get_file_handle) +from genmod.annotate_variants.read_tabix_files import get_tabixhandle +from genmod.annotations import ensembl_path_37, ensembl_path_38 +from genmod.commands.utils import get_file_handle, outfile, silent, temp_dir, variant_file +from genmod.vcf_tools import HeaderParser, print_headers, print_variant logger = logging.getLogger(__name__) -@click.command('annotate', short_help="Annotate vcf variants") + +@click.command("annotate", short_help="Annotate vcf variants") @variant_file -@click.option('-r', '--annotate_regions','--annotate-regions', '--regions', - is_flag=True, - help='Annotate what regions a variant belongs to (eg. genes).' -) -@click.option('--region-file','--region_file', - type=click.Path(exists=True), - show_default=True, - help='Choose a bed file with regions that should be used.' +@click.option( + "-r", + "--annotate_regions", + "--annotate-regions", + "--regions", + is_flag=True, + help="Annotate what regions a variant belongs to (eg. genes).", ) -@click.option('--genome-build','-b', - type=click.Choice(['37','38']), - default='37', - show_default=True, - help='Choose what genome build to use.' +@click.option( + "--region-file", + "--region_file", + type=click.Path(exists=True), + show_default=True, + help="Choose a bed file with regions that should be used.", ) -@click.option('-c', '--cadd-file', '--cadd_file', - multiple = True, - type=click.Path(exists=True), - help="Specify the path to a bgzipped cadd file"\ - " (with index) with variant scores. This command can be"\ - " used multiple times if multiple cadd files." +@click.option( + "--genome-build", + "-b", + type=click.Choice(["37", "38"]), + default="37", + show_default=True, + help="Choose what genome build to use.", ) -@click.option('--thousand-g', '--thousand_g', - type=click.Path(exists=True), - help="Specify the path to a bgzipped vcf file"\ - " (with index) with 1000g variants" +@click.option( + "-c", + "--cadd-file", + "--cadd_file", + multiple=True, + type=click.Path(exists=True), + help="Specify the path to a bgzipped cadd file" + " (with index) with variant scores. This command can be" + " used multiple times if multiple cadd files.", ) -@click.option('--exac', - type=click.Path(exists=True), - help="Specify the path to a bgzipped vcf file"\ - " (with index) with exac variants." +@click.option( + "--thousand-g", + "--thousand_g", + type=click.Path(exists=True), + help="Specify the path to a bgzipped vcf file" " (with index) with 1000g variants", ) -@click.option('--cosmic', - type=click.Path(exists=True), - help="Specify the path to a bgzipped vcf file"\ - " (with index) with COSMIC variants." +@click.option( + "--exac", + type=click.Path(exists=True), + help="Specify the path to a bgzipped vcf file" " (with index) with exac variants.", ) -@click.option('--max-af', '--max_af', - is_flag=True, - help="If the MAX AF should be annotated" +@click.option( + "--cosmic", + type=click.Path(exists=True), + help="Specify the path to a bgzipped vcf file" " (with index) with COSMIC variants.", ) -@click.option('--spidex', - type=click.Path(exists=True), - help="Specify the path to a bgzipped tsv file"\ - " (with index) with spidex information." +@click.option("--max-af", "--max_af", is_flag=True, help="If the MAX AF should be annotated") +@click.option( + "--spidex", + type=click.Path(exists=True), + help="Specify the path to a bgzipped tsv file" " (with index) with spidex information.", ) -@click.option('--cadd-raw', '--cadd_raw', - is_flag=True, - help="""If the raw cadd scores should be annotated.""" +@click.option( + "--cadd-raw", "--cadd_raw", is_flag=True, help="""If the raw cadd scores should be annotated.""" ) @outfile @silent @temp_dir @click.pass_context -def annotate(context, variant_file, annotate_regions, region_file, cadd_file, - thousand_g, exac, spidex, outfile, silent, cadd_raw, cosmic, - max_af, temp_dir, genome_build): +def annotate( + context, + variant_file, + annotate_regions, + region_file, + cadd_file, + thousand_g, + exac, + spidex, + outfile, + silent, + cadd_raw, + cosmic, + max_af, + temp_dir, + genome_build, +): """ Annotate vcf variants. - + Annotate variants with a number of different sources. Please use --help for more info. """ regions = annotate_regions logger.info("Running genmod annotate_variant version {0}".format(__version__)) - + if not region_file: - if genome_build == '37': + if genome_build == "37": region_file = str(ensembl_path_37) - elif genome_build == '38': + elif genome_build == "38": region_file = str(ensembl_path_38) - + start_time_analysis = datetime.now() annotation_arguments = {} - + variants = get_file_handle(variant_file) - + logger.info("Initializing a Header Parser") head = HeaderParser() - + line = None for line in variants: line = line.rstrip() - if line.startswith('#'): - if line.startswith('##'): + if line.startswith("#"): + if line.startswith("##"): head.parse_meta_data(line) else: head.parse_header_line(line) else: break - - #Add the first variant back to the iterator + + # Add the first variant back to the iterator # If the vcf has no variants the last line will be a header - if not line.startswith('#'): + if not line.startswith("#"): variants = itertools.chain([line], variants) else: print_headers(head, outfile, silent) sys.exit(0) - + header_line = head.header - annotation_arguments['header_line'] = header_line - + annotation_arguments["header_line"] = header_line + try: if regions: logger.info("Loading annotations") @@ -152,59 +175,58 @@ def annotate(context, variant_file, annotate_regions, region_file, cadd_file, add_regions(head) regions_handle = get_file_handle(region_file) logger.debug("Adding region trees to arguments") - annotation_arguments['region_trees'] = build_region_trees(regions_handle, padding=4000) - + annotation_arguments["region_trees"] = build_region_trees(regions_handle, padding=4000) + if exac: logger.info("Annotating ExAC frequencies") logger.debug("Using ExAC file: {0}".format(exac)) - annotation_arguments['exac'] = get_tabixhandle(exac) + annotation_arguments["exac"] = get_tabixhandle(exac) add_exac(head) - + if thousand_g: logger.info("Annotating 1000G frequencies") logger.debug("Using 1000G file: {0}".format(thousand_g)) - annotation_arguments['thousand_g'] = get_tabixhandle(thousand_g) + annotation_arguments["thousand_g"] = get_tabixhandle(thousand_g) add_thousandg(head) - + if spidex: logger.info("Annotating Spidex z scores") logger.debug("Using Spidex file: {0}".format(spidex)) - annotation_arguments['spidex'] = get_tabixhandle(spidex) + annotation_arguments["spidex"] = get_tabixhandle(spidex) add_spidex(head) - + if cadd_file: logger.info("Annotating CADD scores") - logger.debug("Using CADD file(s): {0}".format(', '.join(cadd_file))) - annotation_arguments['cadd_files'] = [get_tabixhandle(cadd) for cadd in cadd_file] - + logger.debug("Using CADD file(s): {0}".format(", ".join(cadd_file))) + annotation_arguments["cadd_files"] = [get_tabixhandle(cadd) for cadd in cadd_file] + add_cadd(head) - + if cadd_raw: - annotation_arguments['cadd_raw'] = cadd_raw + annotation_arguments["cadd_raw"] = cadd_raw add_cadd_raw(head) - + if max_af: - annotation_arguments['max_af'] = max_af + annotation_arguments["max_af"] = max_af if thousand_g: add_thousandg_max(head) if exac: add_exac_max(head) - + if cosmic: logger.info("Annotating if variant is in COSMIC") logger.debug("Using COSMIC file: {0}".format(cosmic)) - annotation_arguments['cosmic'] = get_tabixhandle(cosmic) + annotation_arguments["cosmic"] = get_tabixhandle(cosmic) add_cosmic(head) except TabixError as err: logger.warning(err) context.abort() - + print_headers(head, outfile, silent) - + for variant in variants: print_variant( - variant_line = annotate_variant(variant, annotation_arguments), - outfile = outfile, - silent = silent + variant_line=annotate_variant(variant, annotation_arguments), + outfile=outfile, + silent=silent, ) - diff --git a/genmod/commands/base.py b/genmod/commands/base.py index 7626855..fc7aaaa 100755 --- a/genmod/commands/base.py +++ b/genmod/commands/base.py @@ -9,16 +9,23 @@ Copyright (c) 2013 __MyCompanyName__. All rights reserved. """ -from __future__ import (print_function) +from __future__ import print_function import click -from . import (sort_command, models_command, score_command, -score_compounds_command, annotate_variant_command, filter_command) # , sort, annotate, analyze, summarize_variants, score_variants) - from genmod import __version__ +from . import ( + annotate_variant_command, + filter_command, + models_command, + score_command, + score_compounds_command, + sort_command, +) + + def print_version(ctx, param, value): """Callback function for printing version and exiting Args: @@ -30,41 +37,41 @@ def print_version(ctx, param, value): """ if not value or ctx.resilient_parsing: return - click.echo('genmod version: ' + __version__) + click.echo("genmod version: " + __version__) ctx.exit() ### This is the main script ### + @click.group() -@click.option('--version', - is_flag=True, - callback=print_version, - expose_value=False, - is_eager=True +@click.option("--version", is_flag=True, callback=print_version, expose_value=False, is_eager=True) +@click.option( + "-l", + "--logfile", + type=click.Path(exists=False), + help="Path to log file. If none logging is " "printed to stderr.", ) -@click.option('-l', '--logfile', - type=click.Path(exists=False), - help=u"Path to log file. If none logging is "\ - "printed to stderr." -) -@click.option('-v', '--verbose', - count=True, - default=0, - help=u"Increase output verbosity. Can be used multiple times, eg. -vv" +@click.option( + "-v", + "--verbose", + count=True, + default=0, + help="Increase output verbosity. Can be used multiple times, eg. -vv", ) @click.pass_context def cli(context, logfile, verbose): """Tool for annotating and analyzing genetic variants in the vcf format.\n - For more information, please run: - genmod COMMAND --help \n - """ + For more information, please run: + genmod COMMAND --help \n + """ from genmod import logger - from genmod.log import init_log, LEVELS - loglevel = LEVELS.get(min(verbose,2), "WARNING") - + from genmod.log import LEVELS, init_log + + loglevel = LEVELS.get(min(verbose, 2), "WARNING") + init_log(logger, logfile, loglevel) - + cli.add_command(sort_command) cli.add_command(models_command) @@ -74,5 +81,5 @@ def cli(context, logfile, verbose): cli.add_command(filter_command) -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/genmod/commands/filter_variants.py b/genmod/commands/filter_variants.py index 1a8d4d2..08f89af 100755 --- a/genmod/commands/filter_variants.py +++ b/genmod/commands/filter_variants.py @@ -10,101 +10,109 @@ Copyright (c) 2015 __MoonsoInc__. All rights reserved. """ -from __future__ import (print_function) +from __future__ import print_function -import sys -import os -import logging import itertools - -import click - +import logging +import os +import sys from codecs import open from datetime import datetime +import click from extract_vcf import Plugin from genmod import __version__ +from genmod.vcf_tools import ( + HeaderParser, + get_info_dict, + get_variant_dict, + print_headers, + print_variant, +) -from genmod.vcf_tools import (HeaderParser, get_variant_dict, get_info_dict, -print_variant, print_headers) - -from .utils import (variant_file, silent, outfile, get_file_handle) +from .utils import get_file_handle, outfile, silent, variant_file logger = logging.getLogger(__name__) + @click.command() @variant_file -@click.option('-a', '--annotation', - default='1000GAF', - help="Specify the info annotation to search for."\ - " Default 1000GAF" +@click.option( + "-a", + "--annotation", + default="1000GAF", + help="Specify the info annotation to search for." " Default 1000GAF", ) -@click.option('-t', '--threshold', - default=0.05, - help="""Threshold for filter variants. Default 0.05""" +@click.option( + "-t", "--threshold", default=0.05, help="""Threshold for filter variants. Default 0.05""" ) -@click.option('-d', '--discard', - is_flag=True, - help="If variants without the annotation should be"\ - " discarded" +@click.option( + "-d", + "--discard", + is_flag=True, + help="If variants without the annotation should be" " discarded", ) -@click.option('-g', '--greater', - is_flag=True, - help="If greater than threshold should be used instead of"\ - " less thatn threshold." +@click.option( + "-g", + "--greater", + is_flag=True, + help="If greater than threshold should be used instead of" " less thatn threshold.", ) @silent @outfile def filter(variant_file, annotation, threshold, discard, greater, silent, outfile): """ Filter vcf variants. - + Filter variants based on their annotation """ logger.info("Running genmod filter version {0}".format(__version__)) variant_file = get_file_handle(variant_file) start_time_analysis = datetime.now() - + logger.info("Initializing a Header Parser") head = HeaderParser() - + for line in variant_file: line = line.rstrip() - if line.startswith('#'): - if line.startswith('##'): + if line.startswith("#"): + if line.startswith("##"): head.parse_meta_data(line) else: head.parse_header_line(line) else: break - - #Add the first variant to the iterator + + # Add the first variant to the iterator variant_file = itertools.chain([line], variant_file) - + header_line = head.header - + if not annotation in head.info_dict: logger.warning("Annotation {0} not specified in header".format(annotation)) logger.info("Please check VCF file") logger.info("Exiting...") sys.exit(1) - + logger.info("Building a plugin from extract_vcf for {0}".format(annotation)) annotation_plugin = Plugin( - name=annotation, - field='INFO', + name=annotation, + field="INFO", info_key=annotation, - separators = [','], - record_rule = 'min', - data_type = 'float' + separators=[","], + record_rule="min", + data_type="float", + ) + logger.debug( + "Plugin=(field={0},info_key={1},separators={2},record_rule={3}" ",data_type={4})".format( + "INFO", annotation, "','", "min", "float" + ) ) - logger.debug("Plugin=(field={0},info_key={1},separators={2},record_rule={3}"\ - ",data_type={4})".format('INFO', annotation, "','", 'min', 'float')) - + print_headers(head=head, outfile=outfile, silent=silent) - + nr_of_variants = 0 nr_of_passed_variants = 0 for variant in variant_file: @@ -122,27 +130,22 @@ def filter(variant_file, annotation, threshold, discard, greater, silent, outfil else: if not discard: keep_variant = True - + if keep_variant: logger.debug("Keeping variant") nr_of_passed_variants += 1 - print_variant( - variant_line=variant, - outfile=outfile, - mode='vcf', - silent=silent - ) + print_variant(variant_line=variant, outfile=outfile, mode="vcf", silent=silent) else: logger.debug("Discarding variant") - logger.info("Number of variants in file {0}".format(nr_of_variants)) logger.info("Number of variants passing filter {0}".format(nr_of_passed_variants)) - logger.info("Number of variants filtered {0}".format( - nr_of_variants - nr_of_passed_variants)) + logger.info("Number of variants filtered {0}".format(nr_of_variants - nr_of_passed_variants)) -if __name__ == '__main__': - from genmod.log import init_log + +if __name__ == "__main__": from genmod import logger + from genmod.log import init_log + init_log(logger, loglevel="DEBUG") filter() diff --git a/genmod/commands/genmod_sort.py b/genmod/commands/genmod_sort.py index 24b3f5c..5a69ca6 100755 --- a/genmod/commands/genmod_sort.py +++ b/genmod/commands/genmod_sort.py @@ -11,44 +11,41 @@ from __future__ import print_function -import sys -import os -import click import logging - +import os +import sys from codecs import open -from tempfile import NamedTemporaryFile from datetime import datetime +from tempfile import NamedTemporaryFile - -from genmod.vcf_tools import (sort_variants, get_info_dict, print_variant, - HeaderParser, print_headers) - -from genmod.utils import (get_chromosome_priority, get_rank_score) +import click from genmod import __version__ +from genmod.utils import get_chromosome_priority, get_rank_score +from genmod.vcf_tools import ( + HeaderParser, + get_info_dict, + print_headers, + print_variant, + sort_variants, +) -from .utils import (variant_file, outfile, silent, temp_dir, get_file_handle) +from .utils import get_file_handle, outfile, silent, temp_dir, variant_file logger = logging.getLogger(__name__) + @click.command() @variant_file @outfile -@click.option('-f', '--family_id', - type=str, - help='Specify the family id for sorting.' -) +@click.option("-f", "--family_id", type=str, help="Specify the family id for sorting.") @silent @temp_dir -@click.option('-p', '--position', - is_flag=True, - help='If variants should be sorted by position.' -) +@click.option("-p", "--position", is_flag=True, help="If variants should be sorted by position.") def sort(variant_file, outfile, family_id, silent, position, temp_dir): """ Sort a VCF file based on rank score. - """ + """ head = HeaderParser() variant_file = get_file_handle(variant_file) logger.info("Running GENMOD sort version {0}".format(__version__)) @@ -61,91 +58,65 @@ def sort(variant_file, outfile, family_id, silent, position, temp_dir): temp_file = NamedTemporaryFile(delete=False) temp_file.close() # Open the temp file with codecs - temp_file_handle = open( - temp_file.name, - mode='w', - encoding='utf-8', - errors='replace' - ) + temp_file_handle = open(temp_file.name, mode="w", encoding="utf-8", errors="replace") logger.debug("Temp file created") logger.info("Printing variants to temp file") nr_variants = 0 # Print the variants with rank score in first column for line in variant_file: line = line.rstrip() - if line.startswith('#'): - if line.startswith('##'): + if line.startswith("#"): + if line.startswith("##"): head.parse_meta_data(line) else: head.parse_header_line(line) else: nr_variants += 1 - priority = '0' - + priority = "0" + if position: chrom = line.split()[0] priority = get_chromosome_priority(chrom) else: priority = get_rank_score(line) - - print_variant( - variant_line=line, - priority=priority, - outfile=temp_file_handle - ) - + + print_variant(variant_line=line, priority=priority, outfile=temp_file_handle) + temp_file_handle.close() - + logger.info("Variants printed to temp file") logger.info("Nr or variants in VCF file: {0}".format(nr_variants)) - - sort_mode = 'rank' - + + sort_mode = "rank" + if nr_variants == 0: logger.debug("Printing headers") - print_headers( - head = head, - outfile = outfile, - silent=silent - ) + print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) - - + if position: - sort_mode = 'chromosome' - + sort_mode = "chromosome" + logger.info("Sorting variants") - sort_variants( - infile = temp_file.name, - mode=sort_mode - ) + sort_variants(infile=temp_file.name, mode=sort_mode) logger.info("Variants sorted") logger.debug("Printing headers") - print_headers( - head = head, - outfile = outfile, - silent=silent - ) + print_headers(head=head, outfile=outfile, silent=silent) logger.debug("Headers printed") - + logger.info("Printing variants") - with open(temp_file.name, mode='r', encoding='utf-8', errors='replace') as f: + with open(temp_file.name, mode="r", encoding="utf-8", errors="replace") as f: for variant_line in f: - print_variant( - variant_line = variant_line, - outfile = outfile, - mode = 'modified', - silent=False - ) + print_variant(variant_line=variant_line, outfile=outfile, mode="modified", silent=False) logger.debug("Variants printed") - + logger.info("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") - - logger.info("Sorting done, time for sorting: {0}".format(datetime.now()-start)) + + logger.info("Sorting done, time for sorting: {0}".format(datetime.now() - start)) -if __name__ == '__main__': - sort() \ No newline at end of file +if __name__ == "__main__": + sort() diff --git a/genmod/commands/score_compounds.py b/genmod/commands/score_compounds.py index 3a27f3c..f8a5a60 100755 --- a/genmod/commands/score_compounds.py +++ b/genmod/commands/score_compounds.py @@ -11,60 +11,64 @@ from __future__ import print_function -import sys -import os -import click -import logging import itertools - -from multiprocessing import JoinableQueue, Manager, cpu_count, util +import logging +import os +import sys from codecs import open from datetime import datetime +from multiprocessing import JoinableQueue, Manager, cpu_count, util from tempfile import NamedTemporaryFile -from genmod.vcf_tools import (HeaderParser, add_metadata, print_headers, -sort_variants, print_variant) -from genmod.utils import (get_batches, VariantPrinter) -from genmod.score_variants import CompoundScorer +import click from genmod import __version__ +from genmod.score_variants import CompoundScorer +from genmod.utils import VariantPrinter, get_batches +from genmod.vcf_tools import HeaderParser, add_metadata, print_headers, print_variant, sort_variants -from .utils import (variant_file, silent, outfile, processes, temp_dir, - get_file_handle) +from .utils import get_file_handle, outfile, processes, silent, temp_dir, variant_file logger = logging.getLogger(__name__) util.abstract_sockets_supported = False -@click.command('compound', short_help="Score compounds") + +@click.command("compound", short_help="Score compounds") @variant_file @silent @outfile @processes @temp_dir -@click.option('--vep', - is_flag=True, - help='If variants are annotated with the Variant Effect Predictor.' +@click.option( + "--vep", is_flag=True, help="If variants are annotated with the Variant Effect Predictor." +) +@click.option( + "--threshold", + type=int, + help="Threshold for model-dependent penalty if no compounds with passing score", + default=9, ) -@click.option('--threshold', type=int, help="Threshold for model-dependent penalty if no compounds with passing score", default=9) -@click.option('--penalty', type=int, help="Penalty applied together with --threshold", default=6) +@click.option("--penalty", type=int, help="Penalty applied together with --threshold", default=6) @click.pass_context -def compound(context, variant_file, silent, outfile, vep, threshold: int, penalty: int, processes, temp_dir): +def compound( + context, variant_file, silent, outfile, vep, threshold: int, penalty: int, processes, temp_dir +): """ Score compound variants in a vcf file based on their rank score. """ - logger.info('Running GENMOD score_compounds, version: {0}'.format(__version__)) - + logger.info("Running GENMOD score_compounds, version: {0}".format(__version__)) + variant_file = get_file_handle(variant_file) - + start_time_analysis = datetime.now() logger.info("Initializing a Header Parser") head = HeaderParser() - + line = None for line in variant_file: line = line.rstrip() - if line.startswith('#'): - if line.startswith('##'): + if line.startswith("#"): + if line.startswith("##"): head.parse_meta_data(line) else: head.parse_header_line(line) @@ -72,8 +76,8 @@ def compound(context, variant_file, silent, outfile, vep, threshold: int, penalt break logger.info("Headers parsed") - - if not line.startswith('#'): + + if not line.startswith("#"): variant_file = itertools.chain([line], variant_file) else: print_headers(head=head, outfile=outfile, silent=silent) @@ -82,12 +86,14 @@ def compound(context, variant_file, silent, outfile, vep, threshold: int, penalt header_line = head.header individuals = head.individuals - add_metadata(head, - 'info', - 'CompoundsNormalized', - annotation_number='.', - entry_type='String', - description='Rank score as provided by compound analysis, based on RankScoreNormalized. family_id:rank_score') + add_metadata( + head, + "info", + "CompoundsNormalized", + annotation_number=".", + entry_type="String", + description="Rank score as provided by compound analysis, based on RankScoreNormalized. family_id:rank_score", + ) ################################################################### ### The task queue is where all jobs(in this case batches that ### @@ -101,12 +107,12 @@ def compound(context, variant_file, silent, outfile, vep, threshold: int, penalt results = Manager().Queue() num_scorers = processes - #Adapt the number of processes to the machine that run the analysis - logger.info('Number of CPU:s {}'.format(cpu_count())) - logger.info('Number of model checkers: {}'.format(num_scorers)) + # Adapt the number of processes to the machine that run the analysis + logger.info("Number of CPU:s {}".format(cpu_count())) + logger.info("Number of model checkers: {}".format(num_scorers)) # These are the workers that do the heavy part of the analysis - logger.info('Seting up the workers') + logger.info("Seting up the workers") compound_scorers = [ CompoundScorer( task_queue=variant_queue, @@ -117,16 +123,16 @@ def compound(context, variant_file, silent, outfile, vep, threshold: int, penalt ) for i in range(num_scorers) ] - + try: - logger.info('Starting the workers') + logger.info("Starting the workers") for worker in compound_scorers: - logger.debug('Starting worker {0}'.format(worker)) + logger.debug("Starting worker {0}".format(worker)) worker.start() - + # This process prints the variants to temporary files - logger.info('Seting up the variant printer') - + logger.info("Seting up the variant printer") + # We use a temp file to store the processed variants logger.debug("Build a tempfile for printing the variants") if temp_dir: @@ -134,48 +140,40 @@ def compound(context, variant_file, silent, outfile, vep, threshold: int, penalt else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() - + variant_printer = VariantPrinter( - task_queue=results, - head=head, - mode='chromosome', - outfile = temp_file.name + task_queue=results, head=head, mode="chromosome", outfile=temp_file.name ) - - logger.info('Starting the variant printer process') + + logger.info("Starting the variant printer process") variant_printer.start() - + start_time_variant_parsing = datetime.now() - + # This process parses the original vcf and create batches to put in the variant queue: chromosome_list = get_batches( - variants = variant_file, - batch_queue = variant_queue, - header = head, - vep = vep, - results_queue=results - ) - + variants=variant_file, + batch_queue=variant_queue, + header=head, + vep=vep, + results_queue=results, + ) + logger.debug("Put stop signs in the variant queue") for i in range(num_scorers): variant_queue.put(None) - + variant_queue.join() results.put(None) variant_printer.join() - - sort_variants(infile=temp_file.name, mode='chromosome') - + + sort_variants(infile=temp_file.name, mode="chromosome") + print_headers(head=head, outfile=outfile, silent=silent) - - with open(temp_file.name, 'r', encoding='utf-8') as f: + + with open(temp_file.name, "r", encoding="utf-8") as f: for line in f: - print_variant( - variant_line=line, - outfile=outfile, - mode='modified', - silent=silent - ) + print_variant(variant_line=line, outfile=outfile, mode="modified", silent=silent) except Exception as e: logger.warning(e) for worker in compound_scorers: @@ -186,7 +184,5 @@ def compound(context, variant_file, silent, outfile, vep, threshold: int, penalt logger.info("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") - - logger.info('Time for whole analyis: {0}'.format( - str(datetime.now() - start_time_analysis))) + logger.info("Time for whole analyis: {0}".format(str(datetime.now() - start_time_analysis))) diff --git a/genmod/commands/score_variants.py b/genmod/commands/score_variants.py index e2a7931..04e6b5f 100755 --- a/genmod/commands/score_variants.py +++ b/genmod/commands/score_variants.py @@ -11,106 +11,127 @@ from __future__ import print_function -import sys -import os -import click -import logging import itertools - +import logging +import os +import sys from codecs import open from datetime import datetime -from validate import ValidateError +import click from ped_parser import FamilyParser - -from genmod.vcf_tools import (add_metadata, print_variant, add_vcf_info, -print_headers, HeaderParser, get_variant_dict, get_info_dict) - -from genmod.score_variants import (ConfigParser, get_category_score, -check_plugins, as_normalized_max_min, RANK_SCORE_TYPES) +from validate import ValidateError from genmod import __version__ +from genmod.score_variants import ( + RANK_SCORE_TYPES, + ConfigParser, + as_normalized_max_min, + check_plugins, + get_category_score, +) +from genmod.vcf_tools import ( + HeaderParser, + add_metadata, + add_vcf_info, + get_info_dict, + get_variant_dict, + print_headers, + print_variant, +) -from .utils import (variant_file, family_file, family_type, silent, outfile, get_file_handle) +from .utils import family_file, family_type, get_file_handle, outfile, silent, variant_file logger = logging.getLogger(__name__) -@click.command('score', short_help="Score variants") +@click.command("score", short_help="Score variants") @variant_file -@click.option('-i', '--family_id', - default='1', +@click.option( + "-i", + "--family_id", + default="1", ) @family_file @family_type @silent -@click.option('--skip_plugin_check', - is_flag=True, - help='If continue even if plugins does not exist in vcf.' +@click.option( + "--skip_plugin_check", is_flag=True, help="If continue even if plugins does not exist in vcf." ) -@click.option('-r', '--rank_results', - is_flag=True, - help="Add a info field that shows how the different categories"\ - " contribute to the rank score." +@click.option( + "-r", + "--rank_results", + is_flag=True, + help="Add a info field that shows how the different categories" + " contribute to the rank score.", ) @outfile -@click.option('-c', '--score_config', - type=click.Path(exists=True), - help="The plug-in config file(.ini)" +@click.option( + "-c", "--score_config", type=click.Path(exists=True), help="The plug-in config file(.ini)" ) @click.pass_context -def score(context, variant_file, family_id, family_file, family_type, score_config, -silent, skip_plugin_check, rank_results, outfile): +def score( + context, + variant_file, + family_id, + family_file, + family_type, + score_config, + silent, + skip_plugin_check, + rank_results, + outfile, +): """ Score variants in a vcf file using a Weighted Sum Model. - - The specific scores should be defined in a config file, see examples on + + The specific scores should be defined in a config file, see examples on github. - """ - logger.info('Running GENMOD score, version: {0}'.format(__version__)) - + """ + logger.info("Running GENMOD score, version: {0}".format(__version__)) + logger.info("Checking family id") - + variant_file = get_file_handle(variant_file) - + if family_file: logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") family_id = list(family_parser.families.keys())[0] - + logger.info("Family used in analysis: {0}".format(family_id)) - + ## Check the score config: if not score_config: logger.warning("Please provide a score config file.") context.abort() - + logger.debug("Parsing config file") - + try: config_parser = ConfigParser(score_config) except ValidateError as e: logger.error(e.message) context.abort() - + score_categories = list(config_parser.categories.keys()) logger.debug("Config parsed succesfully") logger.info("Initializing a Header Parser") head = HeaderParser() - + for line in variant_file: line = line.rstrip() - if line.startswith('#'): - if line.startswith('##'): + if line.startswith("#"): + if line.startswith("##"): head.parse_meta_data(line) else: head.parse_header_line(line) else: break - + logger.info("Check if all score plugins exist in vcf ...") if not check_plugins(config_parser, head): if not skip_plugin_check: @@ -118,62 +139,60 @@ def score(context, variant_file, family_id, family_file, family_type, score_conf context.abort() else: logger.info("All plugins are defined in vcf") - + csq_format = head.vep_columns - #Add the first variant to the iterator - if not line.startswith('#'): + # Add the first variant to the iterator + if not line.startswith("#"): variant_file = itertools.chain([line], variant_file) else: print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) - + header_line = head.header - + if "RankScore" in head.info_dict: logger.warning("Variants already scored according to VCF header") logger.info("Please check VCF file") context.abort() for rank_score_type, rank_score_description in RANK_SCORE_TYPES.items(): - add_metadata(head, - 'info', - rank_score_type, - annotation_number='.', - entry_type='String', - description=rank_score_description) + add_metadata( + head, + "info", + rank_score_type, + annotation_number=".", + entry_type="String", + description=rank_score_description, + ) add_metadata( head, - 'info', - 'RankScoreMinMax', - annotation_number='.', - entry_type='String', - description="The rank score MIN-MAX bounds. family_id:min:max." + "info", + "RankScoreMinMax", + annotation_number=".", + entry_type="String", + description="The rank score MIN-MAX bounds. family_id:min:max.", ) - + if rank_results: add_metadata( head, - 'info', - 'RankResult', - annotation_number='.', - entry_type='String', - description= '|'.join(score_categories) + "info", + "RankResult", + annotation_number=".", + entry_type="String", + description="|".join(score_categories), ) - - print_headers( - head=head, - outfile=outfile, - silent=silent - ) + + print_headers(head=head, outfile=outfile, silent=silent) start_scoring = datetime.now() last_twenty = datetime.now() nr_of_variants = 1 for line in variant_file: - if not line.startswith('#'): + if not line.startswith("#"): variant = get_variant_dict(line, header_line) - variant['info_dict'] = get_info_dict(variant['INFO']) + variant["info_dict"] = get_info_dict(variant["INFO"]) rank_score = 0 # This is for printing results to vcf: category_scores = [] @@ -182,65 +201,63 @@ def score(context, variant_file, family_id, family_file, family_type, score_conf category_scores_min: float = 0.0 for category in score_categories: category_score, category_score_min, category_score_max = get_category_score( - variant=variant, - category=category, - config_parser=config_parser, - csq_format=csq_format + variant=variant, + category=category, + config_parser=config_parser, + csq_format=csq_format, ) logger.debug("Adding category score {0} to rank_score".format(category_score)) rank_score += category_score logger.debug("Updating rank score to {0}".format(rank_score)) category_scores_min += category_score_min category_scores_max += category_score_max - + category_scores.append(str(category_score)) rank_score = float(rank_score) # Export rank score as float type # Normalize ranks score (across all categories) - rank_score_normalized: float = as_normalized_max_min(score=float(rank_score), - min_score_value=category_scores_min, - max_score_value=category_scores_max) + rank_score_normalized: float = as_normalized_max_min( + score=float(rank_score), + min_score_value=category_scores_min, + max_score_value=category_scores_max, + ) variant = add_vcf_info( - keyword = 'RankScore', + keyword="RankScore", variant_dict=variant, - annotation="{0}:{1}".format(family_id, rank_score) + annotation="{0}:{1}".format(family_id, rank_score), ) variant: dict = add_vcf_info( - keyword = 'RankScoreNormalized', + keyword="RankScoreNormalized", variant_dict=variant, - annotation="{0}:{1}".format(family_id, rank_score_normalized) + annotation="{0}:{1}".format(family_id, rank_score_normalized), ) variant: dict = add_vcf_info( - keyword = 'RankScoreMinMax', + keyword="RankScoreMinMax", variant_dict=variant, - annotation="{0}:{1}:{2}".format(family_id, category_scores_min, category_scores_max) + annotation="{0}:{1}:{2}".format( + family_id, category_scores_min, category_scores_max + ), ) - + if rank_results: variant = add_vcf_info( - keyword = 'RankResult', - variant_dict=variant, - annotation="|".join(category_scores) + keyword="RankResult", variant_dict=variant, annotation="|".join(category_scores) ) - print_variant( - variant_dict=variant, - header_line=header_line, - outfile=outfile, - silent=silent + variant_dict=variant, header_line=header_line, outfile=outfile, silent=silent ) nr_of_variants += 1 if nr_of_variants % 20000 == 0: logger.info("{0} variants scored.".format(nr_of_variants)) - logger.info("Last 20000 took {0} to score.".format(datetime.now()-last_twenty)) + logger.info("Last 20000 took {0} to score.".format(datetime.now() - last_twenty)) last_twenty = datetime.now() logger.info("Variants scored. Number of variants: {0}".format(nr_of_variants)) - logger.info("Time to score variants: {0}".format(datetime.now()-start_scoring)) + logger.info("Time to score variants: {0}".format(datetime.now() - start_scoring)) diff --git a/genmod/commands/summarize_variants.py b/genmod/commands/summarize_variants.py index 1955014..d9bd90c 100755 --- a/genmod/commands/summarize_variants.py +++ b/genmod/commands/summarize_variants.py @@ -4,11 +4,11 @@ analyze_variants.py Analyze the the variants in a vcf, the following will be printed: - + - How many variants found - How many mendelian violations - How many variants where not covered in all individuals. (Default depth 7) - - How many variants did not satisfy the base call + - How many variants did not satisfy the base call quality treshold. (Default 10) - How many variants followed each model: - AR_hom @@ -31,34 +31,36 @@ Copyright (c) 2014 __MoonsoInc__. All rights reserved. """ -from __future__ import (print_function, division) +from __future__ import division, print_function -import sys -import logging import json -import click +import logging +import sys +import click from vcf_parser import VCFParser from genmod.vcf_tools import HeaderParser -from .utils import (variant_file, family_file) +from .utils import family_file, variant_file + def get_inheritance_models(variant, family_id, inheritance_keyword): """Return the genetic models found for this family in this variant""" models_found = set([]) - family_models = variant['info_dict'].get(inheritance_keyword, None) + family_models = variant["info_dict"].get(inheritance_keyword, None) if family_models: - #This is a string on the form 'fam_1:AR_hom,fam_2:AR_hom|AR_hom_dn + # This is a string on the form 'fam_1:AR_hom,fam_2:AR_hom|AR_hom_dn for family_info in family_models: - splitted_family = family_info.split(':') + splitted_family = family_info.split(":") if splitted_family[0] == family_id: - models_found = set(splitted_family[1].split('|')) + models_found = set(splitted_family[1].split("|")) return models_found - + ### This is for analyzing the variants ### + @click.command() @variant_file @family_file @@ -66,36 +68,48 @@ def get_inheritance_models(variant, family_id, inheritance_keyword): # type=click.Path(exists=True), # help="""Specify the path to a config file.""" # ) -@click.option('--frequency_keyword', '-freqkey', - default='1000G_freq', - nargs=1, - help='Specify keyword for frequency in vcf. Default 1000G_freq' +@click.option( + "--frequency_keyword", + "-freqkey", + default="1000G_freq", + nargs=1, + help="Specify keyword for frequency in vcf. Default 1000G_freq", ) -@click.option('--frequency_treshold', '-freq', - default=0.05, - nargs=1, - help='Specify the ferquency treshold for variants to be considered. Default=0.05' +@click.option( + "--frequency_treshold", + "-freq", + default=0.05, + nargs=1, + help="Specify the ferquency treshold for variants to be considered. Default=0.05", ) -@click.option('--cadd_keyword', '-caddkey', - default='CADD', - nargs=1, - help='Specify keyword for CADD scores in vcf. Default CADD' +@click.option( + "--cadd_keyword", + "-caddkey", + default="CADD", + nargs=1, + help="Specify keyword for CADD scores in vcf. Default CADD", ) -@click.option('--gq_treshold', '-gq', - default=20.0, - nargs=1, - help='Specify the genotype quality treshold for variants to be considered. Default=50' +@click.option( + "--gq_treshold", + "-gq", + default=20.0, + nargs=1, + help="Specify the genotype quality treshold for variants to be considered. Default=50", ) -@click.option('--read_depth_treshold', '-depth', - default=10.0, - nargs=1, - help="""Specify the genotype quality treshold for variants to be considered. Default=10. - The read deth is taken from AD, so it is the sum of the quality reads from reference and alternative alleles.""" +@click.option( + "--read_depth_treshold", + "-depth", + default=10.0, + nargs=1, + help="""Specify the genotype quality treshold for variants to be considered. Default=10. + The read deth is taken from AD, so it is the sum of the quality reads from reference and alternative alleles.""", ) -@click.option('--cadd_treshold', '-cadd', - default=12.0, - nargs=1, - help='Specify the cadd treshold for variants to be considered. Default 12.0' +@click.option( + "--cadd_treshold", + "-cadd", + default=12.0, + nargs=1, + help="Specify the cadd treshold for variants to be considered. Default 12.0", ) # @click.option('-p', '--patterns', @@ -111,17 +125,25 @@ def get_inheritance_models(variant, family_id, inheritance_keyword): # is_flag=True, # help='Increase output verbosity.' # ) -def summarize(variant_file, family_file, frequency_treshold, frequency_keyword, - cadd_treshold, cadd_keyword, gq_treshold, read_depth_treshold): +def summarize( + variant_file, + family_file, + frequency_treshold, + frequency_keyword, + cadd_treshold, + cadd_keyword, + gq_treshold, + read_depth_treshold, +): """ Summarize the the variants in a vcf. - + There will be one result line per individual. - + - How many variants found\n - - How many variants did not satisfy the base call + - How many variants did not satisfy the base call quality treshold. (Default 20)\n - - How many variants where not covered in all individuals. + - How many variants where not covered in all individuals. (Default depth 10)\n - How many variants followed each model in each family:\n - AR_hom\n @@ -140,47 +162,39 @@ def summarize(variant_file, family_file, frequency_treshold, frequency_keyword, - How many no cadd score\n - How many indels\n - How many indels without cadd score\n - + """ logger = logging.getLogger(__name__) logger = logging.getLogger("genmod.commands.summarize_variants") - + head = HeaderParser() - + nr_of_variants = 0 - - header = ['sample_id', 'nr_of_variants'] - + + header = ["sample_id", "nr_of_variants"] + samples = {} - + logger.debug("Setting up a variant parser") - if variant_file == '-': - variant_parser = VCFParser( - fsock = sys.stdin, - check_info=False - ) + if variant_file == "-": + variant_parser = VCFParser(fsock=sys.stdin, check_info=False) else: - variant_parser = VCFParser( - infile = variant_file, - check_info=False - ) + variant_parser = VCFParser(infile=variant_file, check_info=False) logger.debug("Variant parser setup") - + head = variant_parser.metadata - + for sample_id in head.individuals: samples[sample_id] = {} samples[sample_id]["nr_of_variants"] = 0 - - + for variant in variant_parser: for sample_id in samples: samples[sample_id]["nr_of_variants"] += 1 - print(variant['genotypes'][sample_id].depth_of_coverage) - + print(variant["genotypes"][sample_id].depth_of_coverage) + print(json.dumps(samples)) - - + # inheritance_models = [ # 'AR_hom', # 'AR_hom_dn', @@ -336,5 +350,6 @@ def summarize(variant_file, family_file, frequency_treshold, frequency_keyword, # % (indel_no_cadd, indel_no_cadd/number_of_variants)) # print('Time for analysis: %s' % str(datetime.now()-analysis_start)) -if __name__ == '__main__': - summarize() \ No newline at end of file + +if __name__ == "__main__": + summarize() diff --git a/genmod/commands/utils.py b/genmod/commands/utils.py index 9839397..1c484df 100644 --- a/genmod/commands/utils.py +++ b/genmod/commands/utils.py @@ -1,53 +1,53 @@ -import sys - -from codecs import (open, getreader) import gzip +import sys +from codecs import getreader, open from multiprocessing import cpu_count + import click -variant_file = click.argument('variant_file', - type=click.Path(), - metavar=' or -') +variant_file = click.argument("variant_file", type=click.Path(), metavar=" or -") -outfile = click.option('-o', '--outfile', - type=click.File('w'), - help='Specify the path to a file where results should be stored.') +outfile = click.option( + "-o", + "--outfile", + type=click.File("w"), + help="Specify the path to a file where results should be stored.", +) -silent = click.option('-s', '--silent', - is_flag=True, - help='Do not print the variants.') +silent = click.option("-s", "--silent", is_flag=True, help="Do not print the variants.") -processes = click.option('-p', '--processes', +processes = click.option( + "-p", + "--processes", default=min(4, cpu_count()), - help='Define how many processes that should be use for annotation.') + help="Define how many processes that should be use for annotation.", +) -temp_dir = click.option('--temp_dir', - type=click.Path(exists=True), - help='Path to tempdir') +temp_dir = click.option("--temp_dir", type=click.Path(exists=True), help="Path to tempdir") -family_file = click.option('-f', '--family_file', - type=click.File('r'), - metavar='') +family_file = click.option("-f", "--family_file", type=click.File("r"), metavar="") -family_type = click.option('-t' ,'--family_type', - type=click.Choice(['ped', 'alt', 'cmms', 'mip']), - default='ped', - help='If the analysis use one of the known setups, please specify which one.' +family_type = click.option( + "-t", + "--family_type", + type=click.Choice(["ped", "alt", "cmms", "mip"]), + default="ped", + help="If the analysis use one of the known setups, please specify which one.", ) def get_file_handle(path): """Get a file handle""" - if path == '-': - if sys.version_info < (3,0): - sys.stdin = getreader('utf-8')(sys.stdin) - + if path == "-": + if sys.version_info < (3, 0): + sys.stdin = getreader("utf-8")(sys.stdin) + file_handle = sys.stdin - - elif path.endswith('.gz'): - file_handle = getreader('utf-8')(gzip.open(path, 'r'), errors='replace') - + + elif path.endswith(".gz"): + file_handle = getreader("utf-8")(gzip.open(path, "r"), errors="replace") + else: - file_handle = open(path, 'r') - - return file_handle \ No newline at end of file + file_handle = open(path, "r") + + return file_handle diff --git a/genmod/errors/__init__.py b/genmod/errors/__init__.py index 1540964..472d276 100644 --- a/genmod/errors/__init__.py +++ b/genmod/errors/__init__.py @@ -1,2 +1,2 @@ # -*- coding: utf-8 -*- -from genmod.errors.warning import warning \ No newline at end of file +from genmod.errors.warning import warning diff --git a/genmod/errors/warning.py b/genmod/errors/warning.py index edf88f4..6720b84 100755 --- a/genmod/errors/warning.py +++ b/genmod/errors/warning.py @@ -11,16 +11,18 @@ from __future__ import print_function, unicode_literals -import sys import os +import sys + def warning(*objs): """Prints the warning messages to std err""" print("WARNING: ", *objs, file=sys.stderr) - + + def main(): - print('This is a warning!') + print("This is a warning!") -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/genmod/log.py b/genmod/log.py index 4d02964..b75abf0 100644 --- a/genmod/log.py +++ b/genmod/log.py @@ -1,13 +1,14 @@ +import logging import os import sys -import logging LEVELS = { - 0 : 'WARNING', - 1 : 'INFO', - 2 : 'DEBUG', + 0: "WARNING", + 1: "INFO", + 2: "DEBUG", } + def init_log(logger, filename=None, loglevel=None): """ Initializes the log file in the proper format. @@ -18,7 +19,7 @@ def init_log(logger, filename=None, loglevel=None): be disabled. loglevel (str): Determines the level of the log output. """ - template = '[%(asctime)s] %(levelname)-8s: %(name)-25s: %(message)s' + template = "[%(asctime)s] %(levelname)-8s: %(name)-25s: %(message)s" formatter = logging.Formatter(template) if loglevel: @@ -26,11 +27,11 @@ def init_log(logger, filename=None, loglevel=None): # We will always print warnings and higher to stderr console = logging.StreamHandler() - console.setLevel('WARNING') + console.setLevel("WARNING") console.setFormatter(formatter) if filename: - file_handler = logging.FileHandler(filename, encoding='utf-8') + file_handler = logging.FileHandler(filename, encoding="utf-8") if loglevel: file_handler.setLevel(getattr(logging, loglevel)) file_handler.setFormatter(formatter) @@ -43,6 +44,7 @@ def init_log(logger, filename=None, loglevel=None): logger.addHandler(console) + def get_log_stream(logger): """ Returns a stream to the root log file. diff --git a/genmod/score_variants/__init__.py b/genmod/score_variants/__init__.py index 1dc2497..0cb4ddf 100644 --- a/genmod/score_variants/__init__.py +++ b/genmod/score_variants/__init__.py @@ -1,8 +1,8 @@ from __future__ import absolute_import -from .score_function import ScoreFunction -from .config_parser import ConfigParser -from .score_variant import get_category_score, as_normalized_max_min -from .compound_scorer import CompoundScorer from .check_plugins import check_plugins -from .rank_score_variant_definitions import RANK_SCORE_TYPES, RANK_SCORE_TYPE_NAMES \ No newline at end of file +from .compound_scorer import CompoundScorer +from .config_parser import ConfigParser +from .rank_score_variant_definitions import RANK_SCORE_TYPE_NAMES, RANK_SCORE_TYPES +from .score_function import ScoreFunction +from .score_variant import as_normalized_max_min, get_category_score diff --git a/genmod/score_variants/cap_rank_score_to_min_bound.py b/genmod/score_variants/cap_rank_score_to_min_bound.py index 04fc30f..c8286b1 100644 --- a/genmod/score_variants/cap_rank_score_to_min_bound.py +++ b/genmod/score_variants/cap_rank_score_to_min_bound.py @@ -1,10 +1,10 @@ -from genmod.score_variants.score_variant import MIN_SCORE_NORMALIZED from genmod.score_variants.rank_score_variant_definitions import RANK_SCORE_TYPE_NAMES +from genmod.score_variants.score_variant import MIN_SCORE_NORMALIZED -def cap_rank_score_to_min_bound(rank_score_type: str, - rank_score, - min_rank_score_value: float) -> float: +def cap_rank_score_to_min_bound( + rank_score_type: str, rank_score, min_rank_score_value: float +) -> float: """ Caps rank_score to fall withing MIN bound of normalized rank score, if it's outside valid range. Args: @@ -17,9 +17,9 @@ def cap_rank_score_to_min_bound(rank_score_type: str, """ if rank_score_type not in set(RANK_SCORE_TYPE_NAMES): - raise ValueError(f'Unknown rank score type {rank_score_type}') + raise ValueError(f"Unknown rank score type {rank_score_type}") - if rank_score_type == 'RankScoreNormalized': + if rank_score_type == "RankScoreNormalized": min_rank_score_value = MIN_SCORE_NORMALIZED if rank_score < min_rank_score_value: diff --git a/genmod/score_variants/check_plugins.py b/genmod/score_variants/check_plugins.py index c4b4d01..2abff93 100644 --- a/genmod/score_variants/check_plugins.py +++ b/genmod/score_variants/check_plugins.py @@ -2,37 +2,40 @@ logger = logging.getLogger(__name__) + def check_plugins(config_parser, head): """Check if the plugins exist in vcf file - - Args: - config_parser (ConfigObj): A config object with the plugins - head (HeaderParser): A vcf header object - - Returns: - bool: If all tests passed or not + + Args: + config_parser (ConfigObj): A config object with the plugins + head (HeaderParser): A vcf header object + + Returns: + bool: If all tests passed or not """ all_pass = True for plugin in config_parser.plugins: plugin_object = config_parser.plugins[plugin] logger.debug("Checking plugin {0}".format(plugin)) - if plugin_object.field == 'INFO': + if plugin_object.field == "INFO": info_key = plugin_object.info_key if info_key not in head.info_dict: - logger.warning("INFO field {0} is not in vcf INFO." - " This field will not be scored.".format(info_key)) + logger.warning( + "INFO field {0} is not in vcf INFO." " This field will not be scored.".format( + info_key + ) + ) all_pass = False else: - logger.debug("INFO field {0} was found in vcf INFO.".format( - info_key)) - if info_key == 'CSQ': + logger.debug("INFO field {0} was found in vcf INFO.".format(info_key)) + if info_key == "CSQ": csq_key = plugin_object.csq_key if csq_key not in head.vep_columns: - logger.warning("CSQ field {0} is not in csq annotation." - " This field will not be scored.".format(csq_key)) + logger.warning( + "CSQ field {0} is not in csq annotation." + " This field will not be scored.".format(csq_key) + ) all_pass = False else: - logger.debug("CSQ field {0} was found in csq annotation.".format( - csq_key)) + logger.debug("CSQ field {0} was found in csq annotation.".format(csq_key)) return all_pass - \ No newline at end of file diff --git a/genmod/score_variants/compound_scorer.py b/genmod/score_variants/compound_scorer.py index e90387c..a8da719 100755 --- a/genmod/score_variants/compound_scorer.py +++ b/genmod/score_variants/compound_scorer.py @@ -3,35 +3,40 @@ """ variant_consumer.py -Consumes batches of variants and annotates them. Each batch is a dictionary +Consumes batches of variants and annotates them. Each batch is a dictionary with variant_id:s as keys and dictionaries with variant information. The variants will get different annotations depending on input - + Created by MÃ¥ns Magnusson on 2013-03-01. Copyright (c) 2013 __MyCompanyName__. All rights reserved. """ -from __future__ import (division, print_function) +from __future__ import division, print_function -import sys -import os import logging -from typing import Dict, Tuple, Union, List +import os +import sys from multiprocessing import Process +from typing import Dict, List, Tuple, Union -from genmod.vcf_tools import (replace_vcf_info, add_vcf_info) - -from genmod.score_variants.score_variant import as_normalized_max_min, MIN_SCORE_NORMALIZED, MAX_SCORE_NORMALIZED -from genmod.score_variants.rank_score_variant_definitions import RANK_SCORE_TYPE_NAMES from genmod.score_variants.cap_rank_score_to_min_bound import cap_rank_score_to_min_bound +from genmod.score_variants.rank_score_variant_definitions import RANK_SCORE_TYPE_NAMES +from genmod.score_variants.score_variant import ( + MAX_SCORE_NORMALIZED, + MIN_SCORE_NORMALIZED, + as_normalized_max_min, +) +from genmod.vcf_tools import add_vcf_info, replace_vcf_info logger = logging.getLogger(__name__) -def get_rank_score(rank_score_type: str, - threshold: Union[int, float], - min_rank_score_value: float, - max_rank_score_value: float) -> Union[int, float]: +def get_rank_score( + rank_score_type: str, + threshold: Union[int, float], + min_rank_score_value: float, + max_rank_score_value: float, +) -> Union[int, float]: """ Return raw rank score or normalized rank score. @@ -43,20 +48,24 @@ def get_rank_score(rank_score_type: str, Returns: A rank score like value, possibly normalized """ - if rank_score_type == 'RankScore': + if rank_score_type == "RankScore": return threshold - elif rank_score_type == 'RankScoreNormalized': + elif rank_score_type == "RankScoreNormalized": # Normalize raw rank score - return as_normalized_max_min(score=float(threshold), - min_score_value=min_rank_score_value, - max_score_value=max_rank_score_value) - raise ValueError('Unknown RANK_SCORE_TYPE_NAMES config', rank_score_type) - - -def get_rank_score_as_magnitude(rank_score_type: str, - rank_score: Union[int, float], - min_rank_score_value: float, - max_rank_score_value: float) -> float: + return as_normalized_max_min( + score=float(threshold), + min_score_value=min_rank_score_value, + max_score_value=max_rank_score_value, + ) + raise ValueError("Unknown RANK_SCORE_TYPE_NAMES config", rank_score_type) + + +def get_rank_score_as_magnitude( + rank_score_type: str, + rank_score: Union[int, float], + min_rank_score_value: float, + max_rank_score_value: float, +) -> float: """ Returns rank score as a magnitude (delta), to make the rank score suitable for addition/subtraction operations. @@ -75,57 +84,59 @@ def get_rank_score_as_magnitude(rank_score_type: str, A value, magnitude, compatible with raw or normalized rank score values """ - if rank_score_type == 'RankScore': + if rank_score_type == "RankScore": return rank_score - elif rank_score_type == 'RankScoreNormalized': + elif rank_score_type == "RankScoreNormalized": normalized_rank_score: float = rank_score / (max_rank_score_value - min_rank_score_value) if not (MIN_SCORE_NORMALIZED <= normalized_rank_score <= MAX_SCORE_NORMALIZED): - raise ValueError(f'Failed to normalize to within expected bounds {normalized_rank_score}') + raise ValueError( + f"Failed to normalize to within expected bounds {normalized_rank_score}" + ) return normalized_rank_score - raise ValueError(f'Unknown rank score type {rank_score_type}') + raise ValueError(f"Unknown rank score type {rank_score_type}") + class CompoundScorer(Process): """ - Annotates variant in batches from the task queue and puts the result in + Annotates variant in batches from the task queue and puts the result in the results queue. """ - + def __init__(self, task_queue, results_queue, individuals, threshold: int, penalty: int): """ Initialize the VariantAnnotator - - Consume variant batches from the task queue, annotate them with the - genetic inheritance patterns that they follow and put them in the + + Consume variant batches from the task queue, annotate them with the + genetic inheritance patterns that they follow and put them in the results queue. - + Arguments: task_queue (Queue) results_queue (Queue) individuals (list) """ Process.__init__(self) - + self.proc_name = self.name - - logger.info("Setting up variant_annotator: {0}".format( - self.proc_name)) - + + logger.info("Setting up variant_annotator: {0}".format(self.proc_name)) + logger.debug("Setting up task queue") self.task_queue = task_queue - + logger.debug("Setting up results queue") self.results_queue = results_queue logger.debug("Setting up individuals") self.individuals = individuals - + self.threshold = threshold self.penalty = penalty if len(self.individuals) == 1: - self.models = ['AR_comp', 'AR_comp_dn', 'AD', 'AD_dn'] + self.models = ["AR_comp", "AR_comp_dn", "AD", "AD_dn"] else: - self.models = ['AR_comp', 'AR_comp_dn'] + self.models = ["AR_comp", "AR_comp_dn"] @staticmethod def _get_rankscore_normalization_bounds(variant_batch: Dict[str, Dict]) -> Dict[str, Tuple]: @@ -138,32 +149,39 @@ def _get_rankscore_normalization_bounds(variant_batch: Dict[str, Dict]) -> Dict[ """ variant_rankscore_normalization_bounds: dict = {} for variant_id in variant_batch: - entry_minmax: List[str] = variant_batch[variant_id]['info_dict']['RankScoreMinMax'].split(':') - rankscore_normalization_min_max: tuple = (float(entry_minmax[1]), float(entry_minmax[2])) + entry_minmax: List[str] = variant_batch[variant_id]["info_dict"][ + "RankScoreMinMax" + ].split(":") + rankscore_normalization_min_max: tuple = ( + float(entry_minmax[1]), + float(entry_minmax[2]), + ) if not rankscore_normalization_min_max[0] <= rankscore_normalization_min_max[1]: - raise ValueError(f'Invalid min-max normalization value expected MIN-MAX \ - {rankscore_normalization_min_max}') + raise ValueError(f"Invalid min-max normalization value expected MIN-MAX \ + {rankscore_normalization_min_max}") if variant_id in variant_rankscore_normalization_bounds.keys(): - raise KeyError(f'Cannot add variant ID to normalization data dict since it\'s already present \ - {variant_id}, {variant_rankscore_normalization_bounds}') - variant_rankscore_normalization_bounds.update({variant_id: rankscore_normalization_min_max}) + raise KeyError(f"Cannot add variant ID to normalization data dict since it's already present \ + {variant_id}, {variant_rankscore_normalization_bounds}") + variant_rankscore_normalization_bounds.update( + {variant_id: rankscore_normalization_min_max} + ) return variant_rankscore_normalization_bounds def run(self): """Run the consuming""" - logger.info('%s: Starting!' % self.proc_name) + logger.info("%s: Starting!" % self.proc_name) # Check if there are any batches in the queue while True: # A batch is a dictionary with varints on the form {variant_id:variant_dict} logger.debug("Getting task from task_queue") variant_batch = self.task_queue.get() - + if variant_batch is None: - logger.info('No more batches') + logger.info("No more batches") self.task_queue.task_done() - logger.info('{0}: Exiting'.format(self.proc_name)) + logger.info("{0}: Exiting".format(self.proc_name)) break - + # We need to save the compound scores in a dict and group them by family # This is a dictionary on the form {'variant_id: rank_score} rank_scores = {} @@ -175,14 +193,14 @@ def run(self): for variant_id in variant_batch: # First we store the scores for each variant in a dictionary variant = variant_batch[variant_id] - rank_score_entry = variant['info_dict'].get(f'{rank_score_type}', '') + rank_score_entry = variant["info_dict"].get(f"{rank_score_type}", "") # We need to loop through the families # This entry looks like :, : - for family_rank_score in rank_score_entry.split(','): - family_rank_score = family_rank_score.split(':') + for family_rank_score in rank_score_entry.split(","): + family_rank_score = family_rank_score.split(":") - #TODO check if correct family id + # TODO check if correct family id # Right now we assume that there is only one family in the vcf family_id = family_rank_score[0] rank_score = float(family_rank_score[-1]) @@ -190,23 +208,24 @@ def run(self): rank_scores[rank_score_type][variant_id] = rank_score # Per variant, find rank score max min values used for normalization - variant_rankscore_normalization_bounds: Dict[str, Tuple] = \ + variant_rankscore_normalization_bounds: Dict[str, Tuple] = ( self._get_rankscore_normalization_bounds(variant_batch) - - #We now have a dictionary with variant ids and rank scores, per rank_score_type + ) + + # We now have a dictionary with variant ids and rank scores, per rank_score_type for variant_id in variant_batch: # If the variants only follow AR_comp (and AD for single individual families) # we want to pennalise the score if the compounds have low scores variant = variant_batch[variant_id] - raw_compounds = variant['info_dict'].get('Compounds', None) + raw_compounds = variant["info_dict"].get("Compounds", None) for rank_score_type in RANK_SCORE_TYPE_NAMES: if raw_compounds: logger.debug("Scoring compound for variant %s" % variant_id) - #Variable to see if we should correct the rank score + # Variable to see if we should correct the rank score correct_score = True # First we check if the rank score should be corrected: - for family in variant['info_dict'].get('GeneticModels', '').split(','): - for model in family.split(':')[-1].split('|'): + for family in variant["info_dict"].get("GeneticModels", "").split(","): + for model in family.split(":")[-1].split("|"): # If the variant follows any model more than the specified it should # not be corrected if model not in self.models: @@ -225,40 +244,51 @@ def run(self): # family_id and compounds splitted with ':' # list of compounds splitted on '|' - #TODO Only checks first family now - family_compound_entry = raw_compounds.split(',')[0] - splitted_entry = family_compound_entry.split(':') + # TODO Only checks first family now + family_compound_entry = raw_compounds.split(",")[0] + splitted_entry = family_compound_entry.split(":") compound_family_id = splitted_entry[0] - compound_list = splitted_entry[-1].split('|') + compound_list = splitted_entry[-1].split("|") - logger.debug("Checking compounds for family {0}".format( - compound_family_id)) + logger.debug("Checking compounds for family {0}".format(compound_family_id)) - #Loop through compounds to check if they are only low scored + # Loop through compounds to check if they are only low scored for compound_id in compound_list: compound_rank_score = rank_scores[rank_score_type][compound_id] - if compound_rank_score > get_rank_score(rank_score_type=rank_score_type, - threshold=self.threshold, - min_rank_score_value=variant_rankscore_normalization_bounds[variant_id][0], - max_rank_score_value=variant_rankscore_normalization_bounds[variant_id][1] - ): + if compound_rank_score > get_rank_score( + rank_score_type=rank_score_type, + threshold=self.threshold, + min_rank_score_value=variant_rankscore_normalization_bounds[ + variant_id + ][0], + max_rank_score_value=variant_rankscore_normalization_bounds[ + variant_id + ][1], + ): only_low = False logger.debug("Setting only_low to {0}".format(only_low)) - if (correct_score and only_low): - logger.debug("correcting rank score for {0}".format( - variant_id)) - current_rank_score -= get_rank_score_as_magnitude(rank_score_type=rank_score_type, - rank_score=self.penalty, - min_rank_score_value=variant_rankscore_normalization_bounds[variant_id][0], - max_rank_score_value=variant_rankscore_normalization_bounds[variant_id][1] - ) + if correct_score and only_low: + logger.debug("correcting rank score for {0}".format(variant_id)) + current_rank_score -= get_rank_score_as_magnitude( + rank_score_type=rank_score_type, + rank_score=self.penalty, + min_rank_score_value=variant_rankscore_normalization_bounds[ + variant_id + ][0], + max_rank_score_value=variant_rankscore_normalization_bounds[ + variant_id + ][1], + ) # In case the current_rank_score falls outside normalization bounds after modification, # cap it to within the MIN normalization bound. - current_rank_score = cap_rank_score_to_min_bound(rank_score_type=rank_score_type, - rank_score=current_rank_score, - min_rank_score_value=variant_rankscore_normalization_bounds[variant_id][0] - ) + current_rank_score = cap_rank_score_to_min_bound( + rank_score_type=rank_score_type, + rank_score=current_rank_score, + min_rank_score_value=variant_rankscore_normalization_bounds[ + variant_id + ][0], + ) for compound_id in compound_list: logger.debug("Checking compound {0}".format(compound_id)) @@ -273,39 +303,48 @@ def run(self): # Sort compound variants lexicographically scored_compound_list.sort() new_compound_string = "{0}:{1}".format( - compound_family_id, '|'.join(scored_compound_list)) + compound_family_id, "|".join(scored_compound_list) + ) - current_rank_score = float(current_rank_score) # Export rank score as float type - new_rank_score_string = "{0}:{1}".format(compound_family_id, current_rank_score) + current_rank_score = float( + current_rank_score + ) # Export rank score as float type + new_rank_score_string = "{0}:{1}".format( + compound_family_id, current_rank_score + ) # variant['info_dict']['IndividualRankScore'] = current_rank_score_string - variant['info_dict'][f'{rank_score_type}'] = new_rank_score_string - variant['info_dict'][f'Compounds{rank_score_type.strip("RankScore")}'] = new_compound_string + variant["info_dict"][f"{rank_score_type}"] = new_rank_score_string + variant["info_dict"][f'Compounds{rank_score_type.strip("RankScore")}'] = ( + new_compound_string + ) variant = replace_vcf_info( - keyword=f'{rank_score_type}', - annotation = new_rank_score_string, - variant_dict=variant + keyword=f"{rank_score_type}", + annotation=new_rank_score_string, + variant_dict=variant, ) # CompoundsNormalized is not previously added to VCF. # For this case, perform an VCF INFO ADD operation, rather than a REPLACE keyword_compounds = f'Compounds{rank_score_type.strip("RankScore")}' fn_add_replace_vcf_info = replace_vcf_info - if not (keyword_compounds in variant['INFO'] and - keyword_compounds in variant['info_dict']): + if not ( + keyword_compounds in variant["INFO"] + and keyword_compounds in variant["info_dict"] + ): # In case INFO subfield is not previously added to VCF, # there's a need to do so now. fn_add_replace_vcf_info = add_vcf_info variant = fn_add_replace_vcf_info( keyword=keyword_compounds, annotation=new_compound_string, - variant_dict=variant + variant_dict=variant, ) logger.debug("Putting variant in results_queue") self.results_queue.put(variant) - + self.task_queue.task_done() - + return diff --git a/genmod/score_variants/config_parser.py b/genmod/score_variants/config_parser.py index 651ea8e..5ff54c0 100755 --- a/genmod/score_variants/config_parser.py +++ b/genmod/score_variants/config_parser.py @@ -25,121 +25,130 @@ from __future__ import print_function import logging + import click import configobj - +from extract_vcf import Plugin from six import string_types from validate import ValidateError -from extract_vcf import Plugin - from genmod.score_variants import ScoreFunction + class ConfigParser(configobj.ConfigObj): """ Class for holding information from config file. - + """ - def __init__(self, config_file, indent_type=' ', encoding='utf-8'): + + def __init__(self, config_file, indent_type=" ", encoding="utf-8"): super(ConfigParser, self).__init__( - infile=config_file, - indent_type=indent_type, - encoding=encoding, - ) + infile=config_file, + indent_type=indent_type, + encoding=encoding, + ) self.logger = logging.getLogger(__name__) self.logger = logging.getLogger("genmod.score_varaints.config_parser") - + self.categories = {} self.plugins = {} self.score_functions = {} - - self.vcf_columns = ['CHROM','POS','ID','REF','ALT', 'FILTER','QUAL', - 'FILTER','INFO','FORMAT','sample_id'] - - self.data_types = ['integer','float','flag','character','string'] - + + self.vcf_columns = [ + "CHROM", + "POS", + "ID", + "REF", + "ALT", + "FILTER", + "QUAL", + "FILTER", + "INFO", + "FORMAT", + "sample_id", + ] + + self.data_types = ["integer", "float", "flag", "character", "string"] + self.logger.info("Checking version and name") self.version_check() - self.version = float(self['Version']['version']) + self.version = float(self["Version"]["version"]) self.logger.debug("Set version to {0}".format(self.version)) - - self.name = self['Version']['name'] + + self.name = self["Version"]["name"] self.logger.debug("Set name to {0}".format(self.name)) - self.logger.info("Config name: {0}".format(self['Version']['name'])) - self.logger.info("Config version: {0}".format(self['Version']['version'])) - + self.logger.info("Config name: {0}".format(self["Version"]["name"])) + self.logger.info("Config version: {0}".format(self["Version"]["version"])) + for plugin in self.keys(): - if plugin != 'Version' and plugin != 'Categories': + if plugin != "Version" and plugin != "Categories": self.plugins[plugin] = None self.score_functions[plugin] = None - - self.logger.info("Found plugins: {0}".format( - ', '.join(list(self.plugins.keys())))) - + + self.logger.info("Found plugins: {0}".format(", ".join(list(self.plugins.keys())))) + self.logger.info("Checking categories") - for category in self['Categories']: + for category in self["Categories"]: self.logger.debug("Found category {0}".format(category)) self.categories[category] = {} - aggregation = self['Categories'][category].get('category_aggregation', 'min') + aggregation = self["Categories"][category].get("category_aggregation", "min") self.logger.debug("Setting aggregation to {0}".format(aggregation)) - self.categories[category]['category_aggregation'] = aggregation - self.categories[category]['plugins'] = [] - + self.categories[category]["category_aggregation"] = aggregation + self.categories[category]["plugins"] = [] + self.logger.info("Checking plugins") - + for plugin in self.keys(): - if plugin != 'Version' and plugin != 'Categories': + if plugin != "Version" and plugin != "Categories": self.logger.debug("Checking plugin: {0}".format(plugin)) self.check_plugin(plugin) self.logger.debug("Plugin {0} is ok".format(plugin)) plugin_info = self[plugin] - + string_rules = {} - if plugin_info['data_type'] == 'string': - self.logger.info("Checking string rules for plugin {0}".format( - plugin - )) + if plugin_info["data_type"] == "string": + self.logger.info("Checking string rules for plugin {0}".format(plugin)) string_rules = self.get_string_dict(plugin_info) - + self.logger.debug("Adding plugin {0} to ConfigParser".format(plugin)) - - category = plugin_info.get('category', None) - + + category = plugin_info.get("category", None) + self.plugins[plugin] = Plugin( - name=plugin, - field=plugin_info['field'], - data_type=plugin_info['data_type'], - separators=plugin_info.get('separators',[]), - info_key=plugin_info.get('info_key',None), + name=plugin, + field=plugin_info["field"], + data_type=plugin_info["data_type"], + separators=plugin_info.get("separators", []), + info_key=plugin_info.get("info_key", None), category=category, - csq_key=plugin_info.get('csq_key', None), - record_rule=plugin_info.get('record_rule', 'max'), - string_rules=string_rules + csq_key=plugin_info.get("csq_key", None), + record_rule=plugin_info.get("record_rule", "max"), + string_rules=string_rules, ) - + if category: if category in self.categories: - self.categories[category]['plugins'].append(plugin) - self.logger.debug("Adding {0} to category {1}".format( - plugin, category)) + self.categories[category]["plugins"].append(plugin) + self.logger.debug("Adding {0} to category {1}".format(plugin, category)) else: - self.logger.error("{0} have an undefined category {1}".format( - plugin, category)) - self.logger.info("Please add specifications for category {0}".format( - category - )) - raise ValidateError("Plugins must have a category"\ - " defined in 'Categories' section") + self.logger.error( + "{0} have an undefined category {1}".format(plugin, category) + ) + self.logger.info( + "Please add specifications for category {0}".format(category) + ) + raise ValidateError( + "Plugins must have a category" " defined in 'Categories' section" + ) else: self.logger.error("{0} is missing category".format(plugin)) raise ValidateError("Plugins must have a category") - + self.logger.info("Check score function for plugin {0}".format(plugin)) self.score_functions[plugin] = self.get_score_function(plugin_info) self.logger.debug("Added score function for plugin {0}".format(plugin)) - # # logger.info("Checking plugin scores") # for plugin in self.plugins: @@ -148,132 +157,121 @@ def __init__(self, config_file, indent_type=' ', encoding='utf-8'): # def get_score_function(self, plugin_info): """Convert the scoring information - + If data_type = String we use the string dict to get the score If data_type = Flag we only need to check if we have an annotation If data_type = Float or Int we can build a interval tree with score as data - + Arguments: plugin_info (dict): A dictionary with plugin information - + """ - + score_dict = {} - + for key in plugin_info: try: score_dict[key] = dict(plugin_info[key]) except ValueError: pass - match_type = plugin_info['data_type'] - + match_type = plugin_info["data_type"] + score_function = ScoreFunction(match_type=match_type) - + for score_name in score_dict: raw_info = score_dict[score_name] - - if score_name == 'not_reported': - not_reported_score = float(raw_info['score']) + + if score_name == "not_reported": + not_reported_score = float(raw_info["score"]) score_function.set_not_reported(not_reported_score) - - elif match_type == 'flag': - reported_score = float(raw_info['score']) + + elif match_type == "flag": + reported_score = float(raw_info["score"]) score_function.set_reported(reported_score) - - elif match_type == 'string': - string = raw_info['string'] - score = float(raw_info['score']) + + elif match_type == "string": + string = raw_info["string"] + score = float(raw_info["score"]) score_function.add_string_rule(key=string, score=score) - + else: - if raw_info['score'] == 'eq': + if raw_info["score"] == "eq": score_function.set_equal() - elif raw_info.get('value'): - score_function.add_value( - value = raw_info['value'], - score = raw_info['score'] - ) + elif raw_info.get("value"): + score_function.add_value(value=raw_info["value"], score=raw_info["score"]) else: - lower_bound = float(raw_info['lower']) - upper_bound = float(raw_info['upper']) - score = float(raw_info['score']) - score_function.add_interval( - lower=lower_bound, - upper=upper_bound, - score=score - ) - + lower_bound = float(raw_info["lower"]) + upper_bound = float(raw_info["upper"]) + score = float(raw_info["score"]) + score_function.add_interval(lower=lower_bound, upper=upper_bound, score=score) + return score_function - - + def get_string_dict(self, plugin_info): """Convert a section with information of priorities to a string dict. - + Arguments: plugin_info (dict): A dictionary with plugin information - + Return: string_dict (dict): A dictionary with strings as keys and integer that specifies their priorities as values """ string_info = [] string_dict = {} - + for key in plugin_info: try: - if key != 'not_reported': + if key != "not_reported": string_info.append(dict(plugin_info[key])) except ValueError: pass - + string_rules = {} - + for raw_info in string_info: try: - string = raw_info['string'] + string = raw_info["string"] except KeyError: - raise ValidateError("String information has to have a 'string'") try: - priority = raw_info['priority'] + priority = raw_info["priority"] except KeyError: raise ValidateError("String information has to have a 'priority'") try: priority = int(priority) except ValueError: raise ValidateError("'priority' has to be an integer") - + string_dict[string] = priority if len(string_dict) == 0: raise ValidateError("'string' entrys must have string rules defined") - + return string_dict - - - + def version_check(self): """ Check if the version entry is in the proper format """ try: - version_info = self['Version'] + version_info = self["Version"] except KeyError: - raise ValidateError('Config file has to have a Version section') + raise ValidateError("Config file has to have a Version section") try: - float(version_info['version']) + float(version_info["version"]) except KeyError: - raise ValidateError('Config file has to have a version section') + raise ValidateError("Config file has to have a version section") except ValueError: - raise ValidateError('Version has to be a float.') - + raise ValidateError("Version has to be a float.") + try: - version_info['name'] + version_info["name"] except KeyError: raise ValidateError("Config file has to have a name") return - + def check_plugin(self, plugin): """ Check if the section is in the proper format vcf format. @@ -285,114 +283,99 @@ def check_plugin(self, plugin): True is it is in the proper format """ - + vcf_section = self[plugin] - + try: - vcf_field = vcf_section['field'] - if not vcf_field in self.vcf_columns: + vcf_field = vcf_section["field"] + if not vcf_field in self.vcf_columns: raise ValidateError( - "field has to be in {0}\n" - "Wrong field name in plugin: {1}".format( + "field has to be in {0}\n" "Wrong field name in plugin: {1}".format( self.vcf_columns, plugin - )) - if vcf_field == 'INFO': + ) + ) + if vcf_field == "INFO": try: - info_key = vcf_section['info_key'] + info_key = vcf_section["info_key"] - if info_key == 'CSQ': + if info_key == "CSQ": try: - csq_key = vcf_section['csq_key'] + csq_key = vcf_section["csq_key"] except KeyError: try: - csq_key = vcf_section['vep_key'] + csq_key = vcf_section["vep_key"] except KeyError: raise ValidateError( - "CSQ entrys has to refer to an csq field.\n" - "Refer with keyword 'csq_key'\n" - "csq_key is missing in section: {0}".format( - plugin + "CSQ entrys has to refer to an csq field.\n" + "Refer with keyword 'csq_key'\n" + "csq_key is missing in section: {0}".format(plugin) ) - ) - except KeyError: raise ValidateError( "INFO entrys has to refer to an INFO field.\n" "Refer with keyword 'info_key'\n" - "info_key is missing in section: {0}".format( - plugin - ) - ) + "info_key is missing in section: {0}".format(plugin) + ) except KeyError: raise ValidateError( "Vcf entrys have to refer to a field in the VCF with keyword" - " 'field'.\nMissing keyword 'field' in plugin: {0}".format( - plugin - )) + " 'field'.\nMissing keyword 'field' in plugin: {0}".format(plugin) + ) try: - data_type = vcf_section['data_type'] + data_type = vcf_section["data_type"] if not data_type in self.data_types: raise ValidateError( - "data_type has to be in {0}\n" - "Wrong data_type in plugin: {1}".format( - self.data_types, plugin) + "data_type has to be in {0}\n" "Wrong data_type in plugin: {1}".format( + self.data_types, plugin ) + ) except KeyError: raise ValidateError( "Vcf entrys have to refer to a data type in the VCF with " "keyword 'data_type'.\n" "Missing data_type in plugin: {0}".format(plugin) - ) + ) + + separators = vcf_section.get("separators", None) - - separators = vcf_section.get('separators', None) - if separators: if len(separators) == 1: - self[plugin]['separators'] = list(separators) + self[plugin]["separators"] = list(separators) else: - if data_type != 'flag': + if data_type != "flag": raise ValidateError( "If data_type != flag the separators have to be defined.\n" "Missing separators in plugin: {0}".format(plugin) - ) - - - record_rule = vcf_section.get('record_rule', None) - + ) + + record_rule = vcf_section.get("record_rule", None) + if record_rule: - if not record_rule in ['min', 'max']: + if not record_rule in ["min", "max"]: raise ValidateError( - "Record rules have to be in {0}\n" - "Wrong record_rule in plugin: {1}".format( - ['min', 'max'], plugin) + "Record rules have to be in {0}\n" "Wrong record_rule in plugin: {1}".format( + ["min", "max"], plugin + ) ) else: self.logger.info("Setting record rule to default: 'max'") - + return True + @click.command() -@click.argument('config_file', - nargs=1, - type=click.Path(exists=True) -) -@click.option('-out', '--outfile', - nargs=1, - type=click.File('w') -) -@click.option('-l', '--loglevel', - type=click.Choice(['DEBUG', 'INFO', 'WARNING']), - default = 'INFO' -) +@click.argument("config_file", nargs=1, type=click.Path(exists=True)) +@click.option("-out", "--outfile", nargs=1, type=click.File("w")) +@click.option("-l", "--loglevel", type=click.Choice(["DEBUG", "INFO", "WARNING"]), default="INFO") def read_config(config_file, outfile, loglevel): """Parse the config file and print it to the output.""" from genmod import logger from genmod.log import init_log - init_log(logger, loglevel='DEBUG') + + init_log(logger, loglevel="DEBUG") logger.info("Reading Config File: {0}".format(config_file)) @@ -400,15 +383,11 @@ def read_config(config_file, outfile, loglevel): for plugin in config_reader.plugins: logger.info("Found plugin:{0}".format(plugin)) - logger.info("{0}: {1}".format( - plugin,config_reader.plugins[plugin]) - ) + logger.info("{0}: {1}".format(plugin, config_reader.plugins[plugin])) for category in config_reader.categories: - logger.info("Category {0}: {1}".format( - category, config_reader.categories[category] - )) + logger.info("Category {0}: {1}".format(category, config_reader.categories[category])) -if __name__ == '__main__': +if __name__ == "__main__": read_config() diff --git a/genmod/score_variants/rank_score_variant_definitions.py b/genmod/score_variants/rank_score_variant_definitions.py index b89a6f3..44c23e1 100644 --- a/genmod/score_variants/rank_score_variant_definitions.py +++ b/genmod/score_variants/rank_score_variant_definitions.py @@ -1,8 +1,8 @@ from typing import Dict, List # Types of rank scores provided by scoring, raw and normalized rank scores -RANK_SCORE_TYPES: Dict[str, str]= { - 'RankScore': 'The rank score for this variant in this family. family_id:rank_score.', - 'RankScoreNormalized': 'The normalized rank score in range(0, 1) for this variant in this family. family_id:rank_score.' +RANK_SCORE_TYPES: Dict[str, str] = { + "RankScore": "The rank score for this variant in this family. family_id:rank_score.", + "RankScoreNormalized": "The normalized rank score in range(0, 1) for this variant in this family. family_id:rank_score.", } RANK_SCORE_TYPE_NAMES: List[str] = list(RANK_SCORE_TYPES.keys()) diff --git a/genmod/score_variants/score_function.py b/genmod/score_variants/score_function.py index b3d08d0..ae155da 100755 --- a/genmod/score_variants/score_function.py +++ b/genmod/score_variants/score_function.py @@ -12,16 +12,17 @@ from __future__ import print_function import logging +from enum import Enum from typing import List from intervaltree import IntervalTree -from enum import Enum class ModeLookup(Enum): """ Class that abstracts ScoreFunction's method for score lookup """ + # Score lookup from value dict VALUE = 1 # Score lookup string dict @@ -34,11 +35,12 @@ class ModeLookup(Enum): class ScoreFunction(object): """Class for holding score functions""" + def __init__(self, match_type, equal=False): super(ScoreFunction, self).__init__() self.logger = logging.getLogger(__name__) self.logger.debug("Initializing match_type to:{0}".format(match_type)) - self.match_type = match_type #['integer','float','flag','character','string'] + self.match_type = match_type # ['integer','float','flag','character','string'] self.logger.debug("Initializing string_dict to:{}") self._string_dict = {} self.logger.debug("Initializing interval_tree") @@ -48,88 +50,86 @@ def __init__(self, match_type, equal=False): self.logger.debug("Initializing not_reported_score to 0") self._not_reported_score = 0 self.logger.debug("Initializing reported_score to 0") - self._reported_score = 0 # only for 'flag' + self._reported_score = 0 # only for 'flag' # If the score is the same as the value found: self.logger.debug("Initializing equal to {0}".format(equal)) self._equal = equal - + def add_interval(self, lower, upper, score): """Add an interval to the score function - - Args: - lower (int,float): The lower bound of the interval - upper (int,float): The upper bound of the interval - score (int,float): The score of the interval + + Args: + lower (int,float): The lower bound of the interval + upper (int,float): The upper bound of the interval + score (int,float): The score of the interval """ - self.logger.debug("Adding interval {0} to score function".format( - ','.join([str(lower), str(upper), str(score)]) - )) + self.logger.debug( + "Adding interval {0} to score function".format( + ",".join([str(lower), str(upper), str(score)]) + ) + ) ##TODO Check if intervals overlap self._interval_tree[lower:upper] = score - + return - + def add_string_rule(self, key, score): """Add the score for a string match - - Args: - key (str): The string that should be matched - score (int): The score for the match - + + Args: + key (str): The string that should be matched + score (int): The score for the match + """ - self.logger.debug("Adding string {0} with score {1} to string_dict".format( - key, str(score)) - ) + self.logger.debug("Adding string {0} with score {1} to string_dict".format(key, str(score))) self._string_dict[key.lower()] = score return def add_value(self, value, score): """Add the score for a value match - - Args: - value (number): The number that should be matched - score (int): The score for the match - + + Args: + value (number): The number that should be matched + score (int): The score for the match + """ - self.logger.debug("Adding value {0} with score {1} to value_dict".format( - value, str(score)) - ) + self.logger.debug("Adding value {0} with score {1} to value_dict".format(value, str(score))) self._value_dict[str(value)] = score return - + def get_score(self, value): """Take a value and return a score - - - If value is None we return the not_reported score - - If value is NOT None but does not have a rule we return 0 - - If Score function is a string comparison we match the string - - If value is a number (float or int): - - if operator is equal we return the number - - else return data of interval - - Args: - value (str): The value that we want to find the score for - - Return: - score (number): The score for this value + + - If value is None we return the not_reported score + - If value is NOT None but does not have a rule we return 0 + - If Score function is a string comparison we match the string + - If value is a number (float or int): + - if operator is equal we return the number + - else return data of interval + + Args: + value (str): The value that we want to find the score for + + Return: + score (number): The score for this value """ score = 0 - + if not value: self.logger.debug("No value found set score to not reported score") score = self._not_reported_score - - # Here we know there is a value - elif self.match_type == 'flag': + + # Here we know there is a value + elif self.match_type == "flag": self.logger.debug("Flag found set score reported score") score = self._reported_score - - elif self.match_type in ['string', 'char']: + + elif self.match_type in ["string", "char"]: score = self._string_dict.get(value.lower(), 0) - + # We know that match type must be any of integer or float else: - if self.match_type == 'float': + if self.match_type == "float": try: value = float(value) except ValueError: @@ -139,37 +139,36 @@ def get_score(self, value): value = int(value) except ValueError: raise ValueError("Value has to be a number") - + if self._equal: score = value - + else: if self._value_dict: score = float(self._value_dict.get(str(value), 0)) self.logger.debug("Got score from value dict") else: - #There should only be one interval matching + # There should only be one interval matching ##TODO Check if intervals overlap for interval in self._interval_tree[value]: score = interval.data self.logger.debug("Got score from interval tree") - + # For now we only allow integers as score ## TODO fix this ugly solution try: score = int(score) except error as e: score = int(float(score)) - - + return score - + def set_not_reported(self, value): """Set the not reported score - + Args: value (int, float): The not reported score - + """ self.logger.debug("Setting not_reported_score to {0}".format(value)) self._not_reported_score = float(value) @@ -177,17 +176,16 @@ def set_not_reported(self, value): def set_reported(self, value): """Set the reported score - + Args: value (int, float): The reported score """ self.logger.debug("Setting reported_score to {0}".format(value)) self._reported_score = float(value) return - + def set_equal(self): - """Set _equal to True - """ + """Set _equal to True""" self.logger.debug("Setting equal to True") self._equal = True return @@ -214,7 +212,9 @@ def _scoring_mode(self) -> ModeLookup: mode_str: bool = bool(self._string_dict) mode_tree: bool = bool(self._interval_tree) if sum([mode_value, mode_str, mode_tree]) > 1: - raise ValueError('Unable to accurately determine what mapping to use for determining score range') + raise ValueError( + "Unable to accurately determine what mapping to use for determining score range" + ) if mode_value: return ModeLookup.VALUE if mode_str: @@ -234,27 +234,31 @@ def score_range(self) -> List[float]: """ if self._scoring_mode == ModeLookup.UNBOUNDED_USER_DEFINED: # Invalid request to expect a known range from an unknown plugin config - raise ValueError('User supplied score values does not have a known score range') + raise ValueError("User supplied score values does not have a known score range") elif self._scoring_mode == ModeLookup.VALUE: - scores: list = [float(score_value) for score_value in self._value_dict.values()] # val -> score + scores: list = [ + float(score_value) for score_value in self._value_dict.values() + ] # val -> score elif self._scoring_mode == ModeLookup.STRING: - scores: list = [float(score_value) for score_value in self._string_dict.values()] # str -> score + scores: list = [ + float(score_value) for score_value in self._string_dict.values() + ] # str -> score elif self._scoring_mode == ModeLookup.TREE: scores: list = [] for interval in self._interval_tree.all_intervals: scores.append(interval.data) # tree.interval -> score else: - raise NotImplementedError('Unknown scoring mode', self._scoring_mode) + raise NotImplementedError("Unknown scoring mode", self._scoring_mode) # Append set_reported and set_not_reported scores (as they're part of score value set) scores.append(float(self._not_reported_score)) scores.append(float(self._reported_score)) if not isinstance(scores, list) and len(scores) > 0: - raise KeyError('Found no score values', scores) + raise KeyError("Found no score values", scores) for score_value in scores: if not isinstance(score_value, float): - raise TypeError('Invalid score type', score_value) + raise TypeError("Invalid score type", score_value) return scores @property diff --git a/genmod/score_variants/score_variant.py b/genmod/score_variants/score_variant.py index e8feeda..8c02ea0 100644 --- a/genmod/score_variants/score_variant.py +++ b/genmod/score_variants/score_variant.py @@ -4,7 +4,7 @@ """ import logging -from typing import Any, Tuple, List +from typing import Any, List, Tuple logger = logging.getLogger(__name__) @@ -12,68 +12,74 @@ MAX_SCORE_NORMALIZED: float = 1.0 -def get_plugin_score(variant, plugin_name, config_parser, csq_format=None) -> Tuple[Any, float, float]: +def get_plugin_score( + variant, plugin_name, config_parser, csq_format=None +) -> Tuple[Any, float, float]: """Return the score found for a plugin - - Args: - variant (dict): A variant dictionary - plugin_name (str): A plugin name - config_parser (ConfigParser): A config parser object with score functions - Returns: - category_score (float): The rank score for this variant - + + Args: + variant (dict): A variant dictionary + plugin_name (str): A plugin name + config_parser (ConfigParser): A config parser object with score functions + Returns: + category_score (float): The rank score for this variant + """ logger.debug("Checking scores for plugin {0}".format(plugin_name)) # This is a plugin to collect the correct value from the vcf plugin = config_parser.plugins[plugin_name] # This is the score function for this plugin score_function = config_parser.score_functions[plugin_name] - + value = plugin.get_value(variant_dict=variant, csq_format=csq_format) logger.debug("Found value {0} for plugin {1}".format(value, plugin_name)) - + # Score is allways a number score = score_function.get_score(value) logger.debug("Score is {0} for plugin {1}".format(score, plugin_name)) plugin_score_min: float = score_function.score_min plugin_score_max: float = score_function.score_max - logger.debug(f'Minmax scores for plugin {plugin_name} is ({plugin_score_min},{plugin_score_max})') - + logger.debug( + f"Minmax scores for plugin {plugin_name} is ({plugin_score_min},{plugin_score_max})" + ) + return score, plugin_score_min, plugin_score_max -def get_category_score(variant, category, config_parser, csq_format=None) -> Tuple[int, float, float]: +def get_category_score( + variant, category, config_parser, csq_format=None +) -> Tuple[int, float, float]: """Return the score for a given category. - A category (such as 'allele_frequency') groups multiple plugin scores. - This method selects final score based on category_aggregation [Categories] - rule (see genmod_example.ini) + A category (such as 'allele_frequency') groups multiple plugin scores. + This method selects final score based on category_aggregation [Categories] + rule (see genmod_example.ini) - Args: - variant (dict): A variant dictionary - category (str): A score category - config_parser (ConfigParser): A config parser object with score functions - Returns: - category_score, sum of min and max scores for this category + Args: + variant (dict): A variant dictionary + category (str): A score category + config_parser (ConfigParser): A config parser object with score functions + Returns: + category_score, sum of min and max scores for this category """ category_score = 0 logger.debug("Checking scores for category {0}".format(category)) - #We save all the scores for a category - category_aggregate = config_parser.categories[category]['category_aggregation'] + # We save all the scores for a category + category_aggregate = config_parser.categories[category]["category_aggregation"] category_scores = [] category_score_mins: List[float] = [] category_score_maxs: List[float] = [] - for plugin_name in config_parser.categories[category]['plugins']: + for plugin_name in config_parser.categories[category]["plugins"]: plugin_score, plugin_score_min, plugin_score_max = get_plugin_score( - variant = variant, - plugin_name = plugin_name, - config_parser = config_parser, - csq_format = csq_format - ) + variant=variant, + plugin_name=plugin_name, + config_parser=config_parser, + csq_format=csq_format, + ) category_scores.append(plugin_score) # Add the maximal and minimal score points that can be provided for this category category_score_mins.append(plugin_score_min) @@ -84,44 +90,39 @@ def get_category_score(variant, category, config_parser, csq_format=None) -> Tup category_score_max: float = 0.0 if category_scores: - - if category_aggregate == 'max' and category_scores: - logger.debug("Take the max score for category {0}".format( - category)) + if category_aggregate == "max" and category_scores: + logger.debug("Take the max score for category {0}".format(category)) category_score = max(category_scores) category_score_min = max(category_score_mins) category_score_max = max(category_score_maxs) - logger.debug("Max value is {0}".format( - category_score)) - elif category_aggregate == 'min' and category_scores: - logger.debug("Take the min score for category {0}".format( - category)) + logger.debug("Max value is {0}".format(category_score)) + elif category_aggregate == "min" and category_scores: + logger.debug("Take the min score for category {0}".format(category)) category_score = min(category_scores) category_score_min = min(category_score_mins) category_score_max = min(category_score_maxs) - logger.debug("Min value is {0}".format( - category_score)) - elif category_aggregate == 'sum' and category_scores: - logger.debug("Take the sum of scores score for category {0}".format( - category)) + logger.debug("Min value is {0}".format(category_score)) + elif category_aggregate == "sum" and category_scores: + logger.debug("Take the sum of scores score for category {0}".format(category)) category_score = sum(category_scores) category_score_min = sum(category_score_mins) category_score_max = sum(category_score_maxs) - logger.debug("Sum of scores is {0}".format( - category_score)) + logger.debug("Sum of scores is {0}".format(category_score)) else: logger.debug("No scores found for category {0}".format(category)) if not (category_score_min <= category_score <= category_score_max): - raise ValueError('Category score outside expected category score range', - category_score_min, category_score, category_score_max) + raise ValueError( + "Category score outside expected category score range", + category_score_min, + category_score, + category_score_max, + ) return category_score, category_score_min, category_score_max -def as_normalized_max_min(score: float, - min_score_value: float, - max_score_value: float) -> float: +def as_normalized_max_min(score: float, min_score_value: float, max_score_value: float) -> float: """ Performs max-min normalization of score to range (0, 1). Args: @@ -131,15 +132,24 @@ def as_normalized_max_min(score: float, Returns: Score in range (0, 1) """ - for key, value in {'score': score, - 'min_score_value': min_score_value, - 'max_score_value': max_score_value}.items(): + for key, value in { + "score": score, + "min_score_value": min_score_value, + "max_score_value": max_score_value, + }.items(): if not isinstance(value, float): - raise TypeError(f'Bad type {key} {type(value)} {value}') + raise TypeError(f"Bad type {key} {type(value)} {value}") if not max_score_value >= min_score_value: - raise ValueError('Inverted minmax values for normalization', max_score_value, min_score_value) + raise ValueError( + "Inverted minmax values for normalization", max_score_value, min_score_value + ) score_normalized: float = (score - min_score_value) / (max_score_value - min_score_value) if not (MIN_SCORE_NORMALIZED <= score_normalized <= MAX_SCORE_NORMALIZED): - raise ValueError('Failed to normalize to within expected bounds', - min_score_value, max_score_value, score, score_normalized) + raise ValueError( + "Failed to normalize to within expected bounds", + min_score_value, + max_score_value, + score, + score_normalized, + ) return score_normalized diff --git a/genmod/utils/__init__.py b/genmod/utils/__init__.py index 5f64098..c006657 100644 --- a/genmod/utils/__init__.py +++ b/genmod/utils/__init__.py @@ -1,67 +1,67 @@ # -*- coding: utf-8 -*- INTERESTING_SO_TERMS = set( - [ - u'transcript_ablation', - u'splice_donor_variant', - u'splice_acceptor_variant', - u'stop_gained', - u'start_lost', - u'frameshift_variant', - u'stop_lost', - u'initiator_codon_variant', - u'transcript_amplification', - u'inframe_insertion', - u'inframe_deletion', - u'protein_altering_variant', - u'missense_variant', - u'splice_region_variant', - u'incomplete_terminal_codon_variant', - u'stop_retained_variant', - u'synonymous_variant', - u'coding_sequence_variant', - u'mature_miRNA_variant', - u'5_prime_UTR_variant', - u'3_prime_UTR_variant', - u'non_coding_exon_variant', - u'non_coding_transcript_exon_variant', - u'non_coding_transcript_variant', - u'nc_transcript_variant', - u'intron_variant', - u'NMD_transcript_variant', - u'non_coding_transcript_variant', - u'upstream_gene_variant', - u'downstream_gene_variant', - ] + [ + "transcript_ablation", + "splice_donor_variant", + "splice_acceptor_variant", + "stop_gained", + "start_lost", + "frameshift_variant", + "stop_lost", + "initiator_codon_variant", + "transcript_amplification", + "inframe_insertion", + "inframe_deletion", + "protein_altering_variant", + "missense_variant", + "splice_region_variant", + "incomplete_terminal_codon_variant", + "stop_retained_variant", + "synonymous_variant", + "coding_sequence_variant", + "mature_miRNA_variant", + "5_prime_UTR_variant", + "3_prime_UTR_variant", + "non_coding_exon_variant", + "non_coding_transcript_exon_variant", + "non_coding_transcript_variant", + "nc_transcript_variant", + "intron_variant", + "NMD_transcript_variant", + "non_coding_transcript_variant", + "upstream_gene_variant", + "downstream_gene_variant", + ] ) EXONIC_SO_TERMS = set( - [ - u'transcript_ablation', - u'splice_donor_variant', - u'splice_acceptor_variant', - u'stop_gained', - u'frameshift_variant', - u'stop_lost', - u'start_lost', - u'initiator_codon_variant', - u'transcript_amplification', - u'inframe_insertion', - u'inframe_deletion', - u'missense_variant', - u'protein_altering_variant', - u'splice_region_variant', - u'incomplete_terminal_codon_variant', - u'stop_retained_variant', - u'synonymous_variant', - u'coding_sequence_variant', - ] + [ + "transcript_ablation", + "splice_donor_variant", + "splice_acceptor_variant", + "stop_gained", + "frameshift_variant", + "stop_lost", + "start_lost", + "initiator_codon_variant", + "transcript_amplification", + "inframe_insertion", + "inframe_deletion", + "missense_variant", + "protein_altering_variant", + "splice_region_variant", + "incomplete_terminal_codon_variant", + "stop_retained_variant", + "synonymous_variant", + "coding_sequence_variant", + ] ) -from .is_number import is_number from .check_individuals import check_individuals -from .pair_generator import generate_pairs -from .get_features import (get_annotation, check_vep_annotation) from .get_batches import get_batches -from .get_priority import (get_chromosome_priority, get_rank_score) +from .get_features import check_vep_annotation, get_annotation +from .get_priority import get_chromosome_priority, get_rank_score +from .is_number import is_number +from .pair_generator import generate_pairs from .variant_printer import VariantPrinter diff --git a/genmod/utils/check_individuals.py b/genmod/utils/check_individuals.py index bd9cca8..afbb4e6 100644 --- a/genmod/utils/check_individuals.py +++ b/genmod/utils/check_individuals.py @@ -1,18 +1,19 @@ - def check_individuals(ped_individuals, vcf_individuals): """ Check if the individuals from ped file is in vcf file - + Arguments: ped_individuals (iterator): An iterator with strings vcf_individuals (iterator): An iterator with strings - + Returns: bool: if the individuals exists """ - + for individual in ped_individuals: if individual not in vcf_individuals: - raise IOError("Individuals in PED file must exist in VCF file") # Raise proper exception here - - return True \ No newline at end of file + raise IOError( + "Individuals in PED file must exist in VCF file" + ) # Raise proper exception here + + return True diff --git a/genmod/utils/get_batches.py b/genmod/utils/get_batches.py index 893a243..e7018c4 100644 --- a/genmod/utils/get_batches.py +++ b/genmod/utils/get_batches.py @@ -1,30 +1,29 @@ -from __future__ import (print_function) +from __future__ import print_function import logging - -from datetime import datetime from collections import OrderedDict +from datetime import datetime -from genmod.utils import get_annotation -from genmod.vcf_tools import (get_variant_dict, get_variant_id, - get_info_dict, get_vep_dict) - +from genmod.utils import get_annotation +from genmod.vcf_tools import get_info_dict, get_variant_dict, get_variant_id, get_vep_dict logger = logging.getLogger(__name__) -def get_batches(variants, batch_queue, header, vep=False, results_queue=None, - annotation_keyword = 'Annotation'): + +def get_batches( + variants, batch_queue, header, vep=False, results_queue=None, annotation_keyword="Annotation" +): """ - Create variant batches based on their annotation and put them into the + Create variant batches based on their annotation and put them into the batch queue. - Variant batches are are dictionaries with variant_id as key and + Variant batches are are dictionaries with variant_id as key and variant_dict as value. get_batches will then use the annotation to search for sequences of variants with overlapping annotations. These are collected into one batch and gets put into a queue. - + Variants that are in between features will be in their own batch. Arguments: @@ -52,53 +51,49 @@ def get_batches(variants, batch_queue, header, vep=False, results_queue=None, start_parsing_time = datetime.now() start_chrom_time = start_parsing_time start_twenty_time = start_parsing_time - + nr_of_variants = 0 nr_of_batches = 0 - + header_line = header.header vep_header = header.vep_columns logger.info("Start parsing the variants") - + for line in variants: - if not line.startswith('#'): - + if not line.startswith("#"): variant = get_variant_dict(line, header_line) variant_id = get_variant_id(variant) - variant['variant_id'] = variant_id - variant['info_dict'] = get_info_dict(variant['INFO']) + variant["variant_id"] = variant_id + variant["info_dict"] = get_info_dict(variant["INFO"]) if vep: - variant['vep_info'] = get_vep_dict( - vep_string=variant['info_dict']['CSQ'], + variant["vep_info"] = get_vep_dict( + vep_string=variant["info_dict"]["CSQ"], vep_header=vep_header, - allele=variant['ALT'].split(',')[0] - ) - + allele=variant["ALT"].split(",")[0], + ) + logger.debug("Checking variant {0}".format(variant_id)) nr_of_variants += 1 - new_chrom = variant['CHROM'] - if new_chrom.startswith('chr'): + new_chrom = variant["CHROM"] + if new_chrom.startswith("chr"): new_chrom = new_chrom[3:] logger.debug("Update new chrom to {0}".format(new_chrom)) new_features = get_annotation( - variant = variant, - vep = vep, - annotation_key = annotation_keyword + variant=variant, vep=vep, annotation_key=annotation_keyword ) - logger.debug("Adding {0} to variant {1}".format( - ', '.join(new_features), variant_id - )) + logger.debug("Adding {0} to variant {1}".format(", ".join(new_features), variant_id)) - variant['annotation'] = new_features + variant["annotation"] = new_features if nr_of_variants % 20000 == 0: logger.info("{0} variants parsed".format(nr_of_variants)) - logger.info("Last 20.000 took {0} to parse.".format( - str(datetime.now() - start_twenty_time))) + logger.info( + "Last 20.000 took {0} to parse.".format(str(datetime.now() - start_twenty_time)) + ) start_twenty_time = datetime.now() if beginning: @@ -112,32 +107,34 @@ def get_batches(variants, batch_queue, header, vep=False, results_queue=None, current_chrom = new_chrom chromosomes.append(current_chrom) - logger.debug("Adding chr {0} to chromosomes".format(new_chrom)) + logger.debug("Adding chr {0} to chromosomes".format(new_chrom)) beginning = False logger.debug("Updating beginning to False") else: # If we should put the batch in the queue: - logger.debug("Updating send to True") + logger.debug("Updating send to True") send = True - + # Check if the variant ovelapps any features if len(new_features) != 0: # Check if the features overlap the previous variants features if new_features.intersection(current_features): - logger.debug("Set send to False since variant features overlap") + logger.debug("Set send to False since variant features overlap") send = False # If we are at a new chromosome we finish the current batch: if new_chrom != current_chrom: if current_chrom not in chromosomes: chromosomes.append(current_chrom) - logger.debug("Adding chr {0} to chromosomes".format(new_chrom)) + logger.debug("Adding chr {0} to chromosomes".format(new_chrom)) # New chromosome means new batch send = True - logger.info("Chromosome {0} parsed. Time to parse"\ - " chromosome: {1}".format( - current_chrom, datetime.now()-start_chrom_time)) + logger.info( + "Chromosome {0} parsed. Time to parse" " chromosome: {1}".format( + current_chrom, datetime.now() - start_chrom_time + ) + ) start_chrom_time = datetime.now() current_chrom = new_chrom @@ -147,13 +144,13 @@ def get_batches(variants, batch_queue, header, vep=False, results_queue=None, logger.debug("Adding batch in queue") batch_queue.put(batch) nr_of_batches += 1 - #Reset the variables + # Reset the variables current_features = new_features - logger.debug("Initializing empty batch") + logger.debug("Initializing empty batch") batch = {} else: current_features = current_features.union(new_features) - + # Add variant to batch batch[variant_id] = variant @@ -161,18 +158,22 @@ def get_batches(variants, batch_queue, header, vep=False, results_queue=None, logger.debug("Adding chr {0} to chromosomes".format(current_chrom)) chromosomes.append(current_chrom) - logger.info("Chromosome {0} parsed. Time to parse"\ - " chromosome: {0}".format( - current_chrom, datetime.now()-start_chrom_time)) + logger.info( + "Chromosome {0} parsed. Time to parse" " chromosome: {0}".format( + current_chrom, datetime.now() - start_chrom_time + ) + ) if len(batch) > 0: nr_of_batches += 1 batch_queue.put(batch) - logger.debug("Adding batch to queue") + logger.debug("Adding batch to queue") - logger.info("Variants parsed. Time to parse variants: {0}".format( - str(datetime.now() - start_parsing_time) - )) + logger.info( + "Variants parsed. Time to parse variants: {0}".format( + str(datetime.now() - start_parsing_time) + ) + ) logger.info("Number of variants in variant file: {0}".format(nr_of_variants)) logger.info("Number of batches created: {0}".format(nr_of_batches)) diff --git a/genmod/utils/get_features.py b/genmod/utils/get_features.py index ef18afd..dc88e81 100644 --- a/genmod/utils/get_features.py +++ b/genmod/utils/get_features.py @@ -1,68 +1,68 @@ -from __future__ import (print_function) +from __future__ import print_function import logging -from genmod.utils import INTERESTING_SO_TERMS, EXONIC_SO_TERMS +from genmod.utils import EXONIC_SO_TERMS, INTERESTING_SO_TERMS + def check_vep_annotation(variant): """ Return a set with the genes that vep has annotated this variant with. - + Vep annotates all variants but we are only interested in the exonic ones. The terms are specified in INTERESTING_SO_TERMS - + Arguments: variant (dict): A variant dictionary - + Returns: annotation (set): A set with genes """ - + annotation = set() # vep_info is a dictionary with genes as key and annotation as values ##TODO use extract_vcf to get the annotation here - - vep_info = variant.get('vep_info',{}) - + + vep_info = variant.get("vep_info", {}) + for allele in vep_info: - for vep_annotation in variant['vep_info'][allele]: - for consequence in vep_annotation.get('Consequence', {}).split('&'): - # These are the SO terms that indicate that the variant + for vep_annotation in variant["vep_info"][allele]: + for consequence in vep_annotation.get("Consequence", {}).split("&"): + # These are the SO terms that indicate that the variant # belongs to a gene if consequence in INTERESTING_SO_TERMS: - annotation.add(vep_annotation.get('Gene', '')) + annotation.add(vep_annotation.get("Gene", "")) return annotation + def get_annotation(variant, annotation_key="Annotation", vep=False): """ Return the features that a variant belongs to. - - Arguments: + + Arguments: variant (dict): A variant dictionary annotation_key (str): The name of the info field to search vep (bool): If variants are annotated with vep - - Returns: + + Returns: annotations (set): A set with annotated features """ logger = logging.getLogger(__name__) ##TODO use extract_vcf to get the annotation here - + annotation = set() - variant_id = variant.get('variant_id', '') + variant_id = variant.get("variant_id", "") logger.debug("Checking variant annotation for {0}".format(variant_id)) - # If the variant has already been annotated by genmod we do not need to + # If the variant has already been annotated by genmod we do not need to # check again if vep: logger.debug("Using vep annotation.") annotation = check_vep_annotation(variant) - + else: - info_dict = variant.get('info_dict', {}) + info_dict = variant.get("info_dict", {}) if info_dict.get(annotation_key, None): - annotation = set(info_dict[annotation_key].split(',')) - - logger.debug("Annotations found for {0}: {1}".format( - variant_id, ','.join(annotation) - )) + annotation = set(info_dict[annotation_key].split(",")) + + logger.debug("Annotations found for {0}: {1}".format(variant_id, ",".join(annotation))) return annotation diff --git a/genmod/utils/get_priority.py b/genmod/utils/get_priority.py index 6f20e78..7eb8749 100644 --- a/genmod/utils/get_priority.py +++ b/genmod/utils/get_priority.py @@ -1,67 +1,72 @@ import logging + from genmod.score_variants import RANK_SCORE_TYPE_NAMES + def get_chromosome_priority(chrom, chrom_dict={}): """ Return the chromosome priority - + Arguments: chrom (str): The cromosome name from the vcf chrom_dict (dict): A map of chromosome names and theis priority - + Return: priority (str): The priority for this chromosom """ - priority = '0' - - chrom = chrom.lstrip('chr') - + priority = "0" + + chrom = chrom.lstrip("chr") + if chrom_dict: - priority = chrom_dict.get(chrom, '0') - + priority = chrom_dict.get(chrom, "0") + else: try: if int(chrom) < 23: priority = chrom except ValueError: - if chrom == 'X': - priority = '23' - elif chrom == 'Y': - priority = '24' - elif chrom == 'MT': - priority = '25' + if chrom == "X": + priority = "23" + elif chrom == "Y": + priority = "24" + elif chrom == "MT": + priority = "25" else: - priority = '26' - + priority = "26" + return priority -def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score_type: str = 'RankScore'): + +def get_rank_score( + variant_line=None, variant_dict=None, family_id=0, rank_score_type: str = "RankScore" +): """ Return the rank score priority for a certain family. - + If no family is given the first family found is used - + Arguments: variant_line (str): A vcf variant line variant_dict (dict): A variant dictionary family_id (str): A family id rank_score_type(str): Return rank score based on raw or normalized format See the genmod.score_variants.rank_score_variant_definitions for more info. - + Return: rank_score (str): The rank score for this variant """ if rank_score_type not in RANK_SCORE_TYPE_NAMES: - raise ValueError('Unknown rank_score_type', rank_score_type) - + raise ValueError("Unknown rank_score_type", rank_score_type) + rank_score = -100 raw_entry = None - + if variant_line: variant_line = variant_line.split("\t") - - for info_annotation in variant_line[7].split(';'): - info_annotation = info_annotation.split('=') + + for info_annotation in variant_line[7].split(";"): + info_annotation = info_annotation.split("=") key = None if len(info_annotation) == 2: key = info_annotation[0] @@ -69,21 +74,20 @@ def get_rank_score(variant_line=None, variant_dict=None, family_id=0, rank_score if key == rank_score_type: raw_entry = value break - + elif variant_dict: - raw_entry: str = variant_dict['info_dict'].get(rank_score_type) - + raw_entry: str = variant_dict["info_dict"].get(rank_score_type) + if raw_entry: - for family_annotation in raw_entry.split(','): - family_annotation = family_annotation.split(':') + for family_annotation in raw_entry.split(","): + family_annotation = family_annotation.split(":") if family_id: # If we should sort on a certain family we look for the # correct id if family_id == family_annotation[0]: rank_score = float(family_annotation[1]) else: - # If no family id is given we choose the first family found + # If no family id is given we choose the first family found rank_score = float(family_annotation[1]) return str(rank_score) - \ No newline at end of file diff --git a/genmod/utils/is_number.py b/genmod/utils/is_number.py index 29fb304..8ad87e0 100755 --- a/genmod/utils/is_number.py +++ b/genmod/utils/is_number.py @@ -6,16 +6,17 @@ Created by MÃ¥ns Magnusson on 2013-04-09. Copyright (c) 2013 __MyCompanyName__. All rights reserved. """ + from __future__ import print_function def is_number(number): """ Returns true if the input is a number or False otherwise - + Arguments: number (obj): The object that should be checked - + """ try: float(number) @@ -23,4 +24,3 @@ def is_number(number): except ValueError: pass return False - diff --git a/genmod/utils/pair_generator.py b/genmod/utils/pair_generator.py index 623fa6a..d471d13 100755 --- a/genmod/utils/pair_generator.py +++ b/genmod/utils/pair_generator.py @@ -6,37 +6,35 @@ Class that takes a list of objects and return all unordered pairs as a generator. If only one object? Raise Exception - + Created by MÃ¥ns Magnusson on 2013-03-01. Copyright (c) 2013 __MyCompanyName__. All rights reserved. """ from __future__ import print_function - try: from collections.abc import Iterable except AttributeError: from collections import Iterable + def generate_pairs(objects): """ Yields all unordered pairs as tuples from the list of objects - + Arguments: list_of_objects (iterator): """ if not isinstance(objects, Iterable): - raise SyntaxError("objects has to be iterable. objects: {0}".format( - objects - )) + raise SyntaxError("objects has to be iterable. objects: {0}".format(objects)) if len(objects) < 2: - #TODO raise a proper exception here - raise SyntaxError('List must include at least 2 objects!."\ - " objects: {0}'.format(objects)) - - for i in range(len(objects)-1): - for j in range(i+1, len(objects)): - yield (objects[i], objects[j]) - + # TODO raise a proper exception here + raise SyntaxError( + 'List must include at least 2 objects!."\ + " objects: {0}'.format(objects) + ) + for i in range(len(objects) - 1): + for j in range(i + 1, len(objects)): + yield (objects[i], objects[j]) diff --git a/genmod/utils/variant_printer.py b/genmod/utils/variant_printer.py index a40e133..fae970e 100755 --- a/genmod/utils/variant_printer.py +++ b/genmod/utils/variant_printer.py @@ -9,30 +9,30 @@ Copyright (c) 2013 __MyCompanyName__. All rights reserved. """ -from __future__ import (print_function) +from __future__ import print_function -from multiprocessing import Process +import logging from codecs import open +from multiprocessing import Process from genmod.utils import get_chromosome_priority, get_rank_score from genmod.vcf_tools import print_variant -import logging class VariantPrinter(Process): """ Print variants to a temporary file. - - There are three modes for printing a variant + + There are three modes for printing a variant 'chromosome' and 'score' are used the file is going to be sorted. 'normal' means that the variants are just printed. - + 'chromosome': In this case the priority order of the chromosome is printed to the first position in the results file - + 'score': Here the Individual score of the variant is printed to the first position of the results file - + Args: task_queue : A variants queue with batches of variants head : The header line to specify what from the variant object to print @@ -40,9 +40,10 @@ class VariantPrinter(Process): chr_map : If mode='chromosome' we need a map to specify the sort order of the chromosomes. (Soon to come?) outfile : File that all variants should be printed to - + """ - def __init__(self, task_queue, head, mode='chromosome', outfile = None, silent=False): + + def __init__(self, task_queue, head, mode="chromosome", outfile=None, silent=False): Process.__init__(self) self.logger = logging.getLogger(__name__) self.task_queue = task_queue @@ -50,47 +51,48 @@ def __init__(self, task_queue, head, mode='chromosome', outfile = None, silent=F self.header = head.header self.mode = mode self.silent = silent - + def run(self): """Starts the printing""" # Print the results to a temporary file: number_of_finished = 0 proc_name = self.name - self.logger.info(('{0}: starting'.format(proc_name))) - + self.logger.info(("{0}: starting".format(proc_name))) + if self.outfile: if isinstance(self.outfile, str): - self.outfile = open(self.outfile, 'w+', encoding="utf-8") - + self.outfile = open(self.outfile, "w+", encoding="utf-8") + while True: - # A task is a variant dictionary - self.logger.debug(('{0} fetching next variant'.format(proc_name))) + self.logger.debug(("{0} fetching next variant".format(proc_name))) variant = self.task_queue.get() - + if self.task_queue.full(): - self.logger.warning('Variant queue full') - + self.logger.warning("Variant queue full") + if variant is None: - self.logger.info('All variants printed.') + self.logger.info("All variants printed.") if self.outfile: self.outfile.close() break - - self.logger.debug("Printing variant {0}".format(variant.get('variant_id', 'unknown'))) - + + self.logger.debug("Printing variant {0}".format(variant.get("variant_id", "unknown"))) + priority = None - - if self.mode == 'chromosome': - priority = get_chromosome_priority(variant['CHROM']) - elif self.mode == 'score': + if self.mode == "chromosome": + priority = get_chromosome_priority(variant["CHROM"]) + + elif self.mode == "score": priority = get_rank_score(variant_dict=variant) - - - print_variant(variant_dict=variant, header_line=self.header, - priority=priority, outfile=self.outfile, - silent=self.silent) - - return + print_variant( + variant_dict=variant, + header_line=self.header, + priority=priority, + outfile=self.outfile, + silent=self.silent, + ) + + return diff --git a/genmod/vcf_tools/__init__.py b/genmod/vcf_tools/__init__.py index 6174196..590b49b 100644 --- a/genmod/vcf_tools/__init__.py +++ b/genmod/vcf_tools/__init__.py @@ -1,16 +1,20 @@ from __future__ import absolute_import -from .header_parser import HeaderParser -from .add_variant_information import (add_vcf_info, replace_vcf_info) -from .add_metadata import (add_metadata, add_version_header, -add_annotation_header, add_exonic_header, add_model_score_header, -add_genetic_models_header, add_compounds_header) -from .print_headers import print_headers -from .print_variants import (print_variant, print_variant_for_sorting, -print_variant_dict) -from .sort_variants import sort_variants +from .add_metadata import ( + add_annotation_header, + add_compounds_header, + add_exonic_header, + add_genetic_models_header, + add_metadata, + add_model_score_header, + add_version_header, +) +from .add_variant_information import add_vcf_info, replace_vcf_info from .check_info_header import check_info -from .parse_variant import (get_variant_dict, get_info_dict, get_variant_id, -get_vep_dict) from .genotype import Genotype from .get_genotypes import get_genotypes +from .header_parser import HeaderParser +from .parse_variant import get_info_dict, get_variant_dict, get_variant_id, get_vep_dict +from .print_headers import print_headers +from .print_variants import print_variant, print_variant_dict, print_variant_for_sorting +from .sort_variants import sort_variants diff --git a/genmod/vcf_tools/add_metadata.py b/genmod/vcf_tools/add_metadata.py index ac1b352..8dcb159 100755 --- a/genmod/vcf_tools/add_metadata.py +++ b/genmod/vcf_tools/add_metadata.py @@ -12,16 +12,24 @@ from __future__ import print_function import logging - from datetime import datetime + from genmod import __version__ -def add_metadata(head, metadata_type, annotation_id, annotation_number='.', - entry_type=None, description=None, version=None, - command_line_string=''): + +def add_metadata( + head, + metadata_type, + annotation_id, + annotation_number=".", + entry_type=None, + description=None, + version=None, + command_line_string="", +): """ Add genmod metadata to the vcf header. - + Args: head : A vcf header object metadata_type(str): 'info' or 'version' @@ -33,36 +41,25 @@ def add_metadata(head, metadata_type, annotation_id, annotation_number='.', thousand_g : Bool. If thousand genome frequencies are annotated exac : Bool. If exac frequencies are added command_line_string : A string with the command that envoked genmod - + """ logger = logging.getLogger(__name__) - - if metadata_type == 'info': - logger.debug("Updating INFO header with {0}".format( - annotation_id - )) - head.add_info( - annotation_id, - annotation_number, - entry_type, - description - ) - elif metadata_type == 'version': - logger.debug("Updating version header with {0}".format( - annotation_id - )) + + if metadata_type == "info": + logger.debug("Updating INFO header with {0}".format(annotation_id)) + head.add_info(annotation_id, annotation_number, entry_type, description) + elif metadata_type == "version": + logger.debug("Updating version header with {0}".format(annotation_id)) return -def add_version_header(head, command_line_string = ""): + +def add_version_header(head, command_line_string=""): """Add Version information to the header""" head.add_version_tracking( - 'genmod', - __version__, - datetime.now().strftime("%Y-%m-%d %H:%M"), - command_line_string - ) + "genmod", __version__, datetime.now().strftime("%Y-%m-%d %H:%M"), command_line_string + ) return - + def add_annotation_header(head): """ @@ -70,68 +67,74 @@ def add_annotation_header(head): """ add_metadata( head, - 'info', - 'Annotation', - annotation_number='.', - entry_type='String', - description='Annotates what feature(s) this variant belongs to.' + "info", + "Annotation", + annotation_number=".", + entry_type="String", + description="Annotates what feature(s) this variant belongs to.", ) return + def add_exonic_header(head): """ Add the Exonic information to a vcf header """ add_metadata( head, - 'info', - 'Exonic', - annotation_number='0', - entry_type='Flag', - description='Indicates if the variant is exonic.' + "info", + "Exonic", + annotation_number="0", + entry_type="Flag", + description="Indicates if the variant is exonic.", ) return - + + def add_genetic_models_header(head): """ Add Genetic Models to vcf header """ add_metadata( head, - 'info', - 'GeneticModels', - annotation_number='.', - entry_type='String', - description="':'-separated list of genetic models for this variant." + "info", + "GeneticModels", + annotation_number=".", + entry_type="String", + description="':'-separated list of genetic models for this variant.", ) return + def add_model_score_header(head): """ Add Model Score to vcf header """ add_metadata( head, - 'info', - 'ModelScore', - annotation_number='1', - entry_type='Integer', - description="PHRED score for genotype models." + "info", + "ModelScore", + annotation_number="1", + entry_type="Integer", + description="PHRED score for genotype models.", ) return + def add_compounds_header(head): """ Add compounds to vcf header """ add_metadata( head, - 'info', - 'Compounds', - annotation_number='.', - entry_type='String', - description=("List of compound pairs for this variant." - "The list is splitted on ',' family id is separated with compounds" - "with ':'. Compounds are separated with '|'.") + "info", + "Compounds", + annotation_number=".", + entry_type="String", + description=( + "List of compound pairs for this variant." + "The list is splitted on ',' family id is separated with compounds" + "with ':'. Compounds are separated with '|'." + ), ) return diff --git a/genmod/vcf_tools/add_variant_information.py b/genmod/vcf_tools/add_variant_information.py index 43cbc6c..e7fd4f7 100755 --- a/genmod/vcf_tools/add_variant_information.py +++ b/genmod/vcf_tools/add_variant_information.py @@ -13,114 +13,116 @@ import logging + def replace_vcf_info(keyword, annotation, variant_line=None, variant_dict=None): """Replace the information of a info field of a vcf variant line. - - + + Arguments: variant_line (str): A vcf formatted variant line variant_dict (dict): A variant dictionary keyword (str): The info field key annotation (str): If the annotation is a key, value pair this is the string that represents the value - + Returns: variant_line (str): A annotated variant line """ logger = logging.getLogger(__name__) - - new_info = '{0}={1}'.format(keyword, annotation) - + + new_info = "{0}={1}".format(keyword, annotation) + logger.debug("Replacing the variant information {0}".format(new_info)) - + fixed_variant = None new_info_list = [] - + if variant_line: logger.debug("Adding information to a variant line") - splitted_variant = variant_line.rstrip('\n').split('\t') + splitted_variant = variant_line.rstrip("\n").split("\t") logger.debug("Adding information to splitted variant line") old_info = splitted_variant[7] - if old_info == '.': + if old_info == ".": new_info_string = new_info else: - splitted_info_string = old_info.split(';') + splitted_info_string = old_info.split(";") for info in splitted_info_string: - splitted_info_entry = info.split('=') + splitted_info_entry = info.split("=") if splitted_info_entry[0] == keyword: new_info_list.append(new_info) else: new_info_list.append(info) - new_info_string = ';'.join(new_info_list) - + new_info_string = ";".join(new_info_list) + splitted_variant[7] = new_info_string - - fixed_variant = '\t'.join(splitted_variant) - + + fixed_variant = "\t".join(splitted_variant) + elif variant_dict: logger.debug("Adding information to a variant dict") - old_info = variant_dict['INFO'] - - if old_info == '.': - variant_dict['INFO'] = new_info + old_info = variant_dict["INFO"] + + if old_info == ".": + variant_dict["INFO"] = new_info else: - for info in old_info.split(';'): - splitted_info_entry = info.split('=') + for info in old_info.split(";"): + splitted_info_entry = info.split("=") if splitted_info_entry[0] == keyword: new_info_list.append(new_info) else: new_info_list.append(info) - new_info_string = ';'.join(new_info_list) - - variant_dict['INFO'] = new_info_string + new_info_string = ";".join(new_info_list) + + variant_dict["INFO"] = new_info_string fixed_variant = variant_dict - + return fixed_variant + def add_vcf_info(keyword, variant_line=None, variant_dict=None, annotation=None): """ Add information to the info field of a vcf variant line. - + Arguments: variant_line (str): A vcf formatted variant line keyword (str): The info field key annotation (str): If the annotation is a key, value pair this is the string that represents the value - + Returns: fixed_variant : str if variant line, or dict if variant_dict """ logger = logging.getLogger(__name__) - + if annotation: annotation = str(annotation) - new_info = '{0}={1}'.format(keyword, annotation) + new_info = "{0}={1}".format(keyword, annotation) else: new_info = keyword - + logger.debug("Adding new variant information {0}".format(new_info)) - + fixed_variant = None - + if variant_line: logger.debug("Adding information to a variant line") - splitted_variant = variant_line.rstrip('\n').split('\t') + splitted_variant = variant_line.rstrip("\n").split("\t") logger.debug("Adding information to splitted variant line") old_info = splitted_variant[7] - if old_info == '.': + if old_info == ".": splitted_variant[7] = new_info else: splitted_variant[7] = "{0};{1}".format(splitted_variant[7], new_info) - - fixed_variant = '\t'.join(splitted_variant) - + + fixed_variant = "\t".join(splitted_variant) + elif variant_dict: logger.debug("Adding information to a variant dict") - old_info = variant_dict['INFO'] - if old_info == '.': - variant_dict['INFO'] = new_info + old_info = variant_dict["INFO"] + if old_info == ".": + variant_dict["INFO"] = new_info else: - variant_dict['INFO'] = "{0};{1}".format(old_info, new_info) + variant_dict["INFO"] = "{0};{1}".format(old_info, new_info) fixed_variant = variant_dict - + return fixed_variant diff --git a/genmod/vcf_tools/check_info_header.py b/genmod/vcf_tools/check_info_header.py index eed1648..f165710 100644 --- a/genmod/vcf_tools/check_info_header.py +++ b/genmod/vcf_tools/check_info_header.py @@ -1,11 +1,12 @@ import logging + def check_info(info_key, head): """Check if a info key exists in the vcf header - - Args: - info_key (str): The name of the info key - head (HeaderParser): A header parser object + + Args: + info_key (str): The name of the info key + head (HeaderParser): A header parser object """ logger = logging.getLogger(__name__) logger.debug("Checking if {0} is in vcf header.".format(info_key)) @@ -13,5 +14,5 @@ def check_info(info_key, head): logger.debug("{0} is specified in vcf header.".format(info_key)) return True logger.warning("{0} is not specified in vcf header.".format(info_key)) - + return False diff --git a/genmod/vcf_tools/genotype.py b/genmod/vcf_tools/genotype.py index 9f02e10..20f742a 100755 --- a/genmod/vcf_tools/genotype.py +++ b/genmod/vcf_tools/genotype.py @@ -15,7 +15,7 @@ - allele_1 STRING (Base on allele 1) - allele_2 STRING (Base on allele 2) - nocall BOOL - - heterozygote BOOL + - heterozygote BOOL - homo_alt BOOL (If individual is homozygote alternative) - homo_ref BOOL (If individual is homozygote reference) - has_variant BOOL (If individual is called and not homozygote reference) @@ -27,32 +27,33 @@ - phased BOOL If a variant is present, that is if homo_alt or heterozygote is true, then has_variant is True - + When dealing with phased data we will see the '|'-delimiter #TODO: Should we allow '1/2', '2/2' and so on? This type of call looses it's point when moving from vcf -> bed since bed files only have one kind of variant on each line. -For now we will only allow './.', '0/0', '0/1', '1/1' +For now we will only allow './.', '0/0', '0/1', '1/1' Created by MÃ¥ns Magnusson on 2014-06-30. Copyright (c) 2013 __MyCompanyName__. All rights reserved. """ -import sys import os +import sys class Genotype(object): """Holds information about a genotype""" + def __init__(self, **kwargs): - super(Genotype, self).__init__() + super(Genotype, self).__init__() # These are the different genotypes: - GT = kwargs.get('GT', './.') - AD = kwargs.get('AD', '.,.') - DP = kwargs.get('DP', '0') - GQ = kwargs.get('GQ', '0') - PL = kwargs.get('PL', None) + GT = kwargs.get("GT", "./.") + AD = kwargs.get("AD", ".,.") + DP = kwargs.get("DP", "0") + GQ = kwargs.get("GQ", "0") + PL = kwargs.get("PL", None) self.heterozygote = False self.allele_depth = False self.homo_alt = False @@ -63,24 +64,24 @@ def __init__(self, **kwargs): self.depth_of_coverage = 0 self.quality_depth = 0 self.genotype_quality = 0 - #Check phasing - if '|' in GT: + # Check phasing + if "|" in GT: self.phased = True - #Check the genotyping: - #This is the case when only one allele is present(eg. X-chromosome) and presented like '0' or '1': - if len(GT) < 3: + # Check the genotyping: + # This is the case when only one allele is present(eg. X-chromosome) and presented like '0' or '1': + if len(GT) < 3: self.allele_1 = GT - self.allele_2 = '.' + self.allele_2 = "." else: self.allele_1 = GT[0] self.allele_2 = GT[-1] # The genotype should allways be represented on the same form - self.genotype = self.allele_1 +'/'+ self.allele_2 - - if self.genotype != './.': + self.genotype = self.allele_1 + "/" + self.allele_2 + + if self.genotype != "./.": self.genotyped = True - #Check allele status - if self.genotype in ['0/0', './0', '0/.']: + # Check allele status + if self.genotype in ["0/0", "./0", "0/."]: self.homo_ref = True elif self.allele_1 == self.allele_2: self.homo_alt = True @@ -88,39 +89,38 @@ def __init__(self, **kwargs): else: self.heterozygote = True self.has_variant = True - #Check the allele depth: + # Check the allele depth: self.ref_depth = 0 self.alt_depth = 0 - - allele_depths = AD.split(',') - + + allele_depths = AD.split(",") + if len(allele_depths) > 1: if allele_depths[0].isdigit(): self.ref_depth = int(allele_depths[0]) if allele_depths[1].isdigit(): self.alt_depth = int(allele_depths[1]) - + self.quality_depth = self.ref_depth + self.alt_depth - #Check the depth of coverage: + # Check the depth of coverage: try: self.depth_of_coverage = int(DP) except ValueError: pass - #Check the genotype quality + # Check the genotype quality try: self.genotype_quality = float(GQ) except ValueError: pass - #Check the genotype likelihoods + # Check the genotype likelihoods self.phred_likelihoods = [] - + if PL: try: - self.phred_likelihoods = [int(score) for score in PL.split(',')] + self.phred_likelihoods = [int(score) for score in PL.split(",")] except ValueError: pass - + def __str__(self): """Specifies what will be printed when printing the object.""" - return self.allele_1+'/'+self.allele_2 - + return self.allele_1 + "/" + self.allele_2 diff --git a/genmod/vcf_tools/get_genotypes.py b/genmod/vcf_tools/get_genotypes.py index def9c84..48df65f 100644 --- a/genmod/vcf_tools/get_genotypes.py +++ b/genmod/vcf_tools/get_genotypes.py @@ -1,29 +1,29 @@ from genmod.vcf_tools import Genotype + def get_genotypes(variant, individuals): """Create genotype objects - - Create Genotype objects for all individuals and return them in a - dictionary - - Args: - variant (dict): A variant dict - individuals (list): A list with strings that are individual id:s - - Returns: - genotype_dict (dict): A dictionary with individual id:s as strings - and Genptype objects as keys + + Create Genotype objects for all individuals and return them in a + dictionary + + Args: + variant (dict): A variant dict + individuals (list): A list with strings that are individual id:s + + Returns: + genotype_dict (dict): A dictionary with individual id:s as strings + and Genptype objects as keys """ - gt_format = variant.get('FORMAT', '').split(':') - + gt_format = variant.get("FORMAT", "").split(":") + genotype_dict = {} - + for individual in individuals: - gt_info = variant[individual].split(':') + gt_info = variant[individual].split(":") gt_call = dict(zip(gt_format, gt_info)) - - #Create a genotype object for this individual + + # Create a genotype object for this individual genotype_dict[individual] = Genotype(**gt_call) - + return genotype_dict - \ No newline at end of file diff --git a/genmod/vcf_tools/header_parser.py b/genmod/vcf_tools/header_parser.py index 550ab42..f791af7 100644 --- a/genmod/vcf_tools/header_parser.py +++ b/genmod/vcf_tools/header_parser.py @@ -1,8 +1,7 @@ from __future__ import print_function -import sys import re - +import sys from logging import getLogger if sys.version_info < (2, 7): @@ -13,36 +12,39 @@ class HeaderParser(object): """Parses a file with family info and creates a family object with individuals.""" + def __init__(self): super(HeaderParser, self).__init__() self.logger = getLogger(__name__) - self.info_lines=[] - self.info_dict=OrderedDict() - #This is a dictionary cantaining specific information about the info fields - #It will have info name as key and then another dictionary with ID, Number, Type and Description + self.info_lines = [] + self.info_dict = OrderedDict() + # This is a dictionary cantaining specific information about the info fields + # It will have info name as key and then another dictionary with ID, Number, Type and Description self.extra_info = {} - - self.filter_lines=[] - self.filter_dict=OrderedDict() - - self.contig_lines=[] - self.contig_dict=OrderedDict() - - self.format_lines=[] - self.format_dict=OrderedDict() - - self.alt_lines=[] - self.alt_dict=OrderedDict() - - self.other_lines=[] - self.other_dict=OrderedDict() - - self.header=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO'] - self.header_keys={'info' : ['ID', 'Number', 'Type', 'Description'], - 'form' : ['ID', 'Number', 'Type', 'Description'], - 'filt' : ['ID', 'Description'], - 'alt' : ['ID', 'Description'], - 'contig' : ['ID']} + + self.filter_lines = [] + self.filter_dict = OrderedDict() + + self.contig_lines = [] + self.contig_dict = OrderedDict() + + self.format_lines = [] + self.format_dict = OrderedDict() + + self.alt_lines = [] + self.alt_dict = OrderedDict() + + self.other_lines = [] + self.other_dict = OrderedDict() + + self.header = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] + self.header_keys = { + "info": ["ID", "Number", "Type", "Description"], + "form": ["ID", "Number", "Type", "Description"], + "filt": ["ID", "Description"], + "alt": ["ID", "Description"], + "contig": ["ID"], + } self.fileformat = None self.filedate = None self.reference = None @@ -51,147 +53,157 @@ def __init__(self): self.line_counter = 0 self.individuals = [] self.vep_columns = [] - self.info_pattern = re.compile(r'''\#\#INFO=< + self.info_pattern = re.compile( + r"""\#\#INFO=< ID=(?P[^,]+), Number=(?P-?\d+|\.|[AGR]), Type=(?PInteger|Float|Flag|Character|String), Description="(?P[^"]*)" (?:,Source="(?P[^"]+)")? (?:,Version="(?P[^"]+)")? - >''', re.VERBOSE) - self.filter_pattern = re.compile(r'''\#\#FILTER=< + >""", + re.VERBOSE, + ) + self.filter_pattern = re.compile( + r"""\#\#FILTER=< ID=(?P[^,]+), Description="(?P.+">$) - ''', re.VERBOSE) - self.contig_pattern = re.compile(r'''\#\#contig=< + """, + re.VERBOSE, + ) + self.contig_pattern = re.compile( + r"""\#\#contig=< ID=(?P[^,]+) .* - >''', re.VERBOSE) - self.format_pattern = re.compile(r'''\#\#FORMAT=< + >""", + re.VERBOSE, + ) + self.format_pattern = re.compile( + r"""\#\#FORMAT=< ID=(?P.+), Number=(?P-?\d+|\.|[AGR]), Type=(?P.+), Description="(?P.*)" - >''', re.VERBOSE) - self.alt_pattern = re.compile(r'''\#\#ALT=< + >""", + re.VERBOSE, + ) + self.alt_pattern = re.compile( + r"""\#\#ALT=< ID=(?P[^,]+), Description="(?P[^"]*)" - >''', re.VERBOSE) - self.meta_pattern = re.compile(r'''##(?P.+?)=(?P.+)''') - + >""", + re.VERBOSE, + ) + self.meta_pattern = re.compile(r"""##(?P.+?)=(?P.+)""") + def parse_meta_data(self, line): """Parse a vcf metadataline""" line = line.rstrip() self.logger.debug("Parsing metadata line:{0}".format(line)) - line_info = line[2:].split('=') + line_info = line[2:].split("=") match = False - - if line_info[0] == 'fileformat': + + if line_info[0] == "fileformat": self.logger.debug("Parsing fileformat") try: self.fileformat = line_info[1] self.logger.debug("Found fileformat {0}".format(self.fileformat)) except IndexError: raise SyntaxError("fileformat must have a value") - - elif line_info[0] == 'INFO': + + elif line_info[0] == "INFO": match = self.info_pattern.match(line) if not match: raise SyntaxError("One of the INFO lines is malformed:{0}".format(line)) - + matches = [ - match.group('id'), match.group('number'), - match.group('type'), match.group('desc'), - match.group('source'), match.group('version') + match.group("id"), + match.group("number"), + match.group("type"), + match.group("desc"), + match.group("source"), + match.group("version"), ] - + # extra_info is a dictionary to check the metadata about the INFO values: - self.extra_info[matches[0]] = dict( - zip(self.header_keys['info'][1:], matches[1:]) - ) - - info_line = dict(list(zip(self.header_keys['info'],matches))) - - if len(info_line['Description'].split('Format:')) > 1: - info_line['Format'] = [ - info.strip() for info in info_line['Description'].split('Format:') + self.extra_info[matches[0]] = dict(zip(self.header_keys["info"][1:], matches[1:])) + + info_line = dict(list(zip(self.header_keys["info"], matches))) + + if len(info_line["Description"].split("Format:")) > 1: + info_line["Format"] = [ + info.strip() for info in info_line["Description"].split("Format:") ][-1] self.info_lines.append(info_line) - + # Store the vep columns: - if info_line['ID'] == 'CSQ': - self.vep_columns = info_line.get('Format', '').split('|') - - self.info_dict[match.group('id')] = line - - elif line_info[0] == 'FILTER': + if info_line["ID"] == "CSQ": + self.vep_columns = info_line.get("Format", "").split("|") + + self.info_dict[match.group("id")] = line + + elif line_info[0] == "FILTER": match = self.filter_pattern.match(line) if not match: raise SyntaxError("One of the FILTER lines is malformed: {0}".format(line)) - matches = [match.group('id'), match.group('desc')] - self.filter_lines.append(dict( - list(zip(self.header_keys['filt'],matches))) - ) - self.filter_dict[match.group('id')] = line - - elif line_info[0] == 'contig': + matches = [match.group("id"), match.group("desc")] + self.filter_lines.append(dict(list(zip(self.header_keys["filt"], matches)))) + self.filter_dict[match.group("id")] = line + + elif line_info[0] == "contig": match = self.contig_pattern.match(line) if not match: raise SyntaxError("One of the contig lines is malformed: {0}".format(line)) - - matches = [match.group('id')] - self.contig_lines.append(dict( - list(zip(self.header_keys['contig'],matches))) - ) - self.contig_dict[match.group('id')] = line - - elif line_info[0] == 'FORMAT': + + matches = [match.group("id")] + self.contig_lines.append(dict(list(zip(self.header_keys["contig"], matches)))) + self.contig_dict[match.group("id")] = line + + elif line_info[0] == "FORMAT": match = self.format_pattern.match(line) if not match: raise SyntaxError("One of the FORMAT lines is malformed: {0}".format(line)) - + matches = [ - match.group('id'), match.group('number'), - match.group('type'), match.group('desc') + match.group("id"), + match.group("number"), + match.group("type"), + match.group("desc"), ] - self.format_lines.append(dict( - list(zip(self.header_keys['form'],matches))) - ) - self.format_dict[match.group('id')] = line - - elif line_info[0] == 'ALT': + self.format_lines.append(dict(list(zip(self.header_keys["form"], matches)))) + self.format_dict[match.group("id")] = line + + elif line_info[0] == "ALT": match = self.alt_pattern.match(line) if not match: raise SyntaxError("One of the ALT lines is malformed: {0}".format(line)) - - matches = [match.group('id'), match.group('desc')] - self.alt_lines.append(dict( - list(zip(self.header_keys['alt'],matches))) - ) - self.alt_dict[match.group('id')] = line - + + matches = [match.group("id"), match.group("desc")] + self.alt_lines.append(dict(list(zip(self.header_keys["alt"], matches)))) + self.alt_dict[match.group("id")] = line + else: match = self.meta_pattern.match(line) if not match: raise SyntaxError("One of the meta data lines is malformed: {0}".format(line)) - - self.other_lines.append({match.group('key'): match.group('val')}) - self.other_dict[match.group('key')] = line - + + self.other_lines.append({match.group("key"): match.group("val")}) + self.other_dict[match.group("key")] = line + def parse_header_line(self, line): """docstring for parse_header_line""" - self.header = line[1:].rstrip().split('\t') + self.header = line[1:].rstrip().split("\t") if len(self.header) < 9: self.header = line[1:].rstrip().split() self.individuals = self.header[9:] - + def print_header(self): """Returns a list with the header lines if proper format""" lines_to_print = [] - lines_to_print.append('##fileformat='+self.fileformat) + lines_to_print.append("##fileformat=" + self.fileformat) if self.filedate: - lines_to_print.append('##fileformat='+self.fileformat) - + lines_to_print.append("##fileformat=" + self.fileformat) + for filt in self.filter_dict: lines_to_print.append(self.filter_dict[filt]) for form in self.format_dict: @@ -204,17 +216,16 @@ def print_header(self): lines_to_print.append(self.alt_dict[alt]) for other in self.other_dict: lines_to_print.append(self.other_dict[other]) - lines_to_print.append('#'+ '\t'.join(self.header)) + lines_to_print.append("#" + "\t".join(self.header)) return lines_to_print - def add_fileformat(self, fileformat): """ Add fileformat line to the header. - + Arguments: fileformat (str): The id of the info line - + """ self.fileformat = fileformat self.logger.info("Adding fileformat to vcf: {0}".format(fileformat)) @@ -223,17 +234,15 @@ def add_fileformat(self, fileformat): def add_meta_line(self, key, value): """ Adds an arbitrary metadata line to the header. - + This must be a key value pair - + Arguments: key (str): The key of the metadata line value (str): The value of the metadata line - + """ - meta_line = '##{0}={1}'.format( - key, value - ) + meta_line = "##{0}={1}".format(key, value) self.logger.info("Adding meta line to vcf: {0}".format(meta_line)) self.parse_meta_data(meta_line) return @@ -241,13 +250,13 @@ def add_meta_line(self, key, value): def add_info(self, info_id, number, entry_type, description): """ Add an info line to the header. - + Arguments: info_id (str): The id of the info line number (str): Integer or any of [A,R,G,.] entry_type (str): Any of [Integer,Float,Flag,Character,String] description (str): A description of the info line - + """ info_line = '##INFO='.format( info_id, number, entry_type, description @@ -259,15 +268,13 @@ def add_info(self, info_id, number, entry_type, description): def add_filter(self, filter_id, description): """ Add a filter line to the header. - + Arguments: filter_id (str): The id of the filter line description (str): A description of the info line - + """ - filter_line = '##FILTER='.format( - filter_id, description - ) + filter_line = '##FILTER='.format(filter_id, description) self.logger.info("Adding filter line to vcf: {0}".format(filter_line)) self.parse_meta_data(filter_line) return @@ -275,13 +282,13 @@ def add_filter(self, filter_id, description): def add_format(self, format_id, number, entry_type, description): """ Add a format line to the header. - + Arguments: format_id (str): The id of the format line number (str): Integer or any of [A,R,G,.] entry_type (str): Any of [Integer,Float,Flag,Character,String] description (str): A description of the info line - + """ format_line = '##FORMAT='.format( format_id, number, entry_type, description @@ -293,15 +300,13 @@ def add_format(self, format_id, number, entry_type, description): def add_alt(self, alt_id, description): """ Add a alternative allele format field line to the header. - + Arguments: alt_id (str): The id of the alternative line description (str): A description of the info line - + """ - alt_line = '##ALT='.format( - alt_id, description - ) + alt_line = '##ALT='.format(alt_id, description) self.logger.info("Adding alternative allele line to vcf: {0}".format(alt_line)) self.parse_meta_data(alt_line) return @@ -309,15 +314,13 @@ def add_alt(self, alt_id, description): def add_contig(self, contig_id, length): """ Add a contig line to the header. - + Arguments: contig_id (str): The id of the alternative line length (str): A description of the info line - + """ - contig_line = '##contig='.format( - contig_id, length - ) + contig_line = "##contig=".format(contig_id, length) self.logger.info("Adding contig line to vcf: {0}".format(contig_line)) self.parse_meta_data(contig_line) return @@ -325,32 +328,31 @@ def add_contig(self, contig_id, length): def add_contig(self, contig_id, length): """ Add a contig line to the header. - + Arguments: contig_id (str): The id of the alternative line length (str): A description of the info line - + """ - contig_line = '##contig='.format( - contig_id, length - ) + contig_line = "##contig=".format(contig_id, length) self.logger.info("Adding contig line to vcf: {0}".format(contig_line)) self.parse_meta_data(contig_line) return - def add_version_tracking(self, info_id, version, date, command_line=''): + def add_version_tracking(self, info_id, version, date, command_line=""): """ - Add a line with information about which software that was run and when + Add a line with information about which software that was run and when to the header. - + Arguments: info_id (str): The id of the info line version (str): The version of the software used date (str): Date when software was run command_line (str): The command line that was used for run - + """ other_line = '##Software='.format( - info_id, version, date, command_line) + info_id, version, date, command_line + ) self.other_dict[info_id] = other_line return diff --git a/genmod/vcf_tools/parse_variant.py b/genmod/vcf_tools/parse_variant.py index c3b498a..35c1d4a 100644 --- a/genmod/vcf_tools/parse_variant.py +++ b/genmod/vcf_tools/parse_variant.py @@ -1,93 +1,98 @@ """Parse a variant line in different ways""" -import string + import logging +import string logging = logging.getLogger(__name__) + def get_variant_dict(variant_line, header_line): """Parse a variant line - - Split a variant line and map the fields on the header columns - - Args: - variant_line (str): A vcf variant line - header_line (list): A list with the header columns - Returns: - variant_dict (dict): A variant dictionary + + Split a variant line and map the fields on the header columns + + Args: + variant_line (str): A vcf variant line + header_line (list): A list with the header columns + Returns: + variant_dict (dict): A variant dictionary """ # logger.debug("Building variant dict from {0}".format(variant_line)) - - return dict(zip(header_line, variant_line.rstrip().split('\t'))) + + return dict(zip(header_line, variant_line.rstrip().split("\t"))) + def get_info_dict(info_line): """Parse a info field of a variant - - Make a dictionary from the info field of a vcf variant. - Keys are the info keys and values are the raw strings from the vcf - - Args: - info_line (str): The info field of a vcf variant - Returns: - info_dict (dict): A INFO dictionary + + Make a dictionary from the info field of a vcf variant. + Keys are the info keys and values are the raw strings from the vcf + + Args: + info_line (str): The info field of a vcf variant + Returns: + info_dict (dict): A INFO dictionary """ - + variant_info = {} - for raw_info in info_line.split(';'): - splitted_info = raw_info.split('=') + for raw_info in info_line.split(";"): + splitted_info = raw_info.split("=") if len(splitted_info) == 2: variant_info[splitted_info[0]] = splitted_info[1] else: variant_info[splitted_info[0]] = [] - + return variant_info + def get_variant_id(variant_dict): """Build a variant id - - The variant id is a string made of CHROM_POS_REF_ALT - - The alt field for svs needs some massage to work downstream. - - Args: - variant_dict (dict): A variant dictionary - - Returns: - variant_id (str) + + The variant id is a string made of CHROM_POS_REF_ALT + + The alt field for svs needs some massage to work downstream. + + Args: + variant_dict (dict): A variant dictionary + + Returns: + variant_id (str) """ - chrom = variant_dict['CHROM'] - pos = variant_dict['POS'] - ref = variant_dict['REF'] - #There are several symbols in structural variant calls that make - #things hard. We will strip those symbols + chrom = variant_dict["CHROM"] + pos = variant_dict["POS"] + ref = variant_dict["REF"] + # There are several symbols in structural variant calls that make + # things hard. We will strip those symbols bad_chars = "<>[]:" - alt = ''.join(c for c in variant_dict['ALT'] if c not in bad_chars) - return '_'.join([chrom,pos,ref,alt]) + alt = "".join(c for c in variant_dict["ALT"] if c not in bad_chars) + return "_".join([chrom, pos, ref, alt]) + def get_vep_dict(vep_string, vep_header, allele=None): """Make the vep annotation into a dictionary - - This dictionary will have the alleles as keys and a list of - dictionaries with vep annotations as values. - - Args: - vep_list (string): A string with the CSQ annotation - vep_header (list): A list with the vep header - allele (str): The allele that is annotated - - Return: - vep_dict (dict): A vep dict as described above - + + This dictionary will have the alleles as keys and a list of + dictionaries with vep annotations as values. + + Args: + vep_list (string): A string with the CSQ annotation + vep_header (list): A list with the vep header + allele (str): The allele that is annotated + + Return: + vep_dict (dict): A vep dict as described above + """ vep_dict = {} - for vep_annotation in vep_string.split(','): - inner_dict = dict(zip(vep_header, vep_annotation.split('|'))) - #If allele is annotated by vep we use that allele - if 'Allele' in inner_dict: - allele = inner_dict['Allele'] - + for vep_annotation in vep_string.split(","): + inner_dict = dict(zip(vep_header, vep_annotation.split("|"))) + # If allele is annotated by vep we use that allele + if "Allele" in inner_dict: + allele = inner_dict["Allele"] + if allele in vep_dict: vep_dict[allele].append(inner_dict) else: vep_dict[allele] = [inner_dict] - + return vep_dict diff --git a/genmod/vcf_tools/print_headers.py b/genmod/vcf_tools/print_headers.py index fa4dfc6..fdaca0f 100755 --- a/genmod/vcf_tools/print_headers.py +++ b/genmod/vcf_tools/print_headers.py @@ -11,25 +11,24 @@ from __future__ import print_function + def print_headers(head, outfile=None, silent=False): """ Print the vcf headers. - + If a result file is provided headers will be printed here, otherwise they are printed to stdout. - + Args: head (HeaderParser): A vcf header object outfile (FileHandle): A file handle silent (Bool): If nothing should be printed. - + """ for header_line in head.print_header(): - if outfile: - outfile.write(header_line+'\n') + outfile.write(header_line + "\n") else: if not silent: print(header_line) return - diff --git a/genmod/vcf_tools/print_variants.py b/genmod/vcf_tools/print_variants.py index 28ded42..01c7153 100755 --- a/genmod/vcf_tools/print_variants.py +++ b/genmod/vcf_tools/print_variants.py @@ -21,40 +21,49 @@ import locale import sys -# sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) +# sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) + def print_variant_dict(variant, header_line, outfile=None, silent=False): """Print a variant dictionary - - Prints a variant dictionary to a file or stdout - - Args: - variant (dict): A variant dictionary - header_line (list): A list with the header columns - outfile (FileHandle): A file handle - + + Prints a variant dictionary to a file or stdout + + Args: + variant (dict): A variant dictionary + header_line (list): A list with the header columns + outfile (FileHandle): A file handle + """ - print_line = [variant.get(entry, '.') for entry in header_line] - + print_line = [variant.get(entry, ".") for entry in header_line] + if outfile: - outfile.write('\t'.join(print_line) + '\n') + outfile.write("\t".join(print_line) + "\n") else: if not silent: - print('\t'.join(print_line)) - -def print_variant(variant_line=None, variant_dict=None, header_line=None, - priority=None, outfile=None, mode='vcf', silent=False): + print("\t".join(print_line)) + + +def print_variant( + variant_line=None, + variant_dict=None, + header_line=None, + priority=None, + outfile=None, + mode="vcf", + silent=False, +): """ Print a variant line. - - If a result file is provided the variante will be appended to the file, + + If a result file is provided the variante will be appended to the file, otherwise they are printed to stdout. - + There are two modes, 'vcf' or 'modified'. If 'vcf' we expect plain vcf variants and print them as they came in. - If 'modified' the first column has been used for sorting so we skip + If 'modified' the first column has been used for sorting so we skip that one. - + Args: variant_line (str): A vcf formatted variant line variant_dict (dict): A variant dictionary @@ -63,41 +72,42 @@ def print_variant(variant_line=None, variant_dict=None, header_line=None, outfile (FileHandle): An opened file_handle mode (str): 'vcf' or 'modified' silent (bool): Bool. If nothing should be printed. - + """ - + if variant_dict: if not header_line: raise IOError("Print line needs a header_line when printing variant dict.") - print_line = [variant_dict.get(entry, '.') for entry in header_line] - + print_line = [variant_dict.get(entry, ".") for entry in header_line] + else: - print_line = variant_line.rstrip().split('\t') - - if mode == 'modified': + print_line = variant_line.rstrip().split("\t") + + if mode == "modified": print_line = print_line[1:] - + elif priority: print_line = [priority] + print_line - print_string = '\t'.join(print_line) - + print_string = "\t".join(print_line) + # Only for python2 if not isinstance(print_string, str): - print_string = print_string.encode('utf-8') + print_string = print_string.encode("utf-8") if outfile: - outfile.write(print_string + '\n') - + outfile.write(print_string + "\n") + else: if not silent: print(print_string) return + def print_variant_for_sorting(variant_line, priority, outfile): """ Print the variants for sorting - + Arguments: variant_line (str): A vcf variant line prority (str): The priotiy for this variant @@ -105,6 +115,5 @@ def print_variant_for_sorting(variant_line, priority, outfile): family_id (str): The family Id for sorting on rank score """ variant_line = variant_line.rstrip().split("\t") - - outfile.write("{0}\t{1}\n".format(priority, '\t'.join(variant_line))) - + + outfile.write("{0}\t{1}\n".format(priority, "\t".join(variant_line))) diff --git a/genmod/vcf_tools/sort_variants.py b/genmod/vcf_tools/sort_variants.py index 3c90220..03a6c18 100755 --- a/genmod/vcf_tools/sort_variants.py +++ b/genmod/vcf_tools/sort_variants.py @@ -11,59 +11,58 @@ from __future__ import print_function -import sys -import os -import click import logging - -from subprocess import call +import os +import sys from datetime import datetime +from subprocess import call + +import click logger = logging.getLogger(__name__) -def sort_variants(infile, mode='chromosome'): + +def sort_variants(infile, mode="chromosome"): """ Sort variants based on rank score or chromosome. - + Uses unix sort to sort the variants and overwrites the infile. - + Args: infile : A string that is the path to a file mode : 'chromosome' or 'rank' outfile : The path to an outfile where the variants should be printed - + Returns: 0 if sorting was performed 1 if variants where not sorted """ command = [ - 'sort', - ] - if mode == 'chromosome': - command.append('-n') - command.append('-k1') - command.append('-k3') + "sort", + ] + if mode == "chromosome": + command.append("-n") + command.append("-k1") + command.append("-k3") - elif mode == 'rank': - command.append('-rn') - command.append('-k1') + elif mode == "rank": + command.append("-rn") + command.append("-k1") - command = command + [infile, '-o', infile] + command = command + [infile, "-o", infile] logger.info("Start sorting variants...") - logger.info("Sort command: {0}".format(' '.join(command))) + logger.info("Sort command: {0}".format(" ".join(command))) sort_start = datetime.now() - + try: call(command) except OSError as e: logger.warning("unix program 'sort' does not seem to exist on your system...") logger.warning("genmod needs unix sort to provide a sorted output.") - logger.warning("Output VCF will not be sorted since genmod can not find"\ - "unix sort") + logger.warning("Output VCF will not be sorted since genmod can not find" "unix sort") raise e - logger.info("Sorting done. Time to sort: {0}".format(datetime.now()-sort_start)) - - return + logger.info("Sorting done. Time to sort: {0}".format(datetime.now() - sort_start)) + return diff --git a/tests/annotate_regions/test_bed_parser.py b/tests/annotate_regions/test_bed_parser.py index 477d180..6791406 100644 --- a/tests/annotate_regions/test_bed_parser.py +++ b/tests/annotate_regions/test_bed_parser.py @@ -1,17 +1,17 @@ from genmod.annotate_regions.parse_annotations import bed_parser + def test_bed_parser(bed_lines): # GIVEN an iterable with bed lines nr_regions = 0 symbols = [] for line in bed_lines: - if not line.startswith('#'): + if not line.startswith("#"): nr_regions += 1 symbols.append(line.split()[3]) # WHEN parsing the bedlines for index, region in enumerate(bed_parser(bed_lines)): - # THEN assert the symbols are found in the bed lines - assert region['symbol'] in symbols - - assert index+1 == nr_regions - \ No newline at end of file + # THEN assert the symbols are found in the bed lines + assert region["symbol"] in symbols + + assert index + 1 == nr_regions diff --git a/tests/annotate_regions/test_build_region_trees.py b/tests/annotate_regions/test_build_region_trees.py index ba0d66c..510861e 100644 --- a/tests/annotate_regions/test_build_region_trees.py +++ b/tests/annotate_regions/test_build_region_trees.py @@ -1,5 +1,6 @@ -from intervaltree import IntervalTree from genmod.annotate_regions.parse_annotations import build_region_trees +from intervaltree import IntervalTree + def test_build_region_trees(bed_lines): # GIVEN some lines in the bed format @@ -16,31 +17,31 @@ def test_query_region_trees(): lines = [ "13\t1\t100\tHMGA1P6\tENSG00000233440\n", "13\t50\t200\tRNY3P4\tENSG00000207157\n", - "13\t300\t1000\tLINC00362\tENSG00000229483\n" + "13\t300\t1000\tLINC00362\tENSG00000229483\n", ] # WHEN building region trees region_trees = build_region_trees(lines, padding=0) - + # THEN make sure that we get the correct info when querying - interval_tree = region_trees['13'] - + interval_tree = region_trees["13"] + # There should be two results here assert len(interval_tree[50]) == 2 - + result = interval_tree[100] - #Intervals are half opened + # Intervals are half opened assert len(result) == 1 for interval in result: - assert interval.data == 'RNY3P4' + assert interval.data == "RNY3P4" result = interval_tree[100:100] - #Intervals without lenght does not match anythin + # Intervals without lenght does not match anythin assert len(result) == 0 - + result = interval_tree[250:600] for interval in result: - assert interval.data == 'LINC00362' - + assert interval.data == "LINC00362" + # Test none overlapping result = interval_tree[10000:11000] assert not result diff --git a/tests/annotate_regions/test_get_interval.py b/tests/annotate_regions/test_get_interval.py index 106492f..b89e7d3 100644 --- a/tests/annotate_regions/test_get_interval.py +++ b/tests/annotate_regions/test_get_interval.py @@ -1,16 +1,17 @@ from genmod.annotate_regions.parse_annotations import get_interval + def test_get_interval(): # GIVEN some coordinates and a symbol start = 1 stop = 10 - symbol = 'first' - + symbol = "first" + # WHEN building an interval interval = get_interval(start, stop, symbol) - + # THEN the interval should have the right properties - + assert interval.begin == start assert interval.end == stop - assert interval.data == symbol \ No newline at end of file + assert interval.data == symbol diff --git a/tests/annotate_variants/test_add_annotations.py b/tests/annotate_variants/test_add_annotations.py index 385b02c..71dfe7a 100644 --- a/tests/annotate_variants/test_add_annotations.py +++ b/tests/annotate_variants/test_add_annotations.py @@ -1,9 +1,18 @@ -from genmod.annotate_variants.add_annotations import (add_regions, add_exac, - add_exac_max, add_thousandg, add_thousandg_max, add_spidex, add_cadd, - add_cadd_raw, add_cosmic) +from genmod.annotate_variants.add_annotations import ( + add_cadd, + add_cadd_raw, + add_cosmic, + add_exac, + add_exac_max, + add_regions, + add_spidex, + add_thousandg, + add_thousandg_max, +) + def test_add_regions(header): - key = 'Annotation' + key = "Annotation" # GIVEN a header without key assert key not in header.info_dict # WHEN adding annotation @@ -11,8 +20,9 @@ def test_add_regions(header): # THEN the key should have been added to the header assert key in header.info_dict + def test_add_exac(header): - key = 'EXACAF' + key = "EXACAF" # GIVEN a header without key assert key not in header.info_dict # WHEN adding annotation @@ -20,8 +30,9 @@ def test_add_exac(header): # THEN the key should have been added to the header assert key in header.info_dict + def test_add_exac_max(header): - key = 'EXAC_MAX_AF' + key = "EXAC_MAX_AF" # GIVEN a header without key assert key not in header.info_dict # WHEN adding annotation @@ -29,8 +40,9 @@ def test_add_exac_max(header): # THEN the key should have been added to the header assert key in header.info_dict + def test_add_thousand_g(header): - key = '1000GAF' + key = "1000GAF" # GIVEN a header without key assert key not in header.info_dict # WHEN adding annotation @@ -38,8 +50,9 @@ def test_add_thousand_g(header): # THEN the key should have been added to the header assert key in header.info_dict + def test_add_thousand_g_max(header): - key = '1000G_MAX_AF' + key = "1000G_MAX_AF" # GIVEN a header without key assert key not in header.info_dict # WHEN adding annotation @@ -47,8 +60,9 @@ def test_add_thousand_g_max(header): # THEN the key should have been added to the header assert key in header.info_dict + def test_add_spidex(header): - key = 'SPIDEX' + key = "SPIDEX" # GIVEN a header without key assert key not in header.info_dict # WHEN adding annotation @@ -56,8 +70,9 @@ def test_add_spidex(header): # THEN the key should have been added to the header assert key in header.info_dict + def test_add_cadd(header): - key = 'CADD' + key = "CADD" # GIVEN a header without key assert key not in header.info_dict # WHEN adding annotation @@ -65,8 +80,9 @@ def test_add_cadd(header): # THEN the key should have been added to the header assert key in header.info_dict + def test_add_cadd_raw(header): - key = 'CADD_raw' + key = "CADD_raw" # GIVEN a header without key assert key not in header.info_dict # WHEN adding annotation @@ -74,8 +90,9 @@ def test_add_cadd_raw(header): # THEN the key should have been added to the header assert key in header.info_dict + def test_add_cosmic(header): - key = 'COSMIC' + key = "COSMIC" # GIVEN a header without key assert key not in header.info_dict # WHEN adding annotation diff --git a/tests/conftest.py b/tests/conftest.py index a884d19..90c8cad 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,75 +1,81 @@ import os -import pytest -import tabix from tempfile import NamedTemporaryFile +import pytest +import tabix from genmod.annotations import ensembl_path_37 as ensembl_path - from genmod.vcf_tools.header_parser import HeaderParser thousandg_path = "tests/fixtures/annotate_variant/small_1000G_maxAF.vcf.gz" thousandg_chr_path = "tests/fixtures/annotate_variant/small_1000G_chr.vcf.gz" vcf = "tests/fixtures/test_vcf.vcf" -@pytest.fixture(scope='function') + +@pytest.fixture(scope="function") def thousand_g_path(request): """Return the path to a bgzipped 1000G file""" return thousandg_path -@pytest.fixture(scope='function') + +@pytest.fixture(scope="function") def thousand_g_chr_path(request): """Return the path to a bgzipped 1000G file""" return thousandg_chr_path -@pytest.fixture(scope='function') + +@pytest.fixture(scope="function") def thousand_g_handle(request, thousand_g_path): """Return a tabix handle with a 1000G file""" thousand_g = tabix.open(thousand_g_path) return thousand_g -@pytest.fixture(scope='function') + +@pytest.fixture(scope="function") def thousand_g_chr_handle(request, thousand_g_chr_path): """Return a tabix handle with a 1000G file""" thousand_g = tabix.open(thousand_g_chr_path) return thousand_g -@pytest.fixture(scope='function') + +@pytest.fixture(scope="function") def vcf_path(request): """Return the path to a vcf file""" return vcf -@pytest.fixture(scope='function') + +@pytest.fixture(scope="function") def header(request, vcf_path): """Return the a header object""" head = HeaderParser() - with open(vcf_path, 'r') as variant_file: + with open(vcf_path, "r") as variant_file: for line in variant_file: line = line.rstrip() - - if line.startswith('#'): - if line.startswith('##'): + + if line.startswith("#"): + if line.startswith("##"): head.parse_meta_data(line) else: head.parse_header_line(line) else: break - + return head -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def bed_lines(request): """Return a iterable with bed lines""" lines = [ "#Chromosome/scaffold name\tStart (bp)\tGene End (bp)\tHGNC symbol\tGene ID", "13\t23708313\t23708703\tHMGA1P6\tENSG00000233440\n", "13\t23726725\t23726825\tRNY3P4\tENSG00000207157\n", - "13\t23743974\t23744736\tLINC00362\tENSG00000229483\n" + "13\t23743974\t23744736\tLINC00362\tENSG00000229483\n", ] return lines -@pytest.fixture(scope='function') + +@pytest.fixture(scope="function") def ensembl_file(request): """Return the path to ensembl file with region defenitions""" - return ensembl_path \ No newline at end of file + return ensembl_path diff --git a/tests/functionality/test_annotate_models.py b/tests/functionality/test_annotate_models.py index bbeac54..3fff16f 100644 --- a/tests/functionality/test_annotate_models.py +++ b/tests/functionality/test_annotate_models.py @@ -1,7 +1,8 @@ -from genmod.commands import models_command from tempfile import NamedTemporaryFile from typing import Dict, Union + from click.testing import CliRunner +from genmod.commands import models_command ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf" VCF_FILE = "tests/fixtures/test_vcf_regions.vcf" @@ -16,95 +17,83 @@ init_log(logger, loglevel="INFO") + def _generate_genetic_models_string_from_file(file_path: str) -> str: """ Yield genetic model string from VCF. :param file_path: VCF to be read """ for variant in generate_variants_from_file(file_path=file_path): - genetic_models_entry: str = variant['info_dict'].get('GeneticModels', '') - for family_genetic_models in genetic_models_entry.split(','): - family_genetic_models = family_genetic_models.split(':') - if len(family_genetic_models) > 1: # Not all variants will have a model + genetic_models_entry: str = variant["info_dict"].get("GeneticModels", "") + for family_genetic_models in genetic_models_entry.split(","): + family_genetic_models = family_genetic_models.split(":") + if len(family_genetic_models) > 1: # Not all variants will have a model genetic_models: str = family_genetic_models[1] yield genetic_models + def _run_model_command(vcf_file): """Helper function to run models_command and return output as a list.""" runner = CliRunner() - result = runner.invoke(models_command, [vcf_file, '-f', FAMILY_FILE]) + result = runner.invoke(models_command, [vcf_file, "-f", FAMILY_FILE]) assert result.exit_code == 0, f"Command failed with exit code: {result.exit_code}" - + with NamedTemporaryFile(delete=False) as temp_file: temp_file.write(result.stdout_bytes) temp_file.seek(0) # Move back to the start of the file return list(_generate_genetic_models_string_from_file(temp_file.name)) + def test_genmod_annotate_models_no_family(): """docstring for test_genmod_annotate_models""" runner = CliRunner() result = runner.invoke(models_command, [VCF_FILE]) - + # This should fail since there is no family file assert result.exit_code == 1 + def test_genmod_annotate_models(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke(models_command, [ - VCF_FILE, - '-f', - FAMILY_FILE - ] - ) - + result = runner.invoke(models_command, [VCF_FILE, "-f", FAMILY_FILE]) + print(result.output) assert result.exit_code == 0 + def test_genmod_annotate_models_empty_vcf(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke(models_command, [ - EMPTY_VCF_FILE, - '-f', - FAMILY_FILE - ] - ) + result = runner.invoke(models_command, [EMPTY_VCF_FILE, "-f", FAMILY_FILE]) print(result.output) assert result.exit_code == 0 + def test_annotate_models_already_annotated(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke(models_command, [ - ANNOTATED_VCF_FILE, - '-f', - FAMILY_FILE - ] - ) - + result = runner.invoke(models_command, [ANNOTATED_VCF_FILE, "-f", FAMILY_FILE]) + assert result.exit_code == 1 + def test_annotate_models_lacking_ind(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke(models_command, [ - VCF_FILE, - '-f', - BAD_FAMILY_FILE - ] - ) + result = runner.invoke(models_command, [VCF_FILE, "-f", BAD_FAMILY_FILE]) assert result.exit_code == 1 + def test_annotate_models_chr_prefix(): """Test that genetic models are identical for VCF with and without 'chr' prefix.""" - + # Get models from both VCF files models_list = _run_model_command(VCF_FILE) models_list_with_chr = _run_model_command(VCF_FILE_WITH_CHR) - + # Assert that the lists of models are identical assert len(models_list) > 0 and len(models_list_with_chr) > 0, "No models in VCFs" assert models_list == models_list_with_chr, "Models differ between VCF files." diff --git a/tests/functionality/test_annotate_variant.py b/tests/functionality/test_annotate_variant.py index d040a3b..aaf51ed 100644 --- a/tests/functionality/test_annotate_variant.py +++ b/tests/functionality/test_annotate_variant.py @@ -1,5 +1,5 @@ -from genmod.commands.base import cli from click.testing import CliRunner +from genmod.commands.base import cli VCF_FILE = "tests/fixtures/test_vcf_regions.vcf" EMPTY_VCF_FILE = "tests/fixtures/empty.vcf" @@ -9,86 +9,55 @@ from genmod import logger from genmod.log import init_log + init_log(logger, loglevel="INFO") def test_genmod_annotate_features(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke( - cli, [ - 'annotate', - VCF_FILE, - '-r' - ]) + result = runner.invoke(cli, ["annotate", VCF_FILE, "-r"]) assert result.exit_code == 0 + def test_genmod_annotate_features_empty_vcf(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke( - cli, [ - 'annotate', - EMPTY_VCF_FILE, - '-r' - ]) + result = runner.invoke(cli, ["annotate", EMPTY_VCF_FILE, "-r"]) assert result.exit_code == 0 + def test_genmod_annotate_features_38(): """Test to annotate variants with the GRCh38 build""" runner = CliRunner() - result = runner.invoke( - cli, [ - 'annotate', - VCF_FILE, - '-r', - '-b', - '38' - ]) + result = runner.invoke(cli, ["annotate", VCF_FILE, "-r", "-b", "38"]) assert result.exit_code == 0 + def test_genmod_annotate_thousand_g(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke( - cli, [ - 'annotate', - VCF_FILE, - '--thousand-g', - THOUSAND_G_FILE - ]) - + result = runner.invoke(cli, ["annotate", VCF_FILE, "--thousand-g", THOUSAND_G_FILE]) + assert result.exit_code == 0 + def test_genmod_annotate_cadd(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke( - cli, [ - 'annotate', - VCF_FILE, - '--cadd-file', - CADD_FILE - ]) - + result = runner.invoke(cli, ["annotate", VCF_FILE, "--cadd-file", CADD_FILE]) + assert result.exit_code == 0 + def test_genmod_annotate_multiple_cadd(): """docstring for test_genmod_annotate_models""" runner = CliRunner() result = runner.invoke( - cli, [ - 'annotate', - VCF_FILE, - '--cadd-file', - CADD_FILE, - '--cadd-file', - CADD_1000G_FILE - - ]) - - assert result.exit_code == 0 + cli, ["annotate", VCF_FILE, "--cadd-file", CADD_FILE, "--cadd-file", CADD_1000G_FILE] + ) + assert result.exit_code == 0 diff --git a/tests/functionality/test_filter_variants.py b/tests/functionality/test_filter_variants.py index 1c023d6..a54fcc1 100644 --- a/tests/functionality/test_filter_variants.py +++ b/tests/functionality/test_filter_variants.py @@ -1,10 +1,11 @@ -from genmod.commands import filter_command from click.testing import CliRunner +from genmod.commands import filter_command ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf" from genmod import logger from genmod.log import init_log + init_log(logger, loglevel="INFO") @@ -12,6 +13,5 @@ def test_genmod_filter(): """docstring for test_genmod_annotate_models""" runner = CliRunner() result = runner.invoke(filter_command, [ANNOTATED_VCF_FILE]) - - assert result.exit_code == 0 + assert result.exit_code == 0 diff --git a/tests/functionality/test_score_variants.py b/tests/functionality/test_score_variants.py index c523377..84b5d51 100644 --- a/tests/functionality/test_score_variants.py +++ b/tests/functionality/test_score_variants.py @@ -1,5 +1,5 @@ -from genmod.commands import score_command from click.testing import CliRunner +from genmod.commands import score_command ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf" VCF_FILE = "tests/fixtures/test_vcf_regions.vcf" @@ -10,6 +10,7 @@ from genmod import logger from genmod.log import init_log + init_log(logger, loglevel="INFO") @@ -17,47 +18,37 @@ def test_genmod_score_no_config(): """docstring for test_genmod_annotate_models""" runner = CliRunner() result = runner.invoke(score_command, [ANNOTATED_VCF_FILE]) - + # This should fail since there is no family file assert result.exit_code == 1 + def test_genmod_score(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke(score_command, [ - ANNOTATED_VCF_FILE, - '-c', - SCORE_CONFIG - ] - ) + result = runner.invoke(score_command, [ANNOTATED_VCF_FILE, "-c", SCORE_CONFIG]) print(result.output) assert result.exit_code == 0 + def test_genmod_score_empty_vcf(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke(score_command, [ - EMPTY_VCF_FILE, - '-c', - SCORE_CONFIG - ] - ) + result = runner.invoke(score_command, [EMPTY_VCF_FILE, "-c", SCORE_CONFIG]) print(result.output) assert result.exit_code == 0 + def test_annotate_models_already_scored(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke(score_command, [ - SCORED_VCF, - '-c', - SCORE_CONFIG - ] - ) + result = runner.invoke(score_command, [SCORED_VCF, "-c", SCORE_CONFIG]) assert result.exit_code == 1 + + # # def test_annotate_models_lacking_ind(): # """docstring for test_genmod_annotate_models""" @@ -69,4 +60,4 @@ def test_annotate_models_already_scored(): # ] # ) # -# assert result.exit_code == 1 \ No newline at end of file +# assert result.exit_code == 1 diff --git a/tests/functionality/test_score_variants_ranks_score_is_float.py b/tests/functionality/test_score_variants_ranks_score_is_float.py index b3c0d04..f797d24 100644 --- a/tests/functionality/test_score_variants_ranks_score_is_float.py +++ b/tests/functionality/test_score_variants_ranks_score_is_float.py @@ -1,22 +1,23 @@ -import pytest from tempfile import NamedTemporaryFile -from click.testing import CliRunner +import pytest +from click.testing import CliRunner from genmod.commands import score_command, score_compounds_command -from test_utils import generate_variants_from_file +from test_utils import generate_variants_from_file ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf" SCORE_CONFIG = "tests/fixtures/score_variants/genmod_example.ini" + def _generate_rank_score_strings_from_file(file_path: str) -> str: """ Yield rank score strings from VCF. :param file_path: VCF to be read """ for variant in generate_variants_from_file(file_path=file_path): - rank_score_entry: str = variant['info_dict'].get('RankScore', '') - for family_rank_score in rank_score_entry.split(','): - family_rank_score = family_rank_score.split(':') + rank_score_entry: str = variant["info_dict"].get("RankScore", "") + for family_rank_score in rank_score_entry.split(","): + family_rank_score = family_rank_score.split(":") family_id: str = family_rank_score[0] rank_score: str = family_rank_score[1] yield rank_score @@ -29,12 +30,9 @@ def _check_rankscore_string_is_float(rank_score_string: str): :raises AssertionError: In case rank_score_string is not a float-type value """ # Check decimal point presence - assert '.' in rank_score_string + assert "." in rank_score_string # Check all numerical digits, might contain period and minus sign. - assert rank_score_string \ - .replace('.', '') \ - .replace('-','') \ - .isdigit() + assert rank_score_string.replace(".", "").replace("-", "").isdigit() # Check successfully parsed to float assert isinstance(float(rank_score_string), float) @@ -46,24 +44,24 @@ def test_check_rankscore_string_is_float(): # GIVEN some rank score strings # WHEN running the method under test # THEN expect it to behave as expected in positive case - _check_rankscore_string_is_float('0.0') - _check_rankscore_string_is_float('132.1') - _check_rankscore_string_is_float('132.10') - _check_rankscore_string_is_float('-10.0') + _check_rankscore_string_is_float("0.0") + _check_rankscore_string_is_float("132.1") + _check_rankscore_string_is_float("132.10") + _check_rankscore_string_is_float("-10.0") # THEN expect it to behave as expected in negative case with pytest.raises(AssertionError): - _check_rankscore_string_is_float('') + _check_rankscore_string_is_float("") with pytest.raises(AssertionError): - _check_rankscore_string_is_float('132') + _check_rankscore_string_is_float("132") with pytest.raises(AssertionError): - _check_rankscore_string_is_float('b') + _check_rankscore_string_is_float("b") with pytest.raises(AssertionError): - _check_rankscore_string_is_float('0') + _check_rankscore_string_is_float("0") with pytest.raises(AssertionError): - _check_rankscore_string_is_float('.') + _check_rankscore_string_is_float(".") with pytest.raises(AssertionError): - _check_rankscore_string_is_float('-') + _check_rankscore_string_is_float("-") def test_rankscore_is_float_type(): @@ -74,16 +72,11 @@ def test_rankscore_is_float_type(): # GIVEN some VCF file to be ranked runner = CliRunner() # WHEN computing rank score - result = runner.invoke(score_command, [ - ANNOTATED_VCF_FILE, - '-c', - SCORE_CONFIG - ] - ) + result = runner.invoke(score_command, [ANNOTATED_VCF_FILE, "-c", SCORE_CONFIG]) assert result.exit_code == 0 temporary_file = NamedTemporaryFile() - with open(temporary_file.name, 'w') as file: - file.write(result.stdout_bytes.decode('utf-8')) # Save processed VCF to file + with open(temporary_file.name, "w") as file: + file.write(result.stdout_bytes.decode("utf-8")) # Save processed VCF to file # THEN expect all rank scores to be float type for rank_score_string in _generate_rank_score_strings_from_file(file_path=temporary_file.name): _check_rankscore_string_is_float(rank_score_string) @@ -91,14 +84,16 @@ def test_rankscore_is_float_type(): # GIVEN some ranked VCF file, run compound scoring (which modify the RankScore) runner = CliRunner() # WHEN computing compound score - result = runner.invoke(score_compounds_command, [ - temporary_file.name, - ] + result = runner.invoke( + score_compounds_command, + [ + temporary_file.name, + ], ) assert result.exit_code == 0 temporary_file = NamedTemporaryFile() - with open(temporary_file.name, 'w') as file: - file.write(result.stdout_bytes.decode('utf-8')) # Save processed VCF to file + with open(temporary_file.name, "w") as file: + file.write(result.stdout_bytes.decode("utf-8")) # Save processed VCF to file # THEN expect all rank scores (including modified compound scores) to be float type for rank_score_string in _generate_rank_score_strings_from_file(file_path=temporary_file.name): _check_rankscore_string_is_float(rank_score_string) diff --git a/tests/functionality/test_sort_variants.py b/tests/functionality/test_sort_variants.py index d4dfe58..b63a779 100644 --- a/tests/functionality/test_sort_variants.py +++ b/tests/functionality/test_sort_variants.py @@ -1,5 +1,5 @@ -from genmod.commands import sort_command from click.testing import CliRunner +from genmod.commands import sort_command ANNOTATED_VCF_FILE = "tests/fixtures/test_vcf_annotated.vcf" VCF_FILE = "tests/fixtures/test_vcf_regions.vcf" @@ -10,6 +10,7 @@ from genmod import logger from genmod.log import init_log + init_log(logger, loglevel="INFO") @@ -21,51 +22,58 @@ # # This should fail since there is no family file # assert result.exit_code == 1 + def test_genmod_sort(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke(sort_command, [ - SCORED_VCF, - ] + result = runner.invoke( + sort_command, + [ + SCORED_VCF, + ], ) print(result.output) assert result.exit_code == 0 + def test_genmod_sort_empty(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke(sort_command, [ - EMPTY_VCF_FILE, - ] + result = runner.invoke( + sort_command, + [ + EMPTY_VCF_FILE, + ], ) print(result.output) assert result.exit_code == 0 + def test_genmod_sort_not_scored(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke(sort_command, [ - ANNOTATED_VCF_FILE, - ] + result = runner.invoke( + sort_command, + [ + ANNOTATED_VCF_FILE, + ], ) print(result.output) assert result.exit_code == 0 + def test_genmod_sort_position(): """docstring for test_genmod_annotate_models""" runner = CliRunner() - result = runner.invoke(sort_command, [ - SCORED_VCF, - '-p' - ] - ) + result = runner.invoke(sort_command, [SCORED_VCF, "-p"]) print(result.output) assert result.exit_code == 0 + # def test_annotate_models_already_scored(): # """docstring for test_genmod_annotate_models""" # runner = CliRunner() @@ -76,7 +84,7 @@ def test_genmod_sort_position(): # ] # ) - # assert result.exit_code == 1 +# assert result.exit_code == 1 # # def test_annotate_models_lacking_ind(): # """docstring for test_genmod_annotate_models""" @@ -88,4 +96,4 @@ def test_genmod_sort_position(): # ] # ) # -# assert result.exit_code == 1 \ No newline at end of file +# assert result.exit_code == 1 diff --git a/tests/functionality/test_utils.py b/tests/functionality/test_utils.py index 29eda6d..ca68256 100644 --- a/tests/functionality/test_utils.py +++ b/tests/functionality/test_utils.py @@ -1,5 +1,7 @@ from typing import Dict, Union -from genmod.vcf_tools import HeaderParser, get_variant_dict, get_info_dict + +from genmod.vcf_tools import HeaderParser, get_info_dict, get_variant_dict + def parse_variant_file(file_path: str) -> HeaderParser: """ @@ -7,32 +9,32 @@ def parse_variant_file(file_path: str) -> HeaderParser: :param file_path: VCF to be read :raises ValueError: in case file is empty """ - with open(file_path, 'r') as variant_file: + with open(file_path, "r") as variant_file: head = HeaderParser() for line_index, line in enumerate(variant_file): line = line.rstrip() - if line.startswith('#'): - if line.startswith('##'): + if line.startswith("#"): + if line.startswith("##"): head.parse_meta_data(line) else: head.parse_header_line(line) else: break if line_index == 0: - raise ValueError('Expected contents in file, got none') + raise ValueError("Expected contents in file, got none") return head + def generate_variants_from_file(file_path: str) -> Dict[str, Union[str, int, float]]: """ Yield variants from VCF file. :param file_path: VCF to be read """ header = parse_variant_file(file_path=file_path) - with open(file_path, 'r') as variant_file: + with open(file_path, "r") as variant_file: for line in variant_file: - if line.startswith('#'): + if line.startswith("#"): continue variant: Dict[str, str] = get_variant_dict(line, header.header) - variant['info_dict'] = get_info_dict(variant['INFO']) + variant["info_dict"] = get_info_dict(variant["INFO"]) yield variant - diff --git a/tests/genetic_models/test_dominant_model.py b/tests/genetic_models/test_dominant_model.py index 1a0cd6a..b4e8242 100644 --- a/tests/genetic_models/test_dominant_model.py +++ b/tests/genetic_models/test_dominant_model.py @@ -1,159 +1,138 @@ from genmod.annotate_models.models import check_dominant from genmod.vcf_tools import Genotype - from ped_parser import FamilyParser FAMILY_FILE = "tests/fixtures/recessive_trio.ped" -def get_family(family_file = None, family_lines = None): - """Return a family object - - """ + +def get_family(family_file=None, family_lines=None): + """Return a family object""" family = None if family_file: - family = FamilyParser(open(family_file, 'r')) + family = FamilyParser(open(family_file, "r")) elif family_lines: family = FamilyParser(family_lines) - - return family + return family ################# Test affected ############### def test_dominant_affected_recessive_male(): - """Test a sick male - """ + """Test a sick male""" family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - recessive_variant = {'genotypes': {}} - recessive_variant['genotypes']['proband'] = Genotype(**{'GT':'0/1'}) - - assert check_dominant( - variant = recessive_variant, - family = family - ) == True + + recessive_variant = {"genotypes": {}} + recessive_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/1"}) + + assert check_dominant(variant=recessive_variant, family=family) == True + def test_dominant_affected_homozygote_male(): """Test an affected homozygote male""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'1/1'}) - - assert check_dominant( - variant = homozygote_variant, - family = family - ) == False + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "1/1"}) + + assert check_dominant(variant=homozygote_variant, family=family) == False def test_dominant_affected_male_ref_call(): """Test an affected ref call male""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'0/0'}) - - assert check_dominant( - variant = homozygote_variant, - family = family - ) == False + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/0"}) + + assert check_dominant(variant=homozygote_variant, family=family) == False + def test_dominant_affected_no_call_male(): """Test a sick male with no gt call - + This should be true since there is no information that contradicts the model """ family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - no_call_variant = {'genotypes': {}} - no_call_variant['genotypes']['proband'] = Genotype(**{'GT':'./.'}) - - assert check_dominant( - variant = no_call_variant, - family = family - ) == True + + no_call_variant = {"genotypes": {}} + no_call_variant["genotypes"]["proband"] = Genotype(**{"GT": "./."}) + + assert check_dominant(variant=no_call_variant, family=family) == True + def test_dominant_affected_no_call_male_strict(): """Test a sick male with no gt call - + This should not be true since we allways need 'proof' for an inheritance pattern if strict mode. """ family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - no_call_variant = {'genotypes': {}} - no_call_variant['genotypes']['proband'] = Genotype(**{'GT':'./.'}) - - assert check_dominant( - variant = no_call_variant, - family = family, - strict = True - ) == False + + no_call_variant = {"genotypes": {}} + no_call_variant["genotypes"]["proband"] = Genotype(**{"GT": "./."}) + + assert check_dominant(variant=no_call_variant, family=family, strict=True) == False + ############### Test healthy ############## + def test_dominant_healthy_recessive_male(): - """Test a healthy recessive male - """ + """Test a healthy recessive male""" family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t1\n" + "1\tproband\t0\t0\t1\t1\n", ] - + family = get_family(family_lines=family_lines) - - recessive_variant = {'genotypes': {}} - recessive_variant['genotypes']['proband'] = Genotype(**{'GT':'0/1'}) - - assert check_dominant( - variant = recessive_variant, - family = family - ) == False + + recessive_variant = {"genotypes": {}} + recessive_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/1"}) + + assert check_dominant(variant=recessive_variant, family=family) == False + def test_dominant_healthy_recessive_male_reduced_penetrance(): """Test a healthy heterozygote female - + Females needs to bo hom alt to follow pattern """ family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t1\n" + "1\tproband\t0\t0\t1\t1\n", ] - + family = get_family(family_lines=family_lines) - - recessive_variant = {'genotypes': {}} - recessive_variant['genotypes']['proband'] = Genotype(**{'GT':'0/1'}) - recessive_variant['reduced_penetrance'] = True - - assert check_dominant( - variant = recessive_variant, - family = family - ) == True + + recessive_variant = {"genotypes": {}} + recessive_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/1"}) + recessive_variant["reduced_penetrance"] = True + + assert check_dominant(variant=recessive_variant, family=family) == True diff --git a/tests/genetic_models/test_x_dominant.py b/tests/genetic_models/test_x_dominant.py index d3f2e1e..337a1aa 100644 --- a/tests/genetic_models/test_x_dominant.py +++ b/tests/genetic_models/test_x_dominant.py @@ -1,286 +1,251 @@ from genmod.annotate_models.models import check_X_dominant from genmod.vcf_tools import Genotype - from ped_parser import FamilyParser FAMILY_FILE = "tests/fixtures/recessive_trio.ped" -def get_family(family_file = None, family_lines = None): - """Return a family object - - """ + +def get_family(family_file=None, family_lines=None): + """Return a family object""" family = None if family_file: - family = FamilyParser(open(family_file, 'r')) + family = FamilyParser(open(family_file, "r")) elif family_lines: family = FamilyParser(family_lines) - - return family + return family ################# Test affected ############### def test_x_affected_recessive_male(): - """Test a sick male - """ + """Test a sick male""" family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - recessive_variant = {'genotypes': {}} - recessive_variant['genotypes']['proband'] = Genotype(**{'GT':'0/1'}) - - assert check_X_dominant( - variant = recessive_variant, - family = family - ) == True + + recessive_variant = {"genotypes": {}} + recessive_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/1"}) + + assert check_X_dominant(variant=recessive_variant, family=family) == True + def test_x_affected_recessive_female(): """Test a sick heterozygote female - + Females needs to bo hom alt to follow pattern """ family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t2\t2\n" + "1\tproband\t0\t0\t2\t2\n", ] - + family = get_family(family_lines=family_lines) - - recessive_variant = {'genotypes': {}} - recessive_variant['genotypes']['proband'] = Genotype(**{'GT':'0/1'}) - - assert check_X_dominant( - variant = recessive_variant, - family = family - ) == True + + recessive_variant = {"genotypes": {}} + recessive_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/1"}) + + assert check_X_dominant(variant=recessive_variant, family=family) == True + def test_x_affected_homozygote_male(): """Test an affected homozygote male""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'1/1'}) - - assert check_X_dominant( - variant = homozygote_variant, - family = family - ) == True + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "1/1"}) + + assert check_X_dominant(variant=homozygote_variant, family=family) == True + def test_x_affected_homozygote_female(): """Test an affected homozygote male""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t2\t2\n" + "1\tproband\t0\t0\t2\t2\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'1/1'}) - - assert check_X_dominant( - variant = homozygote_variant, - family = family - ) == True + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "1/1"}) + + assert check_X_dominant(variant=homozygote_variant, family=family) == True + def test_x_affected_male_ref_call(): """Test an affected ref call male""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'0/0'}) - - assert check_X_dominant( - variant = homozygote_variant, - family = family - ) == False + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/0"}) + + assert check_X_dominant(variant=homozygote_variant, family=family) == False + def test_x_affected_female_ref_call(): """Test an affected ref call male""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t2\t2\n" + "1\tproband\t0\t0\t2\t2\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'0/0'}) - - assert check_X_dominant( - variant = homozygote_variant, - family = family - ) == False - + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/0"}) + + assert check_X_dominant(variant=homozygote_variant, family=family) == False + def test_x_affected_no_call_male(): """Test a sick male with no gt call - + This should be true since there is no information that contradicts the model """ family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - no_call_variant = {'genotypes': {}} - no_call_variant['genotypes']['proband'] = Genotype(**{'GT':'./.'}) - - assert check_X_dominant( - variant = no_call_variant, - family = family - ) == True + + no_call_variant = {"genotypes": {}} + no_call_variant["genotypes"]["proband"] = Genotype(**{"GT": "./."}) + + assert check_X_dominant(variant=no_call_variant, family=family) == True + def test_x_affected_no_call_male_strict(): """Test a sick male with no gt call - + This should not be true since we allways need 'proof' for an inheritance pattern if strict mode. """ family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - no_call_variant = {'genotypes': {}} - no_call_variant['genotypes']['proband'] = Genotype(**{'GT':'./.'}) - - assert check_X_dominant( - variant = no_call_variant, - family = family, - strict = True - ) == False + + no_call_variant = {"genotypes": {}} + no_call_variant["genotypes"]["proband"] = Genotype(**{"GT": "./."}) + + assert check_X_dominant(variant=no_call_variant, family=family, strict=True) == False + ############### Test healthy ############## + def test_x_healthy_recessive_male(): - """Test a healthy recessive male - """ + """Test a healthy recessive male""" family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t1\n" + "1\tproband\t0\t0\t1\t1\n", ] - + family = get_family(family_lines=family_lines) - - recessive_variant = {'genotypes': {}} - recessive_variant['genotypes']['proband'] = Genotype(**{'GT':'0/1'}) - - assert check_X_dominant( - variant = recessive_variant, - family = family - ) == False + + recessive_variant = {"genotypes": {}} + recessive_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/1"}) + + assert check_X_dominant(variant=recessive_variant, family=family) == False + def test_x_healthy_recessive_female(): """Test a healthy heterozygote female - + Females needs to bo hom alt to follow pattern """ family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t2\t1\n" + "1\tproband\t0\t0\t2\t1\n", ] - + family = get_family(family_lines=family_lines) - - recessive_variant = {'genotypes': {}} - recessive_variant['genotypes']['proband'] = Genotype(**{'GT':'0/1'}) - - assert check_X_dominant( - variant = recessive_variant, - family = family - ) == True + + recessive_variant = {"genotypes": {}} + recessive_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/1"}) + + assert check_X_dominant(variant=recessive_variant, family=family) == True + def test_x_healthy_homozygote_male(): """Test an healthy homozygote male""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t1\n" + "1\tproband\t0\t0\t1\t1\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'1/1'}) - - assert check_X_dominant( - variant = homozygote_variant, - family = family - ) == False + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "1/1"}) + + assert check_X_dominant(variant=homozygote_variant, family=family) == False + def test_x_healthy_homozygote_female(): """Test an healthy homozygote female""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t2\t1\n" + "1\tproband\t0\t0\t2\t1\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'1/1'}) - - assert check_X_dominant( - variant = homozygote_variant, - family = family - ) == False + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "1/1"}) + + assert check_X_dominant(variant=homozygote_variant, family=family) == False + def test_x_healthy_male_ref_call(): """Test an healthy ref call male""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t1\n" + "1\tproband\t0\t0\t1\t1\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'0/0'}) - - assert check_X_dominant( - variant = homozygote_variant, - family = family - ) == True + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/0"}) + + assert check_X_dominant(variant=homozygote_variant, family=family) == True + def test_x_healthy_female_ref_call(): """Test an healthy female ref call""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t2\t1\n" + "1\tproband\t0\t0\t2\t1\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'0/0'}) - - assert check_X_dominant( - variant = homozygote_variant, - family = family - ) == True + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/0"}) + + assert check_X_dominant(variant=homozygote_variant, family=family) == True diff --git a/tests/genetic_models/test_x_recessive.py b/tests/genetic_models/test_x_recessive.py index e5a5b0b..e3228b4 100644 --- a/tests/genetic_models/test_x_recessive.py +++ b/tests/genetic_models/test_x_recessive.py @@ -1,214 +1,187 @@ from genmod.annotate_models.models import check_X_recessive from genmod.vcf_tools import Genotype - from ped_parser import FamilyParser FAMILY_FILE = "tests/fixtures/recessive_trio.ped" -def get_family(family_file = None, family_lines = None): - """Return a family object - - """ + +def get_family(family_file=None, family_lines=None): + """Return a family object""" family = None if family_file: - family = FamilyParser(open(family_file, 'r')) + family = FamilyParser(open(family_file, "r")) elif family_lines: family = FamilyParser(family_lines) - - return family + return family ################# Test affected ############### def test_x_affected_recessive_male(): - """Test a sick male - """ + """Test a sick male""" family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - recessive_variant = {'genotypes': {}} - recessive_variant['genotypes']['proband'] = Genotype(**{'GT':'0/1'}) - - assert check_X_recessive( - variant = recessive_variant, - family = family - ) == True + + recessive_variant = {"genotypes": {}} + recessive_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/1"}) + + assert check_X_recessive(variant=recessive_variant, family=family) == True + def test_x_affected_recessive_female(): """Test a sick heterozygote female - + Females needs to bo hom alt to follow pattern """ family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t2\t2\n" + "1\tproband\t0\t0\t2\t2\n", ] - + family = get_family(family_lines=family_lines) - - recessive_variant = {'genotypes': {}} - recessive_variant['genotypes']['proband'] = Genotype(**{'GT':'0/1'}) - - assert check_X_recessive( - variant = recessive_variant, - family = family - ) == False + + recessive_variant = {"genotypes": {}} + recessive_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/1"}) + + assert check_X_recessive(variant=recessive_variant, family=family) == False + def test_x_affected_homozygote_male(): """Test an affected homozygote male""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'1/1'}) - - assert check_X_recessive( - variant = homozygote_variant, - family = family - ) == True + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "1/1"}) + + assert check_X_recessive(variant=homozygote_variant, family=family) == True + def test_x_affected_homozygote_female(): """Test an affected homozygote male""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t2\t2\n" + "1\tproband\t0\t0\t2\t2\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'1/1'}) - - assert check_X_recessive( - variant = homozygote_variant, - family = family - ) == True + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "1/1"}) + + assert check_X_recessive(variant=homozygote_variant, family=family) == True + def test_x_affected_male_ref_call(): """Test an affected ref call male""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'0/0'}) - - assert check_X_recessive( - variant = homozygote_variant, - family = family - ) == False + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/0"}) + + assert check_X_recessive(variant=homozygote_variant, family=family) == False + def test_x_affected_female_ref_call(): """Test an affected ref call male""" - + family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t2\t2\n" + "1\tproband\t0\t0\t2\t2\n", ] - + family = get_family(family_lines=family_lines) - - homozygote_variant = {'genotypes': {}} - homozygote_variant['genotypes']['proband'] = Genotype(**{'GT':'0/0'}) - - assert check_X_recessive( - variant = homozygote_variant, - family = family - ) == False - + + homozygote_variant = {"genotypes": {}} + homozygote_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/0"}) + + assert check_X_recessive(variant=homozygote_variant, family=family) == False + def test_x_affected_no_call_male(): """Test a sick male with no gt call - + This should be true since there is no information that contradicts the model """ family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - no_call_variant = {'genotypes': {}} - no_call_variant['genotypes']['proband'] = Genotype(**{'GT':'./.'}) - - assert check_X_recessive( - variant = no_call_variant, - family = family - ) == True + + no_call_variant = {"genotypes": {}} + no_call_variant["genotypes"]["proband"] = Genotype(**{"GT": "./."}) + + assert check_X_recessive(variant=no_call_variant, family=family) == True + def test_x_affected_no_call_male_strict(): """Test a sick male with no gt call - + This should not be true since we allways need 'proof' for an inheritance pattern if strict mode. """ family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t2\n" + "1\tproband\t0\t0\t1\t2\n", ] - + family = get_family(family_lines=family_lines) - - no_call_variant = {'genotypes': {}} - no_call_variant['genotypes']['proband'] = Genotype(**{'GT':'./.'}) - - assert check_X_recessive( - variant = no_call_variant, - family = family, - strict = True - ) == False + + no_call_variant = {"genotypes": {}} + no_call_variant["genotypes"]["proband"] = Genotype(**{"GT": "./."}) + + assert check_X_recessive(variant=no_call_variant, family=family, strict=True) == False + ############### Test healthy ############## + def test_x_healthy_recessive_male(): - """Test a healthy recessive male - """ + """Test a healthy recessive male""" family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t1\t1\n" + "1\tproband\t0\t0\t1\t1\n", ] - + family = get_family(family_lines=family_lines) - - recessive_variant = {'genotypes': {}} - recessive_variant['genotypes']['proband'] = Genotype(**{'GT':'0/1'}) - - assert check_X_recessive( - variant = recessive_variant, - family = family - ) == False + + recessive_variant = {"genotypes": {}} + recessive_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/1"}) + + assert check_X_recessive(variant=recessive_variant, family=family) == False + def test_x_healthy_recessive_female(): """Test a healthy heterozygote female - + Females needs to bo hom alt to follow pattern """ family_lines = [ "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", - "1\tproband\t0\t0\t2\t1\n" + "1\tproband\t0\t0\t2\t1\n", ] - + family = get_family(family_lines=family_lines) - - recessive_variant = {'genotypes': {}} - recessive_variant['genotypes']['proband'] = Genotype(**{'GT':'0/1'}) - - assert check_X_recessive( - variant = recessive_variant, - family = family - ) == True + + recessive_variant = {"genotypes": {}} + recessive_variant["genotypes"]["proband"] = Genotype(**{"GT": "0/1"}) + + assert check_X_recessive(variant=recessive_variant, family=family) == True diff --git a/tests/score_variants/test_category_score.py b/tests/score_variants/test_category_score.py index 85fb411..c321bca 100644 --- a/tests/score_variants/test_category_score.py +++ b/tests/score_variants/test_category_score.py @@ -1,17 +1,18 @@ -import pytest -from typing import Union, Dict, Any from tempfile import NamedTemporaryFile -from configobj import ConfigObj +from typing import Any, Dict, Union +import pytest +from configobj import ConfigObj from genmod.score_variants import ConfigParser -from genmod.vcf_tools import get_info_dict from genmod.score_variants.score_variant import get_category_score +from genmod.vcf_tools import get_info_dict class ConfigObjWithNamedTemporaryFile(ConfigObj): """ Class that wraps a NamedTemporaryFile inside a ConfigObject """ + def __init__(self, named_temporary_file: NamedTemporaryFile, *args, **kwargs): """ Args: @@ -21,9 +22,7 @@ def __init__(self, named_temporary_file: NamedTemporaryFile, *args, **kwargs): self._file_pointer: NamedTemporaryFile = named_temporary_file self.filename: str = named_temporary_file.name - def add_category(self, - name: str, - aggregation_mode: str): + def add_category(self, name: str, aggregation_mode: str): """ Add a category to config file Args: @@ -32,22 +31,23 @@ def add_category(self, Returns: self for chaining """ - self['Categories'][name]: Dict = {} - self['Categories'][name]['category_aggregation'] = aggregation_mode + self["Categories"][name]: Dict = {} + self["Categories"][name]["category_aggregation"] = aggregation_mode self.write() return self - def add_plugin(self, - category: str, - plugin_name: str, - score_categories: Dict[str, Dict[str, float]], - info_key: str, - not_reported_score: float = 0.0, - field: str = 'INFO', - data_type: str = 'float', - record_rule: str = 'max', - separators: str = ',', - ): + def add_plugin( + self, + category: str, + plugin_name: str, + score_categories: Dict[str, Dict[str, float]], + info_key: str, + not_reported_score: float = 0.0, + field: str = "INFO", + data_type: str = "float", + record_rule: str = "max", + separators: str = ",", + ): """ Add plugin definition to scoring config. Arguments is 1:1 mapped to scoring config file. @@ -69,15 +69,15 @@ def add_plugin(self, self for chaining """ self[plugin_name]: Dict = {} - self[plugin_name]['field'] = field - self[plugin_name]['data_type'] = data_type - self[plugin_name]['category'] = category - self[plugin_name]['record_rule'] = record_rule - self[plugin_name]['separators'] = separators - self[plugin_name]['info_key'] = info_key + self[plugin_name]["field"] = field + self[plugin_name]["data_type"] = data_type + self[plugin_name]["category"] = category + self[plugin_name]["record_rule"] = record_rule + self[plugin_name]["separators"] = separators + self[plugin_name]["info_key"] = info_key - self[plugin_name]['not_reported']: Dict = {} - self[plugin_name]['not_reported']['score'] = not_reported_score + self[plugin_name]["not_reported"]: Dict = {} + self[plugin_name]["not_reported"]["score"] = not_reported_score for score_category in score_categories.keys(): self[plugin_name][score_category]: Dict = {} @@ -101,7 +101,7 @@ def set_info_dict_in_variant(variant: Dict[str, str]) -> Dict[str, Union[str, An Returns: variant with ['info_dict'] attribute """ - variant['info_dict']: Dict[str, str] = get_info_dict(variant['INFO']) + variant["info_dict"]: Dict[str, str] = get_info_dict(variant["INFO"]) return variant @@ -112,186 +112,181 @@ def config_file() -> ConfigObjWithNamedTemporaryFile: Returns: A minimal scoring config file as ConfigObjWithNamedTemporaryFile """ - named_temporary_file: NamedTemporaryFile = NamedTemporaryFile(dir='/tmp') - config: ConfigObjWithNamedTemporaryFile = \ - ConfigObjWithNamedTemporaryFile(named_temporary_file=named_temporary_file) - config['Version'] = {} - config['Version']['version'] = 0.1 - config['Version']['name'] = 'genmod example' - config['Categories'] = {} + named_temporary_file: NamedTemporaryFile = NamedTemporaryFile(dir="/tmp") + config: ConfigObjWithNamedTemporaryFile = ConfigObjWithNamedTemporaryFile( + named_temporary_file=named_temporary_file + ) + config["Version"] = {} + config["Version"]["version"] = 0.1 + config["Version"]["name"] = "genmod example" + config["Categories"] = {} return config @pytest.fixture def variant_0(): - variant = { - 'CHROM': '1', - 'POS': '1', - 'INFO': 'plugin_a=0.01;plugin_b=5.0' - } + variant = {"CHROM": "1", "POS": "1", "INFO": "plugin_a=0.01;plugin_b=5.0"} variant = set_info_dict_in_variant(variant) return variant -@pytest.mark.parametrize('aggregation_mode,expected', - [('sum', (2, 0, 2)), - ('max', (2, 0, 2)), - ('min', (2, 0, 2))]) +@pytest.mark.parametrize( + "aggregation_mode,expected", [("sum", (2, 0, 2)), ("max", (2, 0, 2)), ("min", (2, 0, 2))] +) def test_single_category_single_plugin(config_file, variant_0, aggregation_mode, expected): """ Test with a single category. """ # GIVEN a scoring config in mode aggregation_mode - config_file.add_category('CategoryA', aggregation_mode) - config_file.add_plugin('CategoryA', - 'PluginA', - {'rare': {'score': 2.0, 'lower': 0.01, 'upper': 0.05}}, - 'plugin_a') + config_file.add_category("CategoryA", aggregation_mode) + config_file.add_plugin( + "CategoryA", "PluginA", {"rare": {"score": 2.0, "lower": 0.01, "upper": 0.05}}, "plugin_a" + ) config_parser: ConfigParser = ConfigParser(config_file.filename) # WHEN scoring a variant - score, min, max = get_category_score(variant=variant_0, - category='CategoryA', - config_parser=config_parser) + score, min, max = get_category_score( + variant=variant_0, category="CategoryA", config_parser=config_parser + ) # THEN expect proper score and min max bounds assert score == expected[0] assert min == expected[1] assert max == expected[2] -@pytest.mark.parametrize('aggregation_mode,expected', - [('sum', (2, -5, 2)), - ('max', (2, -5, 2)), - ('min', (2, -5, 2))]) -def test_single_category_single_plugin_notreported(config_file, variant_0, aggregation_mode, expected): +@pytest.mark.parametrize( + "aggregation_mode,expected", [("sum", (2, -5, 2)), ("max", (2, -5, 2)), ("min", (2, -5, 2))] +) +def test_single_category_single_plugin_notreported( + config_file, variant_0, aggregation_mode, expected +): """ Test with a single category, with a custom not reported score. """ # GIVEN a scoring config in mode aggregation_mode - config_file.add_category('CategoryA', aggregation_mode) - config_file.add_plugin('CategoryA', - 'PluginA', - {'rare': {'score': 2.0, 'lower': 0.01, 'upper': 0.05}}, - 'plugin_a', - not_reported_score=-5) + config_file.add_category("CategoryA", aggregation_mode) + config_file.add_plugin( + "CategoryA", + "PluginA", + {"rare": {"score": 2.0, "lower": 0.01, "upper": 0.05}}, + "plugin_a", + not_reported_score=-5, + ) config_parser: ConfigParser = ConfigParser(config_file.filename) # WHEN scoring a variant - score, min, max = get_category_score(variant=variant_0, - category='CategoryA', - config_parser=config_parser) + score, min, max = get_category_score( + variant=variant_0, category="CategoryA", config_parser=config_parser + ) # THEN expect proper score and min max bounds assert score == expected[0] assert min == expected[1] assert max == expected[2] -@pytest.mark.parametrize('aggregation_mode,expected', - [('sum', (3, 0, 3)), - ('max', (2, 0, 2)), - ('min', (1, 0, 1))]) +@pytest.mark.parametrize( + "aggregation_mode,expected", [("sum", (3, 0, 3)), ("max", (2, 0, 2)), ("min", (1, 0, 1))] +) def test_single_category_two_plugin(config_file, variant_0, aggregation_mode, expected): """ Test two categories. """ # GIVEN a scoring config in mode aggregation_mode - config_file.add_category('CategoryA', aggregation_mode) - config_file.add_plugin('CategoryA', - 'PluginA', - {'rare': {'score': 2.0, 'lower': 0.01, 'upper': 0.05}}, - 'plugin_a') - config_file.add_plugin('CategoryA', - 'PluginB', - {'rare': {'score': 1.0, 'lower': 4.0, 'upper': 6.0}}, - 'plugin_b') + config_file.add_category("CategoryA", aggregation_mode) + config_file.add_plugin( + "CategoryA", "PluginA", {"rare": {"score": 2.0, "lower": 0.01, "upper": 0.05}}, "plugin_a" + ) + config_file.add_plugin( + "CategoryA", "PluginB", {"rare": {"score": 1.0, "lower": 4.0, "upper": 6.0}}, "plugin_b" + ) config_parser: ConfigParser = ConfigParser(config_file.filename) # WHEN scoring a variant - score, min, max = get_category_score(variant=variant_0, - category='CategoryA', - config_parser=config_parser) + score, min, max = get_category_score( + variant=variant_0, category="CategoryA", config_parser=config_parser + ) # THEN expect proper score and min max bounds assert score == expected[0] assert min == expected[1] assert max == expected[2] -@pytest.mark.parametrize('aggregation_mode,expected', - [('sum', (-3, -5, 2)), - ('max', (2, 0, 2)), - ('min', (-5, -5, 0))]) +@pytest.mark.parametrize( + "aggregation_mode,expected", [("sum", (-3, -5, 2)), ("max", (2, 0, 2)), ("min", (-5, -5, 0))] +) def test_single_category_two_plugin(config_file, variant_0, aggregation_mode, expected): """ Test two categories, negative score range. """ # GIVEN a scoring config in mode aggregation_mode - config_file.add_category('CategoryA', aggregation_mode) - config_file.add_plugin('CategoryA', - 'PluginA', - {'rare': {'score': 2, 'lower': 0.01, 'upper': 0.05}}, - 'plugin_a') - config_file.add_plugin('CategoryA', - 'PluginB', - {'rare': {'score': -5, 'lower': 4.0, 'upper': 6.0}}, - 'plugin_b') + config_file.add_category("CategoryA", aggregation_mode) + config_file.add_plugin( + "CategoryA", "PluginA", {"rare": {"score": 2, "lower": 0.01, "upper": 0.05}}, "plugin_a" + ) + config_file.add_plugin( + "CategoryA", "PluginB", {"rare": {"score": -5, "lower": 4.0, "upper": 6.0}}, "plugin_b" + ) config_parser: ConfigParser = ConfigParser(config_file.filename) # WHEN scoring a variant - score, min, max = get_category_score(variant=variant_0, - category='CategoryA', - config_parser=config_parser) + score, min, max = get_category_score( + variant=variant_0, category="CategoryA", config_parser=config_parser + ) # THEN expect proper score and min max bounds assert score == expected[0] assert min == expected[1] assert max == expected[2] -@pytest.mark.parametrize('aggregation_mode,expected', - [('sum', (2, 0, 2)), - ('max', (2, 0, 2)), - ('min', (2, 0, 2))]) +@pytest.mark.parametrize( + "aggregation_mode,expected", [("sum", (2, 0, 2)), ("max", (2, 0, 2)), ("min", (2, 0, 2))] +) def test_multi_category_two_plugins(config_file, variant_0, aggregation_mode, expected): """ Test with a single category, shall ignore other category """ # GIVEN a scoring config in mode aggregation_mode - config_file.add_category('CategoryA', aggregation_mode) - config_file.add_category('CategoryB', aggregation_mode) - config_file.add_plugin('CategoryA', - 'PluginA', - {'rare': {'score': 2.0, 'lower': 0.01, 'upper': 0.05}}, - 'plugin_a') - config_file.add_plugin('CategoryB', - 'PluginB', - {'rare': {'score': 1.0, 'lower': 4.0, 'upper': 6.0}}, - 'plugin_b') + config_file.add_category("CategoryA", aggregation_mode) + config_file.add_category("CategoryB", aggregation_mode) + config_file.add_plugin( + "CategoryA", "PluginA", {"rare": {"score": 2.0, "lower": 0.01, "upper": 0.05}}, "plugin_a" + ) + config_file.add_plugin( + "CategoryB", "PluginB", {"rare": {"score": 1.0, "lower": 4.0, "upper": 6.0}}, "plugin_b" + ) config_parser: ConfigParser = ConfigParser(config_file.filename) # WHEN scoring a variant, expect only from category A - score, min, max = get_category_score(variant=variant_0, - category='CategoryA', - config_parser=config_parser) + score, min, max = get_category_score( + variant=variant_0, category="CategoryA", config_parser=config_parser + ) # THEN expect proper score and min max bounds assert score == expected[0] assert min == expected[1] assert max == expected[2] -@pytest.mark.parametrize('aggregation_mode,expected', - [('sum', (10, 0, 10)), - ('max', (10, 0, 10)), - ('min', (10, 0, 10))]) -def test_single_category_single_plugin_multirange(config_file, variant_0, aggregation_mode, expected): + +@pytest.mark.parametrize( + "aggregation_mode,expected", [("sum", (10, 0, 10)), ("max", (10, 0, 10)), ("min", (10, 0, 10))] +) +def test_single_category_single_plugin_multirange( + config_file, variant_0, aggregation_mode, expected +): """ Test with a single category, having multiple score ranges. """ # GIVEN a scoring config in mode aggregation_mode - config_file.add_category('CategoryA', aggregation_mode) - config_file.add_plugin('CategoryA', - 'PluginA', - {'rare': {'score': 10.0, 'lower': 0.01, 'upper': 0.05}, - 'common': {'score': 1.0, 'lower': 0.06, 'upper': 1.0}}, - 'plugin_a') + config_file.add_category("CategoryA", aggregation_mode) + config_file.add_plugin( + "CategoryA", + "PluginA", + { + "rare": {"score": 10.0, "lower": 0.01, "upper": 0.05}, + "common": {"score": 1.0, "lower": 0.06, "upper": 1.0}, + }, + "plugin_a", + ) config_parser: ConfigParser = ConfigParser(config_file.filename) # WHEN scoring a variant - score, min, max = get_category_score(variant=variant_0, - category='CategoryA', - config_parser=config_parser) + score, min, max = get_category_score( + variant=variant_0, category="CategoryA", config_parser=config_parser + ) # THEN expect proper score and min max bounds assert score == expected[0] assert min == expected[1] - assert max == expected[2] \ No newline at end of file + assert max == expected[2] diff --git a/tests/score_variants/test_config_parser.py b/tests/score_variants/test_config_parser.py index 2506bdf..0050154 100644 --- a/tests/score_variants/test_config_parser.py +++ b/tests/score_variants/test_config_parser.py @@ -10,72 +10,82 @@ def test_config_parser(): """Test the config parser""" config_reader = ConfigParser(CONFIG) - - assert set(config_reader.plugins) == set(['1000G', 'CADD', 'GeneticModels','CLNSIG']) - assert set(config_reader.categories.keys()) == set(['allele_frequency', - 'deleteriousness', 'inheritance', 'clinical_significance']) - assert set(config_reader.categories['allele_frequency'].keys()) == set(['category_aggregation', 'plugins']) - assert set(config_reader.categories['allele_frequency']['plugins']) == set(['1000G']) + + assert set(config_reader.plugins) == set(["1000G", "CADD", "GeneticModels", "CLNSIG"]) + assert set(config_reader.categories.keys()) == set( + ["allele_frequency", "deleteriousness", "inheritance", "clinical_significance"] + ) + assert set(config_reader.categories["allele_frequency"].keys()) == set( + ["category_aggregation", "plugins"] + ) + assert set(config_reader.categories["allele_frequency"]["plugins"]) == set(["1000G"]) + def test_get_score(): """Test the config parser""" config_reader = ConfigParser(CONFIG) - - variant = {'info_dict':{ - '1000GAF': '0.1', - 'CADD': '12' - }} - - assert config_reader.plugins['1000G'].get_value(variant_dict=variant) == 0.1 - assert config_reader.plugins['CADD'].get_value(variant_dict=variant) == 12 - - assert config_reader.score_functions['1000G'].get_score(0.01) == 1.0 - assert config_reader.score_functions['1000G'].get_score(0.001) == 2.0 - assert config_reader.score_functions['1000G'].get_score(None) == 3.0 + + variant = {"info_dict": {"1000GAF": "0.1", "CADD": "12"}} + + assert config_reader.plugins["1000G"].get_value(variant_dict=variant) == 0.1 + assert config_reader.plugins["CADD"].get_value(variant_dict=variant) == 12 + + assert config_reader.score_functions["1000G"].get_score(0.01) == 1.0 + assert config_reader.score_functions["1000G"].get_score(0.001) == 2.0 + assert config_reader.score_functions["1000G"].get_score(None) == 3.0 + def test_get_score_string(): """Test the config parser""" config_reader = ConfigParser(CONFIG) - - variant = {'info_dict':{ - '1000GAF': '0.1', - 'CADD': '12', - 'GeneticModels': '1:AD|AD_dn', - }} - - assert config_reader.plugins['GeneticModels'].get_value(variant_dict=variant) == "AD" - - assert config_reader.score_functions['GeneticModels'].get_score("AD") == 3.0 - assert config_reader.score_functions['GeneticModels'].get_score("AD_dn") == 2.0 - assert config_reader.score_functions['GeneticModels'].get_score(None) == -12 + + variant = { + "info_dict": { + "1000GAF": "0.1", + "CADD": "12", + "GeneticModels": "1:AD|AD_dn", + } + } + + assert config_reader.plugins["GeneticModels"].get_value(variant_dict=variant) == "AD" + + assert config_reader.score_functions["GeneticModels"].get_score("AD") == 3.0 + assert config_reader.score_functions["GeneticModels"].get_score("AD_dn") == 2.0 + assert config_reader.score_functions["GeneticModels"].get_score(None) == -12 + def test_get_score_value(): """Test the config parser""" config_reader = ConfigParser(CONFIG) - - variant = {'info_dict':{ - '1000GAF': '0.1', - 'CADD': '12', - 'CLNSIG': '2', - }} - - assert config_reader.plugins['CLNSIG'].get_value(variant_dict=variant) == 2 - assert config_reader.score_functions['CLNSIG'].get_score(2) == -1.0 + variant = { + "info_dict": { + "1000GAF": "0.1", + "CADD": "12", + "CLNSIG": "2", + } + } + + assert config_reader.plugins["CLNSIG"].get_value(variant_dict=variant) == 2 + + assert config_reader.score_functions["CLNSIG"].get_score(2) == -1.0 + def test_get_score_value_multiple_values(): """Test the config parser""" config_reader = ConfigParser(CONFIG) - - variant = {'info_dict':{ - '1000GAF': '0.1', - 'CADD': '12', - 'CLNSIG': '2|5', - }} - - assert config_reader.plugins['CLNSIG'].get_raw_entry(variant_dict=variant) == '2|5' - assert config_reader.plugins['CLNSIG'].get_entry(variant_dict=variant) == ['2','5'] - - assert config_reader.plugins['CLNSIG'].get_value(variant_dict=variant) == 5 - - assert config_reader.score_functions['CLNSIG'].get_score(5) == 2 + + variant = { + "info_dict": { + "1000GAF": "0.1", + "CADD": "12", + "CLNSIG": "2|5", + } + } + + assert config_reader.plugins["CLNSIG"].get_raw_entry(variant_dict=variant) == "2|5" + assert config_reader.plugins["CLNSIG"].get_entry(variant_dict=variant) == ["2", "5"] + + assert config_reader.plugins["CLNSIG"].get_value(variant_dict=variant) == 5 + + assert config_reader.score_functions["CLNSIG"].get_score(5) == 2 diff --git a/tests/score_variants/test_rankscore_capping.py b/tests/score_variants/test_rankscore_capping.py index 90637a1..cfb55ef 100644 --- a/tests/score_variants/test_rankscore_capping.py +++ b/tests/score_variants/test_rankscore_capping.py @@ -1,5 +1,7 @@ -from genmod.score_variants.cap_rank_score_to_min_bound import cap_rank_score_to_min_bound, MIN_SCORE_NORMALIZED - +from genmod.score_variants.cap_rank_score_to_min_bound import ( + MIN_SCORE_NORMALIZED, + cap_rank_score_to_min_bound, +) MIN_SCORE: float = -5.0 @@ -12,9 +14,14 @@ def test_rankscore_normalized_capping(): # WHEN running cap method # THEN expect rank score to be larger than min bound for rank_score_normalized in range(-10, 10): - assert cap_rank_score_to_min_bound(rank_score_type='RankScoreNormalized', - rank_score=float(rank_score_normalized), - min_rank_score_value=MIN_SCORE_NORMALIZED) >= MIN_SCORE_NORMALIZED + assert ( + cap_rank_score_to_min_bound( + rank_score_type="RankScoreNormalized", + rank_score=float(rank_score_normalized), + min_rank_score_value=MIN_SCORE_NORMALIZED, + ) + >= MIN_SCORE_NORMALIZED + ) def test_rankscore_capping(): @@ -26,6 +33,9 @@ def test_rankscore_capping(): # WHEN running cap method # THEN expect rank score to be larger than min bound for rank_score in range(-10, 10): - assert cap_rank_score_to_min_bound(rank_score_type='RankScore', - rank_score=rank_score, - min_rank_score_value=MIN_SCORE) >= MIN_SCORE + assert ( + cap_rank_score_to_min_bound( + rank_score_type="RankScore", rank_score=rank_score, min_rank_score_value=MIN_SCORE + ) + >= MIN_SCORE + ) diff --git a/tests/score_variants/test_score_function.py b/tests/score_variants/test_score_function.py index 4a701c8..0a6da42 100644 --- a/tests/score_variants/test_score_function.py +++ b/tests/score_variants/test_score_function.py @@ -1,40 +1,38 @@ import pytest from genmod.score_variants import ScoreFunction + def test_string_score(): """Test the score function with a string function""" - + not_reported_score = 4 - string_dict = { - 'hello': 1, - 'world': 2 - } + string_dict = {"hello": 1, "world": 2} + + score_function = ScoreFunction(match_type="string") - score_function = ScoreFunction(match_type = 'string') - for key in string_dict: score_function.add_string_rule(key, string_dict[key]) score_function.set_not_reported(not_reported_score) - - assert score_function.get_score('hello') == 1 - assert score_function.get_score('world') == 2 + + assert score_function.get_score("hello") == 1 + assert score_function.get_score("world") == 2 assert score_function.get_score(None) == not_reported_score - assert score_function.get_score('non_existing') == 0 + assert score_function.get_score("non_existing") == 0 def test_int_score(): """Test the score function with a integer""" - + not_reported_score = 1 - - score_function = ScoreFunction(match_type = 'integer') + + score_function = ScoreFunction(match_type="integer") score_function.add_interval(lower=0, upper=10, score=1) score_function.add_interval(lower=10, upper=15, score=2) score_function.add_interval(lower=15, upper=20, score=3) score_function.set_not_reported(not_reported_score) - + assert score_function.get_score(3) == 1 assert score_function.get_score(12) == 2 assert score_function.get_score(15) == 3 @@ -42,45 +40,47 @@ def test_int_score(): assert score_function.get_score(None) == not_reported_score assert score_function.get_score(-3) == 0 + def test_eq_score(): """Test the score function when the score found should be returned""" - + not_reported_score = 1 - - score_function = ScoreFunction(match_type='integer', equal=True) - + + score_function = ScoreFunction(match_type="integer", equal=True) + score_function.set_not_reported(not_reported_score) - + assert score_function.get_score(3) == 3 assert score_function.get_score(12) == 12 assert score_function.get_score(None) == not_reported_score + def test_score_mode_user_defined_range(): - """ Test score mode with user defined override. """ + """Test score mode with user defined override.""" # GIVEN a score function with user defined rank score override (missing plugin defined min-max scores) - score_function = ScoreFunction(match_type='integer', equal=True) + score_function = ScoreFunction(match_type="integer", equal=True) with pytest.raises(ValueError) as error: # WHEN trying to get the score range # THEN expect this to trigger an ValueError _ = score_function.score_range - assert 'User supplied score values does not have a known score range' in str(error.value) + assert "User supplied score values does not have a known score range" in str(error.value) # GIVEN a score function with user defined rank score override (missing plugin defined min-max scores) - score_function = ScoreFunction(match_type='integer') + score_function = ScoreFunction(match_type="integer") score_function.set_equal() with pytest.raises(ValueError) as error: # WHEN trying to get the score range # THEN expect this to trigger a ValueError _ = score_function.score_range - assert 'User supplied score values does not have a known score range' in str(error.value) + assert "User supplied score values does not have a known score range" in str(error.value) def test_score_mode_invalid_double_lookup(): - """ Test ScoreFunction sanity check when using multiple scoring maps. """ + """Test ScoreFunction sanity check when using multiple scoring maps.""" # GIVEN a score function with multiple maps - score_function = ScoreFunction(match_type='integer') + score_function = ScoreFunction(match_type="integer") score_function.set_not_reported(-100) score_function.add_interval(-10, -1, 0.1) score_function.add_value(0, 0) @@ -88,13 +88,15 @@ def test_score_mode_invalid_double_lookup(): # WHEN trying to get a plugin min or max bounds # THEN expect this to trigger a ValueError _ = score_function.score_min - assert 'Unable to accurately determine what mapping to use for determining score range' in str(error.value) + assert "Unable to accurately determine what mapping to use for determining score range" in str( + error.value + ) def test_score_mode_tree_lookup(): - """ Test ScoreFunctions min max bounds property. """ + """Test ScoreFunctions min max bounds property.""" # GIVEN a score function with a tree (range) map - score_function = ScoreFunction(match_type='integer') + score_function = ScoreFunction(match_type="integer") score_function.set_not_reported(-100) score_function.add_interval(-10, -1, 0.1) score_function.add_interval(0, 10, 0.5) @@ -106,9 +108,9 @@ def test_score_mode_tree_lookup(): def test_score_mode_value_lookup(): - """ Test ScoreFunctions min max bounds property. """ + """Test ScoreFunctions min max bounds property.""" # GIVEN a score function with value dict map - score_function = ScoreFunction(match_type='value') + score_function = ScoreFunction(match_type="value") score_function.set_not_reported(-100.0) score_function.add_value(0, 0) score_function.add_value(1, 1) @@ -121,14 +123,14 @@ def test_score_mode_value_lookup(): def test_score_mode_string_lookup(): - """ Test ScoreFunctions min max bounds property. """ + """Test ScoreFunctions min max bounds property.""" # GIVEN a score function with string dict map - score_function = ScoreFunction(match_type='string') + score_function = ScoreFunction(match_type="string") score_function.set_not_reported(-100) - score_function.add_string_rule('foo', 0) - score_function.add_string_rule('bar', 1) - score_function.add_string_rule('0xdead', 2) + score_function.add_string_rule("foo", 0) + score_function.add_string_rule("bar", 1) + score_function.add_string_rule("0xdead", 2) # WHEN accessing min max plugin score # THEN expect the proper min max values assert score_function.score_min == -100 - assert score_function.score_max == 2 \ No newline at end of file + assert score_function.score_max == 2 diff --git a/tests/utils/test_check_individuals.py b/tests/utils/test_check_individuals.py index e722e5f..373848d 100644 --- a/tests/utils/test_check_individuals.py +++ b/tests/utils/test_check_individuals.py @@ -1,14 +1,14 @@ +import pytest from genmod.utils import check_individuals -import pytest def test_correct(): """ Test if return True when the individuals exist """ - ped_individuals = ['1','2','3'] - vcf_individuals = ['1','2','3'] - + ped_individuals = ["1", "2", "3"] + vcf_individuals = ["1", "2", "3"] + assert check_individuals(ped_individuals, vcf_individuals) @@ -16,8 +16,8 @@ def test_wrong(): """ Test if raise error when the individuals not exist """ - ped_individuals = ['1','2','3', '4'] - vcf_individuals = ['1','2','3'] - + ped_individuals = ["1", "2", "3", "4"] + vcf_individuals = ["1", "2", "3"] + with pytest.raises(IOError): - check_individuals(ped_individuals, vcf_individuals) \ No newline at end of file + check_individuals(ped_individuals, vcf_individuals) diff --git a/tests/utils/test_check_vep_annotation.py b/tests/utils/test_check_vep_annotation.py index 5d9f92c..9a5f07e 100644 --- a/tests/utils/test_check_vep_annotation.py +++ b/tests/utils/test_check_vep_annotation.py @@ -1,37 +1,27 @@ from genmod.utils import check_vep_annotation + def test_get_none(): """ Test to get wrong annotation terms """ - - vep_variant = {'vep_info':{ - 'A':[ - { - 'Consequence': 'Two&Different', - 'Gene': 'ADK' - } - ] - } - } - + + vep_variant = {"vep_info": {"A": [{"Consequence": "Two&Different", "Gene": "ADK"}]}} + # The result should be empty since the terms do not exist assert check_vep_annotation(vep_variant) == set() + def test_get_annotation(): """ Test to get true annotation terms """ - - vep_variant = {'vep_info':{ - 'A':[ - { - 'Consequence': 'transcript_ablation&splice_donor_variant', - 'Gene': 'ADK' - } - ] + + vep_variant = { + "vep_info": { + "A": [{"Consequence": "transcript_ablation&splice_donor_variant", "Gene": "ADK"}] } } - + # The result should be empty since the terms do not exist - assert check_vep_annotation(vep_variant) == set(['ADK']) \ No newline at end of file + assert check_vep_annotation(vep_variant) == set(["ADK"]) diff --git a/tests/utils/test_generate_pairs.py b/tests/utils/test_generate_pairs.py index e8ac3db..60cb0b9 100644 --- a/tests/utils/test_generate_pairs.py +++ b/tests/utils/test_generate_pairs.py @@ -1,15 +1,16 @@ +import pytest from genmod.utils import generate_pairs -import pytest def test_generate_pairs(): """Test if generate pairs behave as suspected""" - objects = [1,2] + objects = [1, 2] pairs = [] for pair in generate_pairs(objects): pairs.append(pair) - - assert pairs == [(1,2)] + + assert pairs == [(1, 2)] + def test_non_iterator(): """Test if generate pairs behave as suspected""" @@ -19,6 +20,7 @@ def test_non_iterator(): for pair in generate_pairs(objects): pairs.append(pair) + def test_one_object(): """Test if generate pairs behave as suspected""" objects = [1] @@ -27,14 +29,15 @@ def test_one_object(): for pair in generate_pairs(objects): pairs.append(pair) + def test_generate_multiple_pairs(): """Test if generate pairs behave as suspected""" - objects = [1,2,3,4] + objects = [1, 2, 3, 4] pairs = [] for pair in generate_pairs(objects): pairs.append(pair) - + assert len(pairs) == 6 - assert pairs[0] == (1,2) - assert pairs[1] == (1,3) - assert pairs[-1] == (3,4) + assert pairs[0] == (1, 2) + assert pairs[1] == (1, 3) + assert pairs[-1] == (3, 4) diff --git a/tests/utils/test_get_annotation.py b/tests/utils/test_get_annotation.py index 9ed24eb..1cf95dd 100644 --- a/tests/utils/test_get_annotation.py +++ b/tests/utils/test_get_annotation.py @@ -1,66 +1,60 @@ -from genmod.utils import (get_annotation) - import pytest +from genmod.utils import get_annotation + -def get_variant(chrom='1', pos='1', ref='A', alt='G', annotation=["ADK"]): +def get_variant(chrom="1", pos="1", ref="A", alt="G", annotation=["ADK"]): """ Return a variant dictionary """ - variant_id = '_'.join([chrom, pos, ref, alt]) + variant_id = "_".join([chrom, pos, ref, alt]) variant = { - "CHROM":chrom, - "POS":pos, - "INFO":"Annotation={0}".format(annotation), - 'info_dict':{ - "Annotation":','.join(annotation), + "CHROM": chrom, + "POS": pos, + "INFO": "Annotation={0}".format(annotation), + "info_dict": { + "Annotation": ",".join(annotation), }, - "variant_id": variant_id + "variant_id": variant_id, } return variant -def get_vep_variant(chrom='1', pos='1', ref='A', alt='G', annotation="ADK"): + +def get_vep_variant(chrom="1", pos="1", ref="A", alt="G", annotation="ADK"): """ Return a variant dictionary """ - variant_id = '_'.join([chrom, pos, ref, alt]) + variant_id = "_".join([chrom, pos, ref, alt]) variant = { - "CHROM":chrom, - "POS":pos, - "INFO":"Annotation={0}".format(annotation), - 'vep_info':{ - 'A': [{ - "Gene": annotation, - "Consequence": 'transcript_ablation' - }] - }, - "variant_id": variant_id + "CHROM": chrom, + "POS": pos, + "INFO": "Annotation={0}".format(annotation), + "vep_info": {"A": [{"Gene": annotation, "Consequence": "transcript_ablation"}]}, + "variant_id": variant_id, } return variant def test_empty(): """Test if get_features behave as suspected""" - variant = { - 'CHROM': '1', - 'POS': '12', - 'ALT': 'A' - } - + variant = {"CHROM": "1", "POS": "12", "ALT": "A"} + assert get_annotation(variant) == set() + def test_simple(): """Test if get_annotation behave as suspected""" variant = get_variant() - assert get_annotation(variant) == set(['ADK']) + assert get_annotation(variant) == set(["ADK"]) + def test_double_region(): """Test if get_annotation behave as suspected""" variant = get_variant(annotation=["ADK", "DDD"]) assert get_annotation(variant) == set(["ADK", "DDD"]) + def test_get_vep_region(): """docstring for test_get_vep_region""" variant = get_vep_variant() assert get_annotation(variant, vep=True) == set(["ADK"]) - diff --git a/tests/utils/test_get_batches.py b/tests/utils/test_get_batches.py index 1216de6..fdfc377 100644 --- a/tests/utils/test_get_batches.py +++ b/tests/utils/test_get_batches.py @@ -1,5 +1,6 @@ from genmod.utils import get_batches from genmod.vcf_tools import HeaderParser + try: from Queue import Queue except ImportError: @@ -7,17 +8,27 @@ HEADER = "CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO" -def get_variant_line(chrom='1', pos='1', db_id='.', ref='A', alt='G', -qual='100', filt='PASS', info="Annotation=ADK;Exonic"): + +def get_variant_line( + chrom="1", + pos="1", + db_id=".", + ref="A", + alt="G", + qual="100", + filt="PASS", + info="Annotation=ADK;Exonic", +): """ Return a variant dictionary """ variant_line = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t".format( chrom, pos, db_id, ref, alt, qual, filt, info ) - + return variant_line + def test_get_batches_one(): """ Test to get a batch @@ -27,16 +38,16 @@ def test_get_batches_one(): first_variant = get_variant_line() header = HeaderParser() header.parse_header_line("#{0}".format(HEADER)) - + variants.append(first_variant) - - chromosomes = get_batches(variants=variants, batch_queue=batch_queue, - header=header) + + chromosomes = get_batches(variants=variants, batch_queue=batch_queue, header=header) batch = batch_queue.get() - - assert chromosomes == ['1'] + + assert chromosomes == ["1"] assert len(batch) == 1 + def test_get_batches_two(): """ Test to get a batch @@ -49,12 +60,11 @@ def test_get_batches_two(): variants.append(second_variant) header = HeaderParser() header.parse_header_line("#{0}".format(HEADER)) - - chromosomes = get_batches(variants=variants, batch_queue=batch_queue, - header=header) + + chromosomes = get_batches(variants=variants, batch_queue=batch_queue, header=header) batch = batch_queue.get() - - assert chromosomes == ['1'] + + assert chromosomes == ["1"] assert len(batch) == 2 @@ -68,140 +78,138 @@ def test_get_batches_two_regions(): second_variant = get_variant_line(pos="2", info="Annotation=DDD;Exonic") variants.append(first_variant) variants.append(second_variant) - + header = HeaderParser() header.parse_header_line("#{0}".format(HEADER)) - - chromosomes = get_batches(variants=variants, batch_queue=batch_queue, - header=header) + + chromosomes = get_batches(variants=variants, batch_queue=batch_queue, header=header) batch_1 = batch_queue.get() batch_queue.task_done() - + batch_2 = batch_queue.get() batch_queue.task_done() - - assert chromosomes == ['1'] + + assert chromosomes == ["1"] assert len(batch_1) == 1 assert len(batch_2) == 1 + def test_get_batches_vep(): """ Test to get a batch """ batch_queue = Queue() variants = [] - + first_variant = get_variant_line(info="MQ;CSQ=G|ADK") - + second_variant = get_variant_line(pos="2", info="MQ;CSQ=G|ADK") - + variants.append(first_variant) variants.append(second_variant) header = HeaderParser() header.parse_header_line("#{0}".format(HEADER)) - header.vep_columns = ['Allele', 'SYMBOL'] - - chromosomes = get_batches(variants=variants, batch_queue=batch_queue, - header=header) - + header.vep_columns = ["Allele", "SYMBOL"] + + chromosomes = get_batches(variants=variants, batch_queue=batch_queue, header=header) + batch_1 = batch_queue.get() batch_queue.task_done() - + batch_2 = batch_queue.get() batch_queue.task_done() - - assert chromosomes == ['1'] + + assert chromosomes == ["1"] assert len(batch_1) == 1 assert len(batch_2) == 1 + def test_get_batches_vep_no_allele(): """ Test to get a batch """ batch_queue = Queue() variants = [] - + first_variant = get_variant_line(info="MQ;CSQ=ADK") - + second_variant = get_variant_line(pos="2", info="MQ;CSQ=ADK") - + variants.append(first_variant) variants.append(second_variant) header = HeaderParser() header.parse_header_line("#{0}".format(HEADER)) - header.vep_columns = ['SYMBOL'] - - chromosomes = get_batches(variants=variants, batch_queue=batch_queue, - header=header) - + header.vep_columns = ["SYMBOL"] + + chromosomes = get_batches(variants=variants, batch_queue=batch_queue, header=header) + batch_1 = batch_queue.get() batch_queue.task_done() - + batch_2 = batch_queue.get() batch_queue.task_done() - - assert chromosomes == ['1'] + + assert chromosomes == ["1"] assert len(batch_1) == 1 assert len(batch_2) == 1 + def test_get_batches_no_regions(): """ Test to get a batch """ batch_queue = Queue() variants = [] - + first_variant = get_variant_line(info="MQ") - + second_variant = get_variant_line(pos="2") - + variants.append(first_variant) variants.append(second_variant) header = HeaderParser() header.parse_header_line("#{0}".format(HEADER)) - - chromosomes = get_batches(variants=variants, batch_queue=batch_queue, - header=header) - + + chromosomes = get_batches(variants=variants, batch_queue=batch_queue, header=header) + batch_1 = batch_queue.get() batch_queue.task_done() - + batch_2 = batch_queue.get() batch_queue.task_done() - - assert chromosomes == ['1'] + + assert chromosomes == ["1"] assert len(batch_1) == 1 assert len(batch_2) == 1 + def test_get_batches_new_chromosome(): """ Test to get a batch """ batch_queue = Queue() variants = [] - + first_variant = get_variant_line() second_variant = get_variant_line(chrom="2") - + variants.append(first_variant) variants.append(second_variant) - + header = HeaderParser() header.parse_header_line("#{0}".format(HEADER)) - - chromosomes = get_batches(variants=variants, batch_queue=batch_queue, - header=header) - + + chromosomes = get_batches(variants=variants, batch_queue=batch_queue, header=header) + batch_1 = batch_queue.get() batch_queue.task_done() - + batch_2 = batch_queue.get() batch_queue.task_done() - - assert chromosomes == ['1', '2'] + + assert chromosomes == ["1", "2"] assert len(batch_1) == 1 assert len(batch_2) == 1 - diff --git a/tests/utils/test_get_priority.py b/tests/utils/test_get_priority.py index 4608e0d..83ee95a 100644 --- a/tests/utils/test_get_priority.py +++ b/tests/utils/test_get_priority.py @@ -3,29 +3,34 @@ def test_get_chromosome_priority(): """docstring for test_get_chromosome_priority""" - assert get_chromosome_priority(chrom='1', chrom_dict={}) == '1' + assert get_chromosome_priority(chrom="1", chrom_dict={}) == "1" + def test_get_X_priority(): """docstring for test_get_X_priority""" - assert get_chromosome_priority(chrom='X', chrom_dict={}) == '23' + assert get_chromosome_priority(chrom="X", chrom_dict={}) == "23" + def test_get_Y_priority(): """docstring for test_get_Y_priority""" - assert get_chromosome_priority(chrom='Y', chrom_dict={}) == '24' + assert get_chromosome_priority(chrom="Y", chrom_dict={}) == "24" + def test_get_MT_priority(): """docstring for test_get_MT_priority""" - assert get_chromosome_priority(chrom='MT', chrom_dict={}) == '25' + assert get_chromosome_priority(chrom="MT", chrom_dict={}) == "25" + def test_get_OTHER_priority(): """docstring for test_get_MT_priority""" - assert get_chromosome_priority(chrom='GL37', chrom_dict={}) == '26' + assert get_chromosome_priority(chrom="GL37", chrom_dict={}) == "26" + def test_get_chr_prority(): """docstring for test_get_chr_prority""" - assert get_chromosome_priority(chrom='chr1', chrom_dict={}) == '1' + assert get_chromosome_priority(chrom="chr1", chrom_dict={}) == "1" + def test_get_custom_prority(): """docstring for test_get_chr_prority""" - assert get_chromosome_priority(chrom='AHA_1', chrom_dict={'AHA_1':2, 'AHA_2':3}) == 2 - + assert get_chromosome_priority(chrom="AHA_1", chrom_dict={"AHA_1": 2, "AHA_2": 3}) == 2 diff --git a/tests/utils/test_get_rank_score.py b/tests/utils/test_get_rank_score.py index c1daf64..89c831a 100644 --- a/tests/utils/test_get_rank_score.py +++ b/tests/utils/test_get_rank_score.py @@ -1,58 +1,75 @@ -from genmod.utils import get_rank_score -from genmod.vcf_tools import get_variant_dict, get_info_dict from genmod.score_variants.rank_score_variant_definitions import RANK_SCORE_TYPE_NAMES +from genmod.utils import get_rank_score +from genmod.vcf_tools import get_info_dict, get_variant_dict + def get_variant_line(): """Return a vcf formatted variant line.""" pass + def test_get_rank_score(): """docstring for test_get_rank_score""" - variant_line = "1\t879537\t.\tT\tC\t100\tPASS\tMQ=1;GeneticModels=1:AR_hom;"\ - "ModelScore=1:55;Annotation=SAMD11;CADD=1.248;Exonic;RankScore=1:23;RankScoreNormalized=1:0.2\t"\ - "GT:AD:GQ\t0/1:10,10:60" - - assert float(get_rank_score(variant_line = variant_line)) == float('23') + variant_line = ( + "1\t879537\t.\tT\tC\t100\tPASS\tMQ=1;GeneticModels=1:AR_hom;" + "ModelScore=1:55;Annotation=SAMD11;CADD=1.248;Exonic;RankScore=1:23;RankScoreNormalized=1:0.2\t" + "GT:AD:GQ\t0/1:10,10:60" + ) + + assert float(get_rank_score(variant_line=variant_line)) == float("23") + def test_get_rank_score_no_score(): """docstring for test_get_rank_score""" - variant_line = "1\t879537\t.\tT\tC\t100\tPASS\tMQ=1;GeneticModels=1:AR_hom;"\ - "ModelScore=1:55;Annotation=SAMD11;CADD=1.248;Exonic\t"\ - "GT:AD:GQ\t0/1:10,10:60" + variant_line = ( + "1\t879537\t.\tT\tC\t100\tPASS\tMQ=1;GeneticModels=1:AR_hom;" + "ModelScore=1:55;Annotation=SAMD11;CADD=1.248;Exonic\t" + "GT:AD:GQ\t0/1:10,10:60" + ) for rank_score_type in RANK_SCORE_TYPE_NAMES: - assert float(get_rank_score(variant_line=variant_line, rank_score_type=rank_score_type)) == float('-100') + assert float( + get_rank_score(variant_line=variant_line, rank_score_type=rank_score_type) + ) == float("-100") + def test_get_rank_score_multiple_families(): """docstring for test_get_rank_score""" - variant_line = "1\t879537\t.\tT\tC\t100\tPASS\tMQ=1;GeneticModels=1:AR_hom;"\ - "ModelScore=1:55;Annotation=SAMD11;CADD=1.248;Exonic;RankScore=1:23,2:12;RankScoreNormalized=1:0.2\t"\ - "GT:AD:GQ\t0/1:10,10:60" - - assert float(get_rank_score(variant_line = variant_line, family_id='2')) == float('12') + variant_line = ( + "1\t879537\t.\tT\tC\t100\tPASS\tMQ=1;GeneticModels=1:AR_hom;" + "ModelScore=1:55;Annotation=SAMD11;CADD=1.248;Exonic;RankScore=1:23,2:12;RankScoreNormalized=1:0.2\t" + "GT:AD:GQ\t0/1:10,10:60" + ) + + assert float(get_rank_score(variant_line=variant_line, family_id="2")) == float("12") + def test_get_rank_score_dict(): """docstring for test_get_rank_score""" header_line = "CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t1" - variant_info = "1\t879537\t.\tT\tC\t100\tPASS\tMQ=1;GeneticModels=1:AR_hom;"\ - "ModelScore=1:55;Annotation=SAMD11;CADD=1.248;Exonic;RankScore=1:23;RankScoreNormalized=1:0.2\t"\ - "GT:AD:GQ\t0/1:10,10:60" - variant_dict = get_variant_dict( - variant_line=variant_info, - header_line = header_line.split() + variant_info = ( + "1\t879537\t.\tT\tC\t100\tPASS\tMQ=1;GeneticModels=1:AR_hom;" + "ModelScore=1:55;Annotation=SAMD11;CADD=1.248;Exonic;RankScore=1:23;RankScoreNormalized=1:0.2\t" + "GT:AD:GQ\t0/1:10,10:60" ) - - variant_dict['info_dict'] = get_info_dict(variant_dict['INFO']) - - assert float(get_rank_score(variant_dict = variant_dict)) == float('23') + variant_dict = get_variant_dict(variant_line=variant_info, header_line=header_line.split()) + + variant_dict["info_dict"] = get_info_dict(variant_dict["INFO"]) + + assert float(get_rank_score(variant_dict=variant_dict)) == float("23") def test_get_rank_score_normalized(): - """ Test fetching normalized rank score form VCF INFO field """ + """Test fetching normalized rank score form VCF INFO field""" # GIVEN an INFO field string with both raw and normalized rank score - variant_line = "1\t879537\t.\tT\tC\t100\tPASS\tMQ=1;GeneticModels=1:AR_hom;" \ - "ModelScore=1:55;Annotation=SAMD11;CADD=1.248;Exonic;RankScore=1:23;RankScoreNormalized=1:0.2\t" \ - "GT:AD:GQ\t0/1:10,10:60" + variant_line = ( + "1\t879537\t.\tT\tC\t100\tPASS\tMQ=1;GeneticModels=1:AR_hom;" + "ModelScore=1:55;Annotation=SAMD11;CADD=1.248;Exonic;RankScore=1:23;RankScoreNormalized=1:0.2\t" + "GT:AD:GQ\t0/1:10,10:60" + ) # WHEN fetching normalized rank score # THEN expect correct value fetched - assert float(get_rank_score(variant_line=variant_line, rank_score_type='RankScoreNormalized')) == 0.2 + assert ( + float(get_rank_score(variant_line=variant_line, rank_score_type="RankScoreNormalized")) + == 0.2 + ) diff --git a/tests/utils/test_is_number.py b/tests/utils/test_is_number.py index 885ef4a..5495c5a 100644 --- a/tests/utils/test_is_number.py +++ b/tests/utils/test_is_number.py @@ -1,29 +1,32 @@ +import pytest from genmod.utils import is_number -import pytest def test_int(): """Test if is_number behave as suspected""" obj = 2 assert is_number(obj) == True + def test_float(): """Test if is_number behave as suspected""" obj = 2.5 assert is_number(obj) == True + def test_non_number(): """Test if is_number behave as suspected""" - obj = 'a' + obj = "a" assert is_number(obj) == False + def test_str_int(): """Test if is_number behave as suspected""" - obj = '1' + obj = "1" assert is_number(obj) == True + def test_str_float(): """Test if is_number behave as suspected""" - obj = '1.3' + obj = "1.3" assert is_number(obj) == True - diff --git a/tests/utils/test_variant_printer.py b/tests/utils/test_variant_printer.py index 384eaa2..15785ad 100644 --- a/tests/utils/test_variant_printer.py +++ b/tests/utils/test_variant_printer.py @@ -1,88 +1,84 @@ from codecs import open -from tempfile import NamedTemporaryFile -from multiprocessing import Manager, util from collections import OrderedDict +from multiprocessing import Manager, util +from tempfile import NamedTemporaryFile from genmod.utils import VariantPrinter -from genmod.vcf_tools import (get_variant_dict, get_info_dict, -get_variant_id, HeaderParser) +from genmod.vcf_tools import HeaderParser, get_info_dict, get_variant_dict, get_variant_id util.abstract_sockets_supported = False + def setup_vcf_file(): """ Print some variants to a vcf file and return the filename """ vcf_lines = [ - '##fileformat=VCFv4.1\n', + "##fileformat=VCFv4.1\n", '##INFO=\n', - '##contig=\n', - '##reference=file:///humgen/gsa-hpprojects/GATK/bundle'\ - '/current/b37/human_g1k_v37.fasta\n', - '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband\n', - '1\t11900\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/1:60\t1/1:60\n', - '1\t879585\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/0:60\t0/1:60\n', - '1\t879586\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/1:60\t0/1:60\n', - '1\t947378\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n', - '1\t973348\t.\tG\tA\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n', - '3\t879585\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/0:60\t0/1:60\n', - '3\t879586\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/1:60\t0/1:60\n', - '3\t947378\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n', - '3\t973348\t.\tG\tA\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n' - ] - vcf_file = NamedTemporaryFile(mode='w+t', delete=False, suffix='.vcf') + "##contig=\n", + "##reference=file:///humgen/gsa-hpprojects/GATK/bundle" + "/current/b37/human_g1k_v37.fasta\n", + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband\n", + "1\t11900\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/1:60\t1/1:60\n", + "1\t879585\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/0:60\t0/1:60\n", + "1\t879586\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/1:60\t0/1:60\n", + "1\t947378\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n", + "1\t973348\t.\tG\tA\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n", + "3\t879585\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/0:60\t0/1:60\n", + "3\t879586\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/1:60\t0/1:60\n", + "3\t947378\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n", + "3\t973348\t.\tG\tA\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n", + ] + vcf_file = NamedTemporaryFile(mode="w+t", delete=False, suffix=".vcf") vcf_file.writelines(vcf_lines) vcf_file.seek(0) vcf_file.close() - + return vcf_file.name + def test_variant_printer(): """Test the variant printer""" vcf_file = setup_vcf_file() variant_queue = Manager().Queue() head = HeaderParser() - - outfile = NamedTemporaryFile(mode='w+t', delete=False, suffix='.vcf') + + outfile = NamedTemporaryFile(mode="w+t", delete=False, suffix=".vcf") outfile.close() - - + variant_printer = VariantPrinter( - task_queue=variant_queue, - head=head, - mode='chromosome', - outfile = outfile.name + task_queue=variant_queue, head=head, mode="chromosome", outfile=outfile.name ) - + variant_printer.start() - + batch = OrderedDict() - + for line in open(vcf_file): line = line.rstrip() - if line.startswith('#'): - if line.startswith('##'): + if line.startswith("#"): + if line.startswith("##"): head.parse_meta_data(line) else: head.parse_header_line(line) else: variant_dict = get_variant_dict(line, head.header) variant_id = get_variant_id(variant_dict) - variant_dict['variant_id'] = variant_id - variant_dict['info_dict'] = get_info_dict(variant_dict['INFO']) - + variant_dict["variant_id"] = variant_id + variant_dict["info_dict"] = get_info_dict(variant_dict["INFO"]) + variant_queue.put(variant_dict) - - + variant_queue.put(None) - + variant_printer.join() - + variants = [] - with open(outfile.name, 'r', 'utf-8-sig') as f: + with open(outfile.name, "r", "utf-8-sig") as f: for line in f: - variants.append(line.rstrip().split('\t')) - - assert variants[0][0] == '1' - assert variants[0][2] == '11900' \ No newline at end of file + variants.append(line.rstrip().split("\t")) + + assert variants[0][0] == "1" + assert variants[0][2] == "11900" diff --git a/tests/variant_annotation/test_get_frequencies.py b/tests/variant_annotation/test_get_frequencies.py index 6678b1c..79c288d 100644 --- a/tests/variant_annotation/test_get_frequencies.py +++ b/tests/variant_annotation/test_get_frequencies.py @@ -1,27 +1,29 @@ -from genmod.annotate_variants import (get_frequencies) +from genmod.annotate_variants import get_frequencies + def test_get_frequencies(thousand_g_handle): - chrom = '1' - start = '879537' - alt = 'C' + chrom = "1" + start = "879537" + alt = "C" frequencies = get_frequencies(thousand_g_handle, chrom, start, alt) assert frequencies - assert frequencies['AF'] == '0.000199681' - assert frequencies['MAX_AF'] == '0.001' + assert frequencies["AF"] == "0.000199681" + assert frequencies["MAX_AF"] == "0.001" + def test_get_frequencies_wrong_alt(thousand_g_handle): - chrom = '1' - start = '879537' - alt = 'T' + chrom = "1" + start = "879537" + alt = "T" frequencies = get_frequencies(thousand_g_handle, chrom, start, alt) assert frequencies == {} - + def test_get_non_existing_tabix_record(thousand_g_handle): - chrom = '1' - start = '10' - alt = 'C' - + chrom = "1" + start = "10" + alt = "C" + frequencies = get_frequencies(thousand_g_handle, chrom, start, alt) - - assert frequencies == {} \ No newline at end of file + + assert frequencies == {} diff --git a/tests/variant_annotation/test_get_haploblocks.py b/tests/variant_annotation/test_get_haploblocks.py index 52117a7..88d67bc 100644 --- a/tests/variant_annotation/test_get_haploblocks.py +++ b/tests/variant_annotation/test_get_haploblocks.py @@ -1,91 +1,104 @@ from genmod.annotate_models import get_haploblocks + def get_variant(**kwargs): """ Construct a variant and return it """ variant = { - 'CHROM': kwargs.get('CHROM', '1'), - 'POS': kwargs.get('POS', '12'), - 'REF': kwargs.get('REF', 'A'), - 'ALT': kwargs.get('REF', 'C'), - 'FILTER': 'PASS' + "CHROM": kwargs.get("CHROM", "1"), + "POS": kwargs.get("POS", "12"), + "REF": kwargs.get("REF", "A"), + "ALT": kwargs.get("REF", "C"), + "FILTER": "PASS", } - for indivivdual_id in kwargs.get('genotype_calls',{'1':'0|1'}): - variant[indivivdual_id] = kwargs.get( - 'genotype_calls',{'1':'0|1'})[indivivdual_id] + for indivivdual_id in kwargs.get("genotype_calls", {"1": "0|1"}): + variant[indivivdual_id] = kwargs.get("genotype_calls", {"1": "0|1"})[indivivdual_id] return variant + def get_variant_batch(): """Return a small variant batch""" from collections import OrderedDict + variant_batch = OrderedDict() - variant_1 = get_variant(**{ - 'POS': '12', - 'genotype_calls':{ - '1': '0|1', - '2': '0/1', + variant_1 = get_variant( + **{ + "POS": "12", + "genotype_calls": { + "1": "0|1", + "2": "0/1", + }, } - }) - variant_2 = get_variant(**{ - 'POS': '13', - 'genotype_calls':{ - '1': '0|1', - '2': '0/1', + ) + variant_2 = get_variant( + **{ + "POS": "13", + "genotype_calls": { + "1": "0|1", + "2": "0/1", + }, } - }) - variant_3 = get_variant(**{ - 'POS': '14', - 'genotype_calls':{ - '1': '0|1', - '2': '0|1', + ) + variant_3 = get_variant( + **{ + "POS": "14", + "genotype_calls": { + "1": "0|1", + "2": "0|1", + }, } - }) - variant_4 = get_variant(**{ - 'POS': '15', - 'genotype_calls':{ - '1': '0/1', - '2': '0|1', + ) + variant_4 = get_variant( + **{ + "POS": "15", + "genotype_calls": { + "1": "0/1", + "2": "0|1", + }, } - }) - variant_5 = get_variant(**{ - 'POS': '16', - 'genotype_calls':{ - '1': '0|1', - '2': '0|1', + ) + variant_5 = get_variant( + **{ + "POS": "16", + "genotype_calls": { + "1": "0|1", + "2": "0|1", + }, } - }) - variant_6 = get_variant(**{ - 'POS': '17', - 'genotype_calls':{ - '1': '0|1', - '2': '0/1', + ) + variant_6 = get_variant( + **{ + "POS": "17", + "genotype_calls": { + "1": "0|1", + "2": "0/1", + }, } - }) - variant_batch['1_12_A_C'] = variant_1 - variant_batch['1_13_A_C'] = variant_2 - variant_batch['1_14_A_C'] = variant_3 - variant_batch['1_15_A_C'] = variant_4 - variant_batch['1_16_A_C'] = variant_5 - variant_batch['1_17_A_C'] = variant_6 - + ) + variant_batch["1_12_A_C"] = variant_1 + variant_batch["1_13_A_C"] = variant_2 + variant_batch["1_14_A_C"] = variant_3 + variant_batch["1_15_A_C"] = variant_4 + variant_batch["1_16_A_C"] = variant_5 + variant_batch["1_17_A_C"] = variant_6 + return variant_batch + def test_simple(): """Test if get_haploblocks behave as suspected""" variant_batch = get_variant_batch() - - haploblocks = get_haploblocks(variant_batch, ['1', '2']) - - assert set(haploblocks.keys()) == set(['1','2']) - - assert haploblocks['1'].find_range([12,12]) == ['1'] - assert haploblocks['2'].find_range([12,12]) == [] - - assert haploblocks['1'].find_range([15,15]) == [] - assert haploblocks['2'].find_range([15,15]) == ['2'] - assert haploblocks['1'].find_range([17,17]) == ['3'] - assert haploblocks['2'].find_range([17,17]) == [] - + haploblocks = get_haploblocks(variant_batch, ["1", "2"]) + + assert set(haploblocks.keys()) == set(["1", "2"]) + + assert haploblocks["1"].find_range([12, 12]) == ["1"] + assert haploblocks["2"].find_range([12, 12]) == [] + + assert haploblocks["1"].find_range([15, 15]) == [] + assert haploblocks["2"].find_range([15, 15]) == ["2"] + assert haploblocks["1"].find_range([17, 17]) == ["3"] + assert haploblocks["2"].find_range([17, 17]) == [] diff --git a/tests/variant_annotation/test_get_tabix_records.py b/tests/variant_annotation/test_get_tabix_records.py index 4aa7937..743e22d 100644 --- a/tests/variant_annotation/test_get_tabix_records.py +++ b/tests/variant_annotation/test_get_tabix_records.py @@ -1,29 +1,31 @@ -from genmod.annotate_variants.read_tabix_files import (get_tabix_records) +from genmod.annotate_variants.read_tabix_files import get_tabix_records + def test_get_tabix_record(thousand_g_handle): - chrom = '1' - start = '879537' + chrom = "1" + start = "879537" i = None for i, row in enumerate(get_tabix_records(thousand_g_handle, chrom, start)): print(row) - #Should find one row + # Should find one row assert i == 0 + def test_get_tabix_record_chr(thousand_g_chr_handle): - chrom = '1' - start = '879537' + chrom = "1" + start = "879537" i = None for i, row in enumerate(get_tabix_records(thousand_g_chr_handle, chrom, start)): print(row) - #Should find one row + # Should find one row assert i == 0 def test_get_non_existing_tabix_record(thousand_g_handle): - chrom = '1' - start = '10' + chrom = "1" + start = "10" i = None for i, row in enumerate(get_tabix_records(thousand_g_handle, chrom, start)): print(row) - #Should find one row - assert i == None \ No newline at end of file + # Should find one row + assert i == None diff --git a/tests/vcf_tools/test_genotype.py b/tests/vcf_tools/test_genotype.py index 4501ee9..8616bce 100755 --- a/tests/vcf_tools/test_genotype.py +++ b/tests/vcf_tools/test_genotype.py @@ -27,38 +27,43 @@ from genmod.vcf_tools import Genotype + def test_nocall(): """ - A nocall is when no informations is found on this position for the - individual. It should be False on all questions except nocall. + A nocall is when no informations is found on this position for the + individual. It should be False on all questions except nocall. Also in the case of haploidity the result should be the same. """ - my_nocall = Genotype(**{'GT':'./.'}) - assert my_nocall.genotype == './.' #We never need to look at the alleles since genotype is defined by 'allele_1/allele_2' + my_nocall = Genotype(**{"GT": "./."}) + assert ( + my_nocall.genotype == "./." + ) # We never need to look at the alleles since genotype is defined by 'allele_1/allele_2' assert not my_nocall.heterozygote assert not my_nocall.homo_ref assert not my_nocall.homo_alt assert not my_nocall.has_variant assert not my_nocall.genotyped + def test_haploid_genotype(): """ Test how genotype behaves with haploid call """ - haploid_call = Genotype(**{'GT':'1'}) - assert haploid_call.genotype == '1/.' + haploid_call = Genotype(**{"GT": "1"}) + assert haploid_call.genotype == "1/." # assert not haploid_call.heterozygote # assert not haploid_call.homo_ref # assert haploid_call.homo_alt # assert haploid_call.has_variant # assert haploid_call.genotyped + def test_haploid_no_call(): """ Test how genotype behaves with haploid call """ - haploid_call = Genotype(**{'GT':'0/.'}) - assert haploid_call.genotype == '0/.' + haploid_call = Genotype(**{"GT": "0/."}) + assert haploid_call.genotype == "0/." assert not haploid_call.heterozygote assert haploid_call.homo_ref assert not haploid_call.homo_alt @@ -70,104 +75,116 @@ def test_genotype_0_1(): """ A normal heterozygote call, has_variant and heterozygote is true. """ - my_genotype = Genotype(**{'GT':'0/1'}) - assert my_genotype.genotype == '0/1' + my_genotype = Genotype(**{"GT": "0/1"}) + assert my_genotype.genotype == "0/1" assert my_genotype.heterozygote assert not my_genotype.homo_ref assert not my_genotype.homo_alt assert my_genotype.has_variant assert my_genotype.genotyped + def test_bad_dp(): """ Test what happends when DP is not a float """ - my_genotype = Genotype(**{'GT':'0/1', 'DP':'A'}) - assert my_genotype.genotype == '0/1' - #If dp is wrong we set it to 0 + my_genotype = Genotype(**{"GT": "0/1", "DP": "A"}) + assert my_genotype.genotype == "0/1" + # If dp is wrong we set it to 0 assert my_genotype.depth_of_coverage == 0 + def test_bad_gq(): """ Test what happends when GQ is not a float. """ - my_genotype = Genotype(**{'GT':'0/1', 'GQ':'A'}) - assert my_genotype.genotype == '0/1' - #If dp is wrong we set it to 0 + my_genotype = Genotype(**{"GT": "0/1", "GQ": "A"}) + assert my_genotype.genotype == "0/1" + # If dp is wrong we set it to 0 assert my_genotype.genotype_quality == 0 + def test_phred_likelihoods(): """ A normal heterozygote call, has_variant and heterozygote is true. """ - my_genotype = Genotype(**{'GT':'0/1', 'PL':'60,70,80'}) - assert my_genotype.phred_likelihoods == [60,70,80] + my_genotype = Genotype(**{"GT": "0/1", "PL": "60,70,80"}) + assert my_genotype.phred_likelihoods == [60, 70, 80] + def test_genotype_1_2(): """ A normal heterozygote call, has_variant and heterozygote is true. """ - my_genotype = Genotype(**{'GT':'1/2'}) - assert my_genotype.genotype == '1/2' + my_genotype = Genotype(**{"GT": "1/2"}) + assert my_genotype.genotype == "1/2" assert my_genotype.heterozygote assert not my_genotype.homo_ref assert not my_genotype.homo_alt assert my_genotype.has_variant assert my_genotype.genotyped + def test_homo_ref(): """ - A homozygote reference call. + A homozygote reference call. has_variant and nocall is False and homo_ref is true. """ - my_homo_ref_genotype = Genotype(**{'GT':'0/0'}) - assert my_homo_ref_genotype.genotype == '0/0' + my_homo_ref_genotype = Genotype(**{"GT": "0/0"}) + assert my_homo_ref_genotype.genotype == "0/0" assert not my_homo_ref_genotype.heterozygote assert my_homo_ref_genotype.homo_ref assert not my_homo_ref_genotype.homo_alt assert not my_homo_ref_genotype.has_variant assert my_homo_ref_genotype.genotyped + def test_homo_alt(): """ - A homozygote alternative call. + A homozygote alternative call. has_variant and homo_alt is true. """ - my_genotype = Genotype(**{'GT':'1/1'}) - assert my_genotype.genotype == '1/1' + my_genotype = Genotype(**{"GT": "1/1"}) + assert my_genotype.genotype == "1/1" assert not my_genotype.heterozygote assert not my_genotype.homo_ref assert my_genotype.homo_alt assert my_genotype.has_variant assert my_genotype.genotyped + def test_homo_alt_2(): """ - A homozygote alternative call. + A homozygote alternative call. has_variant and homo_alt is true. """ - my_genotype = Genotype(**{'GT':'3/3'}) - assert my_genotype.genotype == '3/3' + my_genotype = Genotype(**{"GT": "3/3"}) + assert my_genotype.genotype == "3/3" assert not my_genotype.heterozygote assert not my_genotype.homo_ref assert my_genotype.homo_alt assert my_genotype.has_variant assert my_genotype.genotyped + def test_phased_data(): """ - Try if the class van handle phased data. + Try if the class van handle phased data. In this case a heterozygote. """ - my_genotype = Genotype(**{'GT':'1|0'}) - assert my_genotype.genotype == '1/0'# If asked about the genotype, it should still be on the same form. + my_genotype = Genotype(**{"GT": "1|0"}) + assert ( + my_genotype.genotype == "1/0" + ) # If asked about the genotype, it should still be on the same form. assert my_genotype.heterozygote assert not my_genotype.homo_ref assert not my_genotype.homo_alt assert my_genotype.has_variant - assert my_genotype.allele_1 == '1'# If asked about the genotype, it should still be on the same form. - assert my_genotype.allele_2 == '0'# If asked about the genotype, it should still be on the same form. + assert ( + my_genotype.allele_1 == "1" + ) # If asked about the genotype, it should still be on the same form. + assert ( + my_genotype.allele_2 == "0" + ) # If asked about the genotype, it should still be on the same form. assert my_genotype.genotyped assert my_genotype.phased - - \ No newline at end of file diff --git a/tests/vcf_tools/test_header_parser.py b/tests/vcf_tools/test_header_parser.py index 70ab02d..bfacb91 100644 --- a/tests/vcf_tools/test_header_parser.py +++ b/tests/vcf_tools/test_header_parser.py @@ -1,85 +1,94 @@ from genmod.vcf_tools.header_parser import HeaderParser + def test_parse_info(): ## GIVEN a header object head = HeaderParser() - assert 'MQ' not in head.info_dict + assert "MQ" not in head.info_dict info_line = '##INFO=' - + ## WHEN parsing a correct info line head.parse_meta_data(info_line) - + ## THEN assert it is added to the parser - assert 'MQ' in head.info_dict + assert "MQ" in head.info_dict + def test_parse_info_with_source(): ## GIVEN a header object head = HeaderParser() - assert 'MQ' not in head.info_dict + assert "MQ" not in head.info_dict info_line = '##INFO=' - + ## WHEN parsing a correct info line head.parse_meta_data(info_line) - + ## THEN assert it is added to the parser - assert 'MQ' in head.info_dict + assert "MQ" in head.info_dict + def test_parse_info_with_version(): ## GIVEN a header object head = HeaderParser() - assert 'MQ' not in head.info_dict - info_line = '##INFO=' - + assert "MQ" not in head.info_dict + info_line = ( + '##INFO=' + ) + ## WHEN parsing a correct info line head.parse_meta_data(info_line) - + ## THEN assert it is added to the parser - assert 'MQ' in head.info_dict + assert "MQ" in head.info_dict + def test_parse_info_with_source_and_version(): ## GIVEN a header object head = HeaderParser() - assert 'MQ' not in head.info_dict + assert "MQ" not in head.info_dict info_line = '##INFO=' - + ## WHEN parsing a correct info line head.parse_meta_data(info_line) - + ## THEN assert it is added to the parser - assert 'MQ' in head.info_dict + assert "MQ" in head.info_dict + def test_parse_contig(): ## GIVEN a header object head = HeaderParser() - assert '1' not in head.contig_dict - contig_line = '##contig=' - + assert "1" not in head.contig_dict + contig_line = "##contig=" + ## WHEN parsing a correct info line head.parse_meta_data(contig_line) - + ## THEN assert it is added to the parser - assert '1' in head.contig_dict + assert "1" in head.contig_dict + def test_parse_contig_no_length(): ## GIVEN a header object head = HeaderParser() - assert '1' not in head.contig_dict - contig_line = '##contig=' - + assert "1" not in head.contig_dict + contig_line = "##contig=" + ## WHEN parsing a correct info line head.parse_meta_data(contig_line) - + ## THEN assert it is added to the parser - assert '1' in head.contig_dict + assert "1" in head.contig_dict + def test_parse_minimal_contig(): ## GIVEN a header object head = HeaderParser() - assert '1' not in head.contig_dict - contig_line = '##contig=' - + assert "1" not in head.contig_dict + contig_line = "##contig=" + ## WHEN parsing a correct info line head.parse_meta_data(contig_line) - + ## THEN assert it is added to the parser - assert '1' in head.contig_dict \ No newline at end of file + assert "1" in head.contig_dict diff --git a/tests/vcf_tools/test_parse_variant.py b/tests/vcf_tools/test_parse_variant.py index f025b99..1be5d4e 100644 --- a/tests/vcf_tools/test_parse_variant.py +++ b/tests/vcf_tools/test_parse_variant.py @@ -1,39 +1,19 @@ from genmod.vcf_tools import get_variant_id + class TestGetVariantId: - def test_get_variant_id(self): - variant = { - 'CHROM': '1', - 'POS': '10', - 'REF': 'A', - 'ALT': 'G' - } + variant = {"CHROM": "1", "POS": "10", "REF": "A", "ALT": "G"} assert get_variant_id(variant) == "1_10_A_G" - + def test_get_variant_id_sv_ins(self): - variant = { - 'CHROM': '1', - 'POS': '10', - 'REF': 'N', - 'ALT': '' - } + variant = {"CHROM": "1", "POS": "10", "REF": "N", "ALT": ""} assert get_variant_id(variant) == "1_10_N_INS" - + def test_get_variant_id_sv_dup_tandem(self): - variant = { - 'CHROM': '1', - 'POS': '10', - 'REF': 'N', - 'ALT': '' - } + variant = {"CHROM": "1", "POS": "10", "REF": "N", "ALT": ""} assert get_variant_id(variant) == "1_10_N_DUPTANDEM" - + def test_get_variant_id_sv_bdn(self): - variant = { - 'CHROM': '1', - 'POS': '10', - 'REF': 'A', - 'ALT': 'T[6:134717462[' - } - assert get_variant_id(variant) == "1_10_A_T6134717462" \ No newline at end of file + variant = {"CHROM": "1", "POS": "10", "REF": "A", "ALT": "T[6:134717462["} + assert get_variant_id(variant) == "1_10_A_T6134717462" diff --git a/tests/vcf_tools/test_sorting.py b/tests/vcf_tools/test_sorting.py index b37600c..2e51184 100644 --- a/tests/vcf_tools/test_sorting.py +++ b/tests/vcf_tools/test_sorting.py @@ -2,53 +2,54 @@ from genmod.vcf_tools import sort_variants + def setup_csv_file(): """ Print some variants to a vcf file and return the filename """ variant_lines = [ - '1\t1\t11900\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/1:60\t1/1:60\n', - '3\t3\t947378\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n', - '1\t1\t879586\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/1:60\t0/1:60\n', - '23\tX\t879586\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/1:60\t0/1:60\n', - '1\t1\t879585\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/0:60\t0/1:60\n', - '1\t1\t947378\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n', - ] - csv_file = NamedTemporaryFile(mode='w+t', delete=False, suffix='.vcf') + "1\t1\t11900\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/1:60\t1/1:60\n", + "3\t3\t947378\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n", + "1\t1\t879586\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/1:60\t0/1:60\n", + "23\tX\t879586\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/1:60\t0/1:60\n", + "1\t1\t879585\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/0:60\t0/1:60\n", + "1\t1\t947378\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n", + ] + csv_file = NamedTemporaryFile(mode="w+t", delete=False, suffix=".vcf") csv_file.writelines(variant_lines) csv_file.seek(0) csv_file.close() - + return csv_file.name + def test_sort_variants(): """ Test to sort an unsorted file with variants """ csv_file = setup_csv_file() - - sort_variants(infile=csv_file, mode='chromosome') - + + sort_variants(infile=csv_file, mode="chromosome") + variants = [] - with open(csv_file, 'r') as f: + with open(csv_file, "r") as f: for line in f: - variants.append(line.rstrip().split('\t')) - - assert variants[0][1] == '1' - assert variants[0][2] == '11900' - - assert variants[1][1] == '1' - assert variants[1][2] == '879585' - - assert variants[2][1] == '1' - assert variants[2][2] == '879586' - - assert variants[3][1] == '1' - assert variants[3][2] == '947378' - - assert variants[4][1] == '3' - assert variants[4][2] == '947378' - - assert variants[5][1] == 'X' - assert variants[5][2] == '879586' + variants.append(line.rstrip().split("\t")) + + assert variants[0][1] == "1" + assert variants[0][2] == "11900" + + assert variants[1][1] == "1" + assert variants[1][2] == "879585" + + assert variants[2][1] == "1" + assert variants[2][2] == "879586" + + assert variants[3][1] == "1" + assert variants[3][2] == "947378" + + assert variants[4][1] == "3" + assert variants[4][2] == "947378" + assert variants[5][1] == "X" + assert variants[5][2] == "879586"