Skip to content

Commit

Permalink
Merge pull request #326 from apriltuesday/issue-321
Browse files Browse the repository at this point in the history
Update consequence severity filtering
  • Loading branch information
apriltuesday authored May 13, 2022
2 parents ccd41a6 + 9475ae3 commit a11b9cf
Show file tree
Hide file tree
Showing 10 changed files with 2,283 additions and 2,225 deletions.
9 changes: 3 additions & 6 deletions consequence_prediction/repeat_expansion_variants/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,12 +196,9 @@ def extract_consequences(variants):
# Get rid of sets
consequences['RepeatType'] = consequences['RepeatType'].apply(list)
consequences = consequences.explode('RepeatType')
# Form a six-column file compatible with the consequence mapping pipeline, for example:
# RCV000005966 1 ENSG00000156475 PPP2R2B trinucleotide_repeat_expansion 0
consequences['PlaceholderOnes'] = 1
consequences['PlaceholderZeroes'] = 0
consequences = consequences[['RCVaccession', 'PlaceholderOnes', 'EnsemblGeneID', 'EnsemblGeneName', 'RepeatType',
'PlaceholderZeroes']]
# Form a four-column file compatible with the consequence mapping pipeline, for example:
# RCV000005966 ENSG00000156475 PPP2R2B trinucleotide_repeat_expansion
consequences = consequences[['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName', 'RepeatType']]
consequences.sort_values(by=['RepeatType', 'RCVaccession', 'EnsemblGeneID'], inplace=True)
# Check that there are no empty cells in the final consequences table
assert consequences.isnull().to_numpy().sum() == 0
Expand Down
20 changes: 8 additions & 12 deletions consequence_prediction/structural_variants/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

import pandas as pd

from consequence_prediction.vep_mapping_pipeline.consequence_mapping import query_vep, VEP_SHORT_QUERY_DISTANCE, \
extract_consequences, deduplicate_list
from consequence_prediction.vep_mapping_pipeline.consequence_mapping import query_vep, extract_consequences, \
deduplicate_list
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io import ClinVarDataset
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.hgvs_variant import HgvsVariant, VariantType, SequenceType

Expand Down Expand Up @@ -61,7 +61,7 @@ def get_vep_results(clinvar_xml):
i = 0
vep_results = []
for group in grouper(variants, n=200):
vep_results.extend(query_vep(variants=group, search_distance=VEP_SHORT_QUERY_DISTANCE))
vep_results.extend(query_vep(variants=group))
i += 1
logger.info(f'Done with batch {i}')

Expand All @@ -70,20 +70,16 @@ def get_vep_results(clinvar_xml):

def main(clinvar_xml):
vep_results = get_vep_results(clinvar_xml)
results_by_variant = {}
results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'},
only_closest=False, results_by_variant=results_by_variant,
report_distance=False)
results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'})
variant_data = []
for variant_id, variant_consequences in results_by_variant.items():
for consequence_to_yield in deduplicate_list(variant_consequences):
variant_id, gene_id, gene_symbol, consequence_term, distance = consequence_to_yield
variant_id, gene_id, gene_symbol, consequence_term = consequence_to_yield
# variant_id here is the entire input to VEP, we only want the HGVS that we passed as an identifier
hgvs_id = variant_id.split()[-1]
# The second column, set statically to 1, is not used, and is maintained for compatibility purposes
variant_data.append((hgvs_id, '1', gene_id, gene_symbol, consequence_term, distance))
variant_data.append((hgvs_id, gene_id, gene_symbol, consequence_term))

# Return as a dataframe to be compatible with repeat expansion pipeline
consequences = pd.DataFrame(variant_data, columns=('VariantID', 'PlaceholderOnes', 'EnsemblGeneID',
'EnsemblGeneName', 'ConsequenceTerm', 'Distance'))
consequences = pd.DataFrame(variant_data, columns=('VariantID', 'EnsemblGeneID',
'EnsemblGeneName', 'ConsequenceTerm'))
return consequences
132 changes: 52 additions & 80 deletions consequence_prediction/vep_mapping_pipeline/consequence_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,13 @@
from retry import retry

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--enable-distant-querying', action='store_true',
help='Enables a second iteration of querying VEP for distant gene variants, which is disabled by default'
)
parser.add_argument(
'--report-distance', action='store_true',
help='Report a distance to the gene for upstream/downstream gene variants. Disabled by default'
)

logging.basicConfig()
logger = logging.getLogger('consequence_mapping')
logger.setLevel(logging.INFO)

# The "distance to the nearest gene" parameters, used to query VEP in first and second iterations, respectively.
# The "distance to the nearest gene" parameters, used to query VEP.
VEP_SHORT_QUERY_DISTANCE = 5000
VEP_LONG_QUERY_DISTANCE = 500000


def deduplicate_list(lst):
Expand All @@ -54,7 +45,7 @@ def vep_id_to_colon_id(vep_id):


@retry(tries=10, delay=5, backoff=1.2, jitter=(1, 3), logger=logger)
def query_vep(variants, search_distance):
def query_vep(variants, search_distance=VEP_SHORT_QUERY_DISTANCE):
"""Query VEP and return results in JSON format. Upstream/downstream genes are searched up to a given distance."""
ensembl_request_url = 'https://rest.ensembl.org/vep/human/region'
headers = {'Content-Type': 'application/json', 'Accept': 'application/json'}
Expand Down Expand Up @@ -97,54 +88,67 @@ def load_consequence_severity_rank():
return {term: index for index, term in enumerate(get_severity_ranking())}


def extract_consequences(vep_results, acceptable_biotypes, only_closest, results_by_variant, report_distance=False):
def most_severe_consequence(consequence_terms, consequence_term_severity_rank):
return min(consequence_terms, key=lambda term: consequence_term_severity_rank[term])


def most_severe_consequence_per_gene(variant_identifier, consequences):
results = []
consequence_term_severity_rank = load_consequence_severity_rank()
consequences_per_gene = defaultdict(list)
for c in consequences:
key = (c['gene_id'], c.get('gene_symbol', ''))
consequences_per_gene[key].extend(term for term in c['consequence_terms'])
for (gene_id, gene_symbol), terms in consequences_per_gene.items():
most_severe_consequence_term = most_severe_consequence(terms, consequence_term_severity_rank)
results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term))
return results


def overall_most_severe_consequence(variant_identifier, consequences):
results = []
consequence_term_severity_rank = load_consequence_severity_rank()
# Flatten the list of consequence terms and find the most severe one
all_consequence_terms = [term for c in consequences for term in c['consequence_terms']]
most_severe_consequence_term = most_severe_consequence(all_consequence_terms, consequence_term_severity_rank)

# Keep only consequences which include the most severe consequence term.
for c in consequences:
if most_severe_consequence_term in c['consequence_terms']:
results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term))
return results


def extract_consequences(vep_results, acceptable_biotypes):
"""Given VEP results, return a list of consequences matching certain criteria.
Args:
vep_results: results obtained from VEP for a list of variants, in JSON format.
acceptable_biotypes: a list of transcript biotypes to consider (as defined in Ensembl documentation, see
https://www.ensembl.org/info/genome/genebuild/biotypes.html). Consequences for other transcript biotypes
will be ignored.
only_closest: if this flag is specified, then at most one consequence per variant will be returned. The
consequences are sorted by distance from the gene and the closest one is chosen. In case of a tie,
consequence is selected at random. If this flag is not specified, all consequences for each variant will
be returned.
results_by_variant: a dict to write the results into.
report_distance: if set to True, a distance from the variant to the gene it affects will be reported
(applicable to upstream and downstream gene variants). Otherwise, the distance parameter will always be 0.
"""
consequence_term_severity_rank = load_consequence_severity_rank()
results_by_variant = defaultdict(list)
for result in vep_results:
variant_identifier = result['input']
results_by_variant.setdefault(variant_identifier, [])
consequences = result.get('transcript_consequences', [])

# Keep only consequences with the allowed biotypes; if there are none, skip this variant
consequences = [c for c in consequences if c['biotype'] in acceptable_biotypes]
if not consequences:
continue

# Flatten the list of consequence terms and find the most severe one
all_consequence_terms = [term for c in consequences for term in c['consequence_terms']]
all_consequence_terms.sort(key=lambda term: consequence_term_severity_rank[term])
most_severe_consequence_term = all_consequence_terms[0]

# Keep only consequences which include the most severe consequence term; sort by increasing order of distance.
# If there is no 'distance' attribute in VEP results, it means that it is not applicable as the variant resides
# *inside* the gene; hence, in this case the distance is set to 0.
consequences = [c for c in consequences if most_severe_consequence_term in c['consequence_terms']]
consequences.sort(key=lambda consequence: abs(consequence.get('distance', 0)))
# If there is no 'distance' attribute in VEP results, it means the variant overlaps the gene.
overlapping_consequences = [c for c in consequences if 'distance' not in c]

# If mandated by a flag, keep only one (least distant) consequence
if only_closest:
consequences = [consequences[0]]

# Return a subset of fields (required for output) of filtered consequences
results_by_variant[variant_identifier].extend([
(variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term,
c.get('distance', 0) if report_distance else 0)
for c in consequences
])
# For genes overlapping the variant, we report the most severe consequence per gene.
if overlapping_consequences:
consequences_for_variant = most_severe_consequence_per_gene(variant_identifier, overlapping_consequences)
# If there are no consequences on overlapping genes, we take the overall most severe consequence and all genes
# associated with that consequence
else:
consequences_for_variant = overall_most_severe_consequence(variant_identifier, consequences)
results_by_variant[variant_identifier].extend(consequences_for_variant)

return results_by_variant

Expand All @@ -158,50 +162,21 @@ def get_variants_without_consequences(results_by_variant):
})


def process_variants(variants, enable_distant_querying=False, report_distance=False):
def process_variants(variants):
"""Given a list of variant IDs, return a list of consequence types (each including Ensembl gene name & ID and a
functional consequence code) for a given variant.
Args:
enable_distant_querying: If set to True, an additional VEP query will be performed for variants for which no
consequences were found during the first iteration, in an attempt to find distant variant consequences.
report_distance: Whether to report distance to the nearest gene for upstream and downstream gene variants.
See extract_consequences() for details.
"""

# First, we query VEP with default parameters, looking for variants affecting protein coding and miRNA transcripts
# Query VEP with default parameters, looking for variants affecting protein coding and miRNA transcripts
# up to a standard distance (5000 nucleotides either way, which is default for VEP) from the variant.
results_by_variant = {}
vep_results = query_vep(variants=variants, search_distance=VEP_SHORT_QUERY_DISTANCE)
results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'},
only_closest=False, results_by_variant=results_by_variant,
report_distance=report_distance)
vep_results = query_vep(variants=variants)
results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'})

# See if there are variants with no consequences up to the default distance
variants_without_consequences = get_variants_without_consequences(results_by_variant)
if variants_without_consequences:
logger.info('Found {} variant(s) without standard consequences: {}'.format(
len(variants_without_consequences), '|'.join(variants_without_consequences)))

if enable_distant_querying:
logger.info('Attempting to find distant consequences for the remaining variants')

# If there are, we will now do a second round of querying, this time looking only at protein coding biotypes
# (vs. miRNA *and* protein coding during the first round) up to a distance of 500,000 bases each way.
if variants_without_consequences:
distant_vep_results = query_vep(variants=variants_without_consequences,
search_distance=VEP_LONG_QUERY_DISTANCE)
extract_consequences(vep_results=distant_vep_results, acceptable_biotypes={'protein_coding'},
only_closest=True, results_by_variant=results_by_variant,
report_distance=report_distance)

# See if there are still variants with no consequences, even up to a wide search window
variants_without_consequences = get_variants_without_consequences(results_by_variant)
if variants_without_consequences:
logger.info('After distant querying, still remaining {} variant(s) without consequences: {}'.format(
len(variants_without_consequences), '|'.join(variants_without_consequences)
))

# Yield all consequences for all variants. Note they are not grouped by variant, all consequences are yielded in a
# common sequence.
for variant_id, variant_consequences in results_by_variant.items():
Expand All @@ -217,12 +192,9 @@ def main():
variants_to_query = [colon_based_id_to_vep_id(v) for v in sys.stdin.read().splitlines()]

# Query VEP with all variants at once (for the purpose of efficiency), print out the consequences to STDOUT.
consequences = process_variants(variants_to_query,
enable_distant_querying=args.enable_distant_querying,
report_distance=args.report_distance)
for variant_id, gene_id, gene_symbol, consequence_term, distance in consequences:
# The second column, set statically to 1, is not used, and is maintained for compatibility purposes
print('\t'.join([vep_id_to_colon_id(variant_id), '1', gene_id, gene_symbol, consequence_term, str(distance)]))
consequences = process_variants(variants_to_query)
for variant_id, gene_id, gene_symbol, consequence_term in consequences:
print('\t'.join([vep_id_to_colon_id(variant_id), gene_id, gene_symbol, consequence_term]))

logger.info('Successfully processed {} variants'.format(len(variants_to_query)))

Expand Down
10 changes: 5 additions & 5 deletions eva_cttv_pipeline/evidence_string_generation/consequence_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def process_consequence_type_dataframes(*dataframes):
for consequences_dataframe in dataframes:
for row in consequences_dataframe.itertuples():
variant_id = row[1]
ensembl_gene_id = row[3]
so_term = row[5]
ensembl_gene_id = row[2]
so_term = row[4]

process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term)

Expand All @@ -44,13 +44,13 @@ def process_consequence_type_file(snp_2_gene_file, consequence_type_dict=None):
line = line.rstrip()
line_list = line.split("\t")

if len(line_list) < 6:
if len(line_list) < 4:
logger.warning('Skip invalid line in snp_2_gene file: {}'.format(line))
continue

variant_id = line_list[0]
ensembl_gene_id = line_list[2]
so_term = line_list[4]
ensembl_gene_id = line_list[1]
so_term = line_list[3]

if ensembl_gene_id == 'NA':
logger.warning('Skip line with missing gene ID: {}'.format(line))
Expand Down
Loading

0 comments on commit a11b9cf

Please sign in to comment.