Merge pull request #326 from apriltuesday/issue-321

Update consequence severity filtering
EBIvariation · May 13, 2022 · a11b9cf · a11b9cf
2 parents ccd41a6 + 9475ae3
commit a11b9cf
Show file tree

Hide file tree

Showing 10 changed files with 2,283 additions and 2,225 deletions.
diff --git a/consequence_prediction/repeat_expansion_variants/pipeline.py b/consequence_prediction/repeat_expansion_variants/pipeline.py
@@ -196,12 +196,9 @@ def extract_consequences(variants):
     # Get rid of sets
     consequences['RepeatType'] = consequences['RepeatType'].apply(list)
     consequences = consequences.explode('RepeatType')
-    # Form a six-column file compatible with the consequence mapping pipeline, for example:
-    # RCV000005966    1    ENSG00000156475    PPP2R2B    trinucleotide_repeat_expansion    0
-    consequences['PlaceholderOnes'] = 1
-    consequences['PlaceholderZeroes'] = 0
-    consequences = consequences[['RCVaccession', 'PlaceholderOnes', 'EnsemblGeneID', 'EnsemblGeneName', 'RepeatType',
-                                 'PlaceholderZeroes']]
+    # Form a four-column file compatible with the consequence mapping pipeline, for example:
+    # RCV000005966    ENSG00000156475    PPP2R2B    trinucleotide_repeat_expansion
+    consequences = consequences[['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName', 'RepeatType']]
     consequences.sort_values(by=['RepeatType', 'RCVaccession', 'EnsemblGeneID'], inplace=True)
     # Check that there are no empty cells in the final consequences table
     assert consequences.isnull().to_numpy().sum() == 0

diff --git a/consequence_prediction/structural_variants/pipeline.py b/consequence_prediction/structural_variants/pipeline.py
@@ -3,8 +3,8 @@
 
 import pandas as pd
 
-from consequence_prediction.vep_mapping_pipeline.consequence_mapping import query_vep, VEP_SHORT_QUERY_DISTANCE, \
-    extract_consequences, deduplicate_list
+from consequence_prediction.vep_mapping_pipeline.consequence_mapping import query_vep, extract_consequences, \
+    deduplicate_list
 from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io import ClinVarDataset
 from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.hgvs_variant import HgvsVariant, VariantType, SequenceType
 
@@ -61,7 +61,7 @@ def get_vep_results(clinvar_xml):
     i = 0
     vep_results = []
     for group in grouper(variants, n=200):
-        vep_results.extend(query_vep(variants=group, search_distance=VEP_SHORT_QUERY_DISTANCE))
+        vep_results.extend(query_vep(variants=group))
         i += 1
         logger.info(f'Done with batch {i}')
 
@@ -70,20 +70,16 @@ def get_vep_results(clinvar_xml):
 
 def main(clinvar_xml):
     vep_results = get_vep_results(clinvar_xml)
-    results_by_variant = {}
-    results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'},
-                                              only_closest=False, results_by_variant=results_by_variant,
-                                              report_distance=False)
+    results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'})
     variant_data = []
     for variant_id, variant_consequences in results_by_variant.items():
         for consequence_to_yield in deduplicate_list(variant_consequences):
-            variant_id, gene_id, gene_symbol, consequence_term, distance = consequence_to_yield
+            variant_id, gene_id, gene_symbol, consequence_term = consequence_to_yield
             # variant_id here is the entire input to VEP, we only want the HGVS that we passed as an identifier
             hgvs_id = variant_id.split()[-1]
-            # The second column, set statically to 1, is not used, and is maintained for compatibility purposes
-            variant_data.append((hgvs_id, '1', gene_id, gene_symbol, consequence_term, distance))
+            variant_data.append((hgvs_id, gene_id, gene_symbol, consequence_term))
 
     # Return as a dataframe to be compatible with repeat expansion pipeline
-    consequences = pd.DataFrame(variant_data, columns=('VariantID', 'PlaceholderOnes', 'EnsemblGeneID',
-                                                       'EnsemblGeneName', 'ConsequenceTerm', 'Distance'))
+    consequences = pd.DataFrame(variant_data, columns=('VariantID', 'EnsemblGeneID',
+                                                       'EnsemblGeneName', 'ConsequenceTerm'))
     return consequences
diff --git a/consequence_prediction/vep_mapping_pipeline/consequence_mapping.py b/consequence_prediction/vep_mapping_pipeline/consequence_mapping.py
@@ -15,22 +15,13 @@
 from retry import retry
 
 parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--enable-distant-querying', action='store_true',
-    help='Enables a second iteration of querying VEP for distant gene variants, which is disabled by default'
-)
-parser.add_argument(
-    '--report-distance', action='store_true',
-    help='Report a distance to the gene for upstream/downstream gene variants. Disabled by default'
-)
 
 logging.basicConfig()
 logger = logging.getLogger('consequence_mapping')
 logger.setLevel(logging.INFO)
 
-# The "distance to the nearest gene" parameters, used to query VEP in first and second iterations, respectively.
+# The "distance to the nearest gene" parameters, used to query VEP.
 VEP_SHORT_QUERY_DISTANCE = 5000
-VEP_LONG_QUERY_DISTANCE = 500000
 
 
 def deduplicate_list(lst):
@@ -54,7 +45,7 @@ def vep_id_to_colon_id(vep_id):
 
 
 @retry(tries=10, delay=5, backoff=1.2, jitter=(1, 3), logger=logger)
-def query_vep(variants, search_distance):
+def query_vep(variants, search_distance=VEP_SHORT_QUERY_DISTANCE):
     """Query VEP and return results in JSON format. Upstream/downstream genes are searched up to a given distance."""
     ensembl_request_url = 'https://rest.ensembl.org/vep/human/region'
     headers = {'Content-Type': 'application/json', 'Accept': 'application/json'}
@@ -97,54 +88,67 @@ def load_consequence_severity_rank():
     return {term: index for index, term in enumerate(get_severity_ranking())}
 
 
-def extract_consequences(vep_results, acceptable_biotypes, only_closest, results_by_variant, report_distance=False):
+def most_severe_consequence(consequence_terms, consequence_term_severity_rank):
+    return min(consequence_terms, key=lambda term: consequence_term_severity_rank[term])
+
+
+def most_severe_consequence_per_gene(variant_identifier, consequences):
+    results = []
+    consequence_term_severity_rank = load_consequence_severity_rank()
+    consequences_per_gene = defaultdict(list)
+    for c in consequences:
+        key = (c['gene_id'], c.get('gene_symbol', ''))
+        consequences_per_gene[key].extend(term for term in c['consequence_terms'])
+    for (gene_id, gene_symbol), terms in consequences_per_gene.items():
+        most_severe_consequence_term = most_severe_consequence(terms, consequence_term_severity_rank)
+        results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term))
+    return results
+
+
+def overall_most_severe_consequence(variant_identifier, consequences):
+    results = []
+    consequence_term_severity_rank = load_consequence_severity_rank()
+    # Flatten the list of consequence terms and find the most severe one
+    all_consequence_terms = [term for c in consequences for term in c['consequence_terms']]
+    most_severe_consequence_term = most_severe_consequence(all_consequence_terms, consequence_term_severity_rank)
+
+    # Keep only consequences which include the most severe consequence term.
+    for c in consequences:
+        if most_severe_consequence_term in c['consequence_terms']:
+            results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term))
+    return results
+
+
+def extract_consequences(vep_results, acceptable_biotypes):
     """Given VEP results, return a list of consequences matching certain criteria.
 
     Args:
         vep_results: results obtained from VEP for a list of variants, in JSON format.
         acceptable_biotypes: a list of transcript biotypes to consider (as defined in Ensembl documentation, see
             https://www.ensembl.org/info/genome/genebuild/biotypes.html). Consequences for other transcript biotypes
             will be ignored.
-        only_closest: if this flag is specified, then at most one consequence per variant will be returned. The
-            consequences are sorted by distance from the gene and the closest one is chosen. In case of a tie,
-            consequence is selected at random. If this flag is not specified, all consequences for each variant will
-            be returned.
-        results_by_variant: a dict to write the results into.
-        report_distance: if set to True, a distance from the variant to the gene it affects will be reported
-            (applicable to upstream and downstream gene variants). Otherwise, the distance parameter will always be 0.
     """
-    consequence_term_severity_rank = load_consequence_severity_rank()
+    results_by_variant = defaultdict(list)
     for result in vep_results:
         variant_identifier = result['input']
-        results_by_variant.setdefault(variant_identifier, [])
         consequences = result.get('transcript_consequences', [])
 
         # Keep only consequences with the allowed biotypes; if there are none, skip this variant
         consequences = [c for c in consequences if c['biotype'] in acceptable_biotypes]
         if not consequences:
             continue
 
-        # Flatten the list of consequence terms and find the most severe one
-        all_consequence_terms = [term for c in consequences for term in c['consequence_terms']]
-        all_consequence_terms.sort(key=lambda term: consequence_term_severity_rank[term])
-        most_severe_consequence_term = all_consequence_terms[0]
-
-        # Keep only consequences which include the most severe consequence term; sort by increasing order of distance.
-        # If there is no 'distance' attribute in VEP results, it means that it is not applicable as the variant resides
-        # *inside* the gene; hence, in this case the distance is set to 0.
-        consequences = [c for c in consequences if most_severe_consequence_term in c['consequence_terms']]
-        consequences.sort(key=lambda consequence: abs(consequence.get('distance', 0)))
+        # If there is no 'distance' attribute in VEP results, it means the variant overlaps the gene.
+        overlapping_consequences = [c for c in consequences if 'distance' not in c]
 
-        # If mandated by a flag, keep only one (least distant) consequence
-        if only_closest:
-            consequences = [consequences[0]]
-
-        # Return a subset of fields (required for output) of filtered consequences
-        results_by_variant[variant_identifier].extend([
-            (variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term,
-             c.get('distance', 0) if report_distance else 0)
-            for c in consequences
-        ])
+        # For genes overlapping the variant, we report the most severe consequence per gene.
+        if overlapping_consequences:
+            consequences_for_variant = most_severe_consequence_per_gene(variant_identifier, overlapping_consequences)
+        # If there are no consequences on overlapping genes, we take the overall most severe consequence and all genes
+        # associated with that consequence
+        else:
+            consequences_for_variant = overall_most_severe_consequence(variant_identifier, consequences)
+        results_by_variant[variant_identifier].extend(consequences_for_variant)
 
     return results_by_variant
 
@@ -158,50 +162,21 @@ def get_variants_without_consequences(results_by_variant):
     })
 
 
-def process_variants(variants, enable_distant_querying=False, report_distance=False):
+def process_variants(variants):
     """Given a list of variant IDs, return a list of consequence types (each including Ensembl gene name & ID and a
     functional consequence code) for a given variant.
-
-    Args:
-        enable_distant_querying: If set to True, an additional VEP query will be performed for variants for which no
-            consequences were found during the first iteration, in an attempt to find distant variant consequences.
-        report_distance: Whether to report distance to the nearest gene for upstream and downstream gene variants.
-            See extract_consequences() for details.
     """
-
-    # First, we query VEP with default parameters, looking for variants affecting protein coding and miRNA transcripts
+    # Query VEP with default parameters, looking for variants affecting protein coding and miRNA transcripts
     # up to a standard distance (5000 nucleotides either way, which is default for VEP) from the variant.
-    results_by_variant = {}
-    vep_results = query_vep(variants=variants, search_distance=VEP_SHORT_QUERY_DISTANCE)
-    results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'},
-                                              only_closest=False, results_by_variant=results_by_variant,
-                                              report_distance=report_distance)
+    vep_results = query_vep(variants=variants)
+    results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'})
 
     # See if there are variants with no consequences up to the default distance
     variants_without_consequences = get_variants_without_consequences(results_by_variant)
     if variants_without_consequences:
         logger.info('Found {} variant(s) without standard consequences: {}'.format(
             len(variants_without_consequences), '|'.join(variants_without_consequences)))
 
-        if enable_distant_querying:
-            logger.info('Attempting to find distant consequences for the remaining variants')
-
-            # If there are, we will now do a second round of querying, this time looking only at protein coding biotypes
-            # (vs. miRNA *and* protein coding during the first round) up to a distance of 500,000 bases each way.
-            if variants_without_consequences:
-                distant_vep_results = query_vep(variants=variants_without_consequences,
-                                                search_distance=VEP_LONG_QUERY_DISTANCE)
-                extract_consequences(vep_results=distant_vep_results, acceptable_biotypes={'protein_coding'},
-                                     only_closest=True, results_by_variant=results_by_variant,
-                                     report_distance=report_distance)
-
-            # See if there are still variants with no consequences, even up to a wide search window
-            variants_without_consequences = get_variants_without_consequences(results_by_variant)
-            if variants_without_consequences:
-                logger.info('After distant querying, still remaining {} variant(s) without consequences: {}'.format(
-                    len(variants_without_consequences), '|'.join(variants_without_consequences)
-                ))
-
     # Yield all consequences for all variants. Note they are not grouped by variant, all consequences are yielded in a
     # common sequence.
     for variant_id, variant_consequences in results_by_variant.items():
@@ -217,12 +192,9 @@ def main():
     variants_to_query = [colon_based_id_to_vep_id(v) for v in sys.stdin.read().splitlines()]
 
     # Query VEP with all variants at once (for the purpose of efficiency), print out the consequences to STDOUT.
-    consequences = process_variants(variants_to_query,
-                                    enable_distant_querying=args.enable_distant_querying,
-                                    report_distance=args.report_distance)
-    for variant_id, gene_id, gene_symbol, consequence_term, distance in consequences:
-        # The second column, set statically to 1, is not used, and is maintained for compatibility purposes
-        print('\t'.join([vep_id_to_colon_id(variant_id), '1', gene_id, gene_symbol, consequence_term, str(distance)]))
+    consequences = process_variants(variants_to_query)
+    for variant_id, gene_id, gene_symbol, consequence_term in consequences:
+        print('\t'.join([vep_id_to_colon_id(variant_id), gene_id, gene_symbol, consequence_term]))
 
     logger.info('Successfully processed {} variants'.format(len(variants_to_query)))
 

diff --git a/eva_cttv_pipeline/evidence_string_generation/consequence_type.py b/eva_cttv_pipeline/evidence_string_generation/consequence_type.py
@@ -22,8 +22,8 @@ def process_consequence_type_dataframes(*dataframes):
     for consequences_dataframe in dataframes:
         for row in consequences_dataframe.itertuples():
             variant_id = row[1]
-            ensembl_gene_id = row[3]
-            so_term = row[5]
+            ensembl_gene_id = row[2]
+            so_term = row[4]
 
             process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term)
 
@@ -44,13 +44,13 @@ def process_consequence_type_file(snp_2_gene_file, consequence_type_dict=None):
             line = line.rstrip()
             line_list = line.split("\t")
 
-            if len(line_list) < 6:
+            if len(line_list) < 4:
                 logger.warning('Skip invalid line in snp_2_gene file: {}'.format(line))
                 continue
 
             variant_id = line_list[0]
-            ensembl_gene_id = line_list[2]
-            so_term = line_list[4]
+            ensembl_gene_id = line_list[1]
+            so_term = line_list[3]
 
             if ensembl_gene_id == 'NA':
                 logger.warning('Skip line with missing gene ID: {}'.format(line))