From 8766821ee4e28419b878b38d1e9661ef6e03d927 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <50678786+jarbesfeld@users.noreply.github.com> Date: Thu, 24 Oct 2024 09:21:11 -0400 Subject: [PATCH] Correct alignment logic, ensure proper variant selection for post-mapped genomic variants (#70) --- src/dcd_mapping/align.py | 40 +++++++++++++++++++++++++------------- src/dcd_mapping/lookup.py | 10 +++++++--- src/dcd_mapping/schemas.py | 2 +- src/dcd_mapping/vrs_map.py | 14 ++++++------- 4 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index 9925089..cdadb2f 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -4,6 +4,9 @@ import os import subprocess import tempfile +from collections import namedtuple +from itertools import groupby +from operator import attrgetter from pathlib import Path from urllib.parse import urlparse @@ -246,19 +249,30 @@ def _get_best_hsp( :raise AlignmentError: if hit object appears to be empty (should be impossible) """ best_hsp = None - hsp_list = ( - sorted(hit, key=lambda hsp: abs(hsp.hit_start - gene_location.start)) - if gene_location - else hit - ) - hsp_list = sorted( - hsp_list, - key=lambda hsp: (hsp.query_end - hsp.query_start) / output.seq_len, - reverse=True, - ) - best_hsp = hsp_list[ - 0 - ] # Select hit with lowest distance from gene and highest score + hsp_list = [] + if gene_location is None: # If gene data is not present, sort by coverage + hsp_list = sorted( + hit, key=lambda hsp: (hsp.query_end - hsp.query_start) / output.seq_len + ) + best_hsp = hsp_list[0] + else: # Sort by distance from gene start, than by coverage + for hsp in hit: + BlatOutput = namedtuple("BlatOutput", ["hsp", "distance", "coverage"]) + hsp_list.append( + BlatOutput( + hsp, + abs(hsp.hit_start - gene_location.start), + (hsp.query_end - hsp.query_start) / output.seq_len, + ) + ) + hsp_list = sorted(hsp_list, key=attrgetter("distance")) + hsp_list_sorted = [] + for _, group in groupby(hsp_list, key=attrgetter("distance")): + sorted_by_coverage = sorted(group, key=attrgetter("coverage"), reverse=True) + hsp_list_sorted.append(list(sorted_by_coverage)) + hsp_list = [item for sublist in hsp_list_sorted for item in sublist] + + best_hsp = hsp_list[0].hsp if best_hsp is None: _logger.error( "Unable to get best HSP from hit -- this should be impossible? urn: %s, hit: %s", diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py index ad1e0c6..4905f25 100644 --- a/src/dcd_mapping/lookup.py +++ b/src/dcd_mapping/lookup.py @@ -288,7 +288,7 @@ def get_normalized_gene_response( return gene_descriptor # try taking the first word in the target name - if metadata.target_gene_name: + if metadata.target_gene_name and "Minigene" not in metadata.target_gene_name: parsed_name = "" if "_" in metadata.target_gene_name: parsed_name = metadata.target_gene_name.split("_") @@ -489,7 +489,7 @@ def get_sequence( # -------------------------------- VRS-Python -------------------------------- # -def translate_hgvs_to_vrs(hgvs: str) -> Allele: +def translate_hgvs_to_vrs(hgvs: str) -> Allele | None: """Convert HGVS variation description to VRS object. :param hgvs: MAVE-HGVS variation string @@ -500,7 +500,11 @@ def translate_hgvs_to_vrs(hgvs: str) -> Allele: hgvs = hgvs.replace(":c.", ":g.") tr = TranslatorBuilder(get_seqrepo()) - allele: Allele = tr.translate_from(hgvs, "hgvs", do_normalize=False) + try: + allele: Allele = tr.translate_from(hgvs, "hgvs", do_normalize=False) + except ValueError as warn: + _logger.warning(warn) + return None if ( not isinstance(allele.location, SequenceLocation) diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 1a339fd..1730f76 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -145,7 +145,7 @@ class MappedScore(BaseModel): annotation_layer: AnnotationLayer score: str | None pre_mapped: Allele | CisPhasedBlock - post_mapped: Allele | CisPhasedBlock + post_mapped: Allele | CisPhasedBlock | None class ScoreAnnotation(BaseModel): diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index 67b9f94..5b9d884 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -267,7 +267,7 @@ def _map_protein_coding( align_result, False, ) - if pre_mapped_genomic is None or post_mapped_genomic is None: + if pre_mapped_genomic is None and post_mapped_genomic is None: _logger.warning( "Encountered apparently invalid genomic variants in %s: %s", row.accession, @@ -330,7 +330,7 @@ def _map_regulatory_noncoding( False, offset=0, ) - if not pre_map_allele or not post_map_allele: + if not pre_map_allele and not post_map_allele: msg = "Genomic variations missing" raise VrsMapError(msg) variations.append( @@ -393,6 +393,8 @@ def _get_variation( for hgvs_string in hgvs_strings: # Generate VRS Allele structure. Set VA digests and SL digests to None allele = translate_hgvs_to_vrs(hgvs_string) + if allele is None: + break allele.id = None allele.digest = None allele.location.id = None @@ -462,14 +464,10 @@ def _get_variation( allele.id = ga4gh_identify(allele) # Check if the start of an allele is covered by the alignment block for - # genomic variants + # post-mapped genomic variants if layer == AnnotationLayer.GENOMIC: if pre_map: - if ( - allele.location.start >= alignment.query_range.start - and allele.location.start < alignment.query_range.end - ): - alleles.append(allele) + alleles.append(allele) else: if ( allele.location.start >= alignment.hit_range.start