Skip to content

Commit

Permalink
Correct alignment logic, ensure proper variant selection for post-map…
Browse files Browse the repository at this point in the history
…ped genomic variants (#70)
  • Loading branch information
jarbesfeld authored Oct 24, 2024
1 parent bd08532 commit 8766821
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 25 deletions.
40 changes: 27 additions & 13 deletions src/dcd_mapping/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import os
import subprocess
import tempfile
from collections import namedtuple
from itertools import groupby
from operator import attrgetter
from pathlib import Path
from urllib.parse import urlparse

Expand Down Expand Up @@ -246,19 +249,30 @@ def _get_best_hsp(
:raise AlignmentError: if hit object appears to be empty (should be impossible)
"""
best_hsp = None
hsp_list = (
sorted(hit, key=lambda hsp: abs(hsp.hit_start - gene_location.start))
if gene_location
else hit
)
hsp_list = sorted(
hsp_list,
key=lambda hsp: (hsp.query_end - hsp.query_start) / output.seq_len,
reverse=True,
)
best_hsp = hsp_list[
0
] # Select hit with lowest distance from gene and highest score
hsp_list = []
if gene_location is None: # If gene data is not present, sort by coverage
hsp_list = sorted(
hit, key=lambda hsp: (hsp.query_end - hsp.query_start) / output.seq_len
)
best_hsp = hsp_list[0]
else: # Sort by distance from gene start, than by coverage
for hsp in hit:
BlatOutput = namedtuple("BlatOutput", ["hsp", "distance", "coverage"])
hsp_list.append(
BlatOutput(
hsp,
abs(hsp.hit_start - gene_location.start),
(hsp.query_end - hsp.query_start) / output.seq_len,
)
)
hsp_list = sorted(hsp_list, key=attrgetter("distance"))
hsp_list_sorted = []
for _, group in groupby(hsp_list, key=attrgetter("distance")):
sorted_by_coverage = sorted(group, key=attrgetter("coverage"), reverse=True)
hsp_list_sorted.append(list(sorted_by_coverage))
hsp_list = [item for sublist in hsp_list_sorted for item in sublist]

best_hsp = hsp_list[0].hsp
if best_hsp is None:
_logger.error(
"Unable to get best HSP from hit -- this should be impossible? urn: %s, hit: %s",
Expand Down
10 changes: 7 additions & 3 deletions src/dcd_mapping/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def get_normalized_gene_response(
return gene_descriptor

# try taking the first word in the target name
if metadata.target_gene_name:
if metadata.target_gene_name and "Minigene" not in metadata.target_gene_name:
parsed_name = ""
if "_" in metadata.target_gene_name:
parsed_name = metadata.target_gene_name.split("_")
Expand Down Expand Up @@ -489,7 +489,7 @@ def get_sequence(
# -------------------------------- VRS-Python -------------------------------- #


def translate_hgvs_to_vrs(hgvs: str) -> Allele:
def translate_hgvs_to_vrs(hgvs: str) -> Allele | None:
"""Convert HGVS variation description to VRS object.
:param hgvs: MAVE-HGVS variation string
Expand All @@ -500,7 +500,11 @@ def translate_hgvs_to_vrs(hgvs: str) -> Allele:
hgvs = hgvs.replace(":c.", ":g.")

tr = TranslatorBuilder(get_seqrepo())
allele: Allele = tr.translate_from(hgvs, "hgvs", do_normalize=False)
try:
allele: Allele = tr.translate_from(hgvs, "hgvs", do_normalize=False)
except ValueError as warn:
_logger.warning(warn)
return None

if (
not isinstance(allele.location, SequenceLocation)
Expand Down
2 changes: 1 addition & 1 deletion src/dcd_mapping/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ class MappedScore(BaseModel):
annotation_layer: AnnotationLayer
score: str | None
pre_mapped: Allele | CisPhasedBlock
post_mapped: Allele | CisPhasedBlock
post_mapped: Allele | CisPhasedBlock | None


class ScoreAnnotation(BaseModel):
Expand Down
14 changes: 6 additions & 8 deletions src/dcd_mapping/vrs_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def _map_protein_coding(
align_result,
False,
)
if pre_mapped_genomic is None or post_mapped_genomic is None:
if pre_mapped_genomic is None and post_mapped_genomic is None:
_logger.warning(
"Encountered apparently invalid genomic variants in %s: %s",
row.accession,
Expand Down Expand Up @@ -330,7 +330,7 @@ def _map_regulatory_noncoding(
False,
offset=0,
)
if not pre_map_allele or not post_map_allele:
if not pre_map_allele and not post_map_allele:
msg = "Genomic variations missing"
raise VrsMapError(msg)
variations.append(
Expand Down Expand Up @@ -393,6 +393,8 @@ def _get_variation(
for hgvs_string in hgvs_strings:
# Generate VRS Allele structure. Set VA digests and SL digests to None
allele = translate_hgvs_to_vrs(hgvs_string)
if allele is None:
break
allele.id = None
allele.digest = None
allele.location.id = None
Expand Down Expand Up @@ -462,14 +464,10 @@ def _get_variation(
allele.id = ga4gh_identify(allele)

# Check if the start of an allele is covered by the alignment block for
# genomic variants
# post-mapped genomic variants
if layer == AnnotationLayer.GENOMIC:
if pre_map:
if (
allele.location.start >= alignment.query_range.start
and allele.location.start < alignment.query_range.end
):
alleles.append(allele)
alleles.append(allele)
else:
if (
allele.location.start >= alignment.hit_range.start
Expand Down

0 comments on commit 8766821

Please sign in to comment.