Skip to content

Commit

Permalink
refactor: major edits to VRS mapping and supporting methods
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson committed Jan 21, 2024
1 parent 8195ae1 commit d978501
Show file tree
Hide file tree
Showing 6 changed files with 246 additions and 116 deletions.
6 changes: 3 additions & 3 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Transcript selection:
* Tests will need some extensive mocking (or cassettes?) for reliance on UTA and other external dependencies

VRS mapping:
* In general, this stuff is still pretty rough
* Finish the SeqRepo storage workaround
* In general, this stuff is still pretty rough. Tests aren't passing yet.
* Finish double-checking the SeqRepo storage workaround
* A fair amount of small questions about conditions written to handle specific scoresets/edge cases
* More testing. Can be ready for CI by mocking the SequenceStore class (or using SeqRepoRESTDataProxy and cassettes).
* More testing. Can be ready for CI by manually patching the SequenceStore class (or using SeqRepoRESTDataProxy and cassettes).
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,9 @@ src = ["src"]
# pydocstyle (D)
# pep8-naming (N)
# isort (I)
select = ["E", "W", "F", "ANN", "D", "N", "I", "T201"]
fixable = ["I", "F401", "T201"]
select = ["E", "W", "F", "ANN", "D", "N", "I", "T201", "T100"]
fixable = ["I", "F401", "T201", "T100"]
include = ["pyproject.toml", "tests/**/*.py", "src/**/*.py"]

# ANN101 - missing-type-self
# ANN003 - missing-type-kwargs
Expand Down
42 changes: 24 additions & 18 deletions src/dcd_mapping/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from cool_seq_tool.schemas import TranscriptPriority
from ga4gh.core._internal.models import Extension, Gene
from ga4gh.vrs._internal.models import Allele, SequenceLocation
from ga4gh.vrs.dataproxy import SeqRepoDataProxy
from ga4gh.vrs.extras.translator import AlleleTranslator
from gene.database import create_db
from gene.query import QueryHandler
Expand All @@ -23,7 +24,6 @@
"CoolSeqToolBuilder",
"get_seqrepo",
"GeneNormalizerBuilder",
"VrsTranslatorBuilder",
"get_protein_accession",
"get_transcripts",
"get_gene_symbol",
Expand All @@ -33,7 +33,7 @@
"get_chromosome_identifier_from_vrs_id",
"get_sequence",
"store_sequence",
"hgvs_to_vrs",
"translate_hgvs_to_vrs",
"get_mane_transcripts",
"get_uniprot_sequence",
]
Expand Down Expand Up @@ -76,17 +76,20 @@ def __new__(cls) -> QueryHandler:
return cls.instance


class VrsTranslatorBuilder:
"""Singleton constructor for VRS-Python translator instance."""
class TranslatorBuilder:
"""Singleton constructor for VRS Translator instance."""

def __new__(cls) -> AlleleTranslator:
"""Provide VRS-Python translator. Construct if unavailable.
def __new__(cls, data_proxy: SeqRepoDataProxy) -> AlleleTranslator:
"""Provide translator instance. Constructs it if unavailable. Use a new
``data_proxy`` instance that contains a given score row's sequence/ID.
:return: singleton instances of Translator
:return: singleton instance of ``AlleleTranslator``
"""
if not hasattr(cls, "instance"):
cst = CoolSeqToolBuilder()
cls.instance = AlleleTranslator(cst.seqrepo_access, normalize=False)
tr = AlleleTranslator(data_proxy, normalize=False)
cls.instance = tr
else:
cls.instance.data_proxy = data_proxy
return cls.instance


Expand Down Expand Up @@ -280,15 +283,17 @@ def get_gene_location(metadata: ScoresetMetadata) -> Optional[GeneLocation]:
def get_chromosome_identifier(chromosome: str) -> str:
"""Get latest NC_ accession identifier given a chromosome name.
:param chromosome: prefix-free chromosome name, e.g. ``"8"``, ``"X"``
:param chromosome: chromosome name, e.g. ``"8"``, ``"X"``
:return: latest ID if available
:raise KeyError: if unable to retrieve identifier
"""
if not chromosome.startswith("chr"):
chromosome = f"chr{chromosome}"
sr = CoolSeqToolBuilder().seqrepo_access
acs = []
for assembly in ["GRCh38", "GRCh37"]:
tmp_acs, _ = sr.translate_identifier(
f"{assembly}:chr{chromosome}", target_namespaces="refseq"
f"{assembly}:{chromosome}", target_namespaces="refseq"
)
for ac in tmp_acs:
acs.append(ac.split("refseq:")[-1])
Expand Down Expand Up @@ -378,18 +383,15 @@ def store_sequence(sequence: str, names: List[Dict]) -> None:
# -------------------------------- VRS-Python -------------------------------- #


def hgvs_to_vrs(hgvs: str, alias_map: Dict) -> Allele:
def translate_hgvs_to_vrs(hgvs: str, data_proxy: SeqRepoDataProxy) -> Allele:
"""Convert HGVS variation description to VRS object.
# TODO incorporate alias map
:param hgvs: MAVE-HGVS variation string
:param alias_map: lookup for custom sequence IDs
:param data_proxy:
:return: Corresponding VRS allele as a Pydantic class
"""
tr = VrsTranslatorBuilder()
vrs_allele = tr.translate_from(hgvs, "hgvs")
allele = Allele(**vrs_allele)
tr = TranslatorBuilder(data_proxy)
allele = tr.translate_from(hgvs, "hgvs")

if (
not isinstance(allele.location, SequenceLocation)
Expand All @@ -398,6 +400,10 @@ def hgvs_to_vrs(hgvs: str, alias_map: Dict) -> Allele:
):
raise ValueError

# TODO temporary, remove
if not isinstance(allele, Allele):
raise NotImplementedError

return allele


Expand Down
2 changes: 1 addition & 1 deletion src/dcd_mapping/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def _save_results(
"""
outfile = LOCAL_STORE_PATH / f"{metadata.urn}_mapping_results.json"
with open(outfile, "w") as f:
json.dump(mapping_results.model_dump(), f, indent=2)
json.dump(mapping_results.model_dump(exclude_none=True), f, indent=2)
return outfile


Expand Down
17 changes: 9 additions & 8 deletions src/dcd_mapping/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from cool_seq_tool.schemas import Strand, TranscriptPriority
from ga4gh.vrs._internal.models import Allele, Haplotype
from pydantic import BaseModel, StrictBool, StrictInt
from pydantic import BaseModel, StrictBool, StrictFloat, StrictInt, StrictStr


class TargetSequenceType(StrEnum):
Expand Down Expand Up @@ -125,13 +125,14 @@ class TxSelectResult(BaseModel):


class VrsMapping(BaseModel):
"""Define pre-post mapping pair structure for VRS-structured variations.
Probably need to add score and accession to make json writing easier
"""

pre_mapping: Union[Allele, Haplotype]
mapped: Union[Allele, Haplotype]
"""Define pre-post mapping pair structure for VRS-structured variations."""

mavedb_id: StrictStr
pre_mapped: Union[Allele, Haplotype]
post_mapped: Union[Allele, Haplotype]
mapped_transcript: Optional[TranscriptDescription] = None
score: StrictFloat
# relation: Literal["SO:is_homologous_to"] = "SO:is_homologous_to"


class VrsMappingResult(BaseModel):
Expand Down
Loading

0 comments on commit d978501

Please sign in to comment.