refactor: major edits to VRS mapping and supporting methods

GenomicMedLab · Jan 21, 2024 · d978501 · d978501
1 parent 8195ae1
commit d978501
Show file tree

Hide file tree

Showing 6 changed files with 246 additions and 116 deletions.
diff --git a/TODO.md b/TODO.md
@@ -18,7 +18,7 @@ Transcript selection:
 * Tests will need some extensive mocking (or cassettes?) for reliance on UTA and other external dependencies
 
 VRS mapping:
-* In general, this stuff is still pretty rough
-* Finish the SeqRepo storage workaround
+* In general, this stuff is still pretty rough. Tests aren't passing yet.
+* Finish double-checking the SeqRepo storage workaround
 * A fair amount of small questions about conditions written to handle specific scoresets/edge cases
-* More testing. Can be ready for CI by mocking the SequenceStore class (or using SeqRepoRESTDataProxy and cassettes).
+* More testing. Can be ready for CI by manually patching the SequenceStore class (or using SeqRepoRESTDataProxy and cassettes).
diff --git a/pyproject.toml b/pyproject.toml
@@ -80,8 +80,9 @@ src = ["src"]
 # pydocstyle (D)
 # pep8-naming (N)
 # isort (I)
-select = ["E", "W", "F", "ANN", "D", "N", "I", "T201"]
-fixable = ["I", "F401", "T201"]
+select = ["E", "W", "F", "ANN", "D", "N", "I", "T201", "T100"]
+fixable = ["I", "F401", "T201", "T100"]
+include = ["pyproject.toml", "tests/**/*.py", "src/**/*.py"]
 
 # ANN101 - missing-type-self
 # ANN003 - missing-type-kwargs

diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py
@@ -12,6 +12,7 @@
 from cool_seq_tool.schemas import TranscriptPriority
 from ga4gh.core._internal.models import Extension, Gene
 from ga4gh.vrs._internal.models import Allele, SequenceLocation
+from ga4gh.vrs.dataproxy import SeqRepoDataProxy
 from ga4gh.vrs.extras.translator import AlleleTranslator
 from gene.database import create_db
 from gene.query import QueryHandler
@@ -23,7 +24,6 @@
     "CoolSeqToolBuilder",
     "get_seqrepo",
     "GeneNormalizerBuilder",
-    "VrsTranslatorBuilder",
     "get_protein_accession",
     "get_transcripts",
     "get_gene_symbol",
@@ -33,7 +33,7 @@
     "get_chromosome_identifier_from_vrs_id",
     "get_sequence",
     "store_sequence",
-    "hgvs_to_vrs",
+    "translate_hgvs_to_vrs",
     "get_mane_transcripts",
     "get_uniprot_sequence",
 ]
@@ -76,17 +76,20 @@ def __new__(cls) -> QueryHandler:
         return cls.instance
 
 
-class VrsTranslatorBuilder:
-    """Singleton constructor for VRS-Python translator instance."""
+class TranslatorBuilder:
+    """Singleton constructor for VRS Translator instance."""
 
-    def __new__(cls) -> AlleleTranslator:
-        """Provide VRS-Python translator. Construct if unavailable.
+    def __new__(cls, data_proxy: SeqRepoDataProxy) -> AlleleTranslator:
+        """Provide translator instance. Constructs it if unavailable. Use a new
+        ``data_proxy`` instance that contains a given score row's sequence/ID.
 
-        :return: singleton instances of Translator
+        :return: singleton instance of ``AlleleTranslator``
         """
         if not hasattr(cls, "instance"):
-            cst = CoolSeqToolBuilder()
-            cls.instance = AlleleTranslator(cst.seqrepo_access, normalize=False)
+            tr = AlleleTranslator(data_proxy, normalize=False)
+            cls.instance = tr
+        else:
+            cls.instance.data_proxy = data_proxy
         return cls.instance
 
 
@@ -280,15 +283,17 @@ def get_gene_location(metadata: ScoresetMetadata) -> Optional[GeneLocation]:
 def get_chromosome_identifier(chromosome: str) -> str:
     """Get latest NC_ accession identifier given a chromosome name.
 
-    :param chromosome: prefix-free chromosome name, e.g. ``"8"``, ``"X"``
+    :param chromosome: chromosome name, e.g. ``"8"``, ``"X"``
     :return: latest ID if available
     :raise KeyError: if unable to retrieve identifier
     """
+    if not chromosome.startswith("chr"):
+        chromosome = f"chr{chromosome}"
     sr = CoolSeqToolBuilder().seqrepo_access
     acs = []
     for assembly in ["GRCh38", "GRCh37"]:
         tmp_acs, _ = sr.translate_identifier(
-            f"{assembly}:chr{chromosome}", target_namespaces="refseq"
+            f"{assembly}:{chromosome}", target_namespaces="refseq"
         )
         for ac in tmp_acs:
             acs.append(ac.split("refseq:")[-1])
@@ -378,18 +383,15 @@ def store_sequence(sequence: str, names: List[Dict]) -> None:
 # -------------------------------- VRS-Python -------------------------------- #
 
 
-def hgvs_to_vrs(hgvs: str, alias_map: Dict) -> Allele:
+def translate_hgvs_to_vrs(hgvs: str, data_proxy: SeqRepoDataProxy) -> Allele:
     """Convert HGVS variation description to VRS object.
 
-    # TODO incorporate alias map
-
     :param hgvs: MAVE-HGVS variation string
-    :param alias_map: lookup for custom sequence IDs
+    :param data_proxy:
     :return: Corresponding VRS allele as a Pydantic class
     """
-    tr = VrsTranslatorBuilder()
-    vrs_allele = tr.translate_from(hgvs, "hgvs")
-    allele = Allele(**vrs_allele)
+    tr = TranslatorBuilder(data_proxy)
+    allele = tr.translate_from(hgvs, "hgvs")
 
     if (
         not isinstance(allele.location, SequenceLocation)
@@ -398,6 +400,10 @@ def hgvs_to_vrs(hgvs: str, alias_map: Dict) -> Allele:
     ):
         raise ValueError
 
+    # TODO temporary, remove
+    if not isinstance(allele, Allele):
+        raise NotImplementedError
+
     return allele
 
 

diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py
@@ -37,7 +37,7 @@ def _save_results(
     """
     outfile = LOCAL_STORE_PATH / f"{metadata.urn}_mapping_results.json"
     with open(outfile, "w") as f:
-        json.dump(mapping_results.model_dump(), f, indent=2)
+        json.dump(mapping_results.model_dump(exclude_none=True), f, indent=2)
     return outfile
 
 

diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py
@@ -4,7 +4,7 @@
 
 from cool_seq_tool.schemas import Strand, TranscriptPriority
 from ga4gh.vrs._internal.models import Allele, Haplotype
-from pydantic import BaseModel, StrictBool, StrictInt
+from pydantic import BaseModel, StrictBool, StrictFloat, StrictInt, StrictStr
 
 
 class TargetSequenceType(StrEnum):
@@ -125,13 +125,14 @@ class TxSelectResult(BaseModel):
 
 
 class VrsMapping(BaseModel):
-    """Define pre-post mapping pair structure for VRS-structured variations.
-
-    Probably need to add score and accession to make json writing easier
-    """
-
-    pre_mapping: Union[Allele, Haplotype]
-    mapped: Union[Allele, Haplotype]
+    """Define pre-post mapping pair structure for VRS-structured variations."""
+
+    mavedb_id: StrictStr
+    pre_mapped: Union[Allele, Haplotype]
+    post_mapped: Union[Allele, Haplotype]
+    mapped_transcript: Optional[TranscriptDescription] = None
+    score: StrictFloat
+    # relation: Literal["SO:is_homologous_to"] = "SO:is_homologous_to"
 
 
 class VrsMappingResult(BaseModel):