refactor: major edits to VRS mapping and supporting methods (#7)

GenomicMedLab · Jan 21, 2024 · 8c728f5 · 8c728f5
1 parent d39681e
commit 8c728f5
Show file tree

Hide file tree

Showing 13 changed files with 6,571 additions and 134 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,10 @@
 # dcd-mapping
 
-## Usage
+## Results
+
+Mapped MaveDB Scoresets can be downloaded here: https://mavedb-mapping.s3.us-east-2.amazonaws.com/mappings.tar.gz
+
+## Tool usage
 
 Use the `dcd-map` command with a scoreset URN, eg
 
@@ -10,6 +14,16 @@ $ dcd-map urn:mavedb:00000083-c-1
 
 Output is saved in the format `<URN>_mapping_results.json` in the directory specified by the environment variable `MAVEDB_STORAGE_DIR`, or `~/.local/share/dcd-mapping` by default.
 
+## Setup
+
+Following installation instructions for [CoolSeqTool](https://coolseqtool.readthedocs.io/en/0.4.0-dev1/install.html) and [Gene Normalizer](https://gene-normalizer.readthedocs.io/en/latest/install.html) should take care of the external data dependencies.
+
+Note that Gene Normalizer's `pg` dependency group must be installed to make use of the PostgreSQL-based backend:
+
+```shell
+python3 -m pip install 'gene-normalizer[pg]'
+```
+
 ## Development
 
 Clone the repo
@@ -38,6 +52,4 @@ Add pre-commit hooks
 pre-commit install
 ```
 
-## Data
-
-Mapped MaveDB Scoresets can be downloaded here: https://mavedb-mapping.s3.us-east-2.amazonaws.com/mappings.tar.gz
+When debugging, use flags `-c` and `-d` to enable caching of BLAT alignment and debug logging, respectively.
diff --git a/TODO.md b/TODO.md
@@ -0,0 +1,24 @@
+# Remaining TODO
+
+General:
+* notes scattered in comments and docstrings (sorry!)
+* Docker-ification. More or less needs to start from scratch (there should be existing images for SeqRepo and UTA, not sure how up-to-date they are).
+* Add extra stuff that appears in mapping JSON objects.
+* Currently using VRS 2.0a-based libraries. For lifting back to VRS 1.3, some basic post-processing should be fine (annoying but shouldn't be too trivial)
+* Without access to a production DynamoDB instance, Gene Normalizer will be quickest and easiest to set up via a PostgreSQL data backend. That, however, requires an extra dependency group (noted in README). We might want to make a `pg` dependency group here or just include it in core dependencies.
+* On that note, I've only done minimal testing of how possible it would be to drop the gene normalizer dependency entirely, but it'd be nice to get there.
+* Some of the singleton/factory stuff might be cleaner as `global`s
+
+Alignment:
+* Pretty sure this is mostly done. Haven't tested exhaustively, though.
+* Need to sufficiently mock/patch things in tests
+
+Transcript selection:
+* IndexError in calculating offset on lots of new (2023) scoresets.
+* Tests will need some extensive mocking (or cassettes?) for reliance on UTA and other external dependencies
+
+VRS mapping:
+* In general, this stuff is still pretty rough. Tests aren't passing yet.
+* Finish double-checking the SeqRepo storage workaround
+* A fair amount of small questions about conditions written to handle specific scoresets/edge cases
+* More testing. Can be ready for CI by manually patching the SequenceStore class (or using SeqRepoRESTDataProxy and cassettes).
diff --git a/pyproject.toml b/pyproject.toml
@@ -80,8 +80,9 @@ src = ["src"]
 # pydocstyle (D)
 # pep8-naming (N)
 # isort (I)
-select = ["E", "W", "F", "ANN", "D", "N", "I"]
-fixable = ["I", "F401"]
+select = ["E", "W", "F", "ANN", "D", "N", "I", "T201", "T100"]
+fixable = ["I", "F401", "T201", "T100"]
+include = ["pyproject.toml", "tests/**/*.py", "src/**/*.py"]
 
 # ANN101 - missing-type-self
 # ANN003 - missing-type-kwargs

diff --git a/src/dcd_mapping/cli.py b/src/dcd_mapping/cli.py
@@ -41,7 +41,7 @@ def cli(urn: str, debug: bool, cache_align: bool) -> None:
     """  # noqa: D301
     logging.basicConfig(
         filename="dcd-mapping.log",
-        format="%(asctime)s %(levelname)s:%(message)s",
+        format="%(asctime)s %(levelname)s:%(name)s:%(message)s",
         level=logging.INFO,
         force=True,
     )

diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py
@@ -12,6 +12,7 @@
 from cool_seq_tool.schemas import TranscriptPriority
 from ga4gh.core._internal.models import Extension, Gene
 from ga4gh.vrs._internal.models import Allele, SequenceLocation
+from ga4gh.vrs.dataproxy import SeqRepoDataProxy
 from ga4gh.vrs.extras.translator import AlleleTranslator
 from gene.database import create_db
 from gene.query import QueryHandler
@@ -23,7 +24,6 @@
     "CoolSeqToolBuilder",
     "get_seqrepo",
     "GeneNormalizerBuilder",
-    "VrsTranslatorBuilder",
     "get_protein_accession",
     "get_transcripts",
     "get_gene_symbol",
@@ -33,7 +33,7 @@
     "get_chromosome_identifier_from_vrs_id",
     "get_sequence",
     "store_sequence",
-    "hgvs_to_vrs",
+    "translate_hgvs_to_vrs",
     "get_mane_transcripts",
     "get_uniprot_sequence",
 ]
@@ -76,17 +76,20 @@ def __new__(cls) -> QueryHandler:
         return cls.instance
 
 
-class VrsTranslatorBuilder:
-    """Singleton constructor for VRS-Python translator instance."""
+class TranslatorBuilder:
+    """Singleton constructor for VRS Translator instance."""
 
-    def __new__(cls) -> AlleleTranslator:
-        """Provide VRS-Python translator. Construct if unavailable.
+    def __new__(cls, data_proxy: SeqRepoDataProxy) -> AlleleTranslator:
+        """Provide translator instance. Constructs it if unavailable. Use a new
+        ``data_proxy`` instance that contains a given score row's sequence/ID.
 
-        :return: singleton instances of Translator
+        :return: singleton instance of ``AlleleTranslator``
         """
         if not hasattr(cls, "instance"):
-            cst = CoolSeqToolBuilder()
-            cls.instance = AlleleTranslator(cst.seqrepo_access, normalize=False)
+            tr = AlleleTranslator(data_proxy, normalize=False)
+            cls.instance = tr
+        else:
+            cls.instance.data_proxy = data_proxy
         return cls.instance
 
 
@@ -280,15 +283,17 @@ def get_gene_location(metadata: ScoresetMetadata) -> Optional[GeneLocation]:
 def get_chromosome_identifier(chromosome: str) -> str:
     """Get latest NC_ accession identifier given a chromosome name.
 
-    :param chromosome: prefix-free chromosome name, e.g. ``"8"``, ``"X"``
+    :param chromosome: chromosome name, e.g. ``"8"``, ``"X"``
     :return: latest ID if available
     :raise KeyError: if unable to retrieve identifier
     """
+    if not chromosome.startswith("chr"):
+        chromosome = f"chr{chromosome}"
     sr = CoolSeqToolBuilder().seqrepo_access
     acs = []
     for assembly in ["GRCh38", "GRCh37"]:
         tmp_acs, _ = sr.translate_identifier(
-            f"{assembly}:chr{chromosome}", target_namespaces="refseq"
+            f"{assembly}:{chromosome}", target_namespaces="refseq"
         )
         for ac in tmp_acs:
             acs.append(ac.split("refseq:")[-1])
@@ -378,18 +383,15 @@ def store_sequence(sequence: str, names: List[Dict]) -> None:
 # -------------------------------- VRS-Python -------------------------------- #
 
 
-def hgvs_to_vrs(hgvs: str, alias_map: Dict) -> Allele:
+def translate_hgvs_to_vrs(hgvs: str, data_proxy: SeqRepoDataProxy) -> Allele:
     """Convert HGVS variation description to VRS object.
 
-    # TODO incorporate alias map
-
     :param hgvs: MAVE-HGVS variation string
-    :param alias_map: lookup for custom sequence IDs
+    :param data_proxy:
     :return: Corresponding VRS allele as a Pydantic class
     """
-    tr = VrsTranslatorBuilder()
-    vrs_allele = tr.translate_from(hgvs, "hgvs")
-    allele = Allele(**vrs_allele)
+    tr = TranslatorBuilder(data_proxy)
+    allele = tr.translate_from(hgvs, "hgvs")
 
     if (
         not isinstance(allele.location, SequenceLocation)
@@ -398,6 +400,10 @@ def hgvs_to_vrs(hgvs: str, alias_map: Dict) -> Allele:
     ):
         raise ValueError
 
+    # TODO temporary, remove
+    if not isinstance(allele, Allele):
+        raise NotImplementedError
+
     return allele
 
 

diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py
@@ -1,6 +1,7 @@
 """Provide core MaveDB mapping methods."""
 import json
 import logging
+from pathlib import Path
 from typing import List
 
 import click
@@ -21,20 +22,23 @@
 
 def _save_results(
     metadata: ScoresetMetadata, mapping_results: VrsMappingResult
-) -> None:
+) -> Path:
     """Save results to file.
 
     Todo:
     ----
-    * Embed in original metadata JSON
+    * add ``mapped_reference_sequence`` and ``computed_referense_sequence`` properties
+    * Embed within original metadata JSON
     * Option to save as VRS 1.x
 
     :param metadata: scoreset metadata
     :param mapping results: mapped objects
+    :return: path to saved file
     """
     outfile = LOCAL_STORE_PATH / f"{metadata.urn}_mapping_results.json"
     with open(outfile, "w") as f:
-        json.dump(mapping_results.model_dump_json(indent=2), f)
+        json.dump(mapping_results.model_dump(exclude_none=True), f, indent=2)
+    return outfile
 
 
 async def map_scoreset(
@@ -65,7 +69,7 @@ async def map_scoreset(
         return None
 
     try:
-        vrs_results = vrs_map(metadata, alignment_result, transcript, records)
+        vrs_results = vrs_map(metadata, alignment_result, transcript, records, silent)
     except VrsMapError:
         _logger.error(f"VRS mapping failed for scoreset {metadata.urn}")
         return None

diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py
@@ -4,7 +4,7 @@
 
 from cool_seq_tool.schemas import Strand, TranscriptPriority
 from ga4gh.vrs._internal.models import Allele, Haplotype
-from pydantic import BaseModel, StrictBool, StrictInt
+from pydantic import BaseModel, StrictBool, StrictFloat, StrictInt, StrictStr
 
 
 class TargetSequenceType(StrEnum):
@@ -125,13 +125,14 @@ class TxSelectResult(BaseModel):
 
 
 class VrsMapping(BaseModel):
-    """Define pre-post mapping pair structure for VRS-structured variations.
-
-    Probably need to add score and accession to make json writing easier
-    """
-
-    pre_mapping: Union[Allele, Haplotype]
-    mapped: Union[Allele, Haplotype]
+    """Define pre-post mapping pair structure for VRS-structured variations."""
+
+    mavedb_id: StrictStr
+    pre_mapped: Union[Allele, Haplotype]
+    post_mapped: Union[Allele, Haplotype]
+    mapped_transcript: Optional[TranscriptDescription] = None
+    score: StrictFloat
+    # relation: Literal["SO:is_homologous_to"] = "SO:is_homologous_to"
 
 
 class VrsMappingResult(BaseModel):