Skip to content

Commit

Permalink
Define VRS method output structure
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson committed Jan 11, 2024
1 parent dc09054 commit 46676cb
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 17 deletions.
28 changes: 26 additions & 2 deletions src/dcd_mapping/main.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,42 @@
"""Provide core MaveDB mapping methods."""
import json
import logging
from typing import List

import click

from dcd_mapping.align import AlignmentError, align
from dcd_mapping.resources import (
LOCAL_STORE_PATH,
ResourceAcquisitionError,
get_scoreset_metadata,
get_scoreset_records,
)
from dcd_mapping.schemas import ScoreRow, ScoresetMetadata
from dcd_mapping.schemas import ScoreRow, ScoresetMetadata, VrsMappingResult
from dcd_mapping.transcripts import TxSelectError, select_transcript
from dcd_mapping.vrs_map import VrsMapError, vrs_map

_logger = logging.getLogger(__name__)


def _save_results(
metadata: ScoresetMetadata, mapping_results: VrsMappingResult
) -> None:
"""Save results to file.
Todo:
----
* Embed in original metadata JSON
* Option to save as VRS 1.x
:param metadata: scoreset metadata
:param mapping results: mapped objects
"""
outfile = LOCAL_STORE_PATH / f"{metadata.urn}_mapping_results.json"
with open(outfile, "w") as f:
json.dump(mapping_results.model_dump_json(indent=2), f)


async def map_scoreset(
metadata: ScoresetMetadata, records: List[ScoreRow], silent: bool = True
) -> None:
Expand All @@ -42,9 +62,13 @@ async def map_scoreset(
return None

try:
_ = vrs_map(metadata, transcript, records)
vrs_results = vrs_map(metadata, alignment_result, transcript, records)
except VrsMapError:
_logger.error(f"VRS mapping failed for scoreset {metadata.urn}")
return None

if vrs_results:
_save_results(metadata, vrs_results)


async def map_scoreset_urn(urn: str, silent: bool = True) -> None:
Expand Down
23 changes: 19 additions & 4 deletions src/dcd_mapping/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,16 +157,16 @@ def _get_uniprot_ref(scoreset_json: Dict[str, Any]) -> Optional[UniProtRef]:
)


def get_scoreset_metadata(scoreset_urn: str) -> ScoresetMetadata:
"""Acquire metadata for scoreset.
def get_raw_scoreset_metadata(scoreset_urn: str) -> Dict:
"""Get raw (original JSON) metadata for scoreset.
Only hit the MaveDB API if unavailable locally. That means data must be refreshed
manually (i.e. you'll need to delete a scoreset file yourself for this method to
fetch a new one). This could be improved in future versions.
:param scoreset_urn: URN for scoreset
:return: Object containing salient metadata
:raise ResourceAcquisitionError: if unable to acquire metadata
:return: Complete JSON response for object
:raise ResourceAcquisitionError: if HTTP request fails
"""
metadata_file = LOCAL_STORE_PATH / f"{scoreset_urn}_metadata.json"
if not metadata_file.exists():
Expand All @@ -183,6 +183,21 @@ def get_scoreset_metadata(scoreset_urn: str) -> ScoresetMetadata:
else:
with open(metadata_file, "r") as f:
metadata = json.load(f)
return metadata


def get_scoreset_metadata(scoreset_urn: str) -> ScoresetMetadata:
"""Acquire metadata for scoreset.
Only hit the MaveDB API if unavailable locally. That means data must be refreshed
manually (i.e. you'll need to delete a scoreset file yourself for this method to
fetch a new one). This could be improved in future versions.
:param scoreset_urn: URN for scoreset
:return: Object containing salient metadata
:raise ResourceAcquisitionError: if unable to acquire metadata
"""
metadata = get_raw_scoreset_metadata(scoreset_urn)
if not _metadata_response_is_human(metadata):
raise ResourceAcquisitionError(
f"Experiment for {scoreset_urn} contains no human targets"
Expand Down
16 changes: 15 additions & 1 deletion src/dcd_mapping/schemas.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""Provide class definitions for commonly-used information objects."""
from enum import StrEnum
from typing import List, Optional
from typing import List, Optional, Union

from cool_seq_tool.schemas import Strand, TranscriptPriority
from ga4gh.vrs._internal.models import Allele, Haplotype
from pydantic import BaseModel, StrictBool, StrictInt


Expand Down Expand Up @@ -121,3 +122,16 @@ class TxSelectResult(BaseModel):
is_full_match: StrictBool
transcript_mode: Optional[TranscriptPriority] = None
sequence: str


class VrsMapping(BaseModel):
"""Define pre-post mapping pair structure for VRS-structured variations."""

pre_mapping: Union[Allele, Haplotype]
mapped: Union[Allele, Haplotype]


class VrsMappingResult(BaseModel):
"""Define response object from VRS mappings method."""

variations: List[VrsMapping]
24 changes: 14 additions & 10 deletions src/dcd_mapping/vrs_map.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Map transcripts to VRS objects."""
import logging
from typing import List, Optional, Tuple, Union
from typing import List, Optional, Union

import click
from Bio.Seq import Seq
Expand All @@ -21,6 +21,8 @@
ScoresetMetadata,
TargetType,
TxSelectResult,
VrsMapping,
VrsMappingResult,
)

_logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -269,19 +271,19 @@ def _map_regulatory_noncoding(
metadata: ScoresetMetadata,
records: List[ScoreRow],
align_result: AlignmentResult,
) -> List[Tuple[Allele, Allele]]:
) -> VrsMappingResult:
"""Return VRS alleles representing pre- and post-mapped variation objects (?)
:param metadata: metadata for URN
:param records: list of MAVE experiment result rows
:return: TODO
"""
var_ids = []
variations = VrsMappingResult(variations=[])
sequence_id = _get_sequence_id(metadata.target_sequence)

for row in records:
if row.hgvs_nt == "_wt" or row.hgvs_nt == "_sy":
raise ValueError # TODO unclear what's up here
raise ValueError # old MAVE-HGVS syntax
pre_map_allele = _get_haplotype_allele(
row.hgvs_nt[2:], align_result.chrom, 0, sequence_id, AnnotationLayer.GENOMIC
) # TODO need query/hit ranges and strand for something
Expand All @@ -299,9 +301,11 @@ def _map_regulatory_noncoding(
if isinstance(post_map_allele, Haplotype):
breakpoint() # TODO investigate
raise NotImplementedError
var_ids.append((pre_map_allele, post_map_allele))
variations.variations.append(
VrsMapping(pre_mapping=pre_map_allele, mapped=post_map_allele)
)

return var_ids
return variations


def vrs_map(
Expand All @@ -310,13 +314,13 @@ def vrs_map(
transcript: Optional[TxSelectResult],
records: List[ScoreRow],
silent: bool = True,
) -> List[Tuple[Allele, Allele]]:
) -> Optional[VrsMappingResult]:
"""Given a description of a MAVE scoreset and an aligned transcript, generate
the corresponding VRS objects.
Todo:
----
* Store objects in SeqRepo
* Workaround for storing objects in SeqRepo
:param metadata: salient MAVE scoreset metadata
:param transcript: output of transcript selection process
Expand All @@ -329,8 +333,8 @@ def vrs_map(
click.echo(msg)
_logger.info(msg)
if metadata.target_gene_category == TargetType.PROTEIN_CODING and transcript:
result = _map_protein_coding(metadata, transcript, records)
_ = _map_protein_coding(metadata, transcript, records)
result = None
else:
result = _map_regulatory_noncoding(metadata, records, align_result)
breakpoint() # TODO tmp
return result

0 comments on commit 46676cb

Please sign in to comment.