From 46676cb5e3cccb242b4cf6e7ef44a6bdc767ebd9 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Thu, 11 Jan 2024 09:50:24 -0500 Subject: [PATCH] Define VRS method output structure --- src/dcd_mapping/main.py | 28 ++++++++++++++++++++++++++-- src/dcd_mapping/resources.py | 23 +++++++++++++++++++---- src/dcd_mapping/schemas.py | 16 +++++++++++++++- src/dcd_mapping/vrs_map.py | 24 ++++++++++++++---------- 4 files changed, 74 insertions(+), 17 deletions(-) diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index a0065c7..b1063b1 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -1,4 +1,5 @@ """Provide core MaveDB mapping methods.""" +import json import logging from typing import List @@ -6,17 +7,36 @@ from dcd_mapping.align import AlignmentError, align from dcd_mapping.resources import ( + LOCAL_STORE_PATH, ResourceAcquisitionError, get_scoreset_metadata, get_scoreset_records, ) -from dcd_mapping.schemas import ScoreRow, ScoresetMetadata +from dcd_mapping.schemas import ScoreRow, ScoresetMetadata, VrsMappingResult from dcd_mapping.transcripts import TxSelectError, select_transcript from dcd_mapping.vrs_map import VrsMapError, vrs_map _logger = logging.getLogger(__name__) +def _save_results( + metadata: ScoresetMetadata, mapping_results: VrsMappingResult +) -> None: + """Save results to file. + + Todo: + ---- + * Embed in original metadata JSON + * Option to save as VRS 1.x + + :param metadata: scoreset metadata + :param mapping results: mapped objects + """ + outfile = LOCAL_STORE_PATH / f"{metadata.urn}_mapping_results.json" + with open(outfile, "w") as f: + json.dump(mapping_results.model_dump_json(indent=2), f) + + async def map_scoreset( metadata: ScoresetMetadata, records: List[ScoreRow], silent: bool = True ) -> None: @@ -42,9 +62,13 @@ async def map_scoreset( return None try: - _ = vrs_map(metadata, transcript, records) + vrs_results = vrs_map(metadata, alignment_result, transcript, records) except VrsMapError: _logger.error(f"VRS mapping failed for scoreset {metadata.urn}") + return None + + if vrs_results: + _save_results(metadata, vrs_results) async def map_scoreset_urn(urn: str, silent: bool = True) -> None: diff --git a/src/dcd_mapping/resources.py b/src/dcd_mapping/resources.py index a5f3678..a171b2f 100644 --- a/src/dcd_mapping/resources.py +++ b/src/dcd_mapping/resources.py @@ -157,16 +157,16 @@ def _get_uniprot_ref(scoreset_json: Dict[str, Any]) -> Optional[UniProtRef]: ) -def get_scoreset_metadata(scoreset_urn: str) -> ScoresetMetadata: - """Acquire metadata for scoreset. +def get_raw_scoreset_metadata(scoreset_urn: str) -> Dict: + """Get raw (original JSON) metadata for scoreset. Only hit the MaveDB API if unavailable locally. That means data must be refreshed manually (i.e. you'll need to delete a scoreset file yourself for this method to fetch a new one). This could be improved in future versions. :param scoreset_urn: URN for scoreset - :return: Object containing salient metadata - :raise ResourceAcquisitionError: if unable to acquire metadata + :return: Complete JSON response for object + :raise ResourceAcquisitionError: if HTTP request fails """ metadata_file = LOCAL_STORE_PATH / f"{scoreset_urn}_metadata.json" if not metadata_file.exists(): @@ -183,6 +183,21 @@ def get_scoreset_metadata(scoreset_urn: str) -> ScoresetMetadata: else: with open(metadata_file, "r") as f: metadata = json.load(f) + return metadata + + +def get_scoreset_metadata(scoreset_urn: str) -> ScoresetMetadata: + """Acquire metadata for scoreset. + + Only hit the MaveDB API if unavailable locally. That means data must be refreshed + manually (i.e. you'll need to delete a scoreset file yourself for this method to + fetch a new one). This could be improved in future versions. + + :param scoreset_urn: URN for scoreset + :return: Object containing salient metadata + :raise ResourceAcquisitionError: if unable to acquire metadata + """ + metadata = get_raw_scoreset_metadata(scoreset_urn) if not _metadata_response_is_human(metadata): raise ResourceAcquisitionError( f"Experiment for {scoreset_urn} contains no human targets" diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 821d5b7..400aee9 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -1,8 +1,9 @@ """Provide class definitions for commonly-used information objects.""" from enum import StrEnum -from typing import List, Optional +from typing import List, Optional, Union from cool_seq_tool.schemas import Strand, TranscriptPriority +from ga4gh.vrs._internal.models import Allele, Haplotype from pydantic import BaseModel, StrictBool, StrictInt @@ -121,3 +122,16 @@ class TxSelectResult(BaseModel): is_full_match: StrictBool transcript_mode: Optional[TranscriptPriority] = None sequence: str + + +class VrsMapping(BaseModel): + """Define pre-post mapping pair structure for VRS-structured variations.""" + + pre_mapping: Union[Allele, Haplotype] + mapped: Union[Allele, Haplotype] + + +class VrsMappingResult(BaseModel): + """Define response object from VRS mappings method.""" + + variations: List[VrsMapping] diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index bde002e..9075047 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -1,6 +1,6 @@ """Map transcripts to VRS objects.""" import logging -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Union import click from Bio.Seq import Seq @@ -21,6 +21,8 @@ ScoresetMetadata, TargetType, TxSelectResult, + VrsMapping, + VrsMappingResult, ) _logger = logging.getLogger(__name__) @@ -269,19 +271,19 @@ def _map_regulatory_noncoding( metadata: ScoresetMetadata, records: List[ScoreRow], align_result: AlignmentResult, -) -> List[Tuple[Allele, Allele]]: +) -> VrsMappingResult: """Return VRS alleles representing pre- and post-mapped variation objects (?) :param metadata: metadata for URN :param records: list of MAVE experiment result rows :return: TODO """ - var_ids = [] + variations = VrsMappingResult(variations=[]) sequence_id = _get_sequence_id(metadata.target_sequence) for row in records: if row.hgvs_nt == "_wt" or row.hgvs_nt == "_sy": - raise ValueError # TODO unclear what's up here + raise ValueError # old MAVE-HGVS syntax pre_map_allele = _get_haplotype_allele( row.hgvs_nt[2:], align_result.chrom, 0, sequence_id, AnnotationLayer.GENOMIC ) # TODO need query/hit ranges and strand for something @@ -299,9 +301,11 @@ def _map_regulatory_noncoding( if isinstance(post_map_allele, Haplotype): breakpoint() # TODO investigate raise NotImplementedError - var_ids.append((pre_map_allele, post_map_allele)) + variations.variations.append( + VrsMapping(pre_mapping=pre_map_allele, mapped=post_map_allele) + ) - return var_ids + return variations def vrs_map( @@ -310,13 +314,13 @@ def vrs_map( transcript: Optional[TxSelectResult], records: List[ScoreRow], silent: bool = True, -) -> List[Tuple[Allele, Allele]]: +) -> Optional[VrsMappingResult]: """Given a description of a MAVE scoreset and an aligned transcript, generate the corresponding VRS objects. Todo: ---- - * Store objects in SeqRepo + * Workaround for storing objects in SeqRepo :param metadata: salient MAVE scoreset metadata :param transcript: output of transcript selection process @@ -329,8 +333,8 @@ def vrs_map( click.echo(msg) _logger.info(msg) if metadata.target_gene_category == TargetType.PROTEIN_CODING and transcript: - result = _map_protein_coding(metadata, transcript, records) + _ = _map_protein_coding(metadata, transcript, records) + result = None else: result = _map_regulatory_noncoding(metadata, records, align_result) - breakpoint() # TODO tmp return result