diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py index eccdba0..26df3bc 100644 --- a/src/dcd_mapping/lookup.py +++ b/src/dcd_mapping/lookup.py @@ -1,9 +1,14 @@ """Handle API lookups to external (non-MaveDB) services. -This module should contain methods that we don't want to think about caching. +Data sources/handlers include: + +* `CoolSeqTool `_ +* `Gene Normalizer `_ +* the `VRS-Python Translator tool `_ +* the UniProt web API """ import logging -from typing import Dict, List, Optional +from typing import List, Optional import polars as pl import requests @@ -32,7 +37,6 @@ "get_ucsc_chromosome_name", "get_chromosome_identifier_from_vrs_id", "get_sequence", - "store_sequence", "translate_hgvs_to_vrs", "get_mane_transcripts", "get_uniprot_sequence", @@ -118,6 +122,8 @@ async def get_transcripts( """Get transcript accessions matching given parameters (excluding non-coding RNA). TODO: may be able to successfully query with only one of gene symbol/chromosome ac. + In initial testing, gene symbol doesn't seem to be a meaningful filter, but should + get further confirmation. :param gene_symbol: HGNC-given gene symbol (usually, but not always, equivalent to symbols available in other nomenclatures.) @@ -274,10 +280,6 @@ def get_gene_location(metadata: ScoresetMetadata) -> Optional[GeneLocation]: # --------------------------------- SeqRepo --------------------------------- # -# TODO -# * some of these could be refactored into a single method -# * not clear if all of them are necessary -# * either way, they should all be renamed once we have a final idea of what's needed def get_chromosome_identifier(chromosome: str) -> str: @@ -362,24 +364,6 @@ def get_sequence( return sequence -def store_sequence(sequence: str, names: List[Dict]) -> None: - """Store sequnce in SeqRepo. - - I'm a little queasy about this part -- it seems potentially dangerous to be - modifying state outside of the mapper library itself, particularly if there - are any needs for those changes to endure (and if there aren't, why are we - modifying outside state in the first place?). - - Currently unused unless we really really need this functionality. - - :param sequence: raw sequence - :param names: list of namespace/alias pairs, - e.g. ``{"namespace": "GA4GH", "alias": "SQ.XXXXXX"} - """ - sr = CoolSeqToolBuilder().seqrepo_access - sr.sr.store(sequence, nsaliases=names) - - # -------------------------------- VRS-Python -------------------------------- # diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py index 9bd9e6f..1f244f0 100644 --- a/src/dcd_mapping/transcripts.py +++ b/src/dcd_mapping/transcripts.py @@ -138,7 +138,8 @@ def _get_protein_sequence(target_sequence: str) -> str: """Get protein sequence if necessary. It'd be nice if there was a more elegant way to check if the sequence was already a - protein sequence. + protein sequence (it should be possible for protein sequences to contain <5 unique + bases, albeit unlikely with a large enough length). :param target_sequence: sequence set as baseline in MAVE experiment (might already be set to protein)