diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py
index eccdba0..26df3bc 100644
--- a/src/dcd_mapping/lookup.py
+++ b/src/dcd_mapping/lookup.py
@@ -1,9 +1,14 @@
"""Handle API lookups to external (non-MaveDB) services.
-This module should contain methods that we don't want to think about caching.
+Data sources/handlers include:
+
+* `CoolSeqTool `_
+* `Gene Normalizer `_
+* the `VRS-Python Translator tool `_
+* the UniProt web API
"""
import logging
-from typing import Dict, List, Optional
+from typing import List, Optional
import polars as pl
import requests
@@ -32,7 +37,6 @@
"get_ucsc_chromosome_name",
"get_chromosome_identifier_from_vrs_id",
"get_sequence",
- "store_sequence",
"translate_hgvs_to_vrs",
"get_mane_transcripts",
"get_uniprot_sequence",
@@ -118,6 +122,8 @@ async def get_transcripts(
"""Get transcript accessions matching given parameters (excluding non-coding RNA).
TODO: may be able to successfully query with only one of gene symbol/chromosome ac.
+ In initial testing, gene symbol doesn't seem to be a meaningful filter, but should
+ get further confirmation.
:param gene_symbol: HGNC-given gene symbol (usually, but not always, equivalent to
symbols available in other nomenclatures.)
@@ -274,10 +280,6 @@ def get_gene_location(metadata: ScoresetMetadata) -> Optional[GeneLocation]:
# --------------------------------- SeqRepo --------------------------------- #
-# TODO
-# * some of these could be refactored into a single method
-# * not clear if all of them are necessary
-# * either way, they should all be renamed once we have a final idea of what's needed
def get_chromosome_identifier(chromosome: str) -> str:
@@ -362,24 +364,6 @@ def get_sequence(
return sequence
-def store_sequence(sequence: str, names: List[Dict]) -> None:
- """Store sequnce in SeqRepo.
-
- I'm a little queasy about this part -- it seems potentially dangerous to be
- modifying state outside of the mapper library itself, particularly if there
- are any needs for those changes to endure (and if there aren't, why are we
- modifying outside state in the first place?).
-
- Currently unused unless we really really need this functionality.
-
- :param sequence: raw sequence
- :param names: list of namespace/alias pairs,
- e.g. ``{"namespace": "GA4GH", "alias": "SQ.XXXXXX"}
- """
- sr = CoolSeqToolBuilder().seqrepo_access
- sr.sr.store(sequence, nsaliases=names)
-
-
# -------------------------------- VRS-Python -------------------------------- #
diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py
index 9bd9e6f..1f244f0 100644
--- a/src/dcd_mapping/transcripts.py
+++ b/src/dcd_mapping/transcripts.py
@@ -138,7 +138,8 @@ def _get_protein_sequence(target_sequence: str) -> str:
"""Get protein sequence if necessary.
It'd be nice if there was a more elegant way to check if the sequence was already a
- protein sequence.
+ protein sequence (it should be possible for protein sequences to contain <5 unique
+ bases, albeit unlikely with a large enough length).
:param target_sequence: sequence set as baseline in MAVE experiment (might already
be set to protein)