fix: don't use dnax query for dna target sequences

GenomicMedLab · Jan 25, 2024 · d1b5f34 · d1b5f34
1 parent e685c27
commit d1b5f34
Show file tree

Hide file tree

Showing 6 changed files with 122 additions and 101 deletions.
diff --git a/TODO.md b/TODO.md
@@ -2,15 +2,31 @@
 
 General:
 * notes scattered in comments and docstrings (sorry!)
-* Docker-ification. More or less needs to start from scratch (there should be existing images for SeqRepo and UTA, not sure how up-to-date they are).
-* Add extra stuff that appears in mapping JSON objects.
+* Docker-ification. (there should be existing images for SeqRepo and UTA, not sure how up-to-date they are).
+* Add extra stuff that appears in mapping JSON objects (``vrs_ref_allele_seq``).
 * Currently using VRS 2.0a-based libraries. For lifting back to VRS 1.3, some basic post-processing should be fine (annoying but shouldn't be too trivial)
 * Without access to a production DynamoDB instance, Gene Normalizer will be quickest and easiest to set up via a PostgreSQL data backend. That, however, requires an extra dependency group (noted in README). We might want to make a `pg` dependency group here or just include it in core dependencies.
 * On that note, I've only done minimal testing of how possible it would be to drop the gene normalizer dependency entirely, but it'd be nice to get there.
 * Some of the singleton/factory stuff might be cleaner as `global`s
+* make `silent` a global
 
 Alignment:
-* Pretty sure this is mostly done. Haven't tested exhaustively, though.
+* Pretty sure this is mostly done.
+* Getting disagreements vs Jeremy's work on strand for the following:
+  * urn:mavedb:00000032-a-1
+  * urn:mavedb:00000083-i-1
+  * urn:mavedb:00000034-a-1
+  * urn:mavedb:00000043-a-1
+  * urn:mavedb:00000083-c-1
+  * urn:mavedb:00000083-g-1
+  * urn:mavedb:00000068-a-1
+  * urn:mavedb:00000043-a-2
+  * urn:mavedb:00000001-c-1
+  * urn:mavedb:00000083-f-1
+  * urn:mavedb:00000083-b-1
+  * urn:mavedb:00000068-c-1
+  * urn:mavedb:00000001-c-2
+  * urn:mavedb:00000034-b-1
 * Need to sufficiently mock/patch things in tests
 
 Transcript selection:

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ description = "Map MaveDB scoresets to VRS objects"
 authors = [
     {name = "Alex Handler Wagner", email = "[email protected]"},
     {name = "Jeremy Arbesfeld", email = "[email protected]"},
-    {name = "Samriddhi Singh", email = ""},
+    {name = "Samriddhi Singh", email = "[email protected]"},
     {name = "James Stevenson", email = "[email protected]"},
 ]
 readme = "README.md"
@@ -48,7 +48,7 @@ tests = [
     "pytest-asyncio"
 ]
 dev = [
-    "ruff>=0.1.9",
+    "ruff>=0.1.14",
     "pre-commit"
 ]
 

diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py
@@ -1,9 +1,10 @@
 """Align MaveDB target sequences to a human reference genome."""
 import logging
 import subprocess
+import tempfile
 import uuid
 from pathlib import Path
-from typing import Any, Dict, Generator, List, Optional
+from typing import Any, Generator, List, Optional
 
 from Bio.SearchIO import HSP
 from Bio.SearchIO import read as read_blat
@@ -14,7 +15,6 @@
 from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location
 from dcd_mapping.resources import (
     LOCAL_STORE_PATH,
-    get_cached_blat_output,
     get_mapping_tmp_dir,
     get_ref_genome_file,
 )
@@ -56,105 +56,122 @@ def _build_query_file(scoreset_metadata: ScoresetMetadata) -> Generator[Path, An
     query_file = (
         get_mapping_tmp_dir() / f"blat_query_{scoreset_metadata.urn}_{uuid.uuid1()}.fa"
     )
-    _logger.debug("Writing BLAT query to %", query_file)
+    _logger.debug("Writing BLAT query to %s", query_file)
     lines = [">query", scoreset_metadata.target_sequence]
     _write_query_file(query_file, lines)
     yield query_file
     query_file.unlink()
 
 
-def _run_blat_command(command: str, args: Dict) -> subprocess.CompletedProcess:
+def _run_blat(
+    target_args: str, query_file: Path, out_file: str, silent: bool
+) -> subprocess.CompletedProcess:
     """Execute BLAT binary with relevant params.
 
-    This function is broken out to enable mocking while testing.
-
     Currently, we rely on a system-installed BLAT binary accessible in the containing
     environment's PATH. This is sort of awkward and it'd be nice to make use of some
     direct bindings or better packaging if that's possible.
 
     * Perhaps `gget`? https://pachterlab.github.io/gget/en/blat.html
     * ``PxBlat``? https://github.com/ylab-hi/pxblat
 
-    :param command: shell command to execute
-    :param args: ``subprocess.run`` extra args (eg redirecting output for silent mode)
+    :param target_args: target params eg ``"-q=prot -t=dnax"`` (can be empty)
+    :param query_file: path to query FASTA file
+    :param out_file: path-like string to output fill (could be "/dev/stdout")
+    :param silent: if True, suppress all console output
     :return: process result
     """
-    _logger.debug("Running BLAT command: %", command)
-    return subprocess.run(command, shell=True, **args)
+    reference_genome_file = get_ref_genome_file(silent=silent)
+    command = f"blat {reference_genome_file} {target_args} -minScore=20 {query_file} {out_file}"
+    _logger.debug("Running BLAT command: %s", command)
+    result = subprocess.run(
+        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    _logger.debug("BLAT command finished with result %s", result.returncode)
+    if result.returncode != 0:
+        raise AlignmentError(
+            f"BLAT process returned error code {result.returncode}: {target_args} {query_file} {out_file}"
+        )
+    return result
 
 
-def _get_blat_output(
-    scoreset_metadata: ScoresetMetadata,
-    query_file: Path,
-    silent: bool,
-    use_cached: bool,
-) -> QueryResult:
-    """Run a BLAT query and returns a path to the output object.
+def _get_cached_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult:
+    """Get a BLAT output object for the given scoreset -- either reusing a previously-
+    run query output file, or making a new one if unavailable.
+
+    This method is broken out from ``_get_blat_output`` because it was getting pretty
+    messy to handle differing file management logic between the two of them. However,
+    query/command generation logic should be shared.
+
+    :param metadata:
+    :param silent:
+    :return:
+    """
+    out_file = LOCAL_STORE_PATH / f"{metadata.urn}_blat_output.psl"
+    if out_file.exists():
+        return read_blat(out_file.absolute(), "blat-psl")
+    query_file = next(_build_query_file(metadata))
+    if metadata.target_sequence_type == TargetSequenceType.PROTEIN:
+        target_args = "-q=prot -t=dnax"
+    else:
+        target_args = ""
+    _run_blat(target_args, query_file, str(out_file.absolute()), silent)
+    try:
+        output = read_blat(out_file.absolute(), "blat-psl")
+    except ValueError:
+        target_args = "-q=dnax -t=dnax"
+        _run_blat(target_args, query_file, str(out_file.absolute()), silent)
+        try:
+            output = read_blat(out_file.absolute(), "blat-psl")
+        except ValueError:
+            raise AlignmentError(
+                f"Unable to get valid BLAT response for {metadata.urn}"
+            )
+    return output
 
-    We create query and output files in the application's "temporary" folder, which
-    should be deleted by the process once complete. This happens manually, but we could
-    probably add a decorator or a context manager for a bit more elegance.
 
-    Ideally, we should see if we could pipe query output to STDOUT and then grab/process
-    it that way instead of using a temporary intermediary file.
+def _write_blat_output_tempfile(result: subprocess.CompletedProcess) -> str:
+    """Create temp BLAT output file. Not immediately deleted, but should eventually
+    be cleared by the OS.
+
+    :param result: BLAT process result object
+    :return: path-like string representing file location
+    """
+    raw_output = result.stdout.split(b"Loaded")[0]
+    tmp = tempfile.NamedTemporaryFile(delete=False)
+    tmp.write(raw_output)
+    return tmp.name
+
+
+def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult:
+    """Run a BLAT query and returns a path to the output object.
+
+    If unable to produce a valid query the first time, then try a query using ``dnax``
+    bases.
 
     :param scoreset_metadata: object containing scoreset attributes
-    :param query_file: Path to BLAT query file
     :param silent: suppress BLAT command output
-    :param use_cached: if True, don't rerun BLAT if output file already exists, and don't
-    save it to a temporary location. This is probably only useful during development.
     :return: BLAT query result
     :raise AlignmentError: if BLAT subprocess returns error code
     """
-    if use_cached:
-        out_file = get_cached_blat_output(scoreset_metadata.urn)
+    query_file = next(_build_query_file(metadata))
+    if metadata.target_sequence_type == TargetSequenceType.PROTEIN:
+        target_args = "-q=prot -t=dnax"
     else:
-        out_file = None
-    if not use_cached or not out_file:
-        reference_genome_file = get_ref_genome_file(
-            silent=silent
-        )  # TODO hg38 by default--what about earlier builds?
-        if use_cached:
-            out_file = LOCAL_STORE_PATH / f"{scoreset_metadata.urn}_blat_output.psl"
-        else:
-            out_file = (
-                get_mapping_tmp_dir()
-                / f"blat_out_{scoreset_metadata.urn}_{uuid.uuid1()}.psl"
-            )
-
-        if scoreset_metadata.target_sequence_type == TargetSequenceType.PROTEIN:
-            target_commands = "-q=prot -t=dnax"
-        elif scoreset_metadata.target_sequence_type == TargetSequenceType.DNA:
-            target_commands = "-q=dnax -t=dnax"
-        else:
-            query_file.unlink()
-            out_file.unlink()
-            raise AlignmentError(
-                f"Unknown target sequence type: {scoreset_metadata.target_sequence_type} for scoreset {scoreset_metadata.urn}"
-            )
-        command = f"blat {reference_genome_file} {target_commands} -minScore=20 {query_file} {out_file}"
-        if silent:
-            kwargs = {"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT}
-        else:
-            kwargs = {}
-        process = _run_blat_command(command, kwargs)
-        if process.returncode != 0:
-            query_file.unlink()
-            out_file.unlink()
-            raise AlignmentError(
-                f"BLAT process returned error code {process.returncode}: {command}"
-            )
-    # TODO
-    # the notebooks handle errors here by trying different BLAT arg configurations --
-    # investigate, refer to older code if it comes up
-    # ideally we should be forming correct queries up front instead of running
-    # failed alignment attempts
-    output = read_blat(out_file.absolute(), "blat-psl")
-
-    # clean up
-    query_file.unlink()
-    if not use_cached:
-        out_file.unlink()
+        target_args = ""
+    process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
+    out_file = _write_blat_output_tempfile(process_result)
+
+    try:
+        output = read_blat(out_file, "blat-psl")
+    except ValueError:
+        target_args = "-q=dnax -t=dnax"
+        process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
+        out_file = _write_blat_output_tempfile(process_result)
+        try:
+            output = read_blat(out_file, "blat-psl")
+        except ValueError:
+            raise AlignmentError(f"Unable to run successful BLAT on {metadata.urn}")
 
     return output
 
@@ -292,8 +309,10 @@ def align(
         click.echo(msg)
     _logger.info(msg)
 
-    query_file = next(_build_query_file(scoreset_metadata))
-    blat_output = _get_blat_output(scoreset_metadata, query_file, silent, use_cached)
+    if use_cached:
+        blat_output = _get_cached_blat_output(scoreset_metadata, silent)
+    else:
+        blat_output = _get_blat_output(scoreset_metadata, silent)
     match = _get_best_match(blat_output, scoreset_metadata)
 
     msg = "Alignment complete."

diff --git a/src/dcd_mapping/cli.py b/src/dcd_mapping/cli.py
@@ -39,16 +39,16 @@ def cli(urn: str, debug: bool, cache_align: bool) -> None:
     :param debug: if True, enable debug logging
     :param cache_align: if True, save alignment output and reuse when available
     """  # noqa: D301
+    if debug:
+        log_level = logging.DEBUG
+    else:
+        log_level = logging.INFO
     logging.basicConfig(
         filename="dcd-mapping.log",
         format="%(asctime)s %(levelname)s:%(name)s:%(message)s",
-        level=logging.INFO,
+        level=log_level,
         force=True,
     )
-    if debug:
-        _logger.setLevel(logging.DEBUG)
-    else:
-        _logger.setLevel(logging.INFO)
     _logger.debug("debug logging enabled")
     asyncio.run(map_scoreset_urn(urn, silent=False, cache_align=cache_align))
 

diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py
@@ -56,8 +56,8 @@ async def map_scoreset(
     """
     try:
         alignment_result = align(metadata, silent, cache_align)
-    except AlignmentError:
-        _logger.error(f"Alignment failed for scoreset {metadata.urn}")
+    except AlignmentError as e:
+        _logger.error(f"Alignment failed for scoreset {metadata.urn}: {e}")
         return None
 
     try:

diff --git a/src/dcd_mapping/resources.py b/src/dcd_mapping/resources.py
@@ -24,7 +24,6 @@
 from dcd_mapping.schemas import ReferenceGenome, ScoreRow, ScoresetMetadata, UniProtRef
 
 __all__ = [
-    "get_cached_blat_output",
     "get_scoreset_urns",
     "get_human_urns",
     "get_raw_scoreset_metadata",
@@ -47,19 +46,6 @@
     LOCAL_STORE_PATH.mkdir(exist_ok=True, parents=True)
 
 
-def get_cached_blat_output(urn: str) -> Optional[Path]:
-    """Return cached BLAT output if available. Mostly useful for development/testing.
-
-    :param urn: identifier for scoreset
-    :return: path to BLAT output if exists
-    """
-    out_file = LOCAL_STORE_PATH / f"{urn}_blat_output.psl"
-    if out_file.exists():
-        return out_file
-    else:
-        return None
-
-
 class ResourceAcquisitionError(Exception):
     """Raise when resource acquisition fails."""