Skip to content

Commit

Permalink
fix: don't use dnax query for dna target sequences (#11)
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson authored Jan 25, 2024
1 parent 0bc6969 commit cd9573a
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 101 deletions.
22 changes: 19 additions & 3 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,31 @@

General:
* notes scattered in comments and docstrings (sorry!)
* Docker-ification. More or less needs to start from scratch (there should be existing images for SeqRepo and UTA, not sure how up-to-date they are).
* Add extra stuff that appears in mapping JSON objects.
* Docker-ification. (there should be existing images for SeqRepo and UTA, not sure how up-to-date they are).
* Add extra stuff that appears in mapping JSON objects (``vrs_ref_allele_seq``).
* Currently using VRS 2.0a-based libraries. For lifting back to VRS 1.3, some basic post-processing should be fine (annoying but shouldn't be too trivial)
* Without access to a production DynamoDB instance, Gene Normalizer will be quickest and easiest to set up via a PostgreSQL data backend. That, however, requires an extra dependency group (noted in README). We might want to make a `pg` dependency group here or just include it in core dependencies.
* On that note, I've only done minimal testing of how possible it would be to drop the gene normalizer dependency entirely, but it'd be nice to get there.
* Some of the singleton/factory stuff might be cleaner as `global`s
* make `silent` a global

Alignment:
* Pretty sure this is mostly done. Haven't tested exhaustively, though.
* Pretty sure this is mostly done.
* Getting disagreements vs Jeremy's work on strand for the following:
* urn:mavedb:00000032-a-1
* urn:mavedb:00000083-i-1
* urn:mavedb:00000034-a-1
* urn:mavedb:00000043-a-1
* urn:mavedb:00000083-c-1
* urn:mavedb:00000083-g-1
* urn:mavedb:00000068-a-1
* urn:mavedb:00000043-a-2
* urn:mavedb:00000001-c-1
* urn:mavedb:00000083-f-1
* urn:mavedb:00000083-b-1
* urn:mavedb:00000068-c-1
* urn:mavedb:00000001-c-2
* urn:mavedb:00000034-b-1
* Need to sufficiently mock/patch things in tests

Transcript selection:
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ description = "Map MaveDB scoresets to VRS objects"
authors = [
{name = "Alex Handler Wagner", email = "[email protected]"},
{name = "Jeremy Arbesfeld", email = "[email protected]"},
{name = "Samriddhi Singh", email = ""},
{name = "Samriddhi Singh", email = "[email protected]"},
{name = "James Stevenson", email = "[email protected]"},
]
readme = "README.md"
Expand Down Expand Up @@ -48,7 +48,7 @@ tests = [
"pytest-asyncio"
]
dev = [
"ruff>=0.1.9",
"ruff>=0.1.14",
"pre-commit"
]

Expand Down
169 changes: 94 additions & 75 deletions src/dcd_mapping/align.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Align MaveDB target sequences to a human reference genome."""
import logging
import subprocess
import tempfile
import uuid
from pathlib import Path
from typing import Any, Dict, Generator, List, Optional
from typing import Any, Generator, List, Optional

from Bio.SearchIO import HSP
from Bio.SearchIO import read as read_blat
Expand All @@ -14,7 +15,6 @@
from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location
from dcd_mapping.resources import (
LOCAL_STORE_PATH,
get_cached_blat_output,
get_mapping_tmp_dir,
get_ref_genome_file,
)
Expand Down Expand Up @@ -56,105 +56,122 @@ def _build_query_file(scoreset_metadata: ScoresetMetadata) -> Generator[Path, An
query_file = (
get_mapping_tmp_dir() / f"blat_query_{scoreset_metadata.urn}_{uuid.uuid1()}.fa"
)
_logger.debug("Writing BLAT query to %", query_file)
_logger.debug("Writing BLAT query to %s", query_file)
lines = [">query", scoreset_metadata.target_sequence]
_write_query_file(query_file, lines)
yield query_file
query_file.unlink()


def _run_blat_command(command: str, args: Dict) -> subprocess.CompletedProcess:
def _run_blat(
target_args: str, query_file: Path, out_file: str, silent: bool
) -> subprocess.CompletedProcess:
"""Execute BLAT binary with relevant params.
This function is broken out to enable mocking while testing.
Currently, we rely on a system-installed BLAT binary accessible in the containing
environment's PATH. This is sort of awkward and it'd be nice to make use of some
direct bindings or better packaging if that's possible.
* Perhaps `gget`? https://pachterlab.github.io/gget/en/blat.html
* ``PxBlat``? https://github.com/ylab-hi/pxblat
:param command: shell command to execute
:param args: ``subprocess.run`` extra args (eg redirecting output for silent mode)
:param target_args: target params eg ``"-q=prot -t=dnax"`` (can be empty)
:param query_file: path to query FASTA file
:param out_file: path-like string to output fill (could be "/dev/stdout")
:param silent: if True, suppress all console output
:return: process result
"""
_logger.debug("Running BLAT command: %", command)
return subprocess.run(command, shell=True, **args)
reference_genome_file = get_ref_genome_file(silent=silent)
command = f"blat {reference_genome_file} {target_args} -minScore=20 {query_file} {out_file}"
_logger.debug("Running BLAT command: %s", command)
result = subprocess.run(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
_logger.debug("BLAT command finished with result %s", result.returncode)
if result.returncode != 0:
raise AlignmentError(
f"BLAT process returned error code {result.returncode}: {target_args} {query_file} {out_file}"
)
return result


def _get_blat_output(
scoreset_metadata: ScoresetMetadata,
query_file: Path,
silent: bool,
use_cached: bool,
) -> QueryResult:
"""Run a BLAT query and returns a path to the output object.
def _get_cached_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult:
"""Get a BLAT output object for the given scoreset -- either reusing a previously-
run query output file, or making a new one if unavailable.
This method is broken out from ``_get_blat_output`` because it was getting pretty
messy to handle differing file management logic between the two of them. However,
query/command generation logic should be shared.
:param metadata:
:param silent:
:return:
"""
out_file = LOCAL_STORE_PATH / f"{metadata.urn}_blat_output.psl"
if out_file.exists():
return read_blat(out_file.absolute(), "blat-psl")
query_file = next(_build_query_file(metadata))
if metadata.target_sequence_type == TargetSequenceType.PROTEIN:
target_args = "-q=prot -t=dnax"
else:
target_args = ""
_run_blat(target_args, query_file, str(out_file.absolute()), silent)
try:
output = read_blat(out_file.absolute(), "blat-psl")
except ValueError:
target_args = "-q=dnax -t=dnax"
_run_blat(target_args, query_file, str(out_file.absolute()), silent)
try:
output = read_blat(out_file.absolute(), "blat-psl")
except ValueError:
raise AlignmentError(
f"Unable to get valid BLAT response for {metadata.urn}"
)
return output

We create query and output files in the application's "temporary" folder, which
should be deleted by the process once complete. This happens manually, but we could
probably add a decorator or a context manager for a bit more elegance.

Ideally, we should see if we could pipe query output to STDOUT and then grab/process
it that way instead of using a temporary intermediary file.
def _write_blat_output_tempfile(result: subprocess.CompletedProcess) -> str:
"""Create temp BLAT output file. Not immediately deleted, but should eventually
be cleared by the OS.
:param result: BLAT process result object
:return: path-like string representing file location
"""
raw_output = result.stdout.split(b"Loaded")[0]
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp.write(raw_output)
return tmp.name


def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult:
"""Run a BLAT query and returns a path to the output object.
If unable to produce a valid query the first time, then try a query using ``dnax``
bases.
:param scoreset_metadata: object containing scoreset attributes
:param query_file: Path to BLAT query file
:param silent: suppress BLAT command output
:param use_cached: if True, don't rerun BLAT if output file already exists, and don't
save it to a temporary location. This is probably only useful during development.
:return: BLAT query result
:raise AlignmentError: if BLAT subprocess returns error code
"""
if use_cached:
out_file = get_cached_blat_output(scoreset_metadata.urn)
query_file = next(_build_query_file(metadata))
if metadata.target_sequence_type == TargetSequenceType.PROTEIN:
target_args = "-q=prot -t=dnax"
else:
out_file = None
if not use_cached or not out_file:
reference_genome_file = get_ref_genome_file(
silent=silent
) # TODO hg38 by default--what about earlier builds?
if use_cached:
out_file = LOCAL_STORE_PATH / f"{scoreset_metadata.urn}_blat_output.psl"
else:
out_file = (
get_mapping_tmp_dir()
/ f"blat_out_{scoreset_metadata.urn}_{uuid.uuid1()}.psl"
)

if scoreset_metadata.target_sequence_type == TargetSequenceType.PROTEIN:
target_commands = "-q=prot -t=dnax"
elif scoreset_metadata.target_sequence_type == TargetSequenceType.DNA:
target_commands = "-q=dnax -t=dnax"
else:
query_file.unlink()
out_file.unlink()
raise AlignmentError(
f"Unknown target sequence type: {scoreset_metadata.target_sequence_type} for scoreset {scoreset_metadata.urn}"
)
command = f"blat {reference_genome_file} {target_commands} -minScore=20 {query_file} {out_file}"
if silent:
kwargs = {"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT}
else:
kwargs = {}
process = _run_blat_command(command, kwargs)
if process.returncode != 0:
query_file.unlink()
out_file.unlink()
raise AlignmentError(
f"BLAT process returned error code {process.returncode}: {command}"
)
# TODO
# the notebooks handle errors here by trying different BLAT arg configurations --
# investigate, refer to older code if it comes up
# ideally we should be forming correct queries up front instead of running
# failed alignment attempts
output = read_blat(out_file.absolute(), "blat-psl")

# clean up
query_file.unlink()
if not use_cached:
out_file.unlink()
target_args = ""
process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
out_file = _write_blat_output_tempfile(process_result)

try:
output = read_blat(out_file, "blat-psl")
except ValueError:
target_args = "-q=dnax -t=dnax"
process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
out_file = _write_blat_output_tempfile(process_result)
try:
output = read_blat(out_file, "blat-psl")
except ValueError:
raise AlignmentError(f"Unable to run successful BLAT on {metadata.urn}")

return output

Expand Down Expand Up @@ -292,8 +309,10 @@ def align(
click.echo(msg)
_logger.info(msg)

query_file = next(_build_query_file(scoreset_metadata))
blat_output = _get_blat_output(scoreset_metadata, query_file, silent, use_cached)
if use_cached:
blat_output = _get_cached_blat_output(scoreset_metadata, silent)
else:
blat_output = _get_blat_output(scoreset_metadata, silent)
match = _get_best_match(blat_output, scoreset_metadata)

msg = "Alignment complete."
Expand Down
10 changes: 5 additions & 5 deletions src/dcd_mapping/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,16 @@ def cli(urn: str, debug: bool, cache_align: bool) -> None:
:param debug: if True, enable debug logging
:param cache_align: if True, save alignment output and reuse when available
""" # noqa: D301
if debug:
log_level = logging.DEBUG
else:
log_level = logging.INFO
logging.basicConfig(
filename="dcd-mapping.log",
format="%(asctime)s %(levelname)s:%(name)s:%(message)s",
level=logging.INFO,
level=log_level,
force=True,
)
if debug:
_logger.setLevel(logging.DEBUG)
else:
_logger.setLevel(logging.INFO)
_logger.debug("debug logging enabled")
asyncio.run(map_scoreset_urn(urn, silent=False, cache_align=cache_align))

Expand Down
4 changes: 2 additions & 2 deletions src/dcd_mapping/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ async def map_scoreset(
"""
try:
alignment_result = align(metadata, silent, cache_align)
except AlignmentError:
_logger.error(f"Alignment failed for scoreset {metadata.urn}")
except AlignmentError as e:
_logger.error(f"Alignment failed for scoreset {metadata.urn}: {e}")
return None

try:
Expand Down
14 changes: 0 additions & 14 deletions src/dcd_mapping/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from dcd_mapping.schemas import ReferenceGenome, ScoreRow, ScoresetMetadata, UniProtRef

__all__ = [
"get_cached_blat_output",
"get_scoreset_urns",
"get_human_urns",
"get_raw_scoreset_metadata",
Expand All @@ -47,19 +46,6 @@
LOCAL_STORE_PATH.mkdir(exist_ok=True, parents=True)


def get_cached_blat_output(urn: str) -> Optional[Path]:
"""Return cached BLAT output if available. Mostly useful for development/testing.
:param urn: identifier for scoreset
:return: path to BLAT output if exists
"""
out_file = LOCAL_STORE_PATH / f"{urn}_blat_output.psl"
if out_file.exists():
return out_file
else:
return None


class ResourceAcquisitionError(Exception):
"""Raise when resource acquisition fails."""

Expand Down

0 comments on commit cd9573a

Please sign in to comment.