diff --git a/TODO.md b/TODO.md index 8f18152..beadc69 100644 --- a/TODO.md +++ b/TODO.md @@ -2,15 +2,31 @@ General: * notes scattered in comments and docstrings (sorry!) -* Docker-ification. More or less needs to start from scratch (there should be existing images for SeqRepo and UTA, not sure how up-to-date they are). -* Add extra stuff that appears in mapping JSON objects. +* Docker-ification. (there should be existing images for SeqRepo and UTA, not sure how up-to-date they are). +* Add extra stuff that appears in mapping JSON objects (``vrs_ref_allele_seq``). * Currently using VRS 2.0a-based libraries. For lifting back to VRS 1.3, some basic post-processing should be fine (annoying but shouldn't be too trivial) * Without access to a production DynamoDB instance, Gene Normalizer will be quickest and easiest to set up via a PostgreSQL data backend. That, however, requires an extra dependency group (noted in README). We might want to make a `pg` dependency group here or just include it in core dependencies. * On that note, I've only done minimal testing of how possible it would be to drop the gene normalizer dependency entirely, but it'd be nice to get there. * Some of the singleton/factory stuff might be cleaner as `global`s +* make `silent` a global Alignment: -* Pretty sure this is mostly done. Haven't tested exhaustively, though. +* Pretty sure this is mostly done. +* Getting disagreements vs Jeremy's work on strand for the following: + * urn:mavedb:00000032-a-1 + * urn:mavedb:00000083-i-1 + * urn:mavedb:00000034-a-1 + * urn:mavedb:00000043-a-1 + * urn:mavedb:00000083-c-1 + * urn:mavedb:00000083-g-1 + * urn:mavedb:00000068-a-1 + * urn:mavedb:00000043-a-2 + * urn:mavedb:00000001-c-1 + * urn:mavedb:00000083-f-1 + * urn:mavedb:00000083-b-1 + * urn:mavedb:00000068-c-1 + * urn:mavedb:00000001-c-2 + * urn:mavedb:00000034-b-1 * Need to sufficiently mock/patch things in tests Transcript selection: diff --git a/pyproject.toml b/pyproject.toml index e62d95f..0d5fd0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ description = "Map MaveDB scoresets to VRS objects" authors = [ {name = "Alex Handler Wagner", email = "Alex.Wagner@nationwidechildrens.org"}, {name = "Jeremy Arbesfeld", email = "Jeremy.Arbesfeld@nationwidechildrens.org"}, - {name = "Samriddhi Singh", email = ""}, + {name = "Samriddhi Singh", email = "todo@todo.org"}, {name = "James Stevenson", email = "James.Stevenson@nationwidechildrens.org"}, ] readme = "README.md" @@ -48,7 +48,7 @@ tests = [ "pytest-asyncio" ] dev = [ - "ruff>=0.1.9", + "ruff>=0.1.14", "pre-commit" ] diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index 963321f..3405398 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -1,9 +1,10 @@ """Align MaveDB target sequences to a human reference genome.""" import logging import subprocess +import tempfile import uuid from pathlib import Path -from typing import Any, Dict, Generator, List, Optional +from typing import Any, Generator, List, Optional from Bio.SearchIO import HSP from Bio.SearchIO import read as read_blat @@ -14,7 +15,6 @@ from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location from dcd_mapping.resources import ( LOCAL_STORE_PATH, - get_cached_blat_output, get_mapping_tmp_dir, get_ref_genome_file, ) @@ -56,18 +56,18 @@ def _build_query_file(scoreset_metadata: ScoresetMetadata) -> Generator[Path, An query_file = ( get_mapping_tmp_dir() / f"blat_query_{scoreset_metadata.urn}_{uuid.uuid1()}.fa" ) - _logger.debug("Writing BLAT query to %", query_file) + _logger.debug("Writing BLAT query to %s", query_file) lines = [">query", scoreset_metadata.target_sequence] _write_query_file(query_file, lines) yield query_file query_file.unlink() -def _run_blat_command(command: str, args: Dict) -> subprocess.CompletedProcess: +def _run_blat( + target_args: str, query_file: Path, out_file: str, silent: bool +) -> subprocess.CompletedProcess: """Execute BLAT binary with relevant params. - This function is broken out to enable mocking while testing. - Currently, we rely on a system-installed BLAT binary accessible in the containing environment's PATH. This is sort of awkward and it'd be nice to make use of some direct bindings or better packaging if that's possible. @@ -75,86 +75,103 @@ def _run_blat_command(command: str, args: Dict) -> subprocess.CompletedProcess: * Perhaps `gget`? https://pachterlab.github.io/gget/en/blat.html * ``PxBlat``? https://github.com/ylab-hi/pxblat - :param command: shell command to execute - :param args: ``subprocess.run`` extra args (eg redirecting output for silent mode) + :param target_args: target params eg ``"-q=prot -t=dnax"`` (can be empty) + :param query_file: path to query FASTA file + :param out_file: path-like string to output fill (could be "/dev/stdout") + :param silent: if True, suppress all console output :return: process result """ - _logger.debug("Running BLAT command: %", command) - return subprocess.run(command, shell=True, **args) + reference_genome_file = get_ref_genome_file(silent=silent) + command = f"blat {reference_genome_file} {target_args} -minScore=20 {query_file} {out_file}" + _logger.debug("Running BLAT command: %s", command) + result = subprocess.run( + command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + _logger.debug("BLAT command finished with result %s", result.returncode) + if result.returncode != 0: + raise AlignmentError( + f"BLAT process returned error code {result.returncode}: {target_args} {query_file} {out_file}" + ) + return result -def _get_blat_output( - scoreset_metadata: ScoresetMetadata, - query_file: Path, - silent: bool, - use_cached: bool, -) -> QueryResult: - """Run a BLAT query and returns a path to the output object. +def _get_cached_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult: + """Get a BLAT output object for the given scoreset -- either reusing a previously- + run query output file, or making a new one if unavailable. + + This method is broken out from ``_get_blat_output`` because it was getting pretty + messy to handle differing file management logic between the two of them. However, + query/command generation logic should be shared. + + :param metadata: + :param silent: + :return: + """ + out_file = LOCAL_STORE_PATH / f"{metadata.urn}_blat_output.psl" + if out_file.exists(): + return read_blat(out_file.absolute(), "blat-psl") + query_file = next(_build_query_file(metadata)) + if metadata.target_sequence_type == TargetSequenceType.PROTEIN: + target_args = "-q=prot -t=dnax" + else: + target_args = "" + _run_blat(target_args, query_file, str(out_file.absolute()), silent) + try: + output = read_blat(out_file.absolute(), "blat-psl") + except ValueError: + target_args = "-q=dnax -t=dnax" + _run_blat(target_args, query_file, str(out_file.absolute()), silent) + try: + output = read_blat(out_file.absolute(), "blat-psl") + except ValueError: + raise AlignmentError( + f"Unable to get valid BLAT response for {metadata.urn}" + ) + return output - We create query and output files in the application's "temporary" folder, which - should be deleted by the process once complete. This happens manually, but we could - probably add a decorator or a context manager for a bit more elegance. - Ideally, we should see if we could pipe query output to STDOUT and then grab/process - it that way instead of using a temporary intermediary file. +def _write_blat_output_tempfile(result: subprocess.CompletedProcess) -> str: + """Create temp BLAT output file. Not immediately deleted, but should eventually + be cleared by the OS. + + :param result: BLAT process result object + :return: path-like string representing file location + """ + raw_output = result.stdout.split(b"Loaded")[0] + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.write(raw_output) + return tmp.name + + +def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult: + """Run a BLAT query and returns a path to the output object. + + If unable to produce a valid query the first time, then try a query using ``dnax`` + bases. :param scoreset_metadata: object containing scoreset attributes - :param query_file: Path to BLAT query file :param silent: suppress BLAT command output - :param use_cached: if True, don't rerun BLAT if output file already exists, and don't - save it to a temporary location. This is probably only useful during development. :return: BLAT query result :raise AlignmentError: if BLAT subprocess returns error code """ - if use_cached: - out_file = get_cached_blat_output(scoreset_metadata.urn) + query_file = next(_build_query_file(metadata)) + if metadata.target_sequence_type == TargetSequenceType.PROTEIN: + target_args = "-q=prot -t=dnax" else: - out_file = None - if not use_cached or not out_file: - reference_genome_file = get_ref_genome_file( - silent=silent - ) # TODO hg38 by default--what about earlier builds? - if use_cached: - out_file = LOCAL_STORE_PATH / f"{scoreset_metadata.urn}_blat_output.psl" - else: - out_file = ( - get_mapping_tmp_dir() - / f"blat_out_{scoreset_metadata.urn}_{uuid.uuid1()}.psl" - ) - - if scoreset_metadata.target_sequence_type == TargetSequenceType.PROTEIN: - target_commands = "-q=prot -t=dnax" - elif scoreset_metadata.target_sequence_type == TargetSequenceType.DNA: - target_commands = "-q=dnax -t=dnax" - else: - query_file.unlink() - out_file.unlink() - raise AlignmentError( - f"Unknown target sequence type: {scoreset_metadata.target_sequence_type} for scoreset {scoreset_metadata.urn}" - ) - command = f"blat {reference_genome_file} {target_commands} -minScore=20 {query_file} {out_file}" - if silent: - kwargs = {"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT} - else: - kwargs = {} - process = _run_blat_command(command, kwargs) - if process.returncode != 0: - query_file.unlink() - out_file.unlink() - raise AlignmentError( - f"BLAT process returned error code {process.returncode}: {command}" - ) - # TODO - # the notebooks handle errors here by trying different BLAT arg configurations -- - # investigate, refer to older code if it comes up - # ideally we should be forming correct queries up front instead of running - # failed alignment attempts - output = read_blat(out_file.absolute(), "blat-psl") - - # clean up - query_file.unlink() - if not use_cached: - out_file.unlink() + target_args = "" + process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) + out_file = _write_blat_output_tempfile(process_result) + + try: + output = read_blat(out_file, "blat-psl") + except ValueError: + target_args = "-q=dnax -t=dnax" + process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) + out_file = _write_blat_output_tempfile(process_result) + try: + output = read_blat(out_file, "blat-psl") + except ValueError: + raise AlignmentError(f"Unable to run successful BLAT on {metadata.urn}") return output @@ -292,8 +309,10 @@ def align( click.echo(msg) _logger.info(msg) - query_file = next(_build_query_file(scoreset_metadata)) - blat_output = _get_blat_output(scoreset_metadata, query_file, silent, use_cached) + if use_cached: + blat_output = _get_cached_blat_output(scoreset_metadata, silent) + else: + blat_output = _get_blat_output(scoreset_metadata, silent) match = _get_best_match(blat_output, scoreset_metadata) msg = "Alignment complete." diff --git a/src/dcd_mapping/cli.py b/src/dcd_mapping/cli.py index e65c535..fa9b9b6 100644 --- a/src/dcd_mapping/cli.py +++ b/src/dcd_mapping/cli.py @@ -39,16 +39,16 @@ def cli(urn: str, debug: bool, cache_align: bool) -> None: :param debug: if True, enable debug logging :param cache_align: if True, save alignment output and reuse when available """ # noqa: D301 + if debug: + log_level = logging.DEBUG + else: + log_level = logging.INFO logging.basicConfig( filename="dcd-mapping.log", format="%(asctime)s %(levelname)s:%(name)s:%(message)s", - level=logging.INFO, + level=log_level, force=True, ) - if debug: - _logger.setLevel(logging.DEBUG) - else: - _logger.setLevel(logging.INFO) _logger.debug("debug logging enabled") asyncio.run(map_scoreset_urn(urn, silent=False, cache_align=cache_align)) diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 85d73f1..d7e9985 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -56,8 +56,8 @@ async def map_scoreset( """ try: alignment_result = align(metadata, silent, cache_align) - except AlignmentError: - _logger.error(f"Alignment failed for scoreset {metadata.urn}") + except AlignmentError as e: + _logger.error(f"Alignment failed for scoreset {metadata.urn}: {e}") return None try: diff --git a/src/dcd_mapping/resources.py b/src/dcd_mapping/resources.py index dde7cf7..389031e 100644 --- a/src/dcd_mapping/resources.py +++ b/src/dcd_mapping/resources.py @@ -24,7 +24,6 @@ from dcd_mapping.schemas import ReferenceGenome, ScoreRow, ScoresetMetadata, UniProtRef __all__ = [ - "get_cached_blat_output", "get_scoreset_urns", "get_human_urns", "get_raw_scoreset_metadata", @@ -47,19 +46,6 @@ LOCAL_STORE_PATH.mkdir(exist_ok=True, parents=True) -def get_cached_blat_output(urn: str) -> Optional[Path]: - """Return cached BLAT output if available. Mostly useful for development/testing. - - :param urn: identifier for scoreset - :return: path to BLAT output if exists - """ - out_file = LOCAL_STORE_PATH / f"{urn}_blat_output.psl" - if out_file.exists(): - return out_file - else: - return None - - class ResourceAcquisitionError(Exception): """Raise when resource acquisition fails."""