Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add initial alignment draft #3

Merged
merged 1 commit into from
Sep 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,4 +160,7 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

*.pickle
*.pickle

# BLAT query file directory
mavedb_mapping/tmp/*
115 changes: 114 additions & 1 deletion mavedb_mapping/align.py
Original file line number Diff line number Diff line change
@@ -1 +1,114 @@
"""Align MaveDB target sequences to the human genome."""
"""Align MaveDB target sequences to a human reference genome."""
import subprocess
from pathlib import Path
from typing import List, Optional

from Bio.SearchIO import read as read_blat
from Bio.SearchIO._model import QueryResult
from Bio.SearchIO._model.hsp import HSP

from mavedb_mapping.resources import get_mapping_tmp_dir, get_ref_genome_file
from mavedb_mapping.schemas import (
ScoresetMetadata,
TargetSequenceType,
)


class AlignmentError(Exception):
"""Raise when errors encountered during alignment."""


def _build_query_file(scoreset_metadata: ScoresetMetadata) -> Path:
"""Construct BLAT query file.

:param scoreset_metadata: MaveDB scoreset metadata object
:return: Path to constructed file
"""
query_file = get_mapping_tmp_dir() / "blat_query.fa"
with open(query_file, "w") as f:
f.write(">" + "query" + "\n")
f.write(scoreset_metadata.target_sequence + "\n")
f.close()
return query_file


def _run_blat(scoreset_metadata: ScoresetMetadata) -> QueryResult:
"""Run a BLAT query and returns the output.

How to handle the BLAT binary is a big TODO. There are some newish Python packages
that wrap it/provide bindings -- might be best to look there.

:param scoreset_metadata: object containing scoreset attributes
:param ref_genome_path: location of reference genome file
:return: BLAT query result
:raise AlignmentError: if BLAT subprocess returns error code
"""
query_file = _build_query_file(scoreset_metadata)
reference_genome_file = get_ref_genome_file()
min_score = len(scoreset_metadata.target_sequence) // 2 # minimum match 50%
out_file = get_mapping_tmp_dir() / "blat_out.psl"

if scoreset_metadata.target_sequence_type == TargetSequenceType.PROTEIN:
command = f"blat {reference_genome_file} -q=prot -t=dnax -minScore={min_score} {query_file.absolute()} {out_file.absolute()}"
else:
# missing `-t=dnax`?
command = f"blat {reference_genome_file} -minScore={min_score} {query_file.absolute()} {out_file.absolute()}"
process = subprocess.run(
command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT
)
if process.returncode != 0:
raise AlignmentError(
f"BLAT process returned error code {process.returncode}: {command}"
)

# seems like maybe this hits an error resolved by adding -q=dnax sometimes?
# investigate, refer to older code if it comes up
output = read_blat("blat_out.psl", "blat-psl")
return output


def _get_hit_starts(output: QueryResult, hit_index: int) -> List[int]:
"""Get hit starts from HSP object.

:param output: BLAT result object
:return: The starts of hit sequence.
"""
hit_starts: List[int] = []
for n in range(len(output[hit_index])):
hit_starts.append(output[hit_index][n].hit_start)
return hit_starts


def _get_hsp(output: QueryResult) -> HSP:
"""Obtain high-scoring pairs (HSP) for query sequence.

(I am pretty sure this should be refactored)

:param output: BLAT result object
"""
hit_dict = {}
for c in range(len(output)):
max_hit = output[c][0].score
for e in range(len(output[c])):
if output[c][e].score > max_hit:
max_hit = output[c][e].score
hit_dict[c] = max_hit

# Using top scoring hit
hit = max(hit_dict, key=hit_dict.get)
hit_starts = _get_hit_starts(output, hit)
hsp = output[hit][hit_starts.index(max(hit_starts))]
return hsp


def align(scoreset_metadata: ScoresetMetadata) -> Optional[HSP]:
"""Align target sequence to a reference genome.

:param scoreset_metadata: object containing scoreset metadata
:return: high-scoring pairs (HSP) object
"""
try:
blat_output = _run_blat(scoreset_metadata)
except AlignmentError:
return None
return _get_hsp(blat_output)
4 changes: 3 additions & 1 deletion mavedb_mapping/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import os
from pathlib import Path

LOCAL_STORE_PATH = Path(os.environ.get("MAVEDB_STORAGE_DIR", "~/.mave_db/storage"))
LOCAL_STORE_PATH = Path(
os.environ.get("MAVEDB_STORAGE_DIR", Path.home() / ".mave_db/storage")
)
if not LOCAL_STORE_PATH.exists():
LOCAL_STORE_PATH.mkdir(exist_ok=True, parents=True)
127 changes: 0 additions & 127 deletions mavedb_mapping/fetch.py

This file was deleted.

Loading