Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Input-output format #25

Merged
merged 15 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 30 additions & 58 deletions mavedb_mapping/blat_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,17 @@

from mavedb_mapping import path_to_hg38_file

"""
Module for performing BLAT Alignment using target sequence.
Returns Query sequence locations with corresponding locations in the
hit sequence obtained by BLAT.

"""


def extract_blat_output(dat: dict):
"""
Runs a BLAT Query and returns the output
Runs a BLAT Query and returns the output.

Parameters
----------
Expand All @@ -19,32 +26,37 @@ def extract_blat_output(dat: dict):
BLAT Output
"""
blat_file = open("blat_query.fa", "w")
blat_file.write(">" + dat["urn"] + "\n")
blat_file.write(">" + "query" + "\n")
blat_file.write(dat["target_sequence"] + "\n")
blat_file.close()
# minimum match 50%
min_score = len(dat["target_sequence"]) // 2

if dat["target_sequence_type"] == "protein":
command = f"blat {path_to_hg38_file} -q=prot -t=dnax -minScore=20 blat_query.fa blat_out.psl"
command = f"blat {path_to_hg38_file} -q=prot -t=dnax -minScore={min_score} blat_query.fa blat_out.psl"
process = subprocess.run(command, shell=True)
else:
command = f"blat {path_to_hg38_file} -minScore=20 blat_query.fa blat_out.psl"
command = (
f"blat {path_to_hg38_file} -minScore={min_score} blat_query.fa blat_out.psl"
)
process = subprocess.run(command, shell=True)
try:
output = SearchIO.read("blat_out.psl", "blat-psl")
except:
except ValueError:
try:
command = f"blat {path_to_hg38_file} -q=dnax -t=dnax -minScore=20 blat_query.fa blat_out.psl"
command = f"blat {path_to_hg38_file} -q=dnax -t=dnax -minScore={min_score} blat_query.fa blat_out.psl"
process = subprocess.run(command, shell=True)
output = SearchIO.read("blat_out.psl", "blat-psl")
except:
except ValueError:
return None
return output


def obtain_hit_starts(output, hit):
def obtain_hit_starts(output, hit: int):
# a hit is a portion of similarity between query seq and matched seq
"""
Helper function to obtain HSP
Returns the start of hit
Helper function to obtain HSP.
Returns the starts of hit sequence.
"""
hit_starts = list()
for n in range(len(output[hit])):
Expand All @@ -54,7 +66,7 @@ def obtain_hit_starts(output, hit):

def obtain_hsp(output):
"""
Obtains high-scoring pairs (HSP) for query sequence
Obtains high-scoring pairs (HSP) for query sequence.

Parameters
----------
Expand All @@ -80,7 +92,7 @@ def obtain_hsp(output):
return hsp


def from_hsp_output(hsp, output, gsymb):
def get_query_and_hit_ranges(hsp, output) -> tuple:
"""

Parameters
Expand All @@ -94,14 +106,13 @@ def from_hsp_output(hsp, output, gsymb):

Returns
-------
Tuple with data used for mapping
Tuple containing the chromosome, strand, coverage, query ranges, and hit ranges.

"""
query_ranges = hit_ranges = []
strand = hsp[0].query_strand
coverage = 100 * (hsp.query_end - hsp.query_start) / output.seq_len
identity = hsp.ident_pct

query_ranges = []
hit_ranges = []
for j in range(len(hsp)):
test_file = open("blat_output_test.txt", "w")
test_file.write(str(hsp[j]))
Expand All @@ -123,36 +134,10 @@ def from_hsp_output(hsp, output, gsymb):
chrom = chrom.split(" ")[9].strip("chr")
query_string = query_string.split(" ")
hit_string = hit_string.split(" ")

# Range in query sequence that aligned with hit sequence
query_ranges.append(query_string[2])

# Range in hit sequence that aligned with hquery sequence
hit_ranges.append(hit_string[4])

return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb, None


def get_query_and_hit_ranges(output, dat: dict) -> tuple:
"""
Extracts query and hit ranges from the BLAT output.

Parameters
----------
output:
Output from the BLAT query.
dat: dict
Dictionary containing data required for mapping.

Returns
-------
Tuple containing the chromosome, strand, coverage, identity, query ranges, hit ranges, and gene symbol.
"""

hsp = obtain_hsp(output)

data_tuple = from_hsp_output(hsp, output, None)
return data_tuple
return chrom, strand, coverage, query_ranges, hit_ranges


def mave_to_blat(dat: dict) -> dict:
Expand All @@ -173,46 +158,33 @@ def mave_to_blat(dat: dict) -> dict:
"""
output = extract_blat_output(dat)
if output is not None:
hsp = obtain_hsp(output)
(
chrom,
strand,
coverage,
identity,
query_ranges,
hit_ranges,
gsymb,
accession,
) = get_query_and_hit_ranges(output, dat)
) = get_query_and_hit_ranges(hsp, output)
qh_dat = {"query_ranges": query_ranges, "hit_ranges": hit_ranges}
qh_dat = pd.DataFrame(data=qh_dat)
mave_blat_dict = {
"urn": dat["urn"],
"chrom": chrom,
"strand": strand,
"target_type": dat["target_type"],
"uniprot": dat["uniprot_id"],
"coverage": coverage,
"identity": identity,
"hits": qh_dat,
"gsymb": gsymb,
"accession": accession,
}

else:
qh_dat = {"query_ranges": ["NA"], "hit_ranges": ["NA"]}
qh_dat = pd.DataFrame(data=qh_dat)
mave_blat_dict = {
"urn": dat["urn"],
"chrom": "NA",
"strand": "NA",
"target": "NA",
"target_type": "NA",
"uniprot": "NA",
"coverage": "NA",
"identity": "NA",
"hits": qh_dat,
"gsymb": "NA",
"accession": "NA",
}

return mave_blat_dict
53 changes: 17 additions & 36 deletions mavedb_mapping/find_start_pos.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,34 @@
# Find start location in provided target sequence when start position is not first position of sequence
import requests
import Bio
import re
from Bio.Seq import Seq
import pandas as pd
import io
from Bio.SeqUtils import seq1
from Bio.Seq import Seq

offset_within_ts = {}


# maybe put in get haplotype
def validation_helper(protstring):
protstring = protstring[1:][:-1]
vs = protstring.split(";")
return vs


def if_start_not_first(dat):
def is_true(i, locs, ts, aa_dict):
for j in range(len(locs)):
if ts[i + locs[j] - locs[0]] != aa_dict[locs[j]]:
return False
return True


def if_start_not_first(dat: dict, vardat):
if dat["target_type"] == "Protein coding" and dat["target_sequence_type"] == "dna":
urn = dat["urn"]
if (
urn == "urn:mavedb:00000053-a-1" or urn == "urn:mavedb:00000053-a-2"
): # target sequence missing codon
return None
oseq = Seq(dat["target_sequence"])
ts = str(oseq.translate(table=1))

string = (
"https://api.mavedb.org/api/v1/score-sets/urn%3Amavedb%3A"
+ dat["urn"][11::]
+ "/scores"
)
# TODO: remove api call
origdat = requests.get(string).content
score_dat = pd.read_csv(io.StringIO(origdat.decode("utf-8")))
protlist = score_dat["hgvs_pro"].to_list()
if type(score_dat.at[0, "hgvs_pro"]) != str or score_dat.at[
0, "hgvs_pro"
].startswith("NP"):
protlist = vardat["hgvs_pro"]

if type(protlist[0]) != str or protlist[0].startswith("NP"):
return None

protlist = [x.lstrip("p.") for x in protlist]

aa_dict = {}
Expand Down Expand Up @@ -97,24 +86,16 @@ def if_start_not_first(dat):
aa_dict = sorted(aa_dict.items())
aa_dict = dict(aa_dict)
locs = list(aa_dict.keys())[0:5]
p0, p1, p2, p3, p4 = locs[0], locs[1], locs[2], locs[3], locs[4]
offset = locs[0]

seq = ""
for key in aa_dict:
seq = seq + aa_dict[key]

for i in range(len(ts)):
if (
ts[i] == aa_dict[p0]
and ts[i + p1 - p0] == aa_dict[p1]
and ts[i + p2 - p0] == aa_dict[p2]
and ts[i + p3 - p0] == aa_dict[p3]
and ts[i + p4 - p0] == aa_dict[p4]
):
if is_true(i, locs, ts, aa_dict):
if i + 1 == min(aa_dict.keys()) or i + 2 == min(aa_dict.keys()):
offset_within_ts[urn] = 0
offset = 0
else:
offset_within_ts[urn] = i
offset = i
break
return offset_within_ts
return offset
Loading