Skip to content

Commit

Permalink
Merge pull request #25 from ave-dcd/Input-outputs-formats
Browse files Browse the repository at this point in the history
Input-output format
  • Loading branch information
afrubin authored Dec 13, 2023
2 parents 422f097 + ebbcab8 commit 2deaa80
Show file tree
Hide file tree
Showing 21 changed files with 147,977 additions and 464 deletions.
88 changes: 30 additions & 58 deletions mavedb_mapping/blat_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,17 @@

from mavedb_mapping import path_to_hg38_file

"""
Module for performing BLAT Alignment using target sequence.
Returns Query sequence locations with corresponding locations in the
hit sequence obtained by BLAT.
"""


def extract_blat_output(dat: dict):
"""
Runs a BLAT Query and returns the output
Runs a BLAT Query and returns the output.
Parameters
----------
Expand All @@ -19,32 +26,37 @@ def extract_blat_output(dat: dict):
BLAT Output
"""
blat_file = open("blat_query.fa", "w")
blat_file.write(">" + dat["urn"] + "\n")
blat_file.write(">" + "query" + "\n")
blat_file.write(dat["target_sequence"] + "\n")
blat_file.close()
# minimum match 50%
min_score = len(dat["target_sequence"]) // 2

if dat["target_sequence_type"] == "protein":
command = f"blat {path_to_hg38_file} -q=prot -t=dnax -minScore=20 blat_query.fa blat_out.psl"
command = f"blat {path_to_hg38_file} -q=prot -t=dnax -minScore={min_score} blat_query.fa blat_out.psl"
process = subprocess.run(command, shell=True)
else:
command = f"blat {path_to_hg38_file} -minScore=20 blat_query.fa blat_out.psl"
command = (
f"blat {path_to_hg38_file} -minScore={min_score} blat_query.fa blat_out.psl"
)
process = subprocess.run(command, shell=True)
try:
output = SearchIO.read("blat_out.psl", "blat-psl")
except:
except ValueError:
try:
command = f"blat {path_to_hg38_file} -q=dnax -t=dnax -minScore=20 blat_query.fa blat_out.psl"
command = f"blat {path_to_hg38_file} -q=dnax -t=dnax -minScore={min_score} blat_query.fa blat_out.psl"
process = subprocess.run(command, shell=True)
output = SearchIO.read("blat_out.psl", "blat-psl")
except:
except ValueError:
return None
return output


def obtain_hit_starts(output, hit):
def obtain_hit_starts(output, hit: int):
# a hit is a portion of similarity between query seq and matched seq
"""
Helper function to obtain HSP
Returns the start of hit
Helper function to obtain HSP.
Returns the starts of hit sequence.
"""
hit_starts = list()
for n in range(len(output[hit])):
Expand All @@ -54,7 +66,7 @@ def obtain_hit_starts(output, hit):

def obtain_hsp(output):
"""
Obtains high-scoring pairs (HSP) for query sequence
Obtains high-scoring pairs (HSP) for query sequence.
Parameters
----------
Expand All @@ -80,7 +92,7 @@ def obtain_hsp(output):
return hsp


def from_hsp_output(hsp, output, gsymb):
def get_query_and_hit_ranges(hsp, output) -> tuple:
"""
Parameters
Expand All @@ -94,14 +106,13 @@ def from_hsp_output(hsp, output, gsymb):
Returns
-------
Tuple with data used for mapping
Tuple containing the chromosome, strand, coverage, query ranges, and hit ranges.
"""
query_ranges = hit_ranges = []
strand = hsp[0].query_strand
coverage = 100 * (hsp.query_end - hsp.query_start) / output.seq_len
identity = hsp.ident_pct

query_ranges = []
hit_ranges = []
for j in range(len(hsp)):
test_file = open("blat_output_test.txt", "w")
test_file.write(str(hsp[j]))
Expand All @@ -123,36 +134,10 @@ def from_hsp_output(hsp, output, gsymb):
chrom = chrom.split(" ")[9].strip("chr")
query_string = query_string.split(" ")
hit_string = hit_string.split(" ")

# Range in query sequence that aligned with hit sequence
query_ranges.append(query_string[2])

# Range in hit sequence that aligned with hquery sequence
hit_ranges.append(hit_string[4])

return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb, None


def get_query_and_hit_ranges(output, dat: dict) -> tuple:
"""
Extracts query and hit ranges from the BLAT output.
Parameters
----------
output:
Output from the BLAT query.
dat: dict
Dictionary containing data required for mapping.
Returns
-------
Tuple containing the chromosome, strand, coverage, identity, query ranges, hit ranges, and gene symbol.
"""

hsp = obtain_hsp(output)

data_tuple = from_hsp_output(hsp, output, None)
return data_tuple
return chrom, strand, coverage, query_ranges, hit_ranges


def mave_to_blat(dat: dict) -> dict:
Expand All @@ -173,46 +158,33 @@ def mave_to_blat(dat: dict) -> dict:
"""
output = extract_blat_output(dat)
if output is not None:
hsp = obtain_hsp(output)
(
chrom,
strand,
coverage,
identity,
query_ranges,
hit_ranges,
gsymb,
accession,
) = get_query_and_hit_ranges(output, dat)
) = get_query_and_hit_ranges(hsp, output)
qh_dat = {"query_ranges": query_ranges, "hit_ranges": hit_ranges}
qh_dat = pd.DataFrame(data=qh_dat)
mave_blat_dict = {
"urn": dat["urn"],
"chrom": chrom,
"strand": strand,
"target_type": dat["target_type"],
"uniprot": dat["uniprot_id"],
"coverage": coverage,
"identity": identity,
"hits": qh_dat,
"gsymb": gsymb,
"accession": accession,
}

else:
qh_dat = {"query_ranges": ["NA"], "hit_ranges": ["NA"]}
qh_dat = pd.DataFrame(data=qh_dat)
mave_blat_dict = {
"urn": dat["urn"],
"chrom": "NA",
"strand": "NA",
"target": "NA",
"target_type": "NA",
"uniprot": "NA",
"coverage": "NA",
"identity": "NA",
"hits": qh_dat,
"gsymb": "NA",
"accession": "NA",
}

return mave_blat_dict
53 changes: 17 additions & 36 deletions mavedb_mapping/find_start_pos.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,34 @@
# Find start location in provided target sequence when start position is not first position of sequence
import requests
import Bio
import re
from Bio.Seq import Seq
import pandas as pd
import io
from Bio.SeqUtils import seq1
from Bio.Seq import Seq

offset_within_ts = {}


# maybe put in get haplotype
def validation_helper(protstring):
protstring = protstring[1:][:-1]
vs = protstring.split(";")
return vs


def if_start_not_first(dat):
def is_true(i, locs, ts, aa_dict):
for j in range(len(locs)):
if ts[i + locs[j] - locs[0]] != aa_dict[locs[j]]:
return False
return True


def if_start_not_first(dat: dict, vardat):
if dat["target_type"] == "Protein coding" and dat["target_sequence_type"] == "dna":
urn = dat["urn"]
if (
urn == "urn:mavedb:00000053-a-1" or urn == "urn:mavedb:00000053-a-2"
): # target sequence missing codon
return None
oseq = Seq(dat["target_sequence"])
ts = str(oseq.translate(table=1))

string = (
"https://api.mavedb.org/api/v1/score-sets/urn%3Amavedb%3A"
+ dat["urn"][11::]
+ "/scores"
)
# TODO: remove api call
origdat = requests.get(string).content
score_dat = pd.read_csv(io.StringIO(origdat.decode("utf-8")))
protlist = score_dat["hgvs_pro"].to_list()
if type(score_dat.at[0, "hgvs_pro"]) != str or score_dat.at[
0, "hgvs_pro"
].startswith("NP"):
protlist = vardat["hgvs_pro"]

if type(protlist[0]) != str or protlist[0].startswith("NP"):
return None

protlist = [x.lstrip("p.") for x in protlist]

aa_dict = {}
Expand Down Expand Up @@ -97,24 +86,16 @@ def if_start_not_first(dat):
aa_dict = sorted(aa_dict.items())
aa_dict = dict(aa_dict)
locs = list(aa_dict.keys())[0:5]
p0, p1, p2, p3, p4 = locs[0], locs[1], locs[2], locs[3], locs[4]
offset = locs[0]

seq = ""
for key in aa_dict:
seq = seq + aa_dict[key]

for i in range(len(ts)):
if (
ts[i] == aa_dict[p0]
and ts[i + p1 - p0] == aa_dict[p1]
and ts[i + p2 - p0] == aa_dict[p2]
and ts[i + p3 - p0] == aa_dict[p3]
and ts[i + p4 - p0] == aa_dict[p4]
):
if is_true(i, locs, ts, aa_dict):
if i + 1 == min(aa_dict.keys()) or i + 2 == min(aa_dict.keys()):
offset_within_ts[urn] = 0
offset = 0
else:
offset_within_ts[urn] = i
offset = i
break
return offset_within_ts
return offset
Loading

0 comments on commit 2deaa80

Please sign in to comment.