-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
BLAT Alignment without using gene symb/uniprot and updated transcript…
… selection
- Loading branch information
1 parent
a191d26
commit d6390fe
Showing
31 changed files
with
50,146 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
name: Python package | ||
|
||
on: [push] | ||
|
||
jobs: | ||
build: | ||
|
||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
python-version: ["3.9", "3.10", "3.11"] | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Set up Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi | ||
- name: Test with pytest | ||
env: | ||
PATH_TO_SEQREPO: ./tests/data/seqrepo/latest | ||
HG38_FILE: ./tests/data/hg38.2bit | ||
GENE_NORM_DB_URL: postgres://postgres@localhost:5432/gene_normalizer | ||
|
||
run: | | ||
export UTA_PASSWORD=uta | ||
export LRG_REFSEQGENE_PATH=./tests/data/LRG_RefSeqGene | ||
export MANE_SUMMARY_PATH=./tests/data/MANE.GRCh38.v1.1.summary.txt.gz | ||
export PYTHONPATH="${PYTHONPATH}:$(pwd)" | ||
pytest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
repos: | ||
- repo: https://github.com/psf/black | ||
rev: stable | ||
hooks: | ||
- id: black |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import os | ||
from biocommons.seqrepo import SeqRepo | ||
|
||
from cool_seq_tool.data_sources.uta_database import UTADatabase | ||
|
||
from ga4gh.vrs.dataproxy import SeqRepoDataProxy | ||
|
||
path_to_seqrepo = os.getenv("PATH_TO_SEQREPO", "/usr/local/share/seqrepo/latest") | ||
path_to_hg38_file = os.getenv("HG38_FILE", "hg38.2bit") # default- in current directory | ||
data_file_path = "tests/data/" | ||
|
||
# utadb = UTADatabase(db_pwd="uta") | ||
sr = SeqRepo(path_to_seqrepo, writeable=True) | ||
dp = SeqRepoDataProxy(sr=sr) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,218 @@ | ||
from Bio import SearchIO | ||
import pandas as pd | ||
import subprocess | ||
|
||
from mavedb_mapping import path_to_hg38_file | ||
|
||
|
||
def extract_blat_output(dat: dict): | ||
""" | ||
Runs a BLAT Query and returns the output | ||
Parameters | ||
---------- | ||
dat: dict | ||
Dictionary containing data required for mapping. | ||
Returns: | ||
-------- | ||
BLAT Output | ||
""" | ||
blat_file = open("blat_query.fa", "w") | ||
blat_file.write(">" + dat["urn"] + "\n") | ||
blat_file.write(dat["target_sequence"] + "\n") | ||
blat_file.close() | ||
if dat["target_sequence_type"] == "protein": | ||
command = f"blat {path_to_hg38_file} -q=prot -t=dnax -minScore=20 blat_query.fa blat_out.psl" | ||
process = subprocess.run(command, shell=True) | ||
else: | ||
command = f"blat {path_to_hg38_file} -minScore=20 blat_query.fa blat_out.psl" | ||
process = subprocess.run(command, shell=True) | ||
try: | ||
output = SearchIO.read("blat_out.psl", "blat-psl") | ||
except: | ||
try: | ||
command = f"blat {path_to_hg38_file} -q=dnax -t=dnax -minScore=20 blat_query.fa blat_out.psl" | ||
process = subprocess.run(command, shell=True) | ||
output = SearchIO.read("blat_out.psl", "blat-psl") | ||
except: | ||
return None | ||
return output | ||
|
||
|
||
def obtain_hit_starts(output, hit): | ||
# a hit is a portion of similarity between query seq and matched seq | ||
""" | ||
Helper function to obtain HSP | ||
Returns the start of hit | ||
""" | ||
hit_starts = list() | ||
for n in range(len(output[hit])): | ||
hit_starts.append(output[hit][n].hit_start) | ||
return hit_starts | ||
|
||
|
||
def obtain_hsp(output): | ||
""" | ||
Obtains high-scoring pairs (HSP) for query sequence | ||
Parameters | ||
---------- | ||
output: BLAT output | ||
Returns | ||
------- | ||
HSP | ||
""" | ||
hit_dict = {} | ||
for c in range(len(output)): | ||
max_hit = output[c][0].score | ||
for e in range(len(output[c])): | ||
if output[c][e].score > max_hit: | ||
max_hit = output[c][e].score | ||
hit_dict[c] = max_hit | ||
|
||
# Using top scoring hit | ||
hit = max(hit_dict, key=hit_dict.get) | ||
hit_starts = obtain_hit_starts(output, hit) | ||
hsp = output[hit][hit_starts.index(max(hit_starts))] | ||
return hsp | ||
|
||
|
||
def from_hsp_output(hsp, output, gsymb): | ||
""" | ||
Parameters | ||
---------- | ||
hsp: | ||
High scoring pairs obtained by using top scoring hit | ||
output: | ||
BLAT output | ||
gsymb: str | ||
Gene symbol | ||
Returns | ||
------- | ||
Tuple with data used for mapping | ||
""" | ||
query_ranges = hit_ranges = [] | ||
strand = hsp[0].query_strand | ||
coverage = 100 * (hsp.query_end - hsp.query_start) / output.seq_len | ||
identity = hsp.ident_pct | ||
|
||
for j in range(len(hsp)): | ||
test_file = open("blat_output_test.txt", "w") | ||
test_file.write(str(hsp[j])) | ||
test_file.close() | ||
|
||
query_string = "" | ||
hit_string = "" | ||
|
||
test_file = open("blat_output_test.txt", "r") | ||
for k, line in enumerate(test_file): | ||
if k == 1: | ||
chrom = line.strip("\n") | ||
if k == 2: | ||
query_string = line.strip("\n") | ||
if k == 3: | ||
hit_string = line.strip("\n") | ||
test_file.close() | ||
|
||
chrom = chrom.split(" ")[9].strip("chr") | ||
query_string = query_string.split(" ") | ||
hit_string = hit_string.split(" ") | ||
|
||
# Range in query sequence that aligned with hit sequence | ||
query_ranges.append(query_string[2]) | ||
|
||
# Range in hit sequence that aligned with hquery sequence | ||
hit_ranges.append(hit_string[4]) | ||
|
||
return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb, None | ||
|
||
|
||
def get_query_and_hit_ranges(output, dat: dict) -> tuple: | ||
""" | ||
Extracts query and hit ranges from the BLAT output. | ||
Parameters | ||
---------- | ||
output: | ||
Output from the BLAT query. | ||
dat: dict | ||
Dictionary containing data required for mapping. | ||
Returns | ||
------- | ||
Tuple containing the chromosome, strand, coverage, identity, query ranges, hit ranges, and gene symbol. | ||
""" | ||
|
||
hsp = obtain_hsp(output) | ||
|
||
data_tuple = from_hsp_output(hsp, output, None) | ||
return data_tuple | ||
|
||
|
||
def mave_to_blat(dat: dict) -> dict: | ||
""" | ||
Performs BLAT Alignment on MaveDB scoreset data. | ||
Parameters | ||
---------- | ||
dat: dict | ||
Dictionary containing data from MaveDB scoresets. | ||
Returns | ||
------- | ||
mave_blat_dict: dict | ||
Dicitionary containing data after doing BLAT Alignment | ||
""" | ||
output = extract_blat_output(dat) | ||
if output is not None: | ||
( | ||
chrom, | ||
strand, | ||
coverage, | ||
identity, | ||
query_ranges, | ||
hit_ranges, | ||
gsymb, | ||
accession, | ||
) = get_query_and_hit_ranges(output, dat) | ||
qh_dat = {"query_ranges": query_ranges, "hit_ranges": hit_ranges} | ||
qh_dat = pd.DataFrame(data=qh_dat) | ||
mave_blat_dict = { | ||
"urn": dat["urn"], | ||
"chrom": chrom, | ||
"strand": strand, | ||
"target_type": dat["target_type"], | ||
"uniprot": dat["uniprot_id"], | ||
"coverage": coverage, | ||
"identity": identity, | ||
"hits": qh_dat, | ||
"gsymb": gsymb, | ||
"accession": accession, | ||
} | ||
|
||
else: | ||
qh_dat = {"query_ranges": ["NA"], "hit_ranges": ["NA"]} | ||
qh_dat = pd.DataFrame(data=qh_dat) | ||
mave_blat_dict = { | ||
"urn": dat["urn"], | ||
"chrom": "NA", | ||
"strand": "NA", | ||
"target": "NA", | ||
"target_type": "NA", | ||
"uniprot": "NA", | ||
"coverage": "NA", | ||
"identity": "NA", | ||
"hits": qh_dat, | ||
"gsymb": "NA", | ||
"accession": "NA", | ||
} | ||
|
||
return mave_blat_dict |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
# Find start location in provided target sequence when start position is not first position of sequence | ||
import requests | ||
import Bio | ||
import re | ||
from Bio.Seq import Seq | ||
import pandas as pd | ||
import io | ||
from Bio.SeqUtils import seq1 | ||
from Bio.Seq import Seq | ||
|
||
offset_within_ts = {} | ||
|
||
|
||
def validation_helper(protstring): | ||
protstring = protstring[1:][:-1] | ||
vs = protstring.split(";") | ||
return vs | ||
|
||
|
||
def if_start_not_first(dat): | ||
if dat["target_type"] == "Protein coding" and dat["target_sequence_type"] == "dna": | ||
urn = dat["urn"] | ||
if ( | ||
urn == "urn:mavedb:00000053-a-1" or urn == "urn:mavedb:00000053-a-2" | ||
): # target sequence missing codon | ||
return None | ||
oseq = Seq(dat["target_sequence"]) | ||
ts = str(oseq.translate(table=1)) | ||
|
||
string = ( | ||
"https://api.mavedb.org/api/v1/score-sets/urn%3Amavedb%3A" | ||
+ dat["urn"][11::] | ||
+ "/scores" | ||
) | ||
# TODO: remove api call | ||
origdat = requests.get(string).content | ||
score_dat = pd.read_csv(io.StringIO(origdat.decode("utf-8"))) | ||
protlist = score_dat["hgvs_pro"].to_list() | ||
if type(score_dat.at[0, "hgvs_pro"]) != str or score_dat.at[ | ||
0, "hgvs_pro" | ||
].startswith("NP"): | ||
return None | ||
protlist = [x.lstrip("p.") for x in protlist] | ||
|
||
aa_dict = {} | ||
|
||
for k in range(len(protlist)): | ||
if protlist[k] == "_sy" or protlist[k] == "_wt": | ||
continue | ||
else: | ||
if ";" in protlist[k]: | ||
vs = validation_helper(protlist[k]) | ||
for l in range(len(vs)): | ||
aa = vs[l][:3] | ||
if ( | ||
aa == "=" | ||
or vs[l][-3:] | ||
not in Bio.SeqUtils.IUPACData.protein_letters_3to1.keys() | ||
): | ||
continue | ||
if "=" in vs[l]: | ||
loc = vs[l][3:][:-1] | ||
else: | ||
loc = vs[l][3:][:-3] | ||
if loc not in aa_dict: | ||
loc = re.sub("[^0-9]", "", loc) | ||
aa_dict[loc] = seq1(aa) | ||
|
||
else: | ||
if "_" in protlist[k]: | ||
continue | ||
aa = protlist[k][:3] | ||
if ( | ||
aa == "=" | ||
or protlist[k][-3:] | ||
not in Bio.SeqUtils.IUPACData.protein_letters_3to1.keys() | ||
): | ||
continue | ||
if "=" in protlist[k]: | ||
loc = protlist[k][3:][:-1] | ||
else: | ||
loc = protlist[k][3:][:-3] | ||
if loc not in aa_dict: | ||
loc = re.sub("[^0-9]", "", loc) | ||
aa_dict[loc] = seq1(aa) | ||
|
||
aa_dict.pop("", None) | ||
|
||
err_locs = [] | ||
for m in range(len(ts)): | ||
if str(m) in list(aa_dict.keys()): | ||
if aa_dict[str(m)] != ts[int(m) - 1]: # Str vs dict offset | ||
err_locs.append(m) | ||
|
||
if len(err_locs) > 1: | ||
aa_dict = {int(k): v for k, v in aa_dict.items()} | ||
aa_dict = sorted(aa_dict.items()) | ||
aa_dict = dict(aa_dict) | ||
locs = list(aa_dict.keys())[0:5] | ||
p0, p1, p2, p3, p4 = locs[0], locs[1], locs[2], locs[3], locs[4] | ||
offset = locs[0] | ||
|
||
seq = "" | ||
for key in aa_dict: | ||
seq = seq + aa_dict[key] | ||
|
||
for i in range(len(ts)): | ||
if ( | ||
ts[i] == aa_dict[p0] | ||
and ts[i + p1 - p0] == aa_dict[p1] | ||
and ts[i + p2 - p0] == aa_dict[p2] | ||
and ts[i + p3 - p0] == aa_dict[p3] | ||
and ts[i + p4 - p0] == aa_dict[p4] | ||
): | ||
if i + 1 == min(aa_dict.keys()) or i + 2 == min(aa_dict.keys()): | ||
offset_within_ts[urn] = 0 | ||
else: | ||
offset_within_ts[urn] = i | ||
break | ||
return offset_within_ts |
Oops, something went wrong.