Skip to content

Commit

Permalink
BLAT Alignment without using gene symb/uniprot and updated transcript…
Browse files Browse the repository at this point in the history
… selection
  • Loading branch information
samriddhi99 committed Aug 8, 2023
1 parent a191d26 commit d6390fe
Show file tree
Hide file tree
Showing 31 changed files with 50,146 additions and 0 deletions.
35 changes: 35 additions & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Python package

on: [push]

jobs:
build:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest
env:
PATH_TO_SEQREPO: ./tests/data/seqrepo/latest
HG38_FILE: ./tests/data/hg38.2bit
GENE_NORM_DB_URL: postgres://postgres@localhost:5432/gene_normalizer

run: |
export UTA_PASSWORD=uta
export LRG_REFSEQGENE_PATH=./tests/data/LRG_RefSeqGene
export MANE_SUMMARY_PATH=./tests/data/MANE.GRCh38.v1.1.summary.txt.gz
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
pytest
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
repos:
- repo: https://github.com/psf/black
rev: stable
hooks:
- id: black
14 changes: 14 additions & 0 deletions mavedb_mapping/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import os
from biocommons.seqrepo import SeqRepo

from cool_seq_tool.data_sources.uta_database import UTADatabase

from ga4gh.vrs.dataproxy import SeqRepoDataProxy

path_to_seqrepo = os.getenv("PATH_TO_SEQREPO", "/usr/local/share/seqrepo/latest")
path_to_hg38_file = os.getenv("HG38_FILE", "hg38.2bit") # default- in current directory
data_file_path = "tests/data/"

# utadb = UTADatabase(db_pwd="uta")
sr = SeqRepo(path_to_seqrepo, writeable=True)
dp = SeqRepoDataProxy(sr=sr)
218 changes: 218 additions & 0 deletions mavedb_mapping/blat_alignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
from Bio import SearchIO
import pandas as pd
import subprocess

from mavedb_mapping import path_to_hg38_file


def extract_blat_output(dat: dict):
"""
Runs a BLAT Query and returns the output
Parameters
----------
dat: dict
Dictionary containing data required for mapping.
Returns:
--------
BLAT Output
"""
blat_file = open("blat_query.fa", "w")
blat_file.write(">" + dat["urn"] + "\n")
blat_file.write(dat["target_sequence"] + "\n")
blat_file.close()
if dat["target_sequence_type"] == "protein":
command = f"blat {path_to_hg38_file} -q=prot -t=dnax -minScore=20 blat_query.fa blat_out.psl"
process = subprocess.run(command, shell=True)
else:
command = f"blat {path_to_hg38_file} -minScore=20 blat_query.fa blat_out.psl"
process = subprocess.run(command, shell=True)
try:
output = SearchIO.read("blat_out.psl", "blat-psl")
except:
try:
command = f"blat {path_to_hg38_file} -q=dnax -t=dnax -minScore=20 blat_query.fa blat_out.psl"
process = subprocess.run(command, shell=True)
output = SearchIO.read("blat_out.psl", "blat-psl")
except:
return None
return output


def obtain_hit_starts(output, hit):
# a hit is a portion of similarity between query seq and matched seq
"""
Helper function to obtain HSP
Returns the start of hit
"""
hit_starts = list()
for n in range(len(output[hit])):
hit_starts.append(output[hit][n].hit_start)
return hit_starts


def obtain_hsp(output):
"""
Obtains high-scoring pairs (HSP) for query sequence
Parameters
----------
output: BLAT output
Returns
-------
HSP
"""
hit_dict = {}
for c in range(len(output)):
max_hit = output[c][0].score
for e in range(len(output[c])):
if output[c][e].score > max_hit:
max_hit = output[c][e].score
hit_dict[c] = max_hit

# Using top scoring hit
hit = max(hit_dict, key=hit_dict.get)
hit_starts = obtain_hit_starts(output, hit)
hsp = output[hit][hit_starts.index(max(hit_starts))]
return hsp


def from_hsp_output(hsp, output, gsymb):
"""
Parameters
----------
hsp:
High scoring pairs obtained by using top scoring hit
output:
BLAT output
gsymb: str
Gene symbol
Returns
-------
Tuple with data used for mapping
"""
query_ranges = hit_ranges = []
strand = hsp[0].query_strand
coverage = 100 * (hsp.query_end - hsp.query_start) / output.seq_len
identity = hsp.ident_pct

for j in range(len(hsp)):
test_file = open("blat_output_test.txt", "w")
test_file.write(str(hsp[j]))
test_file.close()

query_string = ""
hit_string = ""

test_file = open("blat_output_test.txt", "r")
for k, line in enumerate(test_file):
if k == 1:
chrom = line.strip("\n")
if k == 2:
query_string = line.strip("\n")
if k == 3:
hit_string = line.strip("\n")
test_file.close()

chrom = chrom.split(" ")[9].strip("chr")
query_string = query_string.split(" ")
hit_string = hit_string.split(" ")

# Range in query sequence that aligned with hit sequence
query_ranges.append(query_string[2])

# Range in hit sequence that aligned with hquery sequence
hit_ranges.append(hit_string[4])

return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb, None


def get_query_and_hit_ranges(output, dat: dict) -> tuple:
"""
Extracts query and hit ranges from the BLAT output.
Parameters
----------
output:
Output from the BLAT query.
dat: dict
Dictionary containing data required for mapping.
Returns
-------
Tuple containing the chromosome, strand, coverage, identity, query ranges, hit ranges, and gene symbol.
"""

hsp = obtain_hsp(output)

data_tuple = from_hsp_output(hsp, output, None)
return data_tuple


def mave_to_blat(dat: dict) -> dict:
"""
Performs BLAT Alignment on MaveDB scoreset data.
Parameters
----------
dat: dict
Dictionary containing data from MaveDB scoresets.
Returns
-------
mave_blat_dict: dict
Dicitionary containing data after doing BLAT Alignment
"""
output = extract_blat_output(dat)
if output is not None:
(
chrom,
strand,
coverage,
identity,
query_ranges,
hit_ranges,
gsymb,
accession,
) = get_query_and_hit_ranges(output, dat)
qh_dat = {"query_ranges": query_ranges, "hit_ranges": hit_ranges}
qh_dat = pd.DataFrame(data=qh_dat)
mave_blat_dict = {
"urn": dat["urn"],
"chrom": chrom,
"strand": strand,
"target_type": dat["target_type"],
"uniprot": dat["uniprot_id"],
"coverage": coverage,
"identity": identity,
"hits": qh_dat,
"gsymb": gsymb,
"accession": accession,
}

else:
qh_dat = {"query_ranges": ["NA"], "hit_ranges": ["NA"]}
qh_dat = pd.DataFrame(data=qh_dat)
mave_blat_dict = {
"urn": dat["urn"],
"chrom": "NA",
"strand": "NA",
"target": "NA",
"target_type": "NA",
"uniprot": "NA",
"coverage": "NA",
"identity": "NA",
"hits": qh_dat,
"gsymb": "NA",
"accession": "NA",
}

return mave_blat_dict
120 changes: 120 additions & 0 deletions mavedb_mapping/find_start_pos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Find start location in provided target sequence when start position is not first position of sequence
import requests
import Bio
import re
from Bio.Seq import Seq
import pandas as pd
import io
from Bio.SeqUtils import seq1
from Bio.Seq import Seq

offset_within_ts = {}


def validation_helper(protstring):
protstring = protstring[1:][:-1]
vs = protstring.split(";")
return vs


def if_start_not_first(dat):
if dat["target_type"] == "Protein coding" and dat["target_sequence_type"] == "dna":
urn = dat["urn"]
if (
urn == "urn:mavedb:00000053-a-1" or urn == "urn:mavedb:00000053-a-2"
): # target sequence missing codon
return None
oseq = Seq(dat["target_sequence"])
ts = str(oseq.translate(table=1))

string = (
"https://api.mavedb.org/api/v1/score-sets/urn%3Amavedb%3A"
+ dat["urn"][11::]
+ "/scores"
)
# TODO: remove api call
origdat = requests.get(string).content
score_dat = pd.read_csv(io.StringIO(origdat.decode("utf-8")))
protlist = score_dat["hgvs_pro"].to_list()
if type(score_dat.at[0, "hgvs_pro"]) != str or score_dat.at[
0, "hgvs_pro"
].startswith("NP"):
return None
protlist = [x.lstrip("p.") for x in protlist]

aa_dict = {}

for k in range(len(protlist)):
if protlist[k] == "_sy" or protlist[k] == "_wt":
continue
else:
if ";" in protlist[k]:
vs = validation_helper(protlist[k])
for l in range(len(vs)):
aa = vs[l][:3]
if (
aa == "="
or vs[l][-3:]
not in Bio.SeqUtils.IUPACData.protein_letters_3to1.keys()
):
continue
if "=" in vs[l]:
loc = vs[l][3:][:-1]
else:
loc = vs[l][3:][:-3]
if loc not in aa_dict:
loc = re.sub("[^0-9]", "", loc)
aa_dict[loc] = seq1(aa)

else:
if "_" in protlist[k]:
continue
aa = protlist[k][:3]
if (
aa == "="
or protlist[k][-3:]
not in Bio.SeqUtils.IUPACData.protein_letters_3to1.keys()
):
continue
if "=" in protlist[k]:
loc = protlist[k][3:][:-1]
else:
loc = protlist[k][3:][:-3]
if loc not in aa_dict:
loc = re.sub("[^0-9]", "", loc)
aa_dict[loc] = seq1(aa)

aa_dict.pop("", None)

err_locs = []
for m in range(len(ts)):
if str(m) in list(aa_dict.keys()):
if aa_dict[str(m)] != ts[int(m) - 1]: # Str vs dict offset
err_locs.append(m)

if len(err_locs) > 1:
aa_dict = {int(k): v for k, v in aa_dict.items()}
aa_dict = sorted(aa_dict.items())
aa_dict = dict(aa_dict)
locs = list(aa_dict.keys())[0:5]
p0, p1, p2, p3, p4 = locs[0], locs[1], locs[2], locs[3], locs[4]
offset = locs[0]

seq = ""
for key in aa_dict:
seq = seq + aa_dict[key]

for i in range(len(ts)):
if (
ts[i] == aa_dict[p0]
and ts[i + p1 - p0] == aa_dict[p1]
and ts[i + p2 - p0] == aa_dict[p2]
and ts[i + p3 - p0] == aa_dict[p3]
and ts[i + p4 - p0] == aa_dict[p4]
):
if i + 1 == min(aa_dict.keys()) or i + 2 == min(aa_dict.keys()):
offset_within_ts[urn] = 0
else:
offset_within_ts[urn] = i
break
return offset_within_ts
Loading

0 comments on commit d6390fe

Please sign in to comment.