BLAT Alignment without using gene symb/uniprot and updated transcript…

… selection
ave-dcd · Aug 8, 2023 · d6390fe · d6390fe
1 parent a191d26
commit d6390fe
Show file tree

Hide file tree

Showing 31 changed files with 50,146 additions and 0 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -0,0 +1,35 @@
+name: Python package
+
+on: [push]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+
+      - name: Test with pytest
+        env:
+          PATH_TO_SEQREPO: ./tests/data/seqrepo/latest
+          HG38_FILE: ./tests/data/hg38.2bit
+          GENE_NORM_DB_URL: postgres://postgres@localhost:5432/gene_normalizer
+
+        run: |
+          export UTA_PASSWORD=uta
+          export LRG_REFSEQGENE_PATH=./tests/data/LRG_RefSeqGene
+          export MANE_SUMMARY_PATH=./tests/data/MANE.GRCh38.v1.1.summary.txt.gz 
+          export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+          pytest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,5 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: stable
+    hooks:
+      - id: black
diff --git a/mavedb_mapping/__init__.py b/mavedb_mapping/__init__.py
@@ -0,0 +1,14 @@
+import os
+from biocommons.seqrepo import SeqRepo
+
+from cool_seq_tool.data_sources.uta_database import UTADatabase
+
+from ga4gh.vrs.dataproxy import SeqRepoDataProxy
+
+path_to_seqrepo = os.getenv("PATH_TO_SEQREPO", "/usr/local/share/seqrepo/latest")
+path_to_hg38_file = os.getenv("HG38_FILE", "hg38.2bit")  # default- in current directory
+data_file_path = "tests/data/"
+
+# utadb = UTADatabase(db_pwd="uta")
+sr = SeqRepo(path_to_seqrepo, writeable=True)
+dp = SeqRepoDataProxy(sr=sr)
diff --git a/mavedb_mapping/blat_alignment.py b/mavedb_mapping/blat_alignment.py
@@ -0,0 +1,218 @@
+from Bio import SearchIO
+import pandas as pd
+import subprocess
+
+from mavedb_mapping import path_to_hg38_file
+
+
+def extract_blat_output(dat: dict):
+    """
+    Runs a BLAT Query and returns the output
+
+    Parameters
+    ----------
+        dat: dict
+            Dictionary containing data required for mapping.
+
+    Returns:
+    --------
+        BLAT Output
+    """
+    blat_file = open("blat_query.fa", "w")
+    blat_file.write(">" + dat["urn"] + "\n")
+    blat_file.write(dat["target_sequence"] + "\n")
+    blat_file.close()
+    if dat["target_sequence_type"] == "protein":
+        command = f"blat {path_to_hg38_file} -q=prot -t=dnax -minScore=20 blat_query.fa blat_out.psl"
+        process = subprocess.run(command, shell=True)
+    else:
+        command = f"blat {path_to_hg38_file} -minScore=20 blat_query.fa blat_out.psl"
+        process = subprocess.run(command, shell=True)
+    try:
+        output = SearchIO.read("blat_out.psl", "blat-psl")
+    except:
+        try:
+            command = f"blat {path_to_hg38_file} -q=dnax -t=dnax -minScore=20 blat_query.fa blat_out.psl"
+            process = subprocess.run(command, shell=True)
+            output = SearchIO.read("blat_out.psl", "blat-psl")
+        except:
+            return None
+    return output
+
+
+def obtain_hit_starts(output, hit):
+    # a hit is a portion of similarity between query seq and matched seq
+    """
+    Helper function to obtain HSP
+    Returns the start of hit
+    """
+    hit_starts = list()
+    for n in range(len(output[hit])):
+        hit_starts.append(output[hit][n].hit_start)
+    return hit_starts
+
+
+def obtain_hsp(output):
+    """
+    Obtains high-scoring pairs (HSP) for query sequence
+
+    Parameters
+    ----------
+        output: BLAT output
+
+    Returns
+    -------
+        HSP
+
+    """
+    hit_dict = {}
+    for c in range(len(output)):
+        max_hit = output[c][0].score
+        for e in range(len(output[c])):
+            if output[c][e].score > max_hit:
+                max_hit = output[c][e].score
+        hit_dict[c] = max_hit
+
+    # Using top scoring hit
+    hit = max(hit_dict, key=hit_dict.get)
+    hit_starts = obtain_hit_starts(output, hit)
+    hsp = output[hit][hit_starts.index(max(hit_starts))]
+    return hsp
+
+
+def from_hsp_output(hsp, output, gsymb):
+    """
+
+    Parameters
+    ----------
+        hsp:
+            High scoring pairs obtained by using top scoring hit
+        output:
+            BLAT output
+        gsymb: str
+            Gene symbol
+
+    Returns
+    -------
+        Tuple with data used for mapping
+
+    """
+    query_ranges = hit_ranges = []
+    strand = hsp[0].query_strand
+    coverage = 100 * (hsp.query_end - hsp.query_start) / output.seq_len
+    identity = hsp.ident_pct
+
+    for j in range(len(hsp)):
+        test_file = open("blat_output_test.txt", "w")
+        test_file.write(str(hsp[j]))
+        test_file.close()
+
+        query_string = ""
+        hit_string = ""
+
+        test_file = open("blat_output_test.txt", "r")
+        for k, line in enumerate(test_file):
+            if k == 1:
+                chrom = line.strip("\n")
+            if k == 2:
+                query_string = line.strip("\n")
+            if k == 3:
+                hit_string = line.strip("\n")
+        test_file.close()
+
+        chrom = chrom.split(" ")[9].strip("chr")
+        query_string = query_string.split(" ")
+        hit_string = hit_string.split(" ")
+
+        # Range in query sequence that aligned with hit sequence
+        query_ranges.append(query_string[2])
+
+        # Range in hit sequence that aligned with hquery sequence
+        hit_ranges.append(hit_string[4])
+
+    return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb, None
+
+
+def get_query_and_hit_ranges(output, dat: dict) -> tuple:
+    """
+    Extracts query and hit ranges from the BLAT output.
+
+    Parameters
+    ----------
+        output:
+            Output from the BLAT query.
+        dat: dict
+            Dictionary containing data required for mapping.
+
+    Returns
+    -------
+        Tuple containing the chromosome, strand, coverage, identity, query ranges, hit ranges, and gene symbol.
+    """
+
+    hsp = obtain_hsp(output)
+
+    data_tuple = from_hsp_output(hsp, output, None)
+    return data_tuple
+
+
+def mave_to_blat(dat: dict) -> dict:
+    """
+
+    Performs BLAT Alignment on MaveDB scoreset data.
+
+    Parameters
+    ----------
+        dat: dict
+            Dictionary containing data from MaveDB scoresets.
+
+    Returns
+    -------
+        mave_blat_dict: dict
+            Dicitionary containing data after doing BLAT Alignment
+
+    """
+    output = extract_blat_output(dat)
+    if output is not None:
+        (
+            chrom,
+            strand,
+            coverage,
+            identity,
+            query_ranges,
+            hit_ranges,
+            gsymb,
+            accession,
+        ) = get_query_and_hit_ranges(output, dat)
+        qh_dat = {"query_ranges": query_ranges, "hit_ranges": hit_ranges}
+        qh_dat = pd.DataFrame(data=qh_dat)
+        mave_blat_dict = {
+            "urn": dat["urn"],
+            "chrom": chrom,
+            "strand": strand,
+            "target_type": dat["target_type"],
+            "uniprot": dat["uniprot_id"],
+            "coverage": coverage,
+            "identity": identity,
+            "hits": qh_dat,
+            "gsymb": gsymb,
+            "accession": accession,
+        }
+
+    else:
+        qh_dat = {"query_ranges": ["NA"], "hit_ranges": ["NA"]}
+        qh_dat = pd.DataFrame(data=qh_dat)
+        mave_blat_dict = {
+            "urn": dat["urn"],
+            "chrom": "NA",
+            "strand": "NA",
+            "target": "NA",
+            "target_type": "NA",
+            "uniprot": "NA",
+            "coverage": "NA",
+            "identity": "NA",
+            "hits": qh_dat,
+            "gsymb": "NA",
+            "accession": "NA",
+        }
+
+    return mave_blat_dict
diff --git a/mavedb_mapping/find_start_pos.py b/mavedb_mapping/find_start_pos.py
@@ -0,0 +1,120 @@
+# Find start location in provided target sequence when start position is not first position of sequence
+import requests
+import Bio
+import re
+from Bio.Seq import Seq
+import pandas as pd
+import io
+from Bio.SeqUtils import seq1
+from Bio.Seq import Seq
+
+offset_within_ts = {}
+
+
+def validation_helper(protstring):
+    protstring = protstring[1:][:-1]
+    vs = protstring.split(";")
+    return vs
+
+
+def if_start_not_first(dat):
+    if dat["target_type"] == "Protein coding" and dat["target_sequence_type"] == "dna":
+        urn = dat["urn"]
+        if (
+            urn == "urn:mavedb:00000053-a-1" or urn == "urn:mavedb:00000053-a-2"
+        ):  # target sequence missing codon
+            return None
+        oseq = Seq(dat["target_sequence"])
+        ts = str(oseq.translate(table=1))
+
+        string = (
+            "https://api.mavedb.org/api/v1/score-sets/urn%3Amavedb%3A"
+            + dat["urn"][11::]
+            + "/scores"
+        )
+        # TODO: remove api call
+        origdat = requests.get(string).content
+        score_dat = pd.read_csv(io.StringIO(origdat.decode("utf-8")))
+        protlist = score_dat["hgvs_pro"].to_list()
+        if type(score_dat.at[0, "hgvs_pro"]) != str or score_dat.at[
+            0, "hgvs_pro"
+        ].startswith("NP"):
+            return None
+        protlist = [x.lstrip("p.") for x in protlist]
+
+        aa_dict = {}
+
+        for k in range(len(protlist)):
+            if protlist[k] == "_sy" or protlist[k] == "_wt":
+                continue
+            else:
+                if ";" in protlist[k]:
+                    vs = validation_helper(protlist[k])
+                    for l in range(len(vs)):
+                        aa = vs[l][:3]
+                        if (
+                            aa == "="
+                            or vs[l][-3:]
+                            not in Bio.SeqUtils.IUPACData.protein_letters_3to1.keys()
+                        ):
+                            continue
+                        if "=" in vs[l]:
+                            loc = vs[l][3:][:-1]
+                        else:
+                            loc = vs[l][3:][:-3]
+                        if loc not in aa_dict:
+                            loc = re.sub("[^0-9]", "", loc)
+                            aa_dict[loc] = seq1(aa)
+
+                else:
+                    if "_" in protlist[k]:
+                        continue
+                    aa = protlist[k][:3]
+                    if (
+                        aa == "="
+                        or protlist[k][-3:]
+                        not in Bio.SeqUtils.IUPACData.protein_letters_3to1.keys()
+                    ):
+                        continue
+                    if "=" in protlist[k]:
+                        loc = protlist[k][3:][:-1]
+                    else:
+                        loc = protlist[k][3:][:-3]
+                    if loc not in aa_dict:
+                        loc = re.sub("[^0-9]", "", loc)
+                        aa_dict[loc] = seq1(aa)
+
+        aa_dict.pop("", None)
+
+        err_locs = []
+        for m in range(len(ts)):
+            if str(m) in list(aa_dict.keys()):
+                if aa_dict[str(m)] != ts[int(m) - 1]:  # Str vs dict offset
+                    err_locs.append(m)
+
+        if len(err_locs) > 1:
+            aa_dict = {int(k): v for k, v in aa_dict.items()}
+            aa_dict = sorted(aa_dict.items())
+            aa_dict = dict(aa_dict)
+            locs = list(aa_dict.keys())[0:5]
+            p0, p1, p2, p3, p4 = locs[0], locs[1], locs[2], locs[3], locs[4]
+            offset = locs[0]
+
+            seq = ""
+            for key in aa_dict:
+                seq = seq + aa_dict[key]
+
+            for i in range(len(ts)):
+                if (
+                    ts[i] == aa_dict[p0]
+                    and ts[i + p1 - p0] == aa_dict[p1]
+                    and ts[i + p2 - p0] == aa_dict[p2]
+                    and ts[i + p3 - p0] == aa_dict[p3]
+                    and ts[i + p4 - p0] == aa_dict[p4]
+                ):
+                    if i + 1 == min(aa_dict.keys()) or i + 2 == min(aa_dict.keys()):
+                        offset_within_ts[urn] = 0
+                    else:
+                        offset_within_ts[urn] = i
+                    break
+        return offset_within_ts