From c27f1eb12ea2dc031b516285ae2da2b4c9faae9c Mon Sep 17 00:00:00 2001
From: Samriddhi Singh <samriddhisingh6079@gmail.com>
Date: Tue, 18 Jul 2023 12:19:04 +0530
Subject: [PATCH 01/12] Obtained HGNC accession IDs

---
 mavedb_mapping/blat_alignment.py | 66 +++++++++++++-------------------
 1 file changed, 27 insertions(+), 39 deletions(-)

diff --git a/mavedb_mapping/blat_alignment.py b/mavedb_mapping/blat_alignment.py
index 60b6dae..09041c9 100644
--- a/mavedb_mapping/blat_alignment.py
+++ b/mavedb_mapping/blat_alignment.py
@@ -21,7 +21,22 @@ def get_gene_symb(dat):
     return gsymb
 
 
-def get_gene_data(return_chr: bool, dat: dict, gsymb: str):
+def get_gene_data(gsymb: str):
+    if gsymb == "NA":
+        return "NA"
+    temp = qh.search(gsymb).source_matches
+    source_dict = {}
+    for i in range(len(temp)):
+        source_dict[temp[i].source] = i
+    return temp, source_dict
+
+
+def get_hgnc_accession(temp, source_dict):
+    accession = temp[source_dict["HGNC"]].records[0].concept_id
+    return accession
+
+
+def return_gene_data(return_chr: bool, temp, source_dict):
     """
     Parameters
     ----------
@@ -45,12 +60,8 @@ def get_gene_data(return_chr: bool, dat: dict, gsymb: str):
             If gene symbol can be extracted and return_chr is False
 
     """
-    if gsymb == "NA":
+    if temp == "NA":
         return "NA"
-    temp = qh.search(gsymb).source_matches
-    source_dict = {}
-    for i in range(len(temp)):
-        source_dict[temp[i].source] = i
 
     if "HGNC" in source_dict and return_chr == True:
         chrom = temp[source_dict["HGNC"]].records[0].locations[0].chr
@@ -180,9 +191,13 @@ def get_query_and_hit_ranges(output, dat):
     chrom = strand = ""
     coverage = identity = None
     query_ranges = hit_ranges = list()
+
     gsymb = get_gene_symb(dat)
+    temp, source_dict = get_gene_data(gsymb)
+    accession = get_hgnc_accession(temp, source_dict)
+
     for c in range(len(output)):
-        correct_chr = get_gene_data(dat=dat, return_chr=True, gsymb=gsymb)
+        correct_chr = return_gene_data(True, temp, source_dict)
         if correct_chr == output[c].id.strip("chr"):
             use_chr = True
             break
@@ -200,7 +215,7 @@ def get_query_and_hit_ranges(output, dat):
     else:
         hit = c
 
-    loc_dict = get_gene_data(False, dat, gsymb)
+    loc_dict = return_gene_data(False, temp, source_dict)
 
     hit_starts = list()
     for n in range(len(output[hit])):
@@ -247,37 +262,7 @@ def get_query_and_hit_ranges(output, dat):
         query_ranges.append(query_string[2])
         hit_ranges.append(hit_string[4])
 
-    return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb
-
-
-def check_non_human(mave_blat_dict, min_percentage=80):
-    # for dna min % = 95
-    # for prot min % = 80
-    # as per BLAT website: "BLAT on DNA is designed to quickly find sequences of 95% and grent or shorter sequence alignments. BLAT on proteins finds sequences of 80% and greater similarity of length 20 amino acids or more.
-    """
-    Checks if a sample is human or non-human based on the Mave-BLAT dictionary.
-
-    Parameters
-    ----------
-
-        mave_blat_dict: dict
-            Dicitionary containing data after doing BLAT Alignment
-
-        min_percent: int
-            Minimum percentage coverage to consider a sample as human.
-
-    Returns
-    -------
-        str: "human" if the sample is human, "Non human" otherwise.
-    """
-    cov = mave_blat_dict["coverage"]
-    if cov == "NA":
-        return "Non human"
-    else:
-        if cov < min_percentage:
-            return "Non human"
-        else:
-            return "human"
+    return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb, accession
 
 
 def mave_to_blat(dat):
@@ -306,6 +291,7 @@ def mave_to_blat(dat):
             query_ranges,
             hit_ranges,
             gsymb,
+            accession,
         ) = get_query_and_hit_ranges(output, dat)
         qh_dat = {"query_ranges": query_ranges, "hit_ranges": hit_ranges}
         qh_dat = pd.DataFrame(data=qh_dat)
@@ -320,6 +306,7 @@ def mave_to_blat(dat):
             "identity": identity,
             "hits": qh_dat,
             "gsymb": gsymb,
+            "accession": accession,
         }
 
     else:
@@ -336,6 +323,7 @@ def mave_to_blat(dat):
             "identity": "NA",
             "hits": qh_dat,
             "gsymb": "NA",
+            "accession": "NA",
         }
 
     return mave_blat_dict

From 7f2c1a0fc2c9d6bbbd4517734da5ccbe3243ac03 Mon Sep 17 00:00:00 2001
From: Samriddhi Singh <samriddhisingh6079@gmail.com>
Date: Tue, 18 Jul 2023 12:26:26 +0530
Subject: [PATCH 02/12] Updated transcript_selection.py

---
 mavedb_mapping/blat_alignment.py       | 2 +-
 mavedb_mapping/transcript_selection.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/mavedb_mapping/blat_alignment.py b/mavedb_mapping/blat_alignment.py
index 09041c9..da8bca4 100644
--- a/mavedb_mapping/blat_alignment.py
+++ b/mavedb_mapping/blat_alignment.py
@@ -5,7 +5,7 @@
 import subprocess
 from gene.database import create_db
 
-qh = QueryHandler(create_db("postgres://postgres@localhost:5432/gene_normalizer"))
+qh = QueryHandler(create_db())
 
 
 def get_gene_symb(dat):
diff --git a/mavedb_mapping/transcript_selection.py b/mavedb_mapping/transcript_selection.py
index cdfb8c2..996457b 100644
--- a/mavedb_mapping/transcript_selection.py
+++ b/mavedb_mapping/transcript_selection.py
@@ -283,6 +283,9 @@ def main(mave_blat_dict: dict, dat: dict) -> dict:
     if dat["target_type"] == "Protein coding" or dat["target_type"] == "protein_coding":
         if mave_blat_dict["chrom"] == "NA":
             raise Exception("No BLAT output")
+        if check_non_human(mave_blat_dict) == "Non human":
+            raise ValueError("Non Human Scoreset")
+
         locs = get_locs_list(mave_blat_dict["hits"])
         chrom = get_chr(dp, mave_blat_dict["chrom"])
         gsymb = get_gsymb(dat)

From ac1fad10226f9fa291a09554237c43bce44cb623 Mon Sep 17 00:00:00 2001
From: Samriddhi Singh <samriddhisingh6079@gmail.com>
Date: Thu, 27 Jul 2023 03:28:42 +0530
Subject: [PATCH 03/12] Updated BLAT Alignement code for case where gene symbol
 cannot be obtained

---
 mavedb_mapping/blat_alignment.py | 128 ++++++++++++++++---------------
 1 file changed, 66 insertions(+), 62 deletions(-)

diff --git a/mavedb_mapping/blat_alignment.py b/mavedb_mapping/blat_alignment.py
index 0b17f98..d535dc7 100644
--- a/mavedb_mapping/blat_alignment.py
+++ b/mavedb_mapping/blat_alignment.py
@@ -6,7 +6,6 @@
 from mavedb_mapping import path_to_hg38_file
 
 
-
 # TODO: edit docstrings
 def get_gene_symb(dat):
     try:
@@ -39,7 +38,6 @@ def get_hgnc_accession(temp, source_dict):
     return accession
 
 
-
 def get_sequence_interval(records):
     for record in records:
         for location in record.locations:
@@ -144,69 +142,49 @@ def extract_blat_output(dat: dict):
     return output
 
 
-def get_query_and_hit_ranges(output, dat):
-    """
-    Extracts query and hit ranges from the BLAT output.
-
-    Parameters
-    ----------
-        output:
-            Output from the BLAT query.
-        dat: dict
-            Dictionary containing data required for mapping.
-
-    Returns
-    -------
-        Tuple containing the chromosome, strand, coverage, identity, query ranges, hit ranges, and gene symbol.
-    """
-    hit_scores = list()
-    hit_dict = {}
-    use_chr = False
-    chrom = strand = ""
-    coverage = identity = None
-    query_ranges = hit_ranges = list()
+def obtain_hit_starts(output, hit):
+    hit_starts = list()
+    for n in range(len(output[hit])):
+        hit_starts.append(output[hit][n].hit_start)
+    return hit_starts
 
-    gsymb = get_gene_symb(dat)
-    temp, source_dict = get_gene_data(gsymb)
-    accession = get_hgnc_accession(temp, source_dict)
 
-    correct_chr = return_gene_data(True, temp, source_dict)
+def using_gene_normalizer(output, correct_chr, temp, source_dict):
     for c in range(len(output)):
         if correct_chr == output[c].id.strip("chr"):
-            use_chr = True
+            hit = c
             break
-        if (
-            correct_chr == "NA"
-        ):  # Take top scoring hit if target not found using gene normalizer
-            hit_scores = list()
-            for e in range(len(output[c])):
-                hit_scores.append(output[c][e].score)
-            hit_dict[c] = hit_scores
-    if use_chr == False:
-        for key in hit_dict:
-            hit_dict[key] = max(hit_dict[key])
-        hit = max(hit_dict, key=hit_dict.get)
-    else:
-        hit = c
-
     loc_dict = return_gene_data(False, temp, source_dict)
+    hit_starts = obtain_hit_starts(output, hit)
+    hsp = output[hit][
+        hit_starts.index(min(hit_starts, key=lambda x: abs(x - loc_dict["start"])))
+    ]
+    return hsp
 
-    hit_starts = list()
-    for n in range(len(output[hit])):
-        hit_starts.append(output[hit][n].hit_start)
-
-    sub_scores = list()
-    for n in range(len(output[hit])):
-        sub_scores.append(output[hit][n].score)
 
-    if loc_dict == "NA":
-        hsp = output[hit][
-            sub_scores.index(max(sub_scores))
-        ]  # Take top score if no match found
-    else:
-        hsp = output[hit][
-            hit_starts.index(min(hit_starts, key=lambda x: abs(x - loc_dict["start"])))
-        ]
+def obtain_hsp(output):
+    hit_dict = {}
+    for c in range(
+        len(output)
+    ):  # Take top scoring hit if target not found using gene normalizer
+        hit_scores = list()
+        for e in range(len(output[c])):
+            hit_scores.append(output[c][e].score)
+        hit_dict[c] = hit_scores
+    # maybe can include in above for loop
+    for key in hit_dict:
+        hit_dict[key] = max(hit_dict[key])
+    hit = max(hit_dict, key=hit_dict.get)
+    hit_starts = obtain_hit_starts(output, hit)
+    hsp = output[hit][hit_starts.index(max(hit_starts))]
+    return hsp
+
+
+def from_hsp_output(hsp, output, gsymb):
+    query_ranges = hit_ranges = []
+    strand = hsp[0].query_strand
+    coverage = 100 * (hsp.query_end - hsp.query_start) / output.seq_len
+    identity = hsp.ident_pct
 
     for j in range(len(hsp)):
         test_file = open("blat_output_test.txt", "w")
@@ -215,10 +193,6 @@ def get_query_and_hit_ranges(output, dat):
 
         query_string = ""
         hit_string = ""
-        strand = hsp[0].query_strand
-        coverage = 100 * (hsp.query_end - hsp.query_start) / output.seq_len
-        # coverage = f"{hsp.query_end - hsp.query_start} / {output.seq_len}, {coverage}"
-        identity = hsp.ident_pct
 
         test_file = open("blat_output_test.txt", "r")
         for k, line in enumerate(test_file):
@@ -235,8 +209,38 @@ def get_query_and_hit_ranges(output, dat):
         hit_string = hit_string.split(" ")
         query_ranges.append(query_string[2])
         hit_ranges.append(hit_string[4])
+    return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb, None
+
+
+def get_query_and_hit_ranges(output, dat):
+    """
+    Extracts query and hit ranges from the BLAT output.
+
+    Parameters
+    ----------
+        output:
+            Output from the BLAT query.
+        dat: dict
+            Dictionary containing data required for mapping.
+
+    Returns
+    -------
+        Tuple containing the chromosome, strand, coverage, identity, query ranges, hit ranges, and gene symbol.
+    """
+    gsymb = get_gene_symb(dat)
+    temp, source_dict = get_gene_data(gsymb)
+
+    if "HGNC" in source_dict and (
+        source_dict.get("Ensembl") is not None or source_dict.get("NCBI") is not None
+    ):
+        # TODO: remove these conditions from return_gene_data
+        correct_chr = return_gene_data(True, temp, source_dict)
+        hsp = using_gene_normalizer(output, correct_chr, temp, source_dict)
+    else:
+        hsp = obtain_hsp(output)
 
-    return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb, accession
+    data_tuple = from_hsp_output(hsp, output, gsymb)
+    return data_tuple
 
 
 def mave_to_blat(dat):

From f2a6799a7a5a42b05296caae1508fe3c4d15c91b Mon Sep 17 00:00:00 2001
From: Samriddhi Singh <samriddhisingh6079@gmail.com>
Date: Sat, 29 Jul 2023 01:55:52 +0530
Subject: [PATCH 04/12] Added some docstrings

---
 mavedb_mapping/blat_alignment.py | 41 ++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/mavedb_mapping/blat_alignment.py b/mavedb_mapping/blat_alignment.py
index d535dc7..c0a8790 100644
--- a/mavedb_mapping/blat_alignment.py
+++ b/mavedb_mapping/blat_alignment.py
@@ -8,6 +8,23 @@
 
 # TODO: edit docstrings
 def get_gene_symb(dat):
+    """
+    Obtains gene symbol using gene normalizer
+
+    Parameters
+    ----------
+        dat: dict
+            Dictionary containing data required for mapping from MAVEDB scoreset.
+
+
+    Returns:
+    --------
+        str:
+            Gene symbol
+
+        If gene symbol cannot be extracted, returns None.
+
+    """
     try:
         uniprot = dat["uniprot_id"]
         gsymb = qh.normalize(str(f"uniprot:{uniprot}")).gene_descriptor.label
@@ -16,7 +33,7 @@ def get_gene_symb(dat):
             target = dat["target"].split(" ")[0]
             gsymb = qh.normalize(target).gene_descriptor.label
         except:
-            return "NA"
+            return None
     return gsymb
 
 
@@ -39,6 +56,10 @@ def get_hgnc_accession(temp, source_dict):
 
 
 def get_sequence_interval(records):
+    """
+    Helper function to obtain gene data
+
+    """
     for record in records:
         for location in record.locations:
             if location.interval.type == "SequenceInterval":
@@ -56,11 +77,10 @@ def return_gene_data(return_chr: bool, temp, source_dict):
         return_chr :bool
            If True, returns chromosome information.
 
-        dat: dict
-            Dictionary containing data required for mapping.
+        temp: data obtained from gene normalizer
 
-        gsymb: str
-            Gene symbol.
+        source_dict: dict
+            Dictionary of sources (NCBI, HGNC, Ensembl)
 
 
     Returns:
@@ -102,10 +122,10 @@ def return_gene_data(return_chr: bool, temp, source_dict):
 
 def extract_blat_output(dat: dict):
     """
+    Runs a BLAT Query and returns the output
+
     Parameters
     ----------
-        return_chr :bool
-           If True, returns chromosome information.
 
         dat: dict
             Dictionary containing data required for mapping.
@@ -113,12 +133,7 @@ def extract_blat_output(dat: dict):
 
     Returns:
     --------
-        str:
-            If return_chr is True, returns the chromosome value as a string.
-            If gene symbol cannot be extracted, returns 'NA'.
-        OR
-        dict:
-            If gene symbol can be extracted and return_chr is False
+        BLAT Output
     """
     blat_file = open("blat_query.fa", "w")
     blat_file.write(">" + dat["target"] + "\n")

From 930acd80ca659038bc9d84600ed9757726925c9d Mon Sep 17 00:00:00 2001
From: Samriddhi Singh <samriddhisingh6079@gmail.com>
Date: Tue, 1 Aug 2023 11:12:15 +0530
Subject: [PATCH 05/12] Added tests directory

---
 mavedb_mapping/__init__.py                                    | 2 +-
 {mavedb_mapping/tests => tests}/conftest.py                   | 0
 {mavedb_mapping/tests => tests}/data/00000001-a-4.json        | 0
 {mavedb_mapping/tests => tests}/data/00000041-a-1.json        | 0
 {mavedb_mapping/tests => tests}/data/scores-00000001-a-4      | 0
 {mavedb_mapping/tests => tests}/data/scores-00000041-a-1      | 0
 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000001-a-4  | 0
 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000004-a-1  | 0
 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000010-a-1  | 0
 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000018-a-1  | 0
 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000041-a-1  | 0
 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000041-b-1  | 0
 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000060-a-1  | 0
 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000083-e-1  | 0
 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000097-c-1  | 0
 {mavedb_mapping/tests => tests}/test_for_blat.py              | 4 ++--
 {mavedb_mapping/tests => tests}/test_for_invalid_inputs.py    | 0
 {mavedb_mapping/tests => tests}/test_for_mapped_variants.py   | 0
 .../tests => tests}/test_for_transcript_selection.py          | 0
 19 files changed, 3 insertions(+), 3 deletions(-)
 rename {mavedb_mapping/tests => tests}/conftest.py (100%)
 rename {mavedb_mapping/tests => tests}/data/00000001-a-4.json (100%)
 rename {mavedb_mapping/tests => tests}/data/00000041-a-1.json (100%)
 rename {mavedb_mapping/tests => tests}/data/scores-00000001-a-4 (100%)
 rename {mavedb_mapping/tests => tests}/data/scores-00000041-a-1 (100%)
 rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000001-a-4 (100%)
 rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000004-a-1 (100%)
 rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000010-a-1 (100%)
 rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000018-a-1 (100%)
 rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000041-a-1 (100%)
 rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000041-b-1 (100%)
 rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000060-a-1 (100%)
 rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000083-e-1 (100%)
 rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000097-c-1 (100%)
 rename {mavedb_mapping/tests => tests}/test_for_blat.py (96%)
 rename {mavedb_mapping/tests => tests}/test_for_invalid_inputs.py (100%)
 rename {mavedb_mapping/tests => tests}/test_for_mapped_variants.py (100%)
 rename {mavedb_mapping/tests => tests}/test_for_transcript_selection.py (100%)

diff --git a/mavedb_mapping/__init__.py b/mavedb_mapping/__init__.py
index e6a02f3..feaa027 100644
--- a/mavedb_mapping/__init__.py
+++ b/mavedb_mapping/__init__.py
@@ -7,7 +7,7 @@
 
 path_to_seqrepo = os.getenv("PATH_TO_SEQREPO", "/usr/local/share/seqrepo/latest")
 path_to_hg38_file = os.getenv("HG38_FILE", "hg38.2bit")  # default- in current directory
-data_file_path = "mavedb_mapping/tests/data/"
+data_file_path = "tests/data/"
 
 # utadb = UTADatabase(db_pwd="uta")
 qh = QueryHandler(create_db())
diff --git a/mavedb_mapping/tests/conftest.py b/tests/conftest.py
similarity index 100%
rename from mavedb_mapping/tests/conftest.py
rename to tests/conftest.py
diff --git a/mavedb_mapping/tests/data/00000001-a-4.json b/tests/data/00000001-a-4.json
similarity index 100%
rename from mavedb_mapping/tests/data/00000001-a-4.json
rename to tests/data/00000001-a-4.json
diff --git a/mavedb_mapping/tests/data/00000041-a-1.json b/tests/data/00000041-a-1.json
similarity index 100%
rename from mavedb_mapping/tests/data/00000041-a-1.json
rename to tests/data/00000041-a-1.json
diff --git a/mavedb_mapping/tests/data/scores-00000001-a-4 b/tests/data/scores-00000001-a-4
similarity index 100%
rename from mavedb_mapping/tests/data/scores-00000001-a-4
rename to tests/data/scores-00000001-a-4
diff --git a/mavedb_mapping/tests/data/scores-00000041-a-1 b/tests/data/scores-00000041-a-1
similarity index 100%
rename from mavedb_mapping/tests/data/scores-00000041-a-1
rename to tests/data/scores-00000041-a-1
diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000001-a-4 b/tests/data/urn:mavedb:00000001-a-4
similarity index 100%
rename from mavedb_mapping/tests/data/urn:mavedb:00000001-a-4
rename to tests/data/urn:mavedb:00000001-a-4
diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000004-a-1 b/tests/data/urn:mavedb:00000004-a-1
similarity index 100%
rename from mavedb_mapping/tests/data/urn:mavedb:00000004-a-1
rename to tests/data/urn:mavedb:00000004-a-1
diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000010-a-1 b/tests/data/urn:mavedb:00000010-a-1
similarity index 100%
rename from mavedb_mapping/tests/data/urn:mavedb:00000010-a-1
rename to tests/data/urn:mavedb:00000010-a-1
diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000018-a-1 b/tests/data/urn:mavedb:00000018-a-1
similarity index 100%
rename from mavedb_mapping/tests/data/urn:mavedb:00000018-a-1
rename to tests/data/urn:mavedb:00000018-a-1
diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000041-a-1 b/tests/data/urn:mavedb:00000041-a-1
similarity index 100%
rename from mavedb_mapping/tests/data/urn:mavedb:00000041-a-1
rename to tests/data/urn:mavedb:00000041-a-1
diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000041-b-1 b/tests/data/urn:mavedb:00000041-b-1
similarity index 100%
rename from mavedb_mapping/tests/data/urn:mavedb:00000041-b-1
rename to tests/data/urn:mavedb:00000041-b-1
diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000060-a-1 b/tests/data/urn:mavedb:00000060-a-1
similarity index 100%
rename from mavedb_mapping/tests/data/urn:mavedb:00000060-a-1
rename to tests/data/urn:mavedb:00000060-a-1
diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000083-e-1 b/tests/data/urn:mavedb:00000083-e-1
similarity index 100%
rename from mavedb_mapping/tests/data/urn:mavedb:00000083-e-1
rename to tests/data/urn:mavedb:00000083-e-1
diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000097-c-1 b/tests/data/urn:mavedb:00000097-c-1
similarity index 100%
rename from mavedb_mapping/tests/data/urn:mavedb:00000097-c-1
rename to tests/data/urn:mavedb:00000097-c-1
diff --git a/mavedb_mapping/tests/test_for_blat.py b/tests/test_for_blat.py
similarity index 96%
rename from mavedb_mapping/tests/test_for_blat.py
rename to tests/test_for_blat.py
index e37731d..8e37613 100644
--- a/mavedb_mapping/tests/test_for_blat.py
+++ b/tests/test_for_blat.py
@@ -2,10 +2,10 @@
 from mavedb_mapping.blat_alignment import mave_to_blat
 from mavedb_mapping.metadata_process import metadata_obtain
 from mavedb_mapping.transcript_selection_helper import HelperFunctionsForBLATOutput
-
+from mavedb_mapping import data_file_path
 """Tests that run check_non_human function after BLAT Alignment to determine if scoreset is Human"""
 
-data_file_path = "mavedb_mapping/tests/data/"
+
 
 
 @pytest.fixture
diff --git a/mavedb_mapping/tests/test_for_invalid_inputs.py b/tests/test_for_invalid_inputs.py
similarity index 100%
rename from mavedb_mapping/tests/test_for_invalid_inputs.py
rename to tests/test_for_invalid_inputs.py
diff --git a/mavedb_mapping/tests/test_for_mapped_variants.py b/tests/test_for_mapped_variants.py
similarity index 100%
rename from mavedb_mapping/tests/test_for_mapped_variants.py
rename to tests/test_for_mapped_variants.py
diff --git a/mavedb_mapping/tests/test_for_transcript_selection.py b/tests/test_for_transcript_selection.py
similarity index 100%
rename from mavedb_mapping/tests/test_for_transcript_selection.py
rename to tests/test_for_transcript_selection.py

From 6c7f2cd3366c117eec08521e6a3b095ce1b6018c Mon Sep 17 00:00:00 2001
From: Samriddhi Singh <samriddhisingh6079@gmail.com>
Date: Tue, 1 Aug 2023 11:41:28 +0530
Subject: [PATCH 06/12] Added workflow

---
 .github/workflows/python-package.yml | 29 ++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 .github/workflows/python-package.yml

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
new file mode 100644
index 0000000..ff4e7c5
--- /dev/null
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,29 @@
+name: Python package
+
+on: [push]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"]
+    env:
+        PATH_TO_SEQREPO: ./tests/data/seqrepo/latest
+        HG38_FILE: ./tests/data/hg38.2bit
+        
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r .requirements.txt  
+      - name: Test with pytest
+        run: |
+          export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+          pytest

From a99afb254eb38c3a397af6a991db6853b6018cb3 Mon Sep 17 00:00:00 2001
From: Samriddhi Singh <samriddhisingh6079@gmail.com>
Date: Tue, 1 Aug 2023 11:44:31 +0530
Subject: [PATCH 07/12] added requirements.txt

---
 requirements.txt | 149 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f587c4d
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,149 @@
+-i https://pypi.org/simple
+aiofiles>=22.1.0 ; python_version >= '3.7' and python_version < '4.0'
+anyio>=3.6.2 ; python_full_version >= '3.6.2'
+appdirs>=1.4.4
+appnope>=0.1.3 ; platform_system >= 'Darwin'
+argcomplete>=2.0.0 ; python_version >= '3.6'
+argh>=0.26.2
+argon2-cffi>=21.3.0 ; python_version >= '3.6'
+argon2-cffi-bindings>=21.2.0 ; python_version >= '3.6'
+asttokens>=2.1.0
+asyncio>=3.4.3
+asyncpg>=0.27.0 ; python_full_version >= '3.7.0'
+attrs>=22.1.0 ; python_version >= '3.5'
+babel>=2.11.0 ; python_version >= '3.6'
+backcall>=0.2.0
+beautifulsoup4>=4.11.1 ; python_full_version >= '3.6.0'
+biocommons.seqrepo>=0.6.5
+biopython>=1.80
+bioutils>=0.5.7 ; python_version >= '3.6'
+bleach>=5.0.1 ; python_version >= '3.7'
+boto3>=1.26.16 ; python_version >= '3.7'
+botocore>=1.29.16 ; python_version >= '3.7'
+bs4>=0.0.1
+canonicaljson>=1.6.4 ; python_version >= '3.7'
+certifi>=2022.9.24 ; python_version >= '3.6'
+cffi>=1.15.1
+charset-normalizer>=2.1.1 ; python_full_version >= '3.6.0'
+click>=8.1.3 ; python_version >= '3.7'
+coloredlogs>=15.0.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
+configparser>=5.3.0 ; python_version >= '3.7'
+cool-seq-tool>=0.1.4
+cssselect>=1.2.0 ; python_version >= '3.7'
+debugpy>=1.6.3 ; python_version >= '3.7'
+decorator>=5.1.1 ; python_version >= '3.5'
+defusedxml>=0.7.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
+entrypoints>=0.4 ; python_version >= '3.6'
+executing>=1.2.0
+fake-useragent>=1.0.1
+fastapi>=0.87.0 ; python_version >= '3.7'
+fastjsonschema>=2.16.2
+ga4gh.vrs[extras]>=0.8.7.dev0 ; python_version >= '3.6'
+ga4gh.vrsatile.pydantic>=0.0.13
+gene-normalizer>=0.1.37
+gffutils>=0.11.1
+h11>=0.14.0 ; python_version >= '3.7'
+hgvs>=1.5.4 ; python_version >= '3.6'
+humanfriendly>=10.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
+idna>=3.4 ; python_version >= '3.5'
+importlib-metadata>=5.0.0 ; python_version >= '3.7'
+inflection>=0.5.1 ; python_version >= '3.5'
+ipykernel>=6.17.1 ; python_version >= '3.8'
+ipython>=8.6.0 ; python_version >= '3.8'
+ipython-genutils>=0.2.0
+ipywidgets>=8.0.2 ; python_version >= '3.7'
+jedi>=0.18.2 ; python_version >= '3.6'
+jinja2>=3.1.2 ; python_version >= '3.7'
+jmespath>=1.0.1 ; python_version >= '3.7'
+json5>=0.9.10
+jsonschema>=3.2.0
+jupyter>=1.0.0
+jupyter-client>=7.4.7 ; python_version >= '3.7'
+jupyter-console>=6.4.4 ; python_version >= '3.7'
+jupyter-core>=5.0.0 ; python_version >= '3.8'
+jupyter-server>=1.23.3 ; python_version >= '3.7'
+jupyterlab>=3.5.0
+jupyterlab-pygments>=0.2.2 ; python_version >= '3.7'
+jupyterlab-server>=2.16.3 ; python_version >= '3.7'
+jupyterlab-widgets>=3.0.3 ; python_version >= '3.7'
+lxml>=4.9.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
+markdown>=3.4.1 ; python_version >= '3.7'
+markupsafe>=2.1.1 ; python_version >= '3.7'
+matplotlib-inline>=0.1.6 ; python_version >= '3.5'
+mistune>=2.0.4
+nbclassic>=0.4.8 ; python_version >= '3.7'
+nbclient>=0.7.0 ; python_full_version >= '3.7.0'
+nbconvert>=7.2.5 ; python_version >= '3.7'
+nbformat>=5.7.0 ; python_version >= '3.7'
+nest-asyncio>=1.5.6
+notebook>=6.5.2 ; python_version >= '3.7'
+notebook-shim>=0.2.2 ; python_version >= '3.7'
+numpy>=1.23.5 ; python_version >= '3.8'
+packaging>=21.3 ; python_version >= '3.6'
+pandas>=1.5.2
+pandocfilters>=1.5.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
+parse>=1.19.0
+parsley>=1.3
+parso>=0.8.3 ; python_version >= '3.6'
+pexpect>=4.8.0 ; sys_platform != 'win32'
+pickleshare>=0.7.5
+platformdirs>=2.5.4 ; python_version >= '3.7'
+plotly>=5.11.0
+prometheus-client>=0.15.0 ; python_version >= '3.6'
+prompt-toolkit>=3.0.33 ; python_full_version >= '3.6.2'
+psutil>=5.9.4 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
+psycopg>=3.1.9
+psycopg2>=2.9.5 ; python_version >= '3.6'
+psycopg2-binary>=2.9.5
+ptyprocess>=0.7.0 ; os_name != 'nt'
+pure-eval>=0.2.2
+pycparser>=2.21
+pydantic>=1.10.2 ; python_version >= '3.7'
+pyee>=8.2.2
+pyfaidx>=0.7.1
+pygments>=2.13.0 ; python_version >= '3.6'
+pyliftover>=0.4
+pyparsing>=3.0.9 ; python_full_version >= '3.6.8'
+pyppeteer>=1.0.2 ; python_version >= '3.7' and python_version < '4.0'
+pyquery>=1.4.3
+pyrsistent>=0.19.2 ; python_version >= '3.7'
+pysam>=0.18.0
+pytest>=6.0
+python-dateutil>=2.8.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
+python-jsonschema-objects>=0.4.1
+pytz>=2022.6
+pyyaml>=6.0 ; python_version >= '3.6'
+pyzmq>=24.0.1 ; python_version >= '3.6'
+qtconsole>=5.4.0 ; python_version >= '3.7'
+qtpy>=2.3.0 ; python_version >= '3.7'
+requests>=2.28.1
+requests-html>=0.10.0 ; python_full_version >= '3.6.0'
+s3transfer>=0.6.0 ; python_version >= '3.7'
+send2trash>=1.8.0
+setuptools>=65.6.2 ; python_version >= '3.7'
+simplejson>=3.18.0 ; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3'
+six>=1.16.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
+sniffio>=1.3.0 ; python_version >= '3.7'
+soupsieve>=2.3.2.post1 ; python_version >= '3.6'
+sqlparse>=0.4.3 ; python_version >= '3.5'
+stack-data>=0.6.1
+starlette>=0.21.0 ; python_version >= '3.7'
+tabulate>=0.9.0 ; python_version >= '3.7'
+tenacity>=8.1.0 ; python_version >= '3.6'
+terminado>=0.17.0 ; python_version >= '3.7'
+tinycss2>=1.2.1 ; python_version >= '3.7'
+tomli>=2.0.1 ; python_version >= '3.7'
+tornado>=6.2 ; python_version >= '3.7'
+tqdm>=4.64.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
+traitlets>=5.5.0 ; python_version >= '3.7'
+typing-extensions>=4.4.0 ; python_version >= '3.7'
+urllib3>=1.26.12 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' and python_version < '4'
+uvicorn>=0.20.0 ; python_version >= '3.7'
+w3lib>=2.0.1 ; python_version >= '3.6'
+wcwidth>=0.2.5
+webencodings>=0.5.1
+websocket-client>=1.4.2 ; python_version >= '3.7'
+websockets>=10.4 ; python_version >= '3.7'
+widgetsnbextension>=4.0.3 ; python_version >= '3.7'
+yoyo-migrations>=8.1.0
+zipp>=3.10.0 ; python_version >= '3.7'
\ No newline at end of file

From a220360f32e2b1e7b77e9567dd7f0b39ce5b7900 Mon Sep 17 00:00:00 2001
From: Samriddhi Singh <samriddhisingh6079@gmail.com>
Date: Tue, 1 Aug 2023 11:52:22 +0530
Subject: [PATCH 08/12] Edited workflow file

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index ff4e7c5..c07a728 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -22,7 +22,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -r .requirements.txt  
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
       - name: Test with pytest
         run: |
           export PYTHONPATH="${PYTHONPATH}:$(pwd)"

From 6ac7fccea0b6aa896afeee1b147166404584335a Mon Sep 17 00:00:00 2001
From: Samriddhi Singh <samriddhisingh6079@gmail.com>
Date: Tue, 1 Aug 2023 11:57:21 +0530
Subject: [PATCH 09/12] Edited worflow file

---
 .github/workflows/python-package.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index c07a728..ab40f9f 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -12,6 +12,7 @@ jobs:
     env:
         PATH_TO_SEQREPO: ./tests/data/seqrepo/latest
         HG38_FILE: ./tests/data/hg38.2bit
+        GENE_NORM_DB_URL: postgres://postgres@localhost:5432/gene_normalizer
         
     steps:
       - uses: actions/checkout@v3

From 808e01202537fc7117f7597e40b34eb558fd767e Mon Sep 17 00:00:00 2001
From: Samriddhi Singh <samriddhisingh6079@gmail.com>
Date: Wed, 2 Aug 2023 03:22:15 +0530
Subject: [PATCH 10/12] Docstrings

---
 .github/workflows/python-package.yml |   7 +-
 blat_out.psl                         |   4 +-
 blat_output_test.txt                 |   8 +-
 blat_query.fa                        |   4 +-
 mavedb_mapping/__init__.py           |  10 ++-
 mavedb_mapping/blat_alignment.py     | 107 +++++++++++++++++++--------
 6 files changed, 97 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index ab40f9f..592ad8e 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -12,7 +12,6 @@ jobs:
     env:
         PATH_TO_SEQREPO: ./tests/data/seqrepo/latest
         HG38_FILE: ./tests/data/hg38.2bit
-        GENE_NORM_DB_URL: postgres://postgres@localhost:5432/gene_normalizer
         
     steps:
       - uses: actions/checkout@v3
@@ -24,7 +23,13 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+
       - name: Test with pytest
+        env:
+          PATH_TO_SEQREPO: ./tests/data/seqrepo/latest
+          HG38_FILE: ./tests/data/hg38.2bit
+          GENE_NORM_DB_URL: postgres://postgres@localhost:5432/gene_normalizer
+      
         run: |
           export PYTHONPATH="${PYTHONPATH}:$(pwd)"
           pytest
diff --git a/blat_out.psl b/blat_out.psl
index ee794c5..a5fb47c 100644
--- a/blat_out.psl
+++ b/blat_out.psl
@@ -3,6 +3,4 @@ psLayout version 3
 match	mis- 	rep. 	N's	Q gap	Q gap	T gap	T gap	strand	Q        	Q   	Q    	Q  	T        	T   	T    	T  	block	blockSizes 	qStarts	 tStarts
      	match	match	   	count	bases	count	bases	      	name     	size	start	end	name     	size	start	end	count
 ---------------------------------------------------------------------------------------------------------------------------------------------------------------
-52	2	0	0	1	34	1	33	+	UBE2I	477	337	425	chr2	242193529	30078144	30078231	2	37,17,	337,408,	30078144,30078214,
-477	0	0	0	0	0	5	10286	+	UBE2I	477	0	477	chr16	90338345	1314030	1324793	6	66,84,73,110,80,64,	0,66,150,223,333,413,	1314030,1314292,1315653,1320173,1320437,1324729,
-23	0	0	0	0	0	0	0	-	UBE2I	477	326	349	chr1	248956422	187003058	187003081	1	23,	128,	187003058,
+179	18	0	0	0	0	1	332	+	E4B	309	98	295	chr1	248956422	10179413	10179942	2	149,48,	98,247,	10179413,10179894,
diff --git a/blat_output_test.txt b/blat_output_test.txt
index 1beee5d..0fb8f46 100644
--- a/blat_output_test.txt
+++ b/blat_output_test.txt
@@ -1,5 +1,5 @@
-      Query: UBE2I <unknown description>
-        Hit: chr16 <unknown description>
-Query range: [413:477] (1)
-  Hit range: [1324729:1324793] (1)
+      Query: E4B <unknown description>
+        Hit: chr1 <unknown description>
+Query range: [247:295] (1)
+  Hit range: [10179894:10179942] (1)
   Fragments: 1 (? columns)
\ No newline at end of file
diff --git a/blat_query.fa b/blat_query.fa
index 14dab3f..f75c11f 100644
--- a/blat_query.fa
+++ b/blat_query.fa
@@ -1,2 +1,2 @@
->UBE2I
-ATGTCGGGGATCGCCCTCAGCAGACTCGCCCAGGAGAGGAAAGCATGGAGGAAAGACCACCCATTTGGTTTCGTGGCTGTCCCAACAAAAAATCCCGATGGCACGATGAACCTCATGAACTGGGAGTGCGCCATTCCAGGAAAGAAAGGGACTCCGTGGGAAGGAGGCTTGTTTAAACTACGGATGCTTTTCAAAGATGATTATCCATCTTCGCCACCAAAATGTAAATTCGAACCACCATTATTTCACCCGAATGTGTACCCTTCGGGGACAGTGTGCCTGTCCATCTTAGAGGAGGACAAGGACTGGAGGCCAGCCATCACAATCAAACAGATCCTATTAGGAATACAGGAACTTCTAAATGAACCAAATATCCAAGACCCAGCTCAAGCAGAGGCCTACACGATTTACTGCCAAAACAGAGTGGAGTACGAGAAAAGGGTCCGAGCACAAGCCAAGAAGTTTGCGCCCTCATAA
+>E4B
+ATAGAGAAGTTTAAACTTCTTGCAGAGAAAGTGGAGGAAATCGTGGCAAAGAATGCGCGGGCAGAAATAGACTACAGCGATGCCCCGGACGAGTTCAGAGACCCTCTGATGGACACCCTGATGACCGATCCCGTGAGACTGCCCTCTGGCACCGTCATGGACCGTTCTATCATCCTGCGGCATCTGCTCAACTCCCCCACCGACCCCTTCAACCGCCAGATGCTGACTGAGAGCATGCTGGAGCCAGTGCCAGAGCTAAAGGAGCAGATTCAGGCCTGGATGAGAGAGAAACAGAGCAGTGACCACTGA
diff --git a/mavedb_mapping/__init__.py b/mavedb_mapping/__init__.py
index feaa027..d23f556 100644
--- a/mavedb_mapping/__init__.py
+++ b/mavedb_mapping/__init__.py
@@ -1,10 +1,14 @@
+
 import os
-from gene.query import QueryHandler
-from gene.database import create_db
 from biocommons.seqrepo import SeqRepo
-from ga4gh.vrs.dataproxy import SeqRepoDataProxy
+
 from cool_seq_tool.data_sources.uta_database import UTADatabase
 
+from ga4gh.vrs.dataproxy import SeqRepoDataProxy
+
+from gene.query import QueryHandler
+from gene.database import create_db
+
 path_to_seqrepo = os.getenv("PATH_TO_SEQREPO", "/usr/local/share/seqrepo/latest")
 path_to_hg38_file = os.getenv("HG38_FILE", "hg38.2bit")  # default- in current directory
 data_file_path = "tests/data/"
diff --git a/mavedb_mapping/blat_alignment.py b/mavedb_mapping/blat_alignment.py
index c0a8790..6340036 100644
--- a/mavedb_mapping/blat_alignment.py
+++ b/mavedb_mapping/blat_alignment.py
@@ -6,10 +6,9 @@
 from mavedb_mapping import path_to_hg38_file
 
 
-# TODO: edit docstrings
-def get_gene_symb(dat):
+def get_gene_symb(dat:dict):
     """
-    Obtains gene symbol using gene normalizer
+    Obtains gene symbol using gene normalizer.
 
     Parameters
     ----------
@@ -37,10 +36,24 @@ def get_gene_symb(dat):
     return gsymb
 
 
-def get_gene_data(gsymb: str):
-    if gsymb == "NA":
-        return "NA", "NA"
+def get_gene_data(gsymb: str)->tuple:
+    """
+    Searches through QueryHandler using obtained gene symbol.
+
+    Parameters
+    ----------
+        gsymb: str
+            Gene symbol
 
+    Returns:
+    --------
+        tuple:
+            source_dict: dictionary of sources with their index in temp
+            temp: QueryHandler search output
+    
+    """
+    if not gsymb:
+        return (None, None)
     temp = qh.search(gsymb).source_matches
     source_dict = {}
     for i in range(len(temp)):
@@ -48,7 +61,8 @@ def get_gene_data(gsymb: str):
     return temp, source_dict
 
 
-def get_hgnc_accession(temp, source_dict):
+def get_hgnc_accession(temp, source_dict:dict):
+    """Function that obtains HGNC accession id (like hgnc:12345)"""
     if temp == "NA":
         return "NA"
     accession = temp[source_dict["HGNC"]].records[0].concept_id
@@ -70,17 +84,17 @@ def get_sequence_interval(records):
     return None
 
 
-def return_gene_data(return_chr: bool, temp, source_dict):
+def return_gene_data(return_chr: bool, temp, source_dict:dict):
     """
     Parameters
     ----------
         return_chr :bool
            If True, returns chromosome information.
 
-        temp: data obtained from gene normalizer
-
         source_dict: dict
-            Dictionary of sources (NCBI, HGNC, Ensembl)
+            dictionary of sources with their index in temp
+
+        temp: QueryHandler search output
 
 
     Returns:
@@ -93,7 +107,7 @@ def return_gene_data(return_chr: bool, temp, source_dict):
             If gene symbol can be extracted and return_chr is False
 
     """
-    if temp == "NA":
+    if not temp:
         return "NA"
 
     if "HGNC" in source_dict and return_chr == True:
@@ -126,11 +140,9 @@ def extract_blat_output(dat: dict):
 
     Parameters
     ----------
-
         dat: dict
             Dictionary containing data required for mapping.
 
-
     Returns:
     --------
         BLAT Output
@@ -158,13 +170,18 @@ def extract_blat_output(dat: dict):
 
 
 def obtain_hit_starts(output, hit):
+    #a hit is a portion of similarity between query seq and matched seq
+    """
+        Helper function to obtain HSP
+        Returns the start of hit  
+    """
     hit_starts = list()
     for n in range(len(output[hit])):
         hit_starts.append(output[hit][n].hit_start)
     return hit_starts
 
 
-def using_gene_normalizer(output, correct_chr, temp, source_dict):
+def obtain_hsp_using_gene_normalizer(output, correct_chr, temp, source_dict):
     for c in range(len(output)):
         if correct_chr == output[c].id.strip("chr"):
             hit = c
@@ -178,17 +195,27 @@ def using_gene_normalizer(output, correct_chr, temp, source_dict):
 
 
 def obtain_hsp(output):
+    """
+    Obtains high-scoring pairs (HSP) for query sequence
+     
+    Parameters
+    ----------
+        output: BLAT output
+
+    Returns
+    -------
+        HSP
+    
+    """
     hit_dict = {}
-    for c in range(
-        len(output)
-    ):  # Take top scoring hit if target not found using gene normalizer
-        hit_scores = list()
+    for c in range(len(output)):  
+        max_hit = output[c][0].score
         for e in range(len(output[c])):
-            hit_scores.append(output[c][e].score)
-        hit_dict[c] = hit_scores
-    # maybe can include in above for loop
-    for key in hit_dict:
-        hit_dict[key] = max(hit_dict[key])
+            if output[c][e].score>max_hit:
+                max_hit = output[c][e].score
+        hit_dict[c] = max_hit
+    
+    #Using top scoring hit
     hit = max(hit_dict, key=hit_dict.get)
     hit_starts = obtain_hit_starts(output, hit)
     hsp = output[hit][hit_starts.index(max(hit_starts))]
@@ -196,6 +223,22 @@ def obtain_hsp(output):
 
 
 def from_hsp_output(hsp, output, gsymb):
+    """
+    
+    Parameters
+    ----------
+        hsp: 
+            High scoring pairs obtained by using top scoring hit
+        output:
+            BLAT output
+        gsymb: str
+            Gene symbol
+    
+    Returns
+    -------
+        Tuple with data used for mapping
+
+    """
     query_ranges = hit_ranges = []
     strand = hsp[0].query_strand
     coverage = 100 * (hsp.query_end - hsp.query_start) / output.seq_len
@@ -222,12 +265,17 @@ def from_hsp_output(hsp, output, gsymb):
         chrom = chrom.split(" ")[9].strip("chr")
         query_string = query_string.split(" ")
         hit_string = hit_string.split(" ")
+
+        #Range in query sequence that aligned with hit sequence
         query_ranges.append(query_string[2])
+
+        #Range in hit sequence that aligned with hquery sequence
         hit_ranges.append(hit_string[4])
+
     return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb, None
 
 
-def get_query_and_hit_ranges(output, dat):
+def get_query_and_hit_ranges(output, dat:dict)->tuple:
     """
     Extracts query and hit ranges from the BLAT output.
 
@@ -245,12 +293,11 @@ def get_query_and_hit_ranges(output, dat):
     gsymb = get_gene_symb(dat)
     temp, source_dict = get_gene_data(gsymb)
 
-    if "HGNC" in source_dict and (
-        source_dict.get("Ensembl") is not None or source_dict.get("NCBI") is not None
-    ):
+    #if "HGNC" in source_dict and (source_dict.get("Ensembl") is not None or source_dict.get("NCBI") is not None):
+    if False: #temporary, will remove using_gene_normalizer
         # TODO: remove these conditions from return_gene_data
         correct_chr = return_gene_data(True, temp, source_dict)
-        hsp = using_gene_normalizer(output, correct_chr, temp, source_dict)
+        hsp = obtain_hsp_using_gene_normalizer(output, correct_chr, temp, source_dict)
     else:
         hsp = obtain_hsp(output)
 
@@ -258,7 +305,7 @@ def get_query_and_hit_ranges(output, dat):
     return data_tuple
 
 
-def mave_to_blat(dat):
+def mave_to_blat(dat:dict)->dict:
     """
 
     Performs BLAT Alignment on MaveDB scoreset data.

From 90aba2b0429d4936b1bca3899de5756e371d19ec Mon Sep 17 00:00:00 2001
From: Samriddhi Singh <samriddhisingh6079@gmail.com>
Date: Thu, 3 Aug 2023 01:33:04 +0530
Subject: [PATCH 11/12] Transcript selection without usage of gene symbol, and
 retrieved gene symbol from MANE transcript

---
 blat_out.psl                           |  6 ----
 blat_output_test.txt                   |  5 ---
 blat_query.fa                          |  2 --
 mavedb_mapping/transcript_selection.py | 42 ++++++++++++--------------
 4 files changed, 19 insertions(+), 36 deletions(-)
 delete mode 100644 blat_out.psl
 delete mode 100644 blat_output_test.txt
 delete mode 100644 blat_query.fa

diff --git a/blat_out.psl b/blat_out.psl
deleted file mode 100644
index a5fb47c..0000000
--- a/blat_out.psl
+++ /dev/null
@@ -1,6 +0,0 @@
-psLayout version 3
-
-match	mis- 	rep. 	N's	Q gap	Q gap	T gap	T gap	strand	Q        	Q   	Q    	Q  	T        	T   	T    	T  	block	blockSizes 	qStarts	 tStarts
-     	match	match	   	count	bases	count	bases	      	name     	size	start	end	name     	size	start	end	count
----------------------------------------------------------------------------------------------------------------------------------------------------------------
-179	18	0	0	0	0	1	332	+	E4B	309	98	295	chr1	248956422	10179413	10179942	2	149,48,	98,247,	10179413,10179894,
diff --git a/blat_output_test.txt b/blat_output_test.txt
deleted file mode 100644
index 0fb8f46..0000000
--- a/blat_output_test.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-      Query: E4B <unknown description>
-        Hit: chr1 <unknown description>
-Query range: [247:295] (1)
-  Hit range: [10179894:10179942] (1)
-  Fragments: 1 (? columns)
\ No newline at end of file
diff --git a/blat_query.fa b/blat_query.fa
deleted file mode 100644
index f75c11f..0000000
--- a/blat_query.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->E4B
-ATAGAGAAGTTTAAACTTCTTGCAGAGAAAGTGGAGGAAATCGTGGCAAAGAATGCGCGGGCAGAAATAGACTACAGCGATGCCCCGGACGAGTTCAGAGACCCTCTGATGGACACCCTGATGACCGATCCCGTGAGACTGCCCTCTGGCACCGTCATGGACCGTTCTATCATCCTGCGGCATCTGCTCAACTCCCCCACCGACCCCTTCAACCGCCAGATGCTGACTGAGAGCATGCTGGAGCCAGTGCCAGAGCTAAAGGAGCAGATTCAGGCCTGGATGAGAGAGAAACAGAGCAGTGACCACTGA
diff --git a/mavedb_mapping/transcript_selection.py b/mavedb_mapping/transcript_selection.py
index ef2d733..9b2ed66 100644
--- a/mavedb_mapping/transcript_selection.py
+++ b/mavedb_mapping/transcript_selection.py
@@ -6,7 +6,7 @@
 from Bio.Seq import Seq
 from bs4 import BeautifulSoup
 from mavedb_mapping.transcript_selection_helper import HelperFunctionsForBLATOutput
-from mavedb_mapping import sr, qh, dp
+from mavedb_mapping import sr, dp
 
 utadb = UTADatabase()
 mane = MANETranscriptMappings()
@@ -15,7 +15,7 @@
 nest_asyncio.apply()
 
 
-async def mapq(locs: list, chrom: str, gsymb: str):
+async def mapq(locs: list, chrom: str):
     """
     Runs a query on UTADB to obtain transcripts
     Parameters
@@ -40,8 +40,7 @@ async def mapq(locs: list, chrom: str, gsymb: str):
     for i in range(len(locs)):
         testquery = f"""select *
                             from uta_20210129.tx_exon_aln_v
-                            where hgnc = '{gsymb}'
-                            and {locs[i][0]} between alt_start_i and alt_end_i
+                            where {locs[i][0]} between alt_start_i and alt_end_i
                             or {locs[i][1]} between alt_start_i and alt_end_i
                             and alt_ac = '{chrom}'"""
 
@@ -54,19 +53,6 @@ async def mapq(locs: list, chrom: str, gsymb: str):
             transcript_lists.append(tl)
     return transcript_lists
 
-
-def get_gsymb(dat):
-    try:
-        uniprot = dat["uniprot_id"]
-        gsymb = qh.normalize(str(f"uniprot:{uniprot}")).gene_descriptor.label
-    except:
-        temp = dat["target"].split(" ")
-        if temp[0] == "JAK":
-            temp[0] = "JAK1"
-        gsymb = qh.normalize(temp[0]).gene_descriptor.label
-    return gsymb
-
-
 def using_uniprot(uniprot: str, ts: str):
     """
     Looks for transcripts using Uniprot ID
@@ -101,7 +87,7 @@ def using_uniprot(uniprot: str, ts: str):
         start = up.find(stri[:10])
         return start, full_match
     except:
-        # print(dat['urn'], 'no transcripts found')
+        #no transcripts foundś
         return "NA", "NA"
 
 
@@ -282,11 +268,14 @@ def main(mave_blat_dict: dict, dat: dict) -> dict:
 
         locs = helper.get_locs_list()
         chrom = helper.get_chr(dp)
-        gsymb = get_gsymb(dat)
-        ts = asyncio.run(mapq(locs, chrom, gsymb))
+        ts = asyncio.run(mapq(locs, chrom))
 
         try:
-            isect = list(set.intersection(*map(set, ts)))
+            for i in ts:
+                i = set(i)
+            intersection_set = set.intersection(set(i))
+            isect = list(intersection_set)
+
         except:
             start, full_match = using_uniprot(dat["uniprot_id"], dat["target_sequence"])
             np = nm = status = "NA"
@@ -298,13 +287,20 @@ def main(mave_blat_dict: dict, dat: dict) -> dict:
                 "RefSeq_prot": "NA",
                 "RefSeq_nuc": "NA",
                 "status": "NA",
-                "gsymb": gsymb,
+                "gsymb": "NA",
             }
             return mappings_dict
 
         mane_trans = mane.get_mane_from_transcripts(isect)
-
+        
+        
         if mane_trans != []:
+            #selecting correct transcript using chrom from transcripts obtained
+            for i in mane_trans:
+                if i['GRCh38_chr'] == chrom:
+                    mane_trans = [i]
+            gsymb = mane_trans[0]["symbol"]
+            
             np, start, full_match, nm, status = from_mane_trans(dat, mane_trans)
         else:
             np, start, full_match, nm, status = no_mane_trans(isect, dat)

From 8d5fba77e091d834bb76e309744902fa0bce5a70 Mon Sep 17 00:00:00 2001
From: samriddhi99 <115928307+samriddhi99@users.noreply.github.com>
Date: Sat, 12 Aug 2023 11:30:53 +0530
Subject: [PATCH 12/12] File name as optional parameter

Co-authored-by: James Stevenson <james.sharpsteen@gmail.com>
---
 mavedb_mapping/blat_alignment.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mavedb_mapping/blat_alignment.py b/mavedb_mapping/blat_alignment.py
index 6340036..e6bb133 100644
--- a/mavedb_mapping/blat_alignment.py
+++ b/mavedb_mapping/blat_alignment.py
@@ -134,7 +134,7 @@ def return_gene_data(return_chr: bool, temp, source_dict:dict):
     return "NA"
 
 
-def extract_blat_output(dat: dict):
+def extract_blat_output(dat: dict, query_filename: str = "blat_query.fa"):
     """
     Runs a BLAT Query and returns the output
 
@@ -142,12 +142,14 @@ def extract_blat_output(dat: dict):
     ----------
         dat: dict
             Dictionary containing data required for mapping.
+        blat_query: str
+            Filename for BLAT query file to save
 
     Returns:
     --------
         BLAT Output
     """
-    blat_file = open("blat_query.fa", "w")
+    blat_file = open(query_filename, "w")
     blat_file.write(">" + dat["target"] + "\n")
     blat_file.write(dat["target_sequence"] + "\n")
     blat_file.close()