From c27f1eb12ea2dc031b516285ae2da2b4c9faae9c Mon Sep 17 00:00:00 2001 From: Samriddhi Singh Date: Tue, 18 Jul 2023 12:19:04 +0530 Subject: [PATCH 01/12] Obtained HGNC accession IDs --- mavedb_mapping/blat_alignment.py | 66 +++++++++++++------------------- 1 file changed, 27 insertions(+), 39 deletions(-) diff --git a/mavedb_mapping/blat_alignment.py b/mavedb_mapping/blat_alignment.py index 60b6dae..09041c9 100644 --- a/mavedb_mapping/blat_alignment.py +++ b/mavedb_mapping/blat_alignment.py @@ -21,7 +21,22 @@ def get_gene_symb(dat): return gsymb -def get_gene_data(return_chr: bool, dat: dict, gsymb: str): +def get_gene_data(gsymb: str): + if gsymb == "NA": + return "NA" + temp = qh.search(gsymb).source_matches + source_dict = {} + for i in range(len(temp)): + source_dict[temp[i].source] = i + return temp, source_dict + + +def get_hgnc_accession(temp, source_dict): + accession = temp[source_dict["HGNC"]].records[0].concept_id + return accession + + +def return_gene_data(return_chr: bool, temp, source_dict): """ Parameters ---------- @@ -45,12 +60,8 @@ def get_gene_data(return_chr: bool, dat: dict, gsymb: str): If gene symbol can be extracted and return_chr is False """ - if gsymb == "NA": + if temp == "NA": return "NA" - temp = qh.search(gsymb).source_matches - source_dict = {} - for i in range(len(temp)): - source_dict[temp[i].source] = i if "HGNC" in source_dict and return_chr == True: chrom = temp[source_dict["HGNC"]].records[0].locations[0].chr @@ -180,9 +191,13 @@ def get_query_and_hit_ranges(output, dat): chrom = strand = "" coverage = identity = None query_ranges = hit_ranges = list() + gsymb = get_gene_symb(dat) + temp, source_dict = get_gene_data(gsymb) + accession = get_hgnc_accession(temp, source_dict) + for c in range(len(output)): - correct_chr = get_gene_data(dat=dat, return_chr=True, gsymb=gsymb) + correct_chr = return_gene_data(True, temp, source_dict) if correct_chr == output[c].id.strip("chr"): use_chr = True break @@ -200,7 +215,7 @@ def get_query_and_hit_ranges(output, dat): else: hit = c - loc_dict = get_gene_data(False, dat, gsymb) + loc_dict = return_gene_data(False, temp, source_dict) hit_starts = list() for n in range(len(output[hit])): @@ -247,37 +262,7 @@ def get_query_and_hit_ranges(output, dat): query_ranges.append(query_string[2]) hit_ranges.append(hit_string[4]) - return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb - - -def check_non_human(mave_blat_dict, min_percentage=80): - # for dna min % = 95 - # for prot min % = 80 - # as per BLAT website: "BLAT on DNA is designed to quickly find sequences of 95% and grent or shorter sequence alignments. BLAT on proteins finds sequences of 80% and greater similarity of length 20 amino acids or more. - """ - Checks if a sample is human or non-human based on the Mave-BLAT dictionary. - - Parameters - ---------- - - mave_blat_dict: dict - Dicitionary containing data after doing BLAT Alignment - - min_percent: int - Minimum percentage coverage to consider a sample as human. - - Returns - ------- - str: "human" if the sample is human, "Non human" otherwise. - """ - cov = mave_blat_dict["coverage"] - if cov == "NA": - return "Non human" - else: - if cov < min_percentage: - return "Non human" - else: - return "human" + return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb, accession def mave_to_blat(dat): @@ -306,6 +291,7 @@ def mave_to_blat(dat): query_ranges, hit_ranges, gsymb, + accession, ) = get_query_and_hit_ranges(output, dat) qh_dat = {"query_ranges": query_ranges, "hit_ranges": hit_ranges} qh_dat = pd.DataFrame(data=qh_dat) @@ -320,6 +306,7 @@ def mave_to_blat(dat): "identity": identity, "hits": qh_dat, "gsymb": gsymb, + "accession": accession, } else: @@ -336,6 +323,7 @@ def mave_to_blat(dat): "identity": "NA", "hits": qh_dat, "gsymb": "NA", + "accession": "NA", } return mave_blat_dict From 7f2c1a0fc2c9d6bbbd4517734da5ccbe3243ac03 Mon Sep 17 00:00:00 2001 From: Samriddhi Singh Date: Tue, 18 Jul 2023 12:26:26 +0530 Subject: [PATCH 02/12] Updated transcript_selection.py --- mavedb_mapping/blat_alignment.py | 2 +- mavedb_mapping/transcript_selection.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/mavedb_mapping/blat_alignment.py b/mavedb_mapping/blat_alignment.py index 09041c9..da8bca4 100644 --- a/mavedb_mapping/blat_alignment.py +++ b/mavedb_mapping/blat_alignment.py @@ -5,7 +5,7 @@ import subprocess from gene.database import create_db -qh = QueryHandler(create_db("postgres://postgres@localhost:5432/gene_normalizer")) +qh = QueryHandler(create_db()) def get_gene_symb(dat): diff --git a/mavedb_mapping/transcript_selection.py b/mavedb_mapping/transcript_selection.py index cdfb8c2..996457b 100644 --- a/mavedb_mapping/transcript_selection.py +++ b/mavedb_mapping/transcript_selection.py @@ -283,6 +283,9 @@ def main(mave_blat_dict: dict, dat: dict) -> dict: if dat["target_type"] == "Protein coding" or dat["target_type"] == "protein_coding": if mave_blat_dict["chrom"] == "NA": raise Exception("No BLAT output") + if check_non_human(mave_blat_dict) == "Non human": + raise ValueError("Non Human Scoreset") + locs = get_locs_list(mave_blat_dict["hits"]) chrom = get_chr(dp, mave_blat_dict["chrom"]) gsymb = get_gsymb(dat) From ac1fad10226f9fa291a09554237c43bce44cb623 Mon Sep 17 00:00:00 2001 From: Samriddhi Singh Date: Thu, 27 Jul 2023 03:28:42 +0530 Subject: [PATCH 03/12] Updated BLAT Alignement code for case where gene symbol cannot be obtained --- mavedb_mapping/blat_alignment.py | 128 ++++++++++++++++--------------- 1 file changed, 66 insertions(+), 62 deletions(-) diff --git a/mavedb_mapping/blat_alignment.py b/mavedb_mapping/blat_alignment.py index 0b17f98..d535dc7 100644 --- a/mavedb_mapping/blat_alignment.py +++ b/mavedb_mapping/blat_alignment.py @@ -6,7 +6,6 @@ from mavedb_mapping import path_to_hg38_file - # TODO: edit docstrings def get_gene_symb(dat): try: @@ -39,7 +38,6 @@ def get_hgnc_accession(temp, source_dict): return accession - def get_sequence_interval(records): for record in records: for location in record.locations: @@ -144,69 +142,49 @@ def extract_blat_output(dat: dict): return output -def get_query_and_hit_ranges(output, dat): - """ - Extracts query and hit ranges from the BLAT output. - - Parameters - ---------- - output: - Output from the BLAT query. - dat: dict - Dictionary containing data required for mapping. - - Returns - ------- - Tuple containing the chromosome, strand, coverage, identity, query ranges, hit ranges, and gene symbol. - """ - hit_scores = list() - hit_dict = {} - use_chr = False - chrom = strand = "" - coverage = identity = None - query_ranges = hit_ranges = list() +def obtain_hit_starts(output, hit): + hit_starts = list() + for n in range(len(output[hit])): + hit_starts.append(output[hit][n].hit_start) + return hit_starts - gsymb = get_gene_symb(dat) - temp, source_dict = get_gene_data(gsymb) - accession = get_hgnc_accession(temp, source_dict) - correct_chr = return_gene_data(True, temp, source_dict) +def using_gene_normalizer(output, correct_chr, temp, source_dict): for c in range(len(output)): if correct_chr == output[c].id.strip("chr"): - use_chr = True + hit = c break - if ( - correct_chr == "NA" - ): # Take top scoring hit if target not found using gene normalizer - hit_scores = list() - for e in range(len(output[c])): - hit_scores.append(output[c][e].score) - hit_dict[c] = hit_scores - if use_chr == False: - for key in hit_dict: - hit_dict[key] = max(hit_dict[key]) - hit = max(hit_dict, key=hit_dict.get) - else: - hit = c - loc_dict = return_gene_data(False, temp, source_dict) + hit_starts = obtain_hit_starts(output, hit) + hsp = output[hit][ + hit_starts.index(min(hit_starts, key=lambda x: abs(x - loc_dict["start"]))) + ] + return hsp - hit_starts = list() - for n in range(len(output[hit])): - hit_starts.append(output[hit][n].hit_start) - - sub_scores = list() - for n in range(len(output[hit])): - sub_scores.append(output[hit][n].score) - if loc_dict == "NA": - hsp = output[hit][ - sub_scores.index(max(sub_scores)) - ] # Take top score if no match found - else: - hsp = output[hit][ - hit_starts.index(min(hit_starts, key=lambda x: abs(x - loc_dict["start"]))) - ] +def obtain_hsp(output): + hit_dict = {} + for c in range( + len(output) + ): # Take top scoring hit if target not found using gene normalizer + hit_scores = list() + for e in range(len(output[c])): + hit_scores.append(output[c][e].score) + hit_dict[c] = hit_scores + # maybe can include in above for loop + for key in hit_dict: + hit_dict[key] = max(hit_dict[key]) + hit = max(hit_dict, key=hit_dict.get) + hit_starts = obtain_hit_starts(output, hit) + hsp = output[hit][hit_starts.index(max(hit_starts))] + return hsp + + +def from_hsp_output(hsp, output, gsymb): + query_ranges = hit_ranges = [] + strand = hsp[0].query_strand + coverage = 100 * (hsp.query_end - hsp.query_start) / output.seq_len + identity = hsp.ident_pct for j in range(len(hsp)): test_file = open("blat_output_test.txt", "w") @@ -215,10 +193,6 @@ def get_query_and_hit_ranges(output, dat): query_string = "" hit_string = "" - strand = hsp[0].query_strand - coverage = 100 * (hsp.query_end - hsp.query_start) / output.seq_len - # coverage = f"{hsp.query_end - hsp.query_start} / {output.seq_len}, {coverage}" - identity = hsp.ident_pct test_file = open("blat_output_test.txt", "r") for k, line in enumerate(test_file): @@ -235,8 +209,38 @@ def get_query_and_hit_ranges(output, dat): hit_string = hit_string.split(" ") query_ranges.append(query_string[2]) hit_ranges.append(hit_string[4]) + return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb, None + + +def get_query_and_hit_ranges(output, dat): + """ + Extracts query and hit ranges from the BLAT output. + + Parameters + ---------- + output: + Output from the BLAT query. + dat: dict + Dictionary containing data required for mapping. + + Returns + ------- + Tuple containing the chromosome, strand, coverage, identity, query ranges, hit ranges, and gene symbol. + """ + gsymb = get_gene_symb(dat) + temp, source_dict = get_gene_data(gsymb) + + if "HGNC" in source_dict and ( + source_dict.get("Ensembl") is not None or source_dict.get("NCBI") is not None + ): + # TODO: remove these conditions from return_gene_data + correct_chr = return_gene_data(True, temp, source_dict) + hsp = using_gene_normalizer(output, correct_chr, temp, source_dict) + else: + hsp = obtain_hsp(output) - return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb, accession + data_tuple = from_hsp_output(hsp, output, gsymb) + return data_tuple def mave_to_blat(dat): From f2a6799a7a5a42b05296caae1508fe3c4d15c91b Mon Sep 17 00:00:00 2001 From: Samriddhi Singh Date: Sat, 29 Jul 2023 01:55:52 +0530 Subject: [PATCH 04/12] Added some docstrings --- mavedb_mapping/blat_alignment.py | 41 ++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/mavedb_mapping/blat_alignment.py b/mavedb_mapping/blat_alignment.py index d535dc7..c0a8790 100644 --- a/mavedb_mapping/blat_alignment.py +++ b/mavedb_mapping/blat_alignment.py @@ -8,6 +8,23 @@ # TODO: edit docstrings def get_gene_symb(dat): + """ + Obtains gene symbol using gene normalizer + + Parameters + ---------- + dat: dict + Dictionary containing data required for mapping from MAVEDB scoreset. + + + Returns: + -------- + str: + Gene symbol + + If gene symbol cannot be extracted, returns None. + + """ try: uniprot = dat["uniprot_id"] gsymb = qh.normalize(str(f"uniprot:{uniprot}")).gene_descriptor.label @@ -16,7 +33,7 @@ def get_gene_symb(dat): target = dat["target"].split(" ")[0] gsymb = qh.normalize(target).gene_descriptor.label except: - return "NA" + return None return gsymb @@ -39,6 +56,10 @@ def get_hgnc_accession(temp, source_dict): def get_sequence_interval(records): + """ + Helper function to obtain gene data + + """ for record in records: for location in record.locations: if location.interval.type == "SequenceInterval": @@ -56,11 +77,10 @@ def return_gene_data(return_chr: bool, temp, source_dict): return_chr :bool If True, returns chromosome information. - dat: dict - Dictionary containing data required for mapping. + temp: data obtained from gene normalizer - gsymb: str - Gene symbol. + source_dict: dict + Dictionary of sources (NCBI, HGNC, Ensembl) Returns: @@ -102,10 +122,10 @@ def return_gene_data(return_chr: bool, temp, source_dict): def extract_blat_output(dat: dict): """ + Runs a BLAT Query and returns the output + Parameters ---------- - return_chr :bool - If True, returns chromosome information. dat: dict Dictionary containing data required for mapping. @@ -113,12 +133,7 @@ def extract_blat_output(dat: dict): Returns: -------- - str: - If return_chr is True, returns the chromosome value as a string. - If gene symbol cannot be extracted, returns 'NA'. - OR - dict: - If gene symbol can be extracted and return_chr is False + BLAT Output """ blat_file = open("blat_query.fa", "w") blat_file.write(">" + dat["target"] + "\n") From 930acd80ca659038bc9d84600ed9757726925c9d Mon Sep 17 00:00:00 2001 From: Samriddhi Singh Date: Tue, 1 Aug 2023 11:12:15 +0530 Subject: [PATCH 05/12] Added tests directory --- mavedb_mapping/__init__.py | 2 +- {mavedb_mapping/tests => tests}/conftest.py | 0 {mavedb_mapping/tests => tests}/data/00000001-a-4.json | 0 {mavedb_mapping/tests => tests}/data/00000041-a-1.json | 0 {mavedb_mapping/tests => tests}/data/scores-00000001-a-4 | 0 {mavedb_mapping/tests => tests}/data/scores-00000041-a-1 | 0 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000001-a-4 | 0 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000004-a-1 | 0 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000010-a-1 | 0 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000018-a-1 | 0 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000041-a-1 | 0 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000041-b-1 | 0 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000060-a-1 | 0 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000083-e-1 | 0 {mavedb_mapping/tests => tests}/data/urn:mavedb:00000097-c-1 | 0 {mavedb_mapping/tests => tests}/test_for_blat.py | 4 ++-- {mavedb_mapping/tests => tests}/test_for_invalid_inputs.py | 0 {mavedb_mapping/tests => tests}/test_for_mapped_variants.py | 0 .../tests => tests}/test_for_transcript_selection.py | 0 19 files changed, 3 insertions(+), 3 deletions(-) rename {mavedb_mapping/tests => tests}/conftest.py (100%) rename {mavedb_mapping/tests => tests}/data/00000001-a-4.json (100%) rename {mavedb_mapping/tests => tests}/data/00000041-a-1.json (100%) rename {mavedb_mapping/tests => tests}/data/scores-00000001-a-4 (100%) rename {mavedb_mapping/tests => tests}/data/scores-00000041-a-1 (100%) rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000001-a-4 (100%) rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000004-a-1 (100%) rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000010-a-1 (100%) rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000018-a-1 (100%) rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000041-a-1 (100%) rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000041-b-1 (100%) rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000060-a-1 (100%) rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000083-e-1 (100%) rename {mavedb_mapping/tests => tests}/data/urn:mavedb:00000097-c-1 (100%) rename {mavedb_mapping/tests => tests}/test_for_blat.py (96%) rename {mavedb_mapping/tests => tests}/test_for_invalid_inputs.py (100%) rename {mavedb_mapping/tests => tests}/test_for_mapped_variants.py (100%) rename {mavedb_mapping/tests => tests}/test_for_transcript_selection.py (100%) diff --git a/mavedb_mapping/__init__.py b/mavedb_mapping/__init__.py index e6a02f3..feaa027 100644 --- a/mavedb_mapping/__init__.py +++ b/mavedb_mapping/__init__.py @@ -7,7 +7,7 @@ path_to_seqrepo = os.getenv("PATH_TO_SEQREPO", "/usr/local/share/seqrepo/latest") path_to_hg38_file = os.getenv("HG38_FILE", "hg38.2bit") # default- in current directory -data_file_path = "mavedb_mapping/tests/data/" +data_file_path = "tests/data/" # utadb = UTADatabase(db_pwd="uta") qh = QueryHandler(create_db()) diff --git a/mavedb_mapping/tests/conftest.py b/tests/conftest.py similarity index 100% rename from mavedb_mapping/tests/conftest.py rename to tests/conftest.py diff --git a/mavedb_mapping/tests/data/00000001-a-4.json b/tests/data/00000001-a-4.json similarity index 100% rename from mavedb_mapping/tests/data/00000001-a-4.json rename to tests/data/00000001-a-4.json diff --git a/mavedb_mapping/tests/data/00000041-a-1.json b/tests/data/00000041-a-1.json similarity index 100% rename from mavedb_mapping/tests/data/00000041-a-1.json rename to tests/data/00000041-a-1.json diff --git a/mavedb_mapping/tests/data/scores-00000001-a-4 b/tests/data/scores-00000001-a-4 similarity index 100% rename from mavedb_mapping/tests/data/scores-00000001-a-4 rename to tests/data/scores-00000001-a-4 diff --git a/mavedb_mapping/tests/data/scores-00000041-a-1 b/tests/data/scores-00000041-a-1 similarity index 100% rename from mavedb_mapping/tests/data/scores-00000041-a-1 rename to tests/data/scores-00000041-a-1 diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000001-a-4 b/tests/data/urn:mavedb:00000001-a-4 similarity index 100% rename from mavedb_mapping/tests/data/urn:mavedb:00000001-a-4 rename to tests/data/urn:mavedb:00000001-a-4 diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000004-a-1 b/tests/data/urn:mavedb:00000004-a-1 similarity index 100% rename from mavedb_mapping/tests/data/urn:mavedb:00000004-a-1 rename to tests/data/urn:mavedb:00000004-a-1 diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000010-a-1 b/tests/data/urn:mavedb:00000010-a-1 similarity index 100% rename from mavedb_mapping/tests/data/urn:mavedb:00000010-a-1 rename to tests/data/urn:mavedb:00000010-a-1 diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000018-a-1 b/tests/data/urn:mavedb:00000018-a-1 similarity index 100% rename from mavedb_mapping/tests/data/urn:mavedb:00000018-a-1 rename to tests/data/urn:mavedb:00000018-a-1 diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000041-a-1 b/tests/data/urn:mavedb:00000041-a-1 similarity index 100% rename from mavedb_mapping/tests/data/urn:mavedb:00000041-a-1 rename to tests/data/urn:mavedb:00000041-a-1 diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000041-b-1 b/tests/data/urn:mavedb:00000041-b-1 similarity index 100% rename from mavedb_mapping/tests/data/urn:mavedb:00000041-b-1 rename to tests/data/urn:mavedb:00000041-b-1 diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000060-a-1 b/tests/data/urn:mavedb:00000060-a-1 similarity index 100% rename from mavedb_mapping/tests/data/urn:mavedb:00000060-a-1 rename to tests/data/urn:mavedb:00000060-a-1 diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000083-e-1 b/tests/data/urn:mavedb:00000083-e-1 similarity index 100% rename from mavedb_mapping/tests/data/urn:mavedb:00000083-e-1 rename to tests/data/urn:mavedb:00000083-e-1 diff --git a/mavedb_mapping/tests/data/urn:mavedb:00000097-c-1 b/tests/data/urn:mavedb:00000097-c-1 similarity index 100% rename from mavedb_mapping/tests/data/urn:mavedb:00000097-c-1 rename to tests/data/urn:mavedb:00000097-c-1 diff --git a/mavedb_mapping/tests/test_for_blat.py b/tests/test_for_blat.py similarity index 96% rename from mavedb_mapping/tests/test_for_blat.py rename to tests/test_for_blat.py index e37731d..8e37613 100644 --- a/mavedb_mapping/tests/test_for_blat.py +++ b/tests/test_for_blat.py @@ -2,10 +2,10 @@ from mavedb_mapping.blat_alignment import mave_to_blat from mavedb_mapping.metadata_process import metadata_obtain from mavedb_mapping.transcript_selection_helper import HelperFunctionsForBLATOutput - +from mavedb_mapping import data_file_path """Tests that run check_non_human function after BLAT Alignment to determine if scoreset is Human""" -data_file_path = "mavedb_mapping/tests/data/" + @pytest.fixture diff --git a/mavedb_mapping/tests/test_for_invalid_inputs.py b/tests/test_for_invalid_inputs.py similarity index 100% rename from mavedb_mapping/tests/test_for_invalid_inputs.py rename to tests/test_for_invalid_inputs.py diff --git a/mavedb_mapping/tests/test_for_mapped_variants.py b/tests/test_for_mapped_variants.py similarity index 100% rename from mavedb_mapping/tests/test_for_mapped_variants.py rename to tests/test_for_mapped_variants.py diff --git a/mavedb_mapping/tests/test_for_transcript_selection.py b/tests/test_for_transcript_selection.py similarity index 100% rename from mavedb_mapping/tests/test_for_transcript_selection.py rename to tests/test_for_transcript_selection.py From 6c7f2cd3366c117eec08521e6a3b095ce1b6018c Mon Sep 17 00:00:00 2001 From: Samriddhi Singh Date: Tue, 1 Aug 2023 11:41:28 +0530 Subject: [PATCH 06/12] Added workflow --- .github/workflows/python-package.yml | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 .github/workflows/python-package.yml diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..ff4e7c5 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,29 @@ +name: Python package + +on: [push] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11"] + env: + PATH_TO_SEQREPO: ./tests/data/seqrepo/latest + HG38_FILE: ./tests/data/hg38.2bit + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r .requirements.txt + - name: Test with pytest + run: | + export PYTHONPATH="${PYTHONPATH}:$(pwd)" + pytest From a99afb254eb38c3a397af6a991db6853b6018cb3 Mon Sep 17 00:00:00 2001 From: Samriddhi Singh Date: Tue, 1 Aug 2023 11:44:31 +0530 Subject: [PATCH 07/12] added requirements.txt --- requirements.txt | 149 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f587c4d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,149 @@ +-i https://pypi.org/simple +aiofiles>=22.1.0 ; python_version >= '3.7' and python_version < '4.0' +anyio>=3.6.2 ; python_full_version >= '3.6.2' +appdirs>=1.4.4 +appnope>=0.1.3 ; platform_system >= 'Darwin' +argcomplete>=2.0.0 ; python_version >= '3.6' +argh>=0.26.2 +argon2-cffi>=21.3.0 ; python_version >= '3.6' +argon2-cffi-bindings>=21.2.0 ; python_version >= '3.6' +asttokens>=2.1.0 +asyncio>=3.4.3 +asyncpg>=0.27.0 ; python_full_version >= '3.7.0' +attrs>=22.1.0 ; python_version >= '3.5' +babel>=2.11.0 ; python_version >= '3.6' +backcall>=0.2.0 +beautifulsoup4>=4.11.1 ; python_full_version >= '3.6.0' +biocommons.seqrepo>=0.6.5 +biopython>=1.80 +bioutils>=0.5.7 ; python_version >= '3.6' +bleach>=5.0.1 ; python_version >= '3.7' +boto3>=1.26.16 ; python_version >= '3.7' +botocore>=1.29.16 ; python_version >= '3.7' +bs4>=0.0.1 +canonicaljson>=1.6.4 ; python_version >= '3.7' +certifi>=2022.9.24 ; python_version >= '3.6' +cffi>=1.15.1 +charset-normalizer>=2.1.1 ; python_full_version >= '3.6.0' +click>=8.1.3 ; python_version >= '3.7' +coloredlogs>=15.0.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +configparser>=5.3.0 ; python_version >= '3.7' +cool-seq-tool>=0.1.4 +cssselect>=1.2.0 ; python_version >= '3.7' +debugpy>=1.6.3 ; python_version >= '3.7' +decorator>=5.1.1 ; python_version >= '3.5' +defusedxml>=0.7.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +entrypoints>=0.4 ; python_version >= '3.6' +executing>=1.2.0 +fake-useragent>=1.0.1 +fastapi>=0.87.0 ; python_version >= '3.7' +fastjsonschema>=2.16.2 +ga4gh.vrs[extras]>=0.8.7.dev0 ; python_version >= '3.6' +ga4gh.vrsatile.pydantic>=0.0.13 +gene-normalizer>=0.1.37 +gffutils>=0.11.1 +h11>=0.14.0 ; python_version >= '3.7' +hgvs>=1.5.4 ; python_version >= '3.6' +humanfriendly>=10.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +idna>=3.4 ; python_version >= '3.5' +importlib-metadata>=5.0.0 ; python_version >= '3.7' +inflection>=0.5.1 ; python_version >= '3.5' +ipykernel>=6.17.1 ; python_version >= '3.8' +ipython>=8.6.0 ; python_version >= '3.8' +ipython-genutils>=0.2.0 +ipywidgets>=8.0.2 ; python_version >= '3.7' +jedi>=0.18.2 ; python_version >= '3.6' +jinja2>=3.1.2 ; python_version >= '3.7' +jmespath>=1.0.1 ; python_version >= '3.7' +json5>=0.9.10 +jsonschema>=3.2.0 +jupyter>=1.0.0 +jupyter-client>=7.4.7 ; python_version >= '3.7' +jupyter-console>=6.4.4 ; python_version >= '3.7' +jupyter-core>=5.0.0 ; python_version >= '3.8' +jupyter-server>=1.23.3 ; python_version >= '3.7' +jupyterlab>=3.5.0 +jupyterlab-pygments>=0.2.2 ; python_version >= '3.7' +jupyterlab-server>=2.16.3 ; python_version >= '3.7' +jupyterlab-widgets>=3.0.3 ; python_version >= '3.7' +lxml>=4.9.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +markdown>=3.4.1 ; python_version >= '3.7' +markupsafe>=2.1.1 ; python_version >= '3.7' +matplotlib-inline>=0.1.6 ; python_version >= '3.5' +mistune>=2.0.4 +nbclassic>=0.4.8 ; python_version >= '3.7' +nbclient>=0.7.0 ; python_full_version >= '3.7.0' +nbconvert>=7.2.5 ; python_version >= '3.7' +nbformat>=5.7.0 ; python_version >= '3.7' +nest-asyncio>=1.5.6 +notebook>=6.5.2 ; python_version >= '3.7' +notebook-shim>=0.2.2 ; python_version >= '3.7' +numpy>=1.23.5 ; python_version >= '3.8' +packaging>=21.3 ; python_version >= '3.6' +pandas>=1.5.2 +pandocfilters>=1.5.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +parse>=1.19.0 +parsley>=1.3 +parso>=0.8.3 ; python_version >= '3.6' +pexpect>=4.8.0 ; sys_platform != 'win32' +pickleshare>=0.7.5 +platformdirs>=2.5.4 ; python_version >= '3.7' +plotly>=5.11.0 +prometheus-client>=0.15.0 ; python_version >= '3.6' +prompt-toolkit>=3.0.33 ; python_full_version >= '3.6.2' +psutil>=5.9.4 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +psycopg>=3.1.9 +psycopg2>=2.9.5 ; python_version >= '3.6' +psycopg2-binary>=2.9.5 +ptyprocess>=0.7.0 ; os_name != 'nt' +pure-eval>=0.2.2 +pycparser>=2.21 +pydantic>=1.10.2 ; python_version >= '3.7' +pyee>=8.2.2 +pyfaidx>=0.7.1 +pygments>=2.13.0 ; python_version >= '3.6' +pyliftover>=0.4 +pyparsing>=3.0.9 ; python_full_version >= '3.6.8' +pyppeteer>=1.0.2 ; python_version >= '3.7' and python_version < '4.0' +pyquery>=1.4.3 +pyrsistent>=0.19.2 ; python_version >= '3.7' +pysam>=0.18.0 +pytest>=6.0 +python-dateutil>=2.8.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +python-jsonschema-objects>=0.4.1 +pytz>=2022.6 +pyyaml>=6.0 ; python_version >= '3.6' +pyzmq>=24.0.1 ; python_version >= '3.6' +qtconsole>=5.4.0 ; python_version >= '3.7' +qtpy>=2.3.0 ; python_version >= '3.7' +requests>=2.28.1 +requests-html>=0.10.0 ; python_full_version >= '3.6.0' +s3transfer>=0.6.0 ; python_version >= '3.7' +send2trash>=1.8.0 +setuptools>=65.6.2 ; python_version >= '3.7' +simplejson>=3.18.0 ; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3' +six>=1.16.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +sniffio>=1.3.0 ; python_version >= '3.7' +soupsieve>=2.3.2.post1 ; python_version >= '3.6' +sqlparse>=0.4.3 ; python_version >= '3.5' +stack-data>=0.6.1 +starlette>=0.21.0 ; python_version >= '3.7' +tabulate>=0.9.0 ; python_version >= '3.7' +tenacity>=8.1.0 ; python_version >= '3.6' +terminado>=0.17.0 ; python_version >= '3.7' +tinycss2>=1.2.1 ; python_version >= '3.7' +tomli>=2.0.1 ; python_version >= '3.7' +tornado>=6.2 ; python_version >= '3.7' +tqdm>=4.64.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +traitlets>=5.5.0 ; python_version >= '3.7' +typing-extensions>=4.4.0 ; python_version >= '3.7' +urllib3>=1.26.12 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' and python_version < '4' +uvicorn>=0.20.0 ; python_version >= '3.7' +w3lib>=2.0.1 ; python_version >= '3.6' +wcwidth>=0.2.5 +webencodings>=0.5.1 +websocket-client>=1.4.2 ; python_version >= '3.7' +websockets>=10.4 ; python_version >= '3.7' +widgetsnbextension>=4.0.3 ; python_version >= '3.7' +yoyo-migrations>=8.1.0 +zipp>=3.10.0 ; python_version >= '3.7' \ No newline at end of file From a220360f32e2b1e7b77e9567dd7f0b39ce5b7900 Mon Sep 17 00:00:00 2001 From: Samriddhi Singh Date: Tue, 1 Aug 2023 11:52:22 +0530 Subject: [PATCH 08/12] Edited workflow file --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index ff4e7c5..c07a728 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -22,7 +22,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -r .requirements.txt + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Test with pytest run: | export PYTHONPATH="${PYTHONPATH}:$(pwd)" From 6ac7fccea0b6aa896afeee1b147166404584335a Mon Sep 17 00:00:00 2001 From: Samriddhi Singh Date: Tue, 1 Aug 2023 11:57:21 +0530 Subject: [PATCH 09/12] Edited worflow file --- .github/workflows/python-package.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index c07a728..ab40f9f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,6 +12,7 @@ jobs: env: PATH_TO_SEQREPO: ./tests/data/seqrepo/latest HG38_FILE: ./tests/data/hg38.2bit + GENE_NORM_DB_URL: postgres://postgres@localhost:5432/gene_normalizer steps: - uses: actions/checkout@v3 From 808e01202537fc7117f7597e40b34eb558fd767e Mon Sep 17 00:00:00 2001 From: Samriddhi Singh Date: Wed, 2 Aug 2023 03:22:15 +0530 Subject: [PATCH 10/12] Docstrings --- .github/workflows/python-package.yml | 7 +- blat_out.psl | 4 +- blat_output_test.txt | 8 +- blat_query.fa | 4 +- mavedb_mapping/__init__.py | 10 ++- mavedb_mapping/blat_alignment.py | 107 +++++++++++++++++++-------- 6 files changed, 97 insertions(+), 43 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index ab40f9f..592ad8e 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,7 +12,6 @@ jobs: env: PATH_TO_SEQREPO: ./tests/data/seqrepo/latest HG38_FILE: ./tests/data/hg38.2bit - GENE_NORM_DB_URL: postgres://postgres@localhost:5432/gene_normalizer steps: - uses: actions/checkout@v3 @@ -24,7 +23,13 @@ jobs: run: | python -m pip install --upgrade pip if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Test with pytest + env: + PATH_TO_SEQREPO: ./tests/data/seqrepo/latest + HG38_FILE: ./tests/data/hg38.2bit + GENE_NORM_DB_URL: postgres://postgres@localhost:5432/gene_normalizer + run: | export PYTHONPATH="${PYTHONPATH}:$(pwd)" pytest diff --git a/blat_out.psl b/blat_out.psl index ee794c5..a5fb47c 100644 --- a/blat_out.psl +++ b/blat_out.psl @@ -3,6 +3,4 @@ psLayout version 3 match mis- rep. N's Q gap Q gap T gap T gap strand Q Q Q Q T T T T block blockSizes qStarts tStarts match match count bases count bases name size start end name size start end count --------------------------------------------------------------------------------------------------------------------------------------------------------------- -52 2 0 0 1 34 1 33 + UBE2I 477 337 425 chr2 242193529 30078144 30078231 2 37,17, 337,408, 30078144,30078214, -477 0 0 0 0 0 5 10286 + UBE2I 477 0 477 chr16 90338345 1314030 1324793 6 66,84,73,110,80,64, 0,66,150,223,333,413, 1314030,1314292,1315653,1320173,1320437,1324729, -23 0 0 0 0 0 0 0 - UBE2I 477 326 349 chr1 248956422 187003058 187003081 1 23, 128, 187003058, +179 18 0 0 0 0 1 332 + E4B 309 98 295 chr1 248956422 10179413 10179942 2 149,48, 98,247, 10179413,10179894, diff --git a/blat_output_test.txt b/blat_output_test.txt index 1beee5d..0fb8f46 100644 --- a/blat_output_test.txt +++ b/blat_output_test.txt @@ -1,5 +1,5 @@ - Query: UBE2I - Hit: chr16 -Query range: [413:477] (1) - Hit range: [1324729:1324793] (1) + Query: E4B + Hit: chr1 +Query range: [247:295] (1) + Hit range: [10179894:10179942] (1) Fragments: 1 (? columns) \ No newline at end of file diff --git a/blat_query.fa b/blat_query.fa index 14dab3f..f75c11f 100644 --- a/blat_query.fa +++ b/blat_query.fa @@ -1,2 +1,2 @@ ->UBE2I -ATGTCGGGGATCGCCCTCAGCAGACTCGCCCAGGAGAGGAAAGCATGGAGGAAAGACCACCCATTTGGTTTCGTGGCTGTCCCAACAAAAAATCCCGATGGCACGATGAACCTCATGAACTGGGAGTGCGCCATTCCAGGAAAGAAAGGGACTCCGTGGGAAGGAGGCTTGTTTAAACTACGGATGCTTTTCAAAGATGATTATCCATCTTCGCCACCAAAATGTAAATTCGAACCACCATTATTTCACCCGAATGTGTACCCTTCGGGGACAGTGTGCCTGTCCATCTTAGAGGAGGACAAGGACTGGAGGCCAGCCATCACAATCAAACAGATCCTATTAGGAATACAGGAACTTCTAAATGAACCAAATATCCAAGACCCAGCTCAAGCAGAGGCCTACACGATTTACTGCCAAAACAGAGTGGAGTACGAGAAAAGGGTCCGAGCACAAGCCAAGAAGTTTGCGCCCTCATAA +>E4B +ATAGAGAAGTTTAAACTTCTTGCAGAGAAAGTGGAGGAAATCGTGGCAAAGAATGCGCGGGCAGAAATAGACTACAGCGATGCCCCGGACGAGTTCAGAGACCCTCTGATGGACACCCTGATGACCGATCCCGTGAGACTGCCCTCTGGCACCGTCATGGACCGTTCTATCATCCTGCGGCATCTGCTCAACTCCCCCACCGACCCCTTCAACCGCCAGATGCTGACTGAGAGCATGCTGGAGCCAGTGCCAGAGCTAAAGGAGCAGATTCAGGCCTGGATGAGAGAGAAACAGAGCAGTGACCACTGA diff --git a/mavedb_mapping/__init__.py b/mavedb_mapping/__init__.py index feaa027..d23f556 100644 --- a/mavedb_mapping/__init__.py +++ b/mavedb_mapping/__init__.py @@ -1,10 +1,14 @@ + import os -from gene.query import QueryHandler -from gene.database import create_db from biocommons.seqrepo import SeqRepo -from ga4gh.vrs.dataproxy import SeqRepoDataProxy + from cool_seq_tool.data_sources.uta_database import UTADatabase +from ga4gh.vrs.dataproxy import SeqRepoDataProxy + +from gene.query import QueryHandler +from gene.database import create_db + path_to_seqrepo = os.getenv("PATH_TO_SEQREPO", "/usr/local/share/seqrepo/latest") path_to_hg38_file = os.getenv("HG38_FILE", "hg38.2bit") # default- in current directory data_file_path = "tests/data/" diff --git a/mavedb_mapping/blat_alignment.py b/mavedb_mapping/blat_alignment.py index c0a8790..6340036 100644 --- a/mavedb_mapping/blat_alignment.py +++ b/mavedb_mapping/blat_alignment.py @@ -6,10 +6,9 @@ from mavedb_mapping import path_to_hg38_file -# TODO: edit docstrings -def get_gene_symb(dat): +def get_gene_symb(dat:dict): """ - Obtains gene symbol using gene normalizer + Obtains gene symbol using gene normalizer. Parameters ---------- @@ -37,10 +36,24 @@ def get_gene_symb(dat): return gsymb -def get_gene_data(gsymb: str): - if gsymb == "NA": - return "NA", "NA" +def get_gene_data(gsymb: str)->tuple: + """ + Searches through QueryHandler using obtained gene symbol. + + Parameters + ---------- + gsymb: str + Gene symbol + Returns: + -------- + tuple: + source_dict: dictionary of sources with their index in temp + temp: QueryHandler search output + + """ + if not gsymb: + return (None, None) temp = qh.search(gsymb).source_matches source_dict = {} for i in range(len(temp)): @@ -48,7 +61,8 @@ def get_gene_data(gsymb: str): return temp, source_dict -def get_hgnc_accession(temp, source_dict): +def get_hgnc_accession(temp, source_dict:dict): + """Function that obtains HGNC accession id (like hgnc:12345)""" if temp == "NA": return "NA" accession = temp[source_dict["HGNC"]].records[0].concept_id @@ -70,17 +84,17 @@ def get_sequence_interval(records): return None -def return_gene_data(return_chr: bool, temp, source_dict): +def return_gene_data(return_chr: bool, temp, source_dict:dict): """ Parameters ---------- return_chr :bool If True, returns chromosome information. - temp: data obtained from gene normalizer - source_dict: dict - Dictionary of sources (NCBI, HGNC, Ensembl) + dictionary of sources with their index in temp + + temp: QueryHandler search output Returns: @@ -93,7 +107,7 @@ def return_gene_data(return_chr: bool, temp, source_dict): If gene symbol can be extracted and return_chr is False """ - if temp == "NA": + if not temp: return "NA" if "HGNC" in source_dict and return_chr == True: @@ -126,11 +140,9 @@ def extract_blat_output(dat: dict): Parameters ---------- - dat: dict Dictionary containing data required for mapping. - Returns: -------- BLAT Output @@ -158,13 +170,18 @@ def extract_blat_output(dat: dict): def obtain_hit_starts(output, hit): + #a hit is a portion of similarity between query seq and matched seq + """ + Helper function to obtain HSP + Returns the start of hit + """ hit_starts = list() for n in range(len(output[hit])): hit_starts.append(output[hit][n].hit_start) return hit_starts -def using_gene_normalizer(output, correct_chr, temp, source_dict): +def obtain_hsp_using_gene_normalizer(output, correct_chr, temp, source_dict): for c in range(len(output)): if correct_chr == output[c].id.strip("chr"): hit = c @@ -178,17 +195,27 @@ def using_gene_normalizer(output, correct_chr, temp, source_dict): def obtain_hsp(output): + """ + Obtains high-scoring pairs (HSP) for query sequence + + Parameters + ---------- + output: BLAT output + + Returns + ------- + HSP + + """ hit_dict = {} - for c in range( - len(output) - ): # Take top scoring hit if target not found using gene normalizer - hit_scores = list() + for c in range(len(output)): + max_hit = output[c][0].score for e in range(len(output[c])): - hit_scores.append(output[c][e].score) - hit_dict[c] = hit_scores - # maybe can include in above for loop - for key in hit_dict: - hit_dict[key] = max(hit_dict[key]) + if output[c][e].score>max_hit: + max_hit = output[c][e].score + hit_dict[c] = max_hit + + #Using top scoring hit hit = max(hit_dict, key=hit_dict.get) hit_starts = obtain_hit_starts(output, hit) hsp = output[hit][hit_starts.index(max(hit_starts))] @@ -196,6 +223,22 @@ def obtain_hsp(output): def from_hsp_output(hsp, output, gsymb): + """ + + Parameters + ---------- + hsp: + High scoring pairs obtained by using top scoring hit + output: + BLAT output + gsymb: str + Gene symbol + + Returns + ------- + Tuple with data used for mapping + + """ query_ranges = hit_ranges = [] strand = hsp[0].query_strand coverage = 100 * (hsp.query_end - hsp.query_start) / output.seq_len @@ -222,12 +265,17 @@ def from_hsp_output(hsp, output, gsymb): chrom = chrom.split(" ")[9].strip("chr") query_string = query_string.split(" ") hit_string = hit_string.split(" ") + + #Range in query sequence that aligned with hit sequence query_ranges.append(query_string[2]) + + #Range in hit sequence that aligned with hquery sequence hit_ranges.append(hit_string[4]) + return chrom, strand, coverage, identity, query_ranges, hit_ranges, gsymb, None -def get_query_and_hit_ranges(output, dat): +def get_query_and_hit_ranges(output, dat:dict)->tuple: """ Extracts query and hit ranges from the BLAT output. @@ -245,12 +293,11 @@ def get_query_and_hit_ranges(output, dat): gsymb = get_gene_symb(dat) temp, source_dict = get_gene_data(gsymb) - if "HGNC" in source_dict and ( - source_dict.get("Ensembl") is not None or source_dict.get("NCBI") is not None - ): + #if "HGNC" in source_dict and (source_dict.get("Ensembl") is not None or source_dict.get("NCBI") is not None): + if False: #temporary, will remove using_gene_normalizer # TODO: remove these conditions from return_gene_data correct_chr = return_gene_data(True, temp, source_dict) - hsp = using_gene_normalizer(output, correct_chr, temp, source_dict) + hsp = obtain_hsp_using_gene_normalizer(output, correct_chr, temp, source_dict) else: hsp = obtain_hsp(output) @@ -258,7 +305,7 @@ def get_query_and_hit_ranges(output, dat): return data_tuple -def mave_to_blat(dat): +def mave_to_blat(dat:dict)->dict: """ Performs BLAT Alignment on MaveDB scoreset data. From 90aba2b0429d4936b1bca3899de5756e371d19ec Mon Sep 17 00:00:00 2001 From: Samriddhi Singh Date: Thu, 3 Aug 2023 01:33:04 +0530 Subject: [PATCH 11/12] Transcript selection without usage of gene symbol, and retrieved gene symbol from MANE transcript --- blat_out.psl | 6 ---- blat_output_test.txt | 5 --- blat_query.fa | 2 -- mavedb_mapping/transcript_selection.py | 42 ++++++++++++-------------- 4 files changed, 19 insertions(+), 36 deletions(-) delete mode 100644 blat_out.psl delete mode 100644 blat_output_test.txt delete mode 100644 blat_query.fa diff --git a/blat_out.psl b/blat_out.psl deleted file mode 100644 index a5fb47c..0000000 --- a/blat_out.psl +++ /dev/null @@ -1,6 +0,0 @@ -psLayout version 3 - -match mis- rep. N's Q gap Q gap T gap T gap strand Q Q Q Q T T T T block blockSizes qStarts tStarts - match match count bases count bases name size start end name size start end count ---------------------------------------------------------------------------------------------------------------------------------------------------------------- -179 18 0 0 0 0 1 332 + E4B 309 98 295 chr1 248956422 10179413 10179942 2 149,48, 98,247, 10179413,10179894, diff --git a/blat_output_test.txt b/blat_output_test.txt deleted file mode 100644 index 0fb8f46..0000000 --- a/blat_output_test.txt +++ /dev/null @@ -1,5 +0,0 @@ - Query: E4B - Hit: chr1 -Query range: [247:295] (1) - Hit range: [10179894:10179942] (1) - Fragments: 1 (? columns) \ No newline at end of file diff --git a/blat_query.fa b/blat_query.fa deleted file mode 100644 index f75c11f..0000000 --- a/blat_query.fa +++ /dev/null @@ -1,2 +0,0 @@ ->E4B -ATAGAGAAGTTTAAACTTCTTGCAGAGAAAGTGGAGGAAATCGTGGCAAAGAATGCGCGGGCAGAAATAGACTACAGCGATGCCCCGGACGAGTTCAGAGACCCTCTGATGGACACCCTGATGACCGATCCCGTGAGACTGCCCTCTGGCACCGTCATGGACCGTTCTATCATCCTGCGGCATCTGCTCAACTCCCCCACCGACCCCTTCAACCGCCAGATGCTGACTGAGAGCATGCTGGAGCCAGTGCCAGAGCTAAAGGAGCAGATTCAGGCCTGGATGAGAGAGAAACAGAGCAGTGACCACTGA diff --git a/mavedb_mapping/transcript_selection.py b/mavedb_mapping/transcript_selection.py index ef2d733..9b2ed66 100644 --- a/mavedb_mapping/transcript_selection.py +++ b/mavedb_mapping/transcript_selection.py @@ -6,7 +6,7 @@ from Bio.Seq import Seq from bs4 import BeautifulSoup from mavedb_mapping.transcript_selection_helper import HelperFunctionsForBLATOutput -from mavedb_mapping import sr, qh, dp +from mavedb_mapping import sr, dp utadb = UTADatabase() mane = MANETranscriptMappings() @@ -15,7 +15,7 @@ nest_asyncio.apply() -async def mapq(locs: list, chrom: str, gsymb: str): +async def mapq(locs: list, chrom: str): """ Runs a query on UTADB to obtain transcripts Parameters @@ -40,8 +40,7 @@ async def mapq(locs: list, chrom: str, gsymb: str): for i in range(len(locs)): testquery = f"""select * from uta_20210129.tx_exon_aln_v - where hgnc = '{gsymb}' - and {locs[i][0]} between alt_start_i and alt_end_i + where {locs[i][0]} between alt_start_i and alt_end_i or {locs[i][1]} between alt_start_i and alt_end_i and alt_ac = '{chrom}'""" @@ -54,19 +53,6 @@ async def mapq(locs: list, chrom: str, gsymb: str): transcript_lists.append(tl) return transcript_lists - -def get_gsymb(dat): - try: - uniprot = dat["uniprot_id"] - gsymb = qh.normalize(str(f"uniprot:{uniprot}")).gene_descriptor.label - except: - temp = dat["target"].split(" ") - if temp[0] == "JAK": - temp[0] = "JAK1" - gsymb = qh.normalize(temp[0]).gene_descriptor.label - return gsymb - - def using_uniprot(uniprot: str, ts: str): """ Looks for transcripts using Uniprot ID @@ -101,7 +87,7 @@ def using_uniprot(uniprot: str, ts: str): start = up.find(stri[:10]) return start, full_match except: - # print(dat['urn'], 'no transcripts found') + #no transcripts foundÅ› return "NA", "NA" @@ -282,11 +268,14 @@ def main(mave_blat_dict: dict, dat: dict) -> dict: locs = helper.get_locs_list() chrom = helper.get_chr(dp) - gsymb = get_gsymb(dat) - ts = asyncio.run(mapq(locs, chrom, gsymb)) + ts = asyncio.run(mapq(locs, chrom)) try: - isect = list(set.intersection(*map(set, ts))) + for i in ts: + i = set(i) + intersection_set = set.intersection(set(i)) + isect = list(intersection_set) + except: start, full_match = using_uniprot(dat["uniprot_id"], dat["target_sequence"]) np = nm = status = "NA" @@ -298,13 +287,20 @@ def main(mave_blat_dict: dict, dat: dict) -> dict: "RefSeq_prot": "NA", "RefSeq_nuc": "NA", "status": "NA", - "gsymb": gsymb, + "gsymb": "NA", } return mappings_dict mane_trans = mane.get_mane_from_transcripts(isect) - + + if mane_trans != []: + #selecting correct transcript using chrom from transcripts obtained + for i in mane_trans: + if i['GRCh38_chr'] == chrom: + mane_trans = [i] + gsymb = mane_trans[0]["symbol"] + np, start, full_match, nm, status = from_mane_trans(dat, mane_trans) else: np, start, full_match, nm, status = no_mane_trans(isect, dat) From 8d5fba77e091d834bb76e309744902fa0bce5a70 Mon Sep 17 00:00:00 2001 From: samriddhi99 <115928307+samriddhi99@users.noreply.github.com> Date: Sat, 12 Aug 2023 11:30:53 +0530 Subject: [PATCH 12/12] File name as optional parameter Co-authored-by: James Stevenson --- mavedb_mapping/blat_alignment.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mavedb_mapping/blat_alignment.py b/mavedb_mapping/blat_alignment.py index 6340036..e6bb133 100644 --- a/mavedb_mapping/blat_alignment.py +++ b/mavedb_mapping/blat_alignment.py @@ -134,7 +134,7 @@ def return_gene_data(return_chr: bool, temp, source_dict:dict): return "NA" -def extract_blat_output(dat: dict): +def extract_blat_output(dat: dict, query_filename: str = "blat_query.fa"): """ Runs a BLAT Query and returns the output @@ -142,12 +142,14 @@ def extract_blat_output(dat: dict): ---------- dat: dict Dictionary containing data required for mapping. + blat_query: str + Filename for BLAT query file to save Returns: -------- BLAT Output """ - blat_file = open("blat_query.fa", "w") + blat_file = open(query_filename, "w") blat_file.write(">" + dat["target"] + "\n") blat_file.write(dat["target_sequence"] + "\n") blat_file.close()