From 422f097304d804dfefb1b538c68983447839b9f1 Mon Sep 17 00:00:00 2001 From: Samriddhi Singh Date: Fri, 11 Aug 2023 01:15:17 +0530 Subject: [PATCH] Added tests for transcript selection --- .github/workflows/python-package.yml | 1 - tests/conftest.py | 67 +++++++++++++++++++++ tests/data/urn:mavedb:00000047-c-1 | 1 + tests/data/urn:mavedb:00000091-a-1 | 1 + tests/test_for_mapped_variants.py | 1 - tests/test_for_transcript_selection.py | 82 +++++++++++++++++--------- 6 files changed, 122 insertions(+), 31 deletions(-) create mode 100644 tests/data/urn:mavedb:00000047-c-1 create mode 100644 tests/data/urn:mavedb:00000091-a-1 diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 508aac8..db845b4 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -25,7 +25,6 @@ jobs: env: PATH_TO_SEQREPO: ./tests/data/seqrepo/latest HG38_FILE: ./tests/data/hg38.2bit - GENE_NORM_DB_URL: postgres://postgres@localhost:5432/gene_normalizer run: | export UTA_PASSWORD=uta diff --git a/tests/conftest.py b/tests/conftest.py index 0a3e195..313a394 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,63 @@ from mavedb_mapping.vrs_mapping import vrs_mapping from mavedb_mapping import data_file_path +import requests +import pandas as pd + + +def blat_for_tests(dat): + """ + Function that mocks the BLAT Alignment outputs + """ + seq = dat["target_sequence"] + type_ = dat["target_sequence_type"] + database = "hg38" + blat_url = f"https://genome.ucsc.edu/cgi-bin/hgBlat?userSeq={seq}&type={type_}&db={database}&output=json" + response = requests.get(blat_url) + blat_output = response.json() + print(blat_output) + hsp = blat_output["blat"][0] + if "fix" in hsp[13]: + hsp = blat_output["blat"][1] + diff_list = hsp[-3].split(",") + hit_starts = hsp[-1].split(",") + query_starts = hsp[-2].split(",") + + cov = [int(x) for x in diff_list] + coverage = (sum(cov) / len(seq)) * 100 + + query_list = [] + for i in range(1, len(query_starts)): + temp = f"[{query_starts[i-1]}:{query_starts[i]}]" + query_list.append(temp) + temp = f"[{query_starts[-1]}:{int(query_starts[-1])+int(diff_list[-1])}]" + query_list.append(temp) + hit_list = [] + for i in range(len(query_starts)): + temp = f"[{hit_starts[i]}:{int(hit_starts[i])+int(diff_list[i])}]" + hit_list.append(temp) + data = {"query_ranges": query_list, "hit_ranges": hit_list} + result_df = pd.DataFrame(data) + if hsp[8] == "+": + strand = 1 + else: + strand = -1 + chr = hsp[13].replace("chr", "") + + mave_blat_dict = { + "urn": dat["urn"], + "chrom": chr, + "strand": strand, + "hits": result_df, + "coverage": coverage, + } + return mave_blat_dict + + +@pytest.fixture +def mock_fun1(monkeypatch): + monkeypatch.setattr("mavedb_mapping.blat_alignment.mave_to_blat", blat_for_tests) + @pytest.fixture( scope="package", params=["urn:mavedb:00000041-a-1", "urn:mavedb:00000001-a-4"] @@ -19,3 +76,13 @@ def full_mapping(request): scores_csv = open(scores_path) vrs_mapped = vrs_mapping(mave_dat, mappings_dict, mave_blat, scores_csv) return vrs_mapped, mave_dat["urn"] + + +@pytest.fixture() +def obtain_transcripts(request): + scoreset_path = f"{data_file_path}{request.param}" + with open(scoreset_path) as scoreset: + mave_dat = metadata_obtain(scoreset) + mave_blat = mave_to_blat(mave_dat) + mappings_dict = main(mave_blat, mave_dat) + return mappings_dict diff --git a/tests/data/urn:mavedb:00000047-c-1 b/tests/data/urn:mavedb:00000047-c-1 new file mode 100644 index 0000000..a79d0a6 --- /dev/null +++ b/tests/data/urn:mavedb:00000047-c-1 @@ -0,0 +1 @@ +{"title":"CCR5 HIV binding","methodText":"Data obtained from sorting cells for both surface expression and HIV-1~BaL~ gp120-CD4 interaction was analysed using Enrich (version unspecified). Log~2~ enrichment ratios were calculated and normalised by subtracting the frequency of the WT sequence. Log~2~ enrichment ratios for two replicates were averaged to obtain variant scores. Note that the scores here were not reported in the manuscript tables, but were calculated from the replicate enrichment ratios that were reported.","abstractText":"This experiment utilised site-saturation mutagenesis (SSM) to measure the functional consequences of mutations in the human chemokine receptor, CCR5 and to map ligand interaction sites. Cells were selected for binding to HIV-1~BaL~ gp120-CD4.","shortDescription":"Deep mutational scan selecting for CCR5 binding to HIV-1(BaL) gp120-CD4 in Expi293F cells.","extraMetadata":{},"urn":"urn:mavedb:00000047-c-1","numVariants":7020,"experiment":{"title":"CCR5 HIV binding","shortDescription":"Deep mutational scan selecting for CCR5 binding to HIV-1(BaL) gp120-CD4 in Expi293F cells.","abstractText":"This experiment utilised site-saturation mutagenesis (SSM) to measure the functional consequences of mutations in the human chemokine receptor, CCR5 and to map ligand interaction sites. Cells were selected for binding to HIV-1~BaL~ gp120-CD4.","methodText":"The CCR5 variant library was generated by overlapping PCR using primers with an NNK codon, as described by Procko et al, 2013 and transfected into human Expi293F cells (a cell line lacking CCR5) using pCEP4.\r\nCCR5 interactions with chemokines (CCL3, CCL4, CCL5) were unable to be characterised by deep mutational scanning. Instead, cells were selected for interaction with an important protein ligand, the HIV-1 Env subunit gp120 following incubation with HIV-1~BaL~ gp120 fused to CD4 domains D1-D2.\r\nSurface expression was measured using anti-myc-Alexa 647.\r\nCell sorting was performed using a BD FACSAria II.\r\nTotal RNA was extracted and PCR-amplified before sequencing by Illumina MiSeq v3 or HiSeq 2500 to quantify variants. \r\nTwo replicate selections were performed.\r\n\r\nRaw data available from GEO under accession [GSE100368](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100368).","extraMetadata":{},"urn":"urn:mavedb:00000047-c","createdBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"modifiedBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"creationDate":"2020-11-11","modificationDate":"2020-11-11","publishedDate":"2020-11-11","experimentSetUrn":"urn:mavedb:00000047","scoreSetUrns":["urn:mavedb:00000047-c-1"],"doiIdentifiers":[],"primaryPublicationIdentifiers":[],"secondaryPublicationIdentifiers":[{"identifier":"29678950","id":34,"url":"http://www.ncbi.nlm.nih.gov/pubmed/29678950","referenceHtml":"Heredia JD, et al. Mapping Interaction Sites on Human Chemokine Receptors by Deep Mutational Scanning. J Immunol. 2018; 200:3825-3839."},{"identifier":"23827138","id":35,"url":"http://www.ncbi.nlm.nih.gov/pubmed/23827138","referenceHtml":"Procko E, et al. Computational design of a protein-based enzyme inhibitor. J Mol Biol. 2013; 425:3563-75."}],"rawReadIdentifiers":[],"keywords":[]},"license":{"longName":"CC0 (Public domain)","shortName":"CC0","link":"https://creativecommons.org/publicdomain/zero/1.0/","version":"1.0","id":1},"metaAnalyzesScoreSetUrns":[],"metaAnalyzedByScoreSetUrns":[],"doiIdentifiers":[],"primaryPublicationIdentifiers":[{"identifier":"29678950","id":34,"url":"http://www.ncbi.nlm.nih.gov/pubmed/29678950","referenceHtml":"Heredia JD, et al. Mapping Interaction Sites on Human Chemokine Receptors by Deep Mutational Scanning. J Immunol. 2018; 200:3825-3839."}],"secondaryPublicationIdentifiers":[],"publishedDate":"2020-11-11","creationDate":"2020-11-11","modificationDate":"2020-11-11","createdBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"modifiedBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"targetGene":{"name":"CCR5","category":"Protein coding","externalIdentifiers":[{"identifier":{"dbName":"UniProt","identifier":"P51681","url":"http://purl.uniprot.org/uniprot/P51681"},"offset":1}],"referenceMaps":[{"id":156,"genomeId":5,"targetId":156,"isPrimary":false,"genome":{"shortName":"hg38","organismName":"Homo sapiens","creationDate":"2018-06-22","modificationDate":"2018-06-22","id":5},"creationDate":"2020-11-11","modificationDate":"2020-11-11"}],"wtSequence":{"sequenceType":"dna","sequence":"GATTATCAAGTGTCAAGTCCAATCTATGACATCAATTATTATACATCGGAGCCCTGCCAAAAAATCAATGTGAAGCAAATCGCAGCCCGCCTCCTGCCTCCGCTCTACTCACTGGTGTTCATCTTTGGTTTTGTGGGCAACATGCTGGTCATCCTCATCCTGATAAACTGCAAAAGGCTGAAGAGCATGACTGACATCTACCTGCTCAACCTGGCCATCTCTGACCTGTTTTTCCTTCTTACTGTCCCCTTCTGGGCTCACTATGCTGCCGCCCAGTGGGACTTTGGAAATACAATGTGTCAACTCTTGACAGGGCTCTATTTTATAGGCTTCTTCTCTGGAATCTTCTTCATCATCCTCCTGACAATCGATAGGTACCTGGCTGTCGTCCATGCTGTGTTTGCTTTAAAAGCCAGGACGGTCACCTTTGGGGTGGTGACAAGTGTGATCACTTGGGTGGTGGCTGTGTTTGCGTCTCTCCCAGGAATCATCTTTACCAGATCTCAAAAAGAAGGTCTTCATTACACCTGCAGCTCTCATTTTCCATACAGTCAGTATCAATTCTGGAAGAATTTCCAGACATTAAAGATAGTCATCTTGGGGCTGGTCCTGCCGCTGCTTGTCATGGTCATCTGCTACTCGGGAATCCTAAAAACTCTGCTTCGGTGTCGAAATGAGAAGAAGAGGCACAGGGCTGTGAGGCTTATCTTCACCATCATGATTGTTTATTTTCTCTTCTGGGCTCCCTACAACATTGTCCTTCTCCTGAACACCTTCCAGGAATTCTTTGGCCTGAATAATTGCAGTAGCTCTAACAGGTTGGACCAAGCTATGCAGGTGACAGAGACTCTTGGGATGACGCACTGCTGCATCAACCCCATCATCTATGCCTTTGTCGGGGAGAAGTTCAGAAACTACCTCTTAGTCTTCTTCCAAAAGCACATTGCCAAACGCTTCTGCAAATGCTGTTCTATTTTCCAGCAAGAGGCTCCCGAGCGAGCAAGCTCAGTTTACACCCGATCCACTGGGGAGCAGGAAATATCTGTGGGCTTG"}},"datasetColumns":{"countColumns":[],"scoreColumns":["score","rep1","rep2"]},"keywords":[],"private":false} \ No newline at end of file diff --git a/tests/data/urn:mavedb:00000091-a-1 b/tests/data/urn:mavedb:00000091-a-1 new file mode 100644 index 0000000..224eddd --- /dev/null +++ b/tests/data/urn:mavedb:00000091-a-1 @@ -0,0 +1 @@ +{"title":"BRAF resistance to vemurafenib","methodText":"Log enrichments for the two replicates were calculated and the median enrichment for each amino acid (across all synonymous codons) are reported here.\r\n\r\nOnly a small subset of variants were reported.","abstractText":"To search for possible secondary mutations that that can confer drug resistance in the context of BRAF V600E, the authors performed a deep mutation scan of human BRAF, selecting for enrichment in B3/F cells surviving vemurafenib treatment.","shortDescription":"Deep mutational scan of the BRAF V600E allele for resistance to the cancer drug vemurafenib.","extraMetadata":{},"urn":"urn:mavedb:00000091-a-1","numVariants":73,"experiment":{"title":"BRAF resistance to vemurafenib","shortDescription":"Deep mutational scan of the BRAF V600E allele for resistance to the cancer drug vemurafenib.","abstractText":"To search for possible secondary mutations that that can confer drug resistance in the context of BRAF V600E, the authors performed a deep mutation scan of human BRAF, selecting for enrichment in B3/F cells surviving vemurafenib treatment.","methodText":"Saturation mutagenesis of 77 amino acids surrounding the PLX4720‐binding site. Eight mutant pools were generated—corresponding to amino acids 458–466, 467–476, 477–486, 501–510, 511–520, 527–536, 579–587, and 588–596—in which each amino acid was mutated to all possible 64 codons. The mutant pools were transferred into a retroviral vector containing BRAFV600E and then stably transduced into the human melanoma cell line A375.\r\n\r\nAfter transfection with mutant BRAF, cells were treated with vemurafenib. Cells surviving vemurafenib treatment were then sequenced to determine the enrichment of resistance mutations.","extraMetadata":{},"urn":"urn:mavedb:00000091-a","createdBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"modifiedBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"creationDate":"2021-11-13","modificationDate":"2021-11-13","publishedDate":"2021-11-13","experimentSetUrn":"urn:mavedb:00000091","scoreSetUrns":["urn:mavedb:00000091-a-1"],"doiIdentifiers":[],"primaryPublicationIdentifiers":[{"identifier":"24112705","id":71,"url":"http://www.ncbi.nlm.nih.gov/pubmed/24112705","referenceHtml":"Wagenaar TR, et al. Resistance to vemurafenib resulting from a novel mutation in the BRAFV600E kinase domain. Resistance to vemurafenib resulting from a novel mutation in the BRAFV600E kinase domain. 2014; 27:124-33. doi: 10.1111/pcmr.12171"}],"secondaryPublicationIdentifiers":[],"rawReadIdentifiers":[],"keywords":[]},"license":{"longName":"CC0 (Public domain)","shortName":"CC0","link":"https://creativecommons.org/publicdomain/zero/1.0/","version":"1.0","id":1},"metaAnalyzesScoreSetUrns":[],"metaAnalyzedByScoreSetUrns":[],"doiIdentifiers":[],"primaryPublicationIdentifiers":[{"identifier":"24112705","id":71,"url":"http://www.ncbi.nlm.nih.gov/pubmed/24112705","referenceHtml":"Wagenaar TR, et al. Resistance to vemurafenib resulting from a novel mutation in the BRAFV600E kinase domain. Resistance to vemurafenib resulting from a novel mutation in the BRAFV600E kinase domain. 2014; 27:124-33. doi: 10.1111/pcmr.12171"}],"secondaryPublicationIdentifiers":[],"publishedDate":"2021-11-13","creationDate":"2021-11-13","modificationDate":"2021-11-13","createdBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"modifiedBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"targetGene":{"name":"BRAF","category":"Protein coding","externalIdentifiers":[{"identifier":{"dbName":"UniProt","identifier":"P15056","url":"http://purl.uniprot.org/uniprot/P15056"},"offset":476}],"referenceMaps":[{"id":283,"genomeId":4,"targetId":283,"isPrimary":false,"genome":{"shortName":"hg19","organismName":"Homo sapiens","creationDate":"2018-06-22","modificationDate":"2018-06-22","id":4},"creationDate":"2021-11-13","modificationDate":"2021-11-13"}],"wtSequence":{"sequenceType":"protein","sequence":"HGDVAVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHHLHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDF"}},"datasetColumns":{"countColumns":[],"scoreColumns":["score"]},"keywords":[],"private":false} \ No newline at end of file diff --git a/tests/test_for_mapped_variants.py b/tests/test_for_mapped_variants.py index 5dae6c4..a3b4531 100644 --- a/tests/test_for_mapped_variants.py +++ b/tests/test_for_mapped_variants.py @@ -10,7 +10,6 @@ def get_sample_mapping_data(urn): # Sample Mapping file mapped_file = open(f"{data_file_path}{urn[11:]}.json") mapped_example = json.load(mapped_file) - print(mapped_example.keys()) # Sequence id from sample mapping file sample_sequence_id = mapped_example["mapped_scores"][0]["pre_mapped"]["location"][ diff --git a/tests/test_for_transcript_selection.py b/tests/test_for_transcript_selection.py index d4341e7..1ea3221 100644 --- a/tests/test_for_transcript_selection.py +++ b/tests/test_for_transcript_selection.py @@ -2,39 +2,63 @@ from mavedb_mapping.metadata_process import metadata_obtain from mavedb_mapping.blat_alignment import mave_to_blat from mavedb_mapping.transcript_selection import main -from mavedb_mapping import data_file_path -# Data generated by notebooks -transcripts_dict = { - "urn:mavedb:00000041-a-1": ["NP_938033.1", "NM_198291.3"], - "urn:mavedb:00000041-b-1": ["NP_938033.1", "NM_198291.3"], -} +@pytest.mark.parametrize( + "obtain_transcripts", ["urn:mavedb:00000041-a-1"], indirect=True +) +def test_for_mane_select(obtain_transcripts): + """ + Tests for transcript selections. + Compares computed values of RefSeq Protein and Nucleotide ID with those obtained by the notebooks + Also checks for status of Transcripts found + """ + # Checking for RefSeq protien ID + computed_prot = obtain_transcripts["RefSeq_prot"] + assert computed_prot == "NP_938033.1" + + # Checking for RefSeq Nucleotide ID + computed_nuc = obtain_transcripts["RefSeq_nuc"] + assert computed_nuc == "NM_198291.3" + + assert obtain_transcripts["status"] == "MANE Select" + + +@pytest.mark.parametrize( + "obtain_transcripts", ["urn:mavedb:00000091-a-1"], indirect=True +) +def test_for_mane_plus_clinical(obtain_transcripts): + """ + Tests for transcript selections. + Compares computed values of RefSeq Protein and Nucleotide ID with those obtained by the notebooks + Also checks for status of Transcripts found + """ + # Checking for RefSeq protien ID + computed_prot = obtain_transcripts["RefSeq_prot"] + assert computed_prot == "NP_004324.2" + + # Checking for RefSeq Nucleotide ID + computed_nuc = obtain_transcripts["RefSeq_nuc"] + assert computed_nuc == "NM_004333.6" -@pytest.fixture( - params=[ - "urn:mavedb:00000041-a-1", - "urn:mavedb:00000041-b-1", - ] + assert obtain_transcripts["status"] == "MANE Plus Clincial" + + +@pytest.mark.parametrize( + "obtain_transcripts", ["urn:mavedb:00000047-c-1"], indirect=True ) -def transcript_selection_dict(request): - """Fixture to return dictionary after transcript selection""" - urn = request.param - file_path = f"{data_file_path}{urn}" - with open(file_path) as scoreset: - mave_dat = metadata_obtain(scoreset) - mave = mave_to_blat(mave_dat) - tr = main(mave, mave_dat) - return tr - - -def test_for_refseq_id(transcript_selection_dict): - """Test to compare computed values of RefSeq Protein and Nucleotide ID with those obtained by the notebooks""" - urn = transcript_selection_dict["urn"] +def test_for_longest_compatible(obtain_transcripts): + """ + Tests for transcript selections. + Compares computed values of RefSeq Protein and Nucleotide ID with those obtained by the notebooks + Also checks for status of Transcripts found + """ # Checking for RefSeq protien ID - computed_prot = transcript_selection_dict["RefSeq_prot"] - assert computed_prot == transcripts_dict[urn][0] + computed_prot = obtain_transcripts["RefSeq_prot"] + assert computed_prot == "NP_000570.1" # Checking for RefSeq Nucleotide ID - computed_nuc = transcript_selection_dict["RefSeq_nuc"] - assert computed_nuc == transcripts_dict[urn][1] + computed_nuc = obtain_transcripts["RefSeq_nuc"] + assert computed_nuc == "NM_000579.3" + + assert obtain_transcripts["status"] == "Longest Compatible"