Skip to content

Commit

Permalink
Added tests for transcript selection
Browse files Browse the repository at this point in the history
  • Loading branch information
samriddhi99 committed Aug 10, 2023
1 parent d6390fe commit 422f097
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 31 deletions.
1 change: 0 additions & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ jobs:
env:
PATH_TO_SEQREPO: ./tests/data/seqrepo/latest
HG38_FILE: ./tests/data/hg38.2bit
GENE_NORM_DB_URL: postgres://postgres@localhost:5432/gene_normalizer

run: |
export UTA_PASSWORD=uta
Expand Down
67 changes: 67 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,63 @@
from mavedb_mapping.vrs_mapping import vrs_mapping
from mavedb_mapping import data_file_path

import requests
import pandas as pd


def blat_for_tests(dat):
"""
Function that mocks the BLAT Alignment outputs
"""
seq = dat["target_sequence"]
type_ = dat["target_sequence_type"]
database = "hg38"
blat_url = f"https://genome.ucsc.edu/cgi-bin/hgBlat?userSeq={seq}&type={type_}&db={database}&output=json"
response = requests.get(blat_url)
blat_output = response.json()
print(blat_output)
hsp = blat_output["blat"][0]
if "fix" in hsp[13]:
hsp = blat_output["blat"][1]
diff_list = hsp[-3].split(",")
hit_starts = hsp[-1].split(",")
query_starts = hsp[-2].split(",")

cov = [int(x) for x in diff_list]
coverage = (sum(cov) / len(seq)) * 100

query_list = []
for i in range(1, len(query_starts)):
temp = f"[{query_starts[i-1]}:{query_starts[i]}]"
query_list.append(temp)
temp = f"[{query_starts[-1]}:{int(query_starts[-1])+int(diff_list[-1])}]"
query_list.append(temp)
hit_list = []
for i in range(len(query_starts)):
temp = f"[{hit_starts[i]}:{int(hit_starts[i])+int(diff_list[i])}]"
hit_list.append(temp)
data = {"query_ranges": query_list, "hit_ranges": hit_list}
result_df = pd.DataFrame(data)
if hsp[8] == "+":
strand = 1
else:
strand = -1
chr = hsp[13].replace("chr", "")

mave_blat_dict = {
"urn": dat["urn"],
"chrom": chr,
"strand": strand,
"hits": result_df,
"coverage": coverage,
}
return mave_blat_dict


@pytest.fixture
def mock_fun1(monkeypatch):
monkeypatch.setattr("mavedb_mapping.blat_alignment.mave_to_blat", blat_for_tests)


@pytest.fixture(
scope="package", params=["urn:mavedb:00000041-a-1", "urn:mavedb:00000001-a-4"]
Expand All @@ -19,3 +76,13 @@ def full_mapping(request):
scores_csv = open(scores_path)
vrs_mapped = vrs_mapping(mave_dat, mappings_dict, mave_blat, scores_csv)
return vrs_mapped, mave_dat["urn"]


@pytest.fixture()
def obtain_transcripts(request):
scoreset_path = f"{data_file_path}{request.param}"
with open(scoreset_path) as scoreset:
mave_dat = metadata_obtain(scoreset)
mave_blat = mave_to_blat(mave_dat)
mappings_dict = main(mave_blat, mave_dat)
return mappings_dict
1 change: 1 addition & 0 deletions tests/data/urn:mavedb:00000047-c-1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"title":"CCR5 HIV binding","methodText":"Data obtained from sorting cells for both surface expression and HIV-1~BaL~ gp120-CD4 interaction was analysed using Enrich (version unspecified). Log~2~ enrichment ratios were calculated and normalised by subtracting the frequency of the WT sequence. Log~2~ enrichment ratios for two replicates were averaged to obtain variant scores. Note that the scores here were not reported in the manuscript tables, but were calculated from the replicate enrichment ratios that were reported.","abstractText":"This experiment utilised site-saturation mutagenesis (SSM) to measure the functional consequences of mutations in the human chemokine receptor, CCR5 and to map ligand interaction sites. Cells were selected for binding to HIV-1~BaL~ gp120-CD4.","shortDescription":"Deep mutational scan selecting for CCR5 binding to HIV-1(BaL) gp120-CD4 in Expi293F cells.","extraMetadata":{},"urn":"urn:mavedb:00000047-c-1","numVariants":7020,"experiment":{"title":"CCR5 HIV binding","shortDescription":"Deep mutational scan selecting for CCR5 binding to HIV-1(BaL) gp120-CD4 in Expi293F cells.","abstractText":"This experiment utilised site-saturation mutagenesis (SSM) to measure the functional consequences of mutations in the human chemokine receptor, CCR5 and to map ligand interaction sites. Cells were selected for binding to HIV-1~BaL~ gp120-CD4.","methodText":"The CCR5 variant library was generated by overlapping PCR using primers with an NNK codon, as described by Procko et al, 2013 and transfected into human Expi293F cells (a cell line lacking CCR5) using pCEP4.\r\nCCR5 interactions with chemokines (CCL3, CCL4, CCL5) were unable to be characterised by deep mutational scanning. Instead, cells were selected for interaction with an important protein ligand, the HIV-1 Env subunit gp120 following incubation with HIV-1~BaL~ gp120 fused to CD4 domains D1-D2.\r\nSurface expression was measured using anti-myc-Alexa 647.\r\nCell sorting was performed using a BD FACSAria II.\r\nTotal RNA was extracted and PCR-amplified before sequencing by Illumina MiSeq v3 or HiSeq 2500 to quantify variants. \r\nTwo replicate selections were performed.\r\n\r\nRaw data available from GEO under accession [GSE100368](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100368).","extraMetadata":{},"urn":"urn:mavedb:00000047-c","createdBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"modifiedBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"creationDate":"2020-11-11","modificationDate":"2020-11-11","publishedDate":"2020-11-11","experimentSetUrn":"urn:mavedb:00000047","scoreSetUrns":["urn:mavedb:00000047-c-1"],"doiIdentifiers":[],"primaryPublicationIdentifiers":[],"secondaryPublicationIdentifiers":[{"identifier":"29678950","id":34,"url":"http://www.ncbi.nlm.nih.gov/pubmed/29678950","referenceHtml":"Heredia JD, <i>et al</i>. Mapping Interaction Sites on Human Chemokine Receptors by Deep Mutational Scanning. <i>J Immunol</i>. 2018; <b>200</b>:3825-3839."},{"identifier":"23827138","id":35,"url":"http://www.ncbi.nlm.nih.gov/pubmed/23827138","referenceHtml":"Procko E, <i>et al</i>. Computational design of a protein-based enzyme inhibitor. <i>J Mol Biol</i>. 2013; <b>425</b>:3563-75."}],"rawReadIdentifiers":[],"keywords":[]},"license":{"longName":"CC0 (Public domain)","shortName":"CC0","link":"https://creativecommons.org/publicdomain/zero/1.0/","version":"1.0","id":1},"metaAnalyzesScoreSetUrns":[],"metaAnalyzedByScoreSetUrns":[],"doiIdentifiers":[],"primaryPublicationIdentifiers":[{"identifier":"29678950","id":34,"url":"http://www.ncbi.nlm.nih.gov/pubmed/29678950","referenceHtml":"Heredia JD, <i>et al</i>. Mapping Interaction Sites on Human Chemokine Receptors by Deep Mutational Scanning. <i>J Immunol</i>. 2018; <b>200</b>:3825-3839."}],"secondaryPublicationIdentifiers":[],"publishedDate":"2020-11-11","creationDate":"2020-11-11","modificationDate":"2020-11-11","createdBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"modifiedBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"targetGene":{"name":"CCR5","category":"Protein coding","externalIdentifiers":[{"identifier":{"dbName":"UniProt","identifier":"P51681","url":"http://purl.uniprot.org/uniprot/P51681"},"offset":1}],"referenceMaps":[{"id":156,"genomeId":5,"targetId":156,"isPrimary":false,"genome":{"shortName":"hg38","organismName":"Homo sapiens","creationDate":"2018-06-22","modificationDate":"2018-06-22","id":5},"creationDate":"2020-11-11","modificationDate":"2020-11-11"}],"wtSequence":{"sequenceType":"dna","sequence":"GATTATCAAGTGTCAAGTCCAATCTATGACATCAATTATTATACATCGGAGCCCTGCCAAAAAATCAATGTGAAGCAAATCGCAGCCCGCCTCCTGCCTCCGCTCTACTCACTGGTGTTCATCTTTGGTTTTGTGGGCAACATGCTGGTCATCCTCATCCTGATAAACTGCAAAAGGCTGAAGAGCATGACTGACATCTACCTGCTCAACCTGGCCATCTCTGACCTGTTTTTCCTTCTTACTGTCCCCTTCTGGGCTCACTATGCTGCCGCCCAGTGGGACTTTGGAAATACAATGTGTCAACTCTTGACAGGGCTCTATTTTATAGGCTTCTTCTCTGGAATCTTCTTCATCATCCTCCTGACAATCGATAGGTACCTGGCTGTCGTCCATGCTGTGTTTGCTTTAAAAGCCAGGACGGTCACCTTTGGGGTGGTGACAAGTGTGATCACTTGGGTGGTGGCTGTGTTTGCGTCTCTCCCAGGAATCATCTTTACCAGATCTCAAAAAGAAGGTCTTCATTACACCTGCAGCTCTCATTTTCCATACAGTCAGTATCAATTCTGGAAGAATTTCCAGACATTAAAGATAGTCATCTTGGGGCTGGTCCTGCCGCTGCTTGTCATGGTCATCTGCTACTCGGGAATCCTAAAAACTCTGCTTCGGTGTCGAAATGAGAAGAAGAGGCACAGGGCTGTGAGGCTTATCTTCACCATCATGATTGTTTATTTTCTCTTCTGGGCTCCCTACAACATTGTCCTTCTCCTGAACACCTTCCAGGAATTCTTTGGCCTGAATAATTGCAGTAGCTCTAACAGGTTGGACCAAGCTATGCAGGTGACAGAGACTCTTGGGATGACGCACTGCTGCATCAACCCCATCATCTATGCCTTTGTCGGGGAGAAGTTCAGAAACTACCTCTTAGTCTTCTTCCAAAAGCACATTGCCAAACGCTTCTGCAAATGCTGTTCTATTTTCCAGCAAGAGGCTCCCGAGCGAGCAAGCTCAGTTTACACCCGATCCACTGGGGAGCAGGAAATATCTGTGGGCTTG"}},"datasetColumns":{"countColumns":[],"scoreColumns":["score","rep1","rep2"]},"keywords":[],"private":false}
1 change: 1 addition & 0 deletions tests/data/urn:mavedb:00000091-a-1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"title":"BRAF resistance to vemurafenib","methodText":"Log enrichments for the two replicates were calculated and the median enrichment for each amino acid (across all synonymous codons) are reported here.\r\n\r\nOnly a small subset of variants were reported.","abstractText":"To search for possible secondary mutations that that can confer drug resistance in the context of BRAF V600E, the authors performed a deep mutation scan of human BRAF, selecting for enrichment in B3/F cells surviving vemurafenib treatment.","shortDescription":"Deep mutational scan of the BRAF V600E allele for resistance to the cancer drug vemurafenib.","extraMetadata":{},"urn":"urn:mavedb:00000091-a-1","numVariants":73,"experiment":{"title":"BRAF resistance to vemurafenib","shortDescription":"Deep mutational scan of the BRAF V600E allele for resistance to the cancer drug vemurafenib.","abstractText":"To search for possible secondary mutations that that can confer drug resistance in the context of BRAF V600E, the authors performed a deep mutation scan of human BRAF, selecting for enrichment in B3/F cells surviving vemurafenib treatment.","methodText":"Saturation mutagenesis of 77 amino acids surrounding the PLX4720‐binding site. Eight mutant pools were generated—corresponding to amino acids 458–466, 467–476, 477–486, 501–510, 511–520, 527–536, 579–587, and 588–596—in which each amino acid was mutated to all possible 64 codons. The mutant pools were transferred into a retroviral vector containing BRAFV600E and then stably transduced into the human melanoma cell line A375.\r\n\r\nAfter transfection with mutant BRAF, cells were treated with vemurafenib. Cells surviving vemurafenib treatment were then sequenced to determine the enrichment of resistance mutations.","extraMetadata":{},"urn":"urn:mavedb:00000091-a","createdBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"modifiedBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"creationDate":"2021-11-13","modificationDate":"2021-11-13","publishedDate":"2021-11-13","experimentSetUrn":"urn:mavedb:00000091","scoreSetUrns":["urn:mavedb:00000091-a-1"],"doiIdentifiers":[],"primaryPublicationIdentifiers":[{"identifier":"24112705","id":71,"url":"http://www.ncbi.nlm.nih.gov/pubmed/24112705","referenceHtml":"Wagenaar TR, <i>et al</i>. Resistance to vemurafenib resulting from a novel mutation in the BRAFV600E kinase domain. Resistance to vemurafenib resulting from a novel mutation in the BRAFV600E kinase domain. 2014; 27:124-33. doi: 10.1111/pcmr.12171"}],"secondaryPublicationIdentifiers":[],"rawReadIdentifiers":[],"keywords":[]},"license":{"longName":"CC0 (Public domain)","shortName":"CC0","link":"https://creativecommons.org/publicdomain/zero/1.0/","version":"1.0","id":1},"metaAnalyzesScoreSetUrns":[],"metaAnalyzedByScoreSetUrns":[],"doiIdentifiers":[],"primaryPublicationIdentifiers":[{"identifier":"24112705","id":71,"url":"http://www.ncbi.nlm.nih.gov/pubmed/24112705","referenceHtml":"Wagenaar TR, <i>et al</i>. Resistance to vemurafenib resulting from a novel mutation in the BRAFV600E kinase domain. Resistance to vemurafenib resulting from a novel mutation in the BRAFV600E kinase domain. 2014; 27:124-33. doi: 10.1111/pcmr.12171"}],"secondaryPublicationIdentifiers":[],"publishedDate":"2021-11-13","creationDate":"2021-11-13","modificationDate":"2021-11-13","createdBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"modifiedBy":{"orcidId":"0000-0003-1474-605X","firstName":"Alan","lastName":"Rubin"},"targetGene":{"name":"BRAF","category":"Protein coding","externalIdentifiers":[{"identifier":{"dbName":"UniProt","identifier":"P15056","url":"http://purl.uniprot.org/uniprot/P15056"},"offset":476}],"referenceMaps":[{"id":283,"genomeId":4,"targetId":283,"isPrimary":false,"genome":{"shortName":"hg19","organismName":"Homo sapiens","creationDate":"2018-06-22","modificationDate":"2018-06-22","id":4},"creationDate":"2021-11-13","modificationDate":"2021-11-13"}],"wtSequence":{"sequenceType":"protein","sequence":"HGDVAVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHHLHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDF"}},"datasetColumns":{"countColumns":[],"scoreColumns":["score"]},"keywords":[],"private":false}
1 change: 0 additions & 1 deletion tests/test_for_mapped_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ def get_sample_mapping_data(urn):
# Sample Mapping file
mapped_file = open(f"{data_file_path}{urn[11:]}.json")
mapped_example = json.load(mapped_file)
print(mapped_example.keys())

# Sequence id from sample mapping file
sample_sequence_id = mapped_example["mapped_scores"][0]["pre_mapped"]["location"][
Expand Down
82 changes: 53 additions & 29 deletions tests/test_for_transcript_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,63 @@
from mavedb_mapping.metadata_process import metadata_obtain
from mavedb_mapping.blat_alignment import mave_to_blat
from mavedb_mapping.transcript_selection import main
from mavedb_mapping import data_file_path

# Data generated by notebooks
transcripts_dict = {
"urn:mavedb:00000041-a-1": ["NP_938033.1", "NM_198291.3"],
"urn:mavedb:00000041-b-1": ["NP_938033.1", "NM_198291.3"],
}

@pytest.mark.parametrize(
"obtain_transcripts", ["urn:mavedb:00000041-a-1"], indirect=True
)
def test_for_mane_select(obtain_transcripts):
"""
Tests for transcript selections.
Compares computed values of RefSeq Protein and Nucleotide ID with those obtained by the notebooks
Also checks for status of Transcripts found
"""
# Checking for RefSeq protien ID
computed_prot = obtain_transcripts["RefSeq_prot"]
assert computed_prot == "NP_938033.1"

# Checking for RefSeq Nucleotide ID
computed_nuc = obtain_transcripts["RefSeq_nuc"]
assert computed_nuc == "NM_198291.3"

assert obtain_transcripts["status"] == "MANE Select"


@pytest.mark.parametrize(
"obtain_transcripts", ["urn:mavedb:00000091-a-1"], indirect=True
)
def test_for_mane_plus_clinical(obtain_transcripts):
"""
Tests for transcript selections.
Compares computed values of RefSeq Protein and Nucleotide ID with those obtained by the notebooks
Also checks for status of Transcripts found
"""
# Checking for RefSeq protien ID
computed_prot = obtain_transcripts["RefSeq_prot"]
assert computed_prot == "NP_004324.2"

# Checking for RefSeq Nucleotide ID
computed_nuc = obtain_transcripts["RefSeq_nuc"]
assert computed_nuc == "NM_004333.6"

@pytest.fixture(
params=[
"urn:mavedb:00000041-a-1",
"urn:mavedb:00000041-b-1",
]
assert obtain_transcripts["status"] == "MANE Plus Clincial"


@pytest.mark.parametrize(
"obtain_transcripts", ["urn:mavedb:00000047-c-1"], indirect=True
)
def transcript_selection_dict(request):
"""Fixture to return dictionary after transcript selection"""
urn = request.param
file_path = f"{data_file_path}{urn}"
with open(file_path) as scoreset:
mave_dat = metadata_obtain(scoreset)
mave = mave_to_blat(mave_dat)
tr = main(mave, mave_dat)
return tr


def test_for_refseq_id(transcript_selection_dict):
"""Test to compare computed values of RefSeq Protein and Nucleotide ID with those obtained by the notebooks"""
urn = transcript_selection_dict["urn"]
def test_for_longest_compatible(obtain_transcripts):
"""
Tests for transcript selections.
Compares computed values of RefSeq Protein and Nucleotide ID with those obtained by the notebooks
Also checks for status of Transcripts found
"""
# Checking for RefSeq protien ID
computed_prot = transcript_selection_dict["RefSeq_prot"]
assert computed_prot == transcripts_dict[urn][0]
computed_prot = obtain_transcripts["RefSeq_prot"]
assert computed_prot == "NP_000570.1"

# Checking for RefSeq Nucleotide ID
computed_nuc = transcript_selection_dict["RefSeq_nuc"]
assert computed_nuc == transcripts_dict[urn][1]
computed_nuc = obtain_transcripts["RefSeq_nuc"]
assert computed_nuc == "NM_000579.3"

assert obtain_transcripts["status"] == "Longest Compatible"

0 comments on commit 422f097

Please sign in to comment.