Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update notebooks, alignment, and mapping code #64

Merged
merged 18 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 59 additions & 18 deletions notebooks/analysis/mapping_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@
"outputs": [],
"source": [
"import json\n",
"\n",
"import pandas as pd\n",
"from Bio.Seq import Seq\n",
"import pandas as pd"
"from cool_seq_tool.schemas import Strand"
]
},
{
Expand All @@ -49,6 +51,26 @@
"score_sets = mave_metadata_dat[\"urn\"].to_list()[:-5]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "a69e4d2c",
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"from pathlib import Path\n",
"\n",
"from dcd_mapping.schemas import AlignmentResult\n",
"\n",
"with Path.open(\"analysis_files/mave_blat_output.pickle\", \"rb\") as fn:\n",
" mave_blat_temp = pickle.load(fn)\n",
"align_results = {}\n",
"for scoreset in score_sets:\n",
" if scoreset != \"urn:mavedb:00000105-a-1\":\n",
" align_results[scoreset] = AlignmentResult(**mave_blat_temp[scoreset])"
]
},
{
"cell_type": "markdown",
"id": "4d037805",
Expand All @@ -60,7 +82,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"id": "3f5e44d2",
"metadata": {},
"outputs": [
Expand All @@ -70,23 +92,26 @@
"'The number of examined variant pairs is: 2499044'"
]
},
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diff_vars_dict = {}\n",
"var_count = 0\n",
"counter = 0\n",
"var_count_dict_new = {}\n",
"\n",
"for key in score_sets:\n",
" if key != \"urn:mavedb:00000072-a-1\" and key != \"urn:mavedb:00000105-a-1\": # No mapping for these score sets\n",
" f = open(f\"analysis_files/mappings/{key[11::]}.json\")\n",
" dat = json.load(f)\n",
" seq_type = dat[\"computed_reference_sequence\"][\"sequence_type\"]\n",
" dat = dat[\"mapped_scores\"]\n",
"\n",
" diff_vars = []\n",
" strand = align_results[key].strand\n",
"\n",
" for j in range(len(dat)):\n",
" if \"members\" not in dat[j][\"pre_mapped\"]:\n",
Expand All @@ -95,8 +120,16 @@
" seq_post = dat[j][\"post_mapped\"][\"vrs_ref_allele_seq\"]\n",
" seq_pre_rv = str(Seq(seq_pre).reverse_complement())\n",
"\n",
" if seq_pre != seq_post and seq_post != seq_pre_rv:\n",
" diff_vars.append(j)\n",
" if seq_type == \"protein\":\n",
" if seq_pre != seq_post:\n",
" diff_vars.append(j)\n",
" else:\n",
" if strand == Strand.POSITIVE:\n",
" if seq_pre != seq_post:\n",
" diff_vars.append(j)\n",
" else:\n",
" if seq_post != seq_pre_rv:\n",
" diff_vars.append(j)\n",
"\n",
" else:\n",
" for k in range(len(dat[j][\"pre_mapped\"][\"members\"])):\n",
Expand All @@ -105,8 +138,16 @@
" seq_post = dat[j][\"post_mapped\"][\"members\"][k][\"vrs_ref_allele_seq\"]\n",
" seq_pre_rv = str(Seq(seq_pre).reverse_complement())\n",
"\n",
" if seq_pre != seq_post and seq_post != seq_pre_rv:\n",
" diff_vars.append(j)\n",
" if seq_type == \"protein\":\n",
" if seq_pre != seq_post:\n",
" diff_vars.append(j)\n",
" else:\n",
" if strand == Strand.POSITIVE:\n",
" if seq_pre != seq_post:\n",
" diff_vars.append(j)\n",
" else:\n",
" if seq_post != seq_pre_rv:\n",
" diff_vars.append(j)\n",
"\n",
" diff_vars_dict[key] = diff_vars\n",
" var_count_dict_new[key] = var_count\n",
Expand All @@ -123,7 +164,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"id": "240e02be",
"metadata": {},
"outputs": [],
Expand All @@ -142,7 +183,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "1c5c6167",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -182,7 +223,7 @@
" 'score': 0.0836691871353702}"
]
},
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -205,17 +246,17 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "de7a5de0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'There are 24878 instances of reference mismatch. This corresponds to a percentage of 0.996 (24878/2499044)'"
"'There are 34832 instances of reference mismatch. This corresponds to a percentage of 1.394 (34832/2499044)'"
]
},
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -239,7 +280,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"id": "6a22205e",
"metadata": {},
"outputs": [
Expand All @@ -249,7 +290,7 @@
"'The number of unique pre-mapped MAVE variants is 363294. The number of unique post-mapped MAVE variants is 349972.'"
]
},
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -293,7 +334,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"id": "3eb301ed",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -340,7 +381,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"id": "fc87cbbe",
"metadata": {},
"outputs": [
Expand All @@ -350,7 +391,7 @@
"'There are 349972 pre-mapped MAVE variants in the dictionary. 9553 post-mapped MAVE variants have 2 or more corresponding pre-mapped MAVE variants.'"
]
},
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
99 changes: 95 additions & 4 deletions notebooks/analysis/mavedb_scoreset_breakdown.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@
{
"data": {
"text/plain": [
"'The number of protein coding score sets with gene symbols/aliases is: 125'"
"'The number of protein coding score sets with gene symbols/aliases is: 126'"
]
},
"execution_count": 13,
Expand All @@ -495,7 +495,7 @@
],
"source": [
"td = list(coding_ss[\"target\"])\n",
"td = [x for x in td if \" \" not in x]\n",
"td = [x for x in td if \" \" not in x or x == \"Glycophorin A\"]\n",
"f\"The number of protein coding score sets with gene symbols/aliases is: {len(td)}\""
]
},
Expand All @@ -516,7 +516,7 @@
{
"data": {
"text/plain": [
"'The number of protein coding score sets with descriptive targets is: 43'"
"'The number of protein coding score sets with descriptive targets is: 42'"
]
},
"execution_count": 14,
Expand All @@ -526,7 +526,7 @@
],
"source": [
"td = list(coding_ss[\"target\"])\n",
"td = [x for x in td if \" \" in x]\n",
"td = [x for x in td if \" \" in x and x != \"Glycophorin A\"]\n",
"f\"The number of protein coding score sets with descriptive targets is: {len(td)}\""
]
},
Expand Down Expand Up @@ -606,6 +606,97 @@
" unique_genes.append(comp[0])\n",
"f\"The number of unique gene symbols across examined score set targets is: {len(set(unique_genes))}\""
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "4ef61ec4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'ACE2',\n",
" 'AICDA',\n",
" 'ALDOB',\n",
" 'Aβ42',\n",
" 'BCL11A',\n",
" 'BRAF',\n",
" 'BRCA1',\n",
" 'CALM1',\n",
" 'CBS',\n",
" 'CCR5',\n",
" 'CD86',\n",
" 'CXCR4',\n",
" 'CYP2C19',\n",
" 'CYP2C9',\n",
" 'DLG4',\n",
" 'ECR11',\n",
" 'ERBB2',\n",
" 'F9',\n",
" 'FOXE1',\n",
" 'GABBR1',\n",
" 'GCK',\n",
" 'GDI1',\n",
" 'GP1BB',\n",
" 'Glycophorin',\n",
" 'HBB',\n",
" 'HBG1',\n",
" 'HMBS',\n",
" 'HMGCR',\n",
" 'HNF4A',\n",
" 'IGHG1',\n",
" 'IRF4',\n",
" 'IRF6',\n",
" 'KCNQ1',\n",
" 'KCNQ4',\n",
" 'LDLR',\n",
" 'LDLRAP1',\n",
" 'MAPK1',\n",
" 'MPL',\n",
" 'MSH2',\n",
" 'MSMB',\n",
" 'MTHFR',\n",
" 'MYC',\n",
" 'NCS1',\n",
" 'NUDT15',\n",
" 'PARP1',\n",
" 'PKLR',\n",
" 'PSAT1',\n",
" 'PTEN',\n",
" 'RET',\n",
" 'RHO',\n",
" 'Ras',\n",
" 'SCN5A',\n",
" 'SORT1',\n",
" 'SRC',\n",
" 'SUMO1',\n",
" 'TARDBP',\n",
" 'TCF7L2',\n",
" 'TECR',\n",
" 'TERT',\n",
" 'TP53',\n",
" 'TPK1',\n",
" 'TPMT',\n",
" 'UBE2I',\n",
" 'UC88',\n",
" 'VKORC1',\n",
" 'WT1',\n",
" 'YAP1',\n",
" 'ZFAND3',\n",
" 'ZHX2',\n",
" 'ZRS',\n",
" 'alpha-synuclein'}"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"set(unique_genes)"
]
}
],
"metadata": {
Expand Down
Loading
Loading