Skip to content

Commit

Permalink
Update notebooks, alignment, and mapping code (#64)
Browse files Browse the repository at this point in the history
  • Loading branch information
jarbesfeld authored Oct 10, 2024
1 parent e491d7c commit 88e3078
Show file tree
Hide file tree
Showing 10 changed files with 370 additions and 141 deletions.
77 changes: 59 additions & 18 deletions notebooks/analysis/mapping_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@
"outputs": [],
"source": [
"import json\n",
"\n",
"import pandas as pd\n",
"from Bio.Seq import Seq\n",
"import pandas as pd"
"from cool_seq_tool.schemas import Strand"
]
},
{
Expand All @@ -49,6 +51,26 @@
"score_sets = mave_metadata_dat[\"urn\"].to_list()[:-5]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "a69e4d2c",
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"from pathlib import Path\n",
"\n",
"from dcd_mapping.schemas import AlignmentResult\n",
"\n",
"with Path.open(\"analysis_files/mave_blat_output.pickle\", \"rb\") as fn:\n",
" mave_blat_temp = pickle.load(fn)\n",
"align_results = {}\n",
"for scoreset in score_sets:\n",
" if scoreset != \"urn:mavedb:00000105-a-1\":\n",
" align_results[scoreset] = AlignmentResult(**mave_blat_temp[scoreset])"
]
},
{
"cell_type": "markdown",
"id": "4d037805",
Expand All @@ -60,7 +82,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"id": "3f5e44d2",
"metadata": {},
"outputs": [
Expand All @@ -70,23 +92,26 @@
"'The number of examined variant pairs is: 2499044'"
]
},
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diff_vars_dict = {}\n",
"var_count = 0\n",
"counter = 0\n",
"var_count_dict_new = {}\n",
"\n",
"for key in score_sets:\n",
" if key != \"urn:mavedb:00000072-a-1\" and key != \"urn:mavedb:00000105-a-1\": # No mapping for these score sets\n",
" f = open(f\"analysis_files/mappings/{key[11::]}.json\")\n",
" dat = json.load(f)\n",
" seq_type = dat[\"computed_reference_sequence\"][\"sequence_type\"]\n",
" dat = dat[\"mapped_scores\"]\n",
"\n",
" diff_vars = []\n",
" strand = align_results[key].strand\n",
"\n",
" for j in range(len(dat)):\n",
" if \"members\" not in dat[j][\"pre_mapped\"]:\n",
Expand All @@ -95,8 +120,16 @@
" seq_post = dat[j][\"post_mapped\"][\"vrs_ref_allele_seq\"]\n",
" seq_pre_rv = str(Seq(seq_pre).reverse_complement())\n",
"\n",
" if seq_pre != seq_post and seq_post != seq_pre_rv:\n",
" diff_vars.append(j)\n",
" if seq_type == \"protein\":\n",
" if seq_pre != seq_post:\n",
" diff_vars.append(j)\n",
" else:\n",
" if strand == Strand.POSITIVE:\n",
" if seq_pre != seq_post:\n",
" diff_vars.append(j)\n",
" else:\n",
" if seq_post != seq_pre_rv:\n",
" diff_vars.append(j)\n",
"\n",
" else:\n",
" for k in range(len(dat[j][\"pre_mapped\"][\"members\"])):\n",
Expand All @@ -105,8 +138,16 @@
" seq_post = dat[j][\"post_mapped\"][\"members\"][k][\"vrs_ref_allele_seq\"]\n",
" seq_pre_rv = str(Seq(seq_pre).reverse_complement())\n",
"\n",
" if seq_pre != seq_post and seq_post != seq_pre_rv:\n",
" diff_vars.append(j)\n",
" if seq_type == \"protein\":\n",
" if seq_pre != seq_post:\n",
" diff_vars.append(j)\n",
" else:\n",
" if strand == Strand.POSITIVE:\n",
" if seq_pre != seq_post:\n",
" diff_vars.append(j)\n",
" else:\n",
" if seq_post != seq_pre_rv:\n",
" diff_vars.append(j)\n",
"\n",
" diff_vars_dict[key] = diff_vars\n",
" var_count_dict_new[key] = var_count\n",
Expand All @@ -123,7 +164,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"id": "240e02be",
"metadata": {},
"outputs": [],
Expand All @@ -142,7 +183,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "1c5c6167",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -182,7 +223,7 @@
" 'score': 0.0836691871353702}"
]
},
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -205,17 +246,17 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "de7a5de0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'There are 24878 instances of reference mismatch. This corresponds to a percentage of 0.996 (24878/2499044)'"
"'There are 34832 instances of reference mismatch. This corresponds to a percentage of 1.394 (34832/2499044)'"
]
},
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -239,7 +280,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"id": "6a22205e",
"metadata": {},
"outputs": [
Expand All @@ -249,7 +290,7 @@
"'The number of unique pre-mapped MAVE variants is 363294. The number of unique post-mapped MAVE variants is 349972.'"
]
},
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -293,7 +334,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"id": "3eb301ed",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -340,7 +381,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"id": "fc87cbbe",
"metadata": {},
"outputs": [
Expand All @@ -350,7 +391,7 @@
"'There are 349972 pre-mapped MAVE variants in the dictionary. 9553 post-mapped MAVE variants have 2 or more corresponding pre-mapped MAVE variants.'"
]
},
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
99 changes: 95 additions & 4 deletions notebooks/analysis/mavedb_scoreset_breakdown.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@
{
"data": {
"text/plain": [
"'The number of protein coding score sets with gene symbols/aliases is: 125'"
"'The number of protein coding score sets with gene symbols/aliases is: 126'"
]
},
"execution_count": 13,
Expand All @@ -495,7 +495,7 @@
],
"source": [
"td = list(coding_ss[\"target\"])\n",
"td = [x for x in td if \" \" not in x]\n",
"td = [x for x in td if \" \" not in x or x == \"Glycophorin A\"]\n",
"f\"The number of protein coding score sets with gene symbols/aliases is: {len(td)}\""
]
},
Expand All @@ -516,7 +516,7 @@
{
"data": {
"text/plain": [
"'The number of protein coding score sets with descriptive targets is: 43'"
"'The number of protein coding score sets with descriptive targets is: 42'"
]
},
"execution_count": 14,
Expand All @@ -526,7 +526,7 @@
],
"source": [
"td = list(coding_ss[\"target\"])\n",
"td = [x for x in td if \" \" in x]\n",
"td = [x for x in td if \" \" in x and x != \"Glycophorin A\"]\n",
"f\"The number of protein coding score sets with descriptive targets is: {len(td)}\""
]
},
Expand Down Expand Up @@ -606,6 +606,97 @@
" unique_genes.append(comp[0])\n",
"f\"The number of unique gene symbols across examined score set targets is: {len(set(unique_genes))}\""
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "4ef61ec4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'ACE2',\n",
" 'AICDA',\n",
" 'ALDOB',\n",
" 'Aβ42',\n",
" 'BCL11A',\n",
" 'BRAF',\n",
" 'BRCA1',\n",
" 'CALM1',\n",
" 'CBS',\n",
" 'CCR5',\n",
" 'CD86',\n",
" 'CXCR4',\n",
" 'CYP2C19',\n",
" 'CYP2C9',\n",
" 'DLG4',\n",
" 'ECR11',\n",
" 'ERBB2',\n",
" 'F9',\n",
" 'FOXE1',\n",
" 'GABBR1',\n",
" 'GCK',\n",
" 'GDI1',\n",
" 'GP1BB',\n",
" 'Glycophorin',\n",
" 'HBB',\n",
" 'HBG1',\n",
" 'HMBS',\n",
" 'HMGCR',\n",
" 'HNF4A',\n",
" 'IGHG1',\n",
" 'IRF4',\n",
" 'IRF6',\n",
" 'KCNQ1',\n",
" 'KCNQ4',\n",
" 'LDLR',\n",
" 'LDLRAP1',\n",
" 'MAPK1',\n",
" 'MPL',\n",
" 'MSH2',\n",
" 'MSMB',\n",
" 'MTHFR',\n",
" 'MYC',\n",
" 'NCS1',\n",
" 'NUDT15',\n",
" 'PARP1',\n",
" 'PKLR',\n",
" 'PSAT1',\n",
" 'PTEN',\n",
" 'RET',\n",
" 'RHO',\n",
" 'Ras',\n",
" 'SCN5A',\n",
" 'SORT1',\n",
" 'SRC',\n",
" 'SUMO1',\n",
" 'TARDBP',\n",
" 'TCF7L2',\n",
" 'TECR',\n",
" 'TERT',\n",
" 'TP53',\n",
" 'TPK1',\n",
" 'TPMT',\n",
" 'UBE2I',\n",
" 'UC88',\n",
" 'VKORC1',\n",
" 'WT1',\n",
" 'YAP1',\n",
" 'ZFAND3',\n",
" 'ZHX2',\n",
" 'ZRS',\n",
" 'alpha-synuclein'}"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"set(unique_genes)"
]
}
],
"metadata": {
Expand Down
Loading

0 comments on commit 88e3078

Please sign in to comment.