ave-dcd · jarbesfeld · Oct 10, 2024 · Oct 1, 2024 · Oct 2, 2024 · Oct 3, 2024
diff --git a/notebooks/analysis/mapping_analysis.ipynb b/notebooks/analysis/mapping_analysis.ipynb
@@ -26,8 +26,10 @@
    "outputs": [],
    "source": [
     "import json\n",
+    "\n",
+    "import pandas as pd\n",
     "from Bio.Seq import Seq\n",
-    "import pandas as pd"
+    "from cool_seq_tool.schemas import Strand"
    ]
   },
   {
@@ -49,6 +51,26 @@
     "score_sets = mave_metadata_dat[\"urn\"].to_list()[:-5]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a69e4d2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from dcd_mapping.schemas import AlignmentResult\n",
+    "\n",
+    "with Path.open(\"analysis_files/mave_blat_output.pickle\", \"rb\") as fn:\n",
+    "    mave_blat_temp = pickle.load(fn)\n",
+    "align_results = {}\n",
+    "for scoreset in score_sets:\n",
+    "    if scoreset != \"urn:mavedb:00000105-a-1\":\n",
+    "        align_results[scoreset] = AlignmentResult(**mave_blat_temp[scoreset])"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "4d037805",
@@ -60,7 +82,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "3f5e44d2",
    "metadata": {},
    "outputs": [
@@ -70,23 +92,26 @@
        "'The number of examined variant pairs is: 2499044'"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "diff_vars_dict = {}\n",
     "var_count = 0\n",
+    "counter = 0\n",
     "var_count_dict_new = {}\n",
     "\n",
     "for key in score_sets:\n",
     "    if key != \"urn:mavedb:00000072-a-1\" and key != \"urn:mavedb:00000105-a-1\": # No mapping for these score sets\n",
     "        f = open(f\"analysis_files/mappings/{key[11::]}.json\")\n",
     "        dat = json.load(f)\n",
+    "        seq_type = dat[\"computed_reference_sequence\"][\"sequence_type\"]\n",
     "        dat = dat[\"mapped_scores\"]\n",
     "\n",
     "        diff_vars = []\n",
+    "        strand = align_results[key].strand\n",
     "\n",
     "        for j in range(len(dat)):\n",
     "            if \"members\" not in dat[j][\"pre_mapped\"]:\n",
@@ -95,8 +120,16 @@
     "                seq_post = dat[j][\"post_mapped\"][\"vrs_ref_allele_seq\"]\n",
     "                seq_pre_rv = str(Seq(seq_pre).reverse_complement())\n",
     "\n",
-    "                if seq_pre != seq_post and seq_post != seq_pre_rv:\n",
-    "                    diff_vars.append(j)\n",
+    "                if seq_type == \"protein\":\n",
+    "                    if seq_pre != seq_post:\n",
+    "                        diff_vars.append(j)\n",
+    "                else:\n",
+    "                    if strand == Strand.POSITIVE:\n",
+    "                        if seq_pre != seq_post:\n",
+    "                            diff_vars.append(j)\n",
+    "                    else:\n",
+    "                        if seq_post != seq_pre_rv:\n",
+    "                            diff_vars.append(j)\n",
     "\n",
     "            else:\n",
     "                for k in range(len(dat[j][\"pre_mapped\"][\"members\"])):\n",
@@ -105,8 +138,16 @@
     "                    seq_post = dat[j][\"post_mapped\"][\"members\"][k][\"vrs_ref_allele_seq\"]\n",
     "                    seq_pre_rv = str(Seq(seq_pre).reverse_complement())\n",
     "\n",
-    "                    if seq_pre != seq_post and seq_post != seq_pre_rv:\n",
-    "                        diff_vars.append(j)\n",
+    "                    if seq_type == \"protein\":\n",
+    "                        if seq_pre != seq_post:\n",
+    "                            diff_vars.append(j)\n",
+    "                    else:\n",
+    "                        if strand == Strand.POSITIVE:\n",
+    "                            if seq_pre != seq_post:\n",
+    "                                diff_vars.append(j)\n",
+    "                        else:\n",
+    "                            if seq_post != seq_pre_rv:\n",
+    "                                diff_vars.append(j)\n",
     "\n",
     "        diff_vars_dict[key] = diff_vars\n",
     "        var_count_dict_new[key] = var_count\n",
@@ -123,7 +164,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "240e02be",
    "metadata": {},
    "outputs": [],
@@ -142,7 +183,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "1c5c6167",
    "metadata": {},
    "outputs": [
@@ -182,7 +223,7 @@
        " 'score': 0.0836691871353702}"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -205,17 +246,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "de7a5de0",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'There are 24878 instances of reference mismatch. This corresponds to a percentage of 0.996 (24878/2499044)'"
+       "'There are 34832 instances of reference mismatch. This corresponds to a percentage of 1.394 (34832/2499044)'"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -239,7 +280,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "id": "6a22205e",
    "metadata": {},
    "outputs": [
@@ -249,7 +290,7 @@
        "'The number of unique pre-mapped MAVE variants is 363294. The number of unique post-mapped MAVE variants is 349972.'"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -293,7 +334,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "3eb301ed",
    "metadata": {},
    "outputs": [],
@@ -340,7 +381,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "fc87cbbe",
    "metadata": {},
    "outputs": [
@@ -350,7 +391,7 @@
        "'There are 349972 pre-mapped MAVE variants in the dictionary. 9553 post-mapped MAVE variants have 2 or more corresponding pre-mapped MAVE variants.'"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }

diff --git a/notebooks/analysis/mavedb_scoreset_breakdown.ipynb b/notebooks/analysis/mavedb_scoreset_breakdown.ipynb
@@ -485,7 +485,7 @@
     {
      "data": {
       "text/plain": [
-       "'The number of protein coding score sets with gene symbols/aliases is: 125'"
+       "'The number of protein coding score sets with gene symbols/aliases is: 126'"
       ]
      },
      "execution_count": 13,
@@ -495,7 +495,7 @@
    ],
    "source": [
     "td = list(coding_ss[\"target\"])\n",
-    "td = [x for x in td if \" \" not in x]\n",
+    "td = [x for x in td if \" \" not in x or x == \"Glycophorin A\"]\n",
     "f\"The number of protein coding score sets with gene symbols/aliases is: {len(td)}\""
    ]
   },
@@ -516,7 +516,7 @@
     {
      "data": {
       "text/plain": [
-       "'The number of protein coding score sets with descriptive targets is: 43'"
+       "'The number of protein coding score sets with descriptive targets is: 42'"
       ]
      },
      "execution_count": 14,
@@ -526,7 +526,7 @@
    ],
    "source": [
     "td = list(coding_ss[\"target\"])\n",
-    "td = [x for x in td if \" \" in x]\n",
+    "td = [x for x in td if \" \" in x and x != \"Glycophorin A\"]\n",
     "f\"The number of protein coding score sets with descriptive targets is: {len(td)}\""
    ]
   },
@@ -606,6 +606,97 @@
     "            unique_genes.append(comp[0])\n",
     "f\"The number of unique gene symbols across examined score set targets is: {len(set(unique_genes))}\""
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "4ef61ec4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'ACE2',\n",
+       " 'AICDA',\n",
+       " 'ALDOB',\n",
+       " 'Aβ42',\n",
+       " 'BCL11A',\n",
+       " 'BRAF',\n",
+       " 'BRCA1',\n",
+       " 'CALM1',\n",
+       " 'CBS',\n",
+       " 'CCR5',\n",
+       " 'CD86',\n",
+       " 'CXCR4',\n",
+       " 'CYP2C19',\n",
+       " 'CYP2C9',\n",
+       " 'DLG4',\n",
+       " 'ECR11',\n",
+       " 'ERBB2',\n",
+       " 'F9',\n",
+       " 'FOXE1',\n",
+       " 'GABBR1',\n",
+       " 'GCK',\n",
+       " 'GDI1',\n",
+       " 'GP1BB',\n",
+       " 'Glycophorin',\n",
+       " 'HBB',\n",
+       " 'HBG1',\n",
+       " 'HMBS',\n",
+       " 'HMGCR',\n",
+       " 'HNF4A',\n",
+       " 'IGHG1',\n",
+       " 'IRF4',\n",
+       " 'IRF6',\n",
+       " 'KCNQ1',\n",
+       " 'KCNQ4',\n",
+       " 'LDLR',\n",
+       " 'LDLRAP1',\n",
+       " 'MAPK1',\n",
+       " 'MPL',\n",
+       " 'MSH2',\n",
+       " 'MSMB',\n",
+       " 'MTHFR',\n",
+       " 'MYC',\n",
+       " 'NCS1',\n",
+       " 'NUDT15',\n",
+       " 'PARP1',\n",
+       " 'PKLR',\n",
+       " 'PSAT1',\n",
+       " 'PTEN',\n",
+       " 'RET',\n",
+       " 'RHO',\n",
+       " 'Ras',\n",
+       " 'SCN5A',\n",
+       " 'SORT1',\n",
+       " 'SRC',\n",
+       " 'SUMO1',\n",
+       " 'TARDBP',\n",
+       " 'TCF7L2',\n",
+       " 'TECR',\n",
+       " 'TERT',\n",
+       " 'TP53',\n",
+       " 'TPK1',\n",
+       " 'TPMT',\n",
+       " 'UBE2I',\n",
+       " 'UC88',\n",
+       " 'VKORC1',\n",
+       " 'WT1',\n",
+       " 'YAP1',\n",
+       " 'ZFAND3',\n",
+       " 'ZHX2',\n",
+       " 'ZRS',\n",
+       " 'alpha-synuclein'}"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "set(unique_genes)"
+   ]
   }
  ],
  "metadata": {