diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4a36d46..f031e81 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,8 +22,6 @@ jobs: cfg: - os: ubuntu-latest python-version: "3.7" - - os: ubuntu-latest - python-version: "3.9" - os: macos-latest python-version: "3.7" - os: windows-latest diff --git a/devtools/test_env.yaml b/devtools/test_env.yaml index f0e5637..f7e4019 100644 --- a/devtools/test_env.yaml +++ b/devtools/test_env.yaml @@ -4,13 +4,14 @@ channels: - defaults dependencies: # Base depends - - python>=3.7 + - python=3.7 - pip - openpyxl - tabulate - scikit-learn - rdkit<=2021.09.2 - kissim + - matplotlib-venn # Testing # Workaround for https://github.com/computationalmodelling/nbval/issues/153 - pytest 5.* diff --git a/devtools/user_env.yaml b/devtools/user_env.yaml index 91d686c..1ad2719 100644 --- a/devtools/user_env.yaml +++ b/devtools/user_env.yaml @@ -4,12 +4,13 @@ channels: - defaults dependencies: # Base depends - - python>=3.7 + - python=3.7 - pip - openpyxl - tabulate - scikit-learn - kissim + - matplotlib-venn # Testing # Workaround for https://github.com/computationalmodelling/nbval/issues/153 - pytest 5.* diff --git a/notebooks/004_fingerprints/006_literature_pocket_subsets.ipynb b/notebooks/004_fingerprints/006_literature_pocket_subsets.ipynb new file mode 100644 index 0000000..707388f --- /dev/null +++ b/notebooks/004_fingerprints/006_literature_pocket_subsets.ipynb @@ -0,0 +1,1119 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7fac40e2-3a90-45e0-8de6-eb32c4946452", + "metadata": {}, + "source": [ + "# Pocket subsets\n", + "\n", + "- Martin et al. 2011 \n", + " - https://doi.org/10.1021/ci200314j\n", + " - 16 residues\n", + " - Residue numbering based on PKA\n", + "- Bosc et al. 2015\n", + " - https://doi.org/10.1021/acschembio.5b00555\n", + " - 29 residues\n", + " - Residue numbering based on ABL1" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c375d9d0-6dfa-4da4-9936-68e810cc38ba", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from matplotlib_venn import venn2\n", + "from opencadd.databases.klifs import setup_remote" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "88793384-a983-4d89-8ac2-32836c255f91", + "metadata": {}, + "outputs": [], + "source": [ + "klifs = setup_remote()" + ] + }, + { + "cell_type": "markdown", + "id": "22fee186-d7cd-4182-9563-ff830b4b9954", + "metadata": {}, + "source": [ + "## Martin residues" + ] + }, + { + "cell_type": "markdown", + "id": "33112594-d549-4b31-9594-cc34af142a82", + "metadata": {}, + "source": [ + "### Residue definition in paper (UniProt numbering)\n", + "\n", + "Residues from paper --- UniProt numbering:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4366a5b9-fd4f-4838-bef5-e93074189ec5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "51 54 95 103 106 119 120 121 123 126 127 162 163 173 183 187\n" + ] + } + ], + "source": [ + "residues_letter_number_martin = [\n", + " \"M120\",\n", + " \"E121\",\n", + " \"V123\",\n", + " \"D127\",\n", + " \"L173\",\n", + " \"T183\",\n", + " \"T51\",\n", + " \"L103\",\n", + " \"V119\",\n", + " \"G126\",\n", + " \"I163\",\n", + " \"F54\",\n", + " \"L95\",\n", + " \"L106\",\n", + " \"F187\",\n", + " \"L162\",\n", + "]\n", + "residues_number_martin = [int(i[1:]) for i in residues_letter_number_martin]\n", + "residues_number_martin = sorted(residues_number_martin)\n", + "print(*residues_number_martin)" + ] + }, + { + "cell_type": "markdown", + "id": "2f688da3-1bd5-4647-b175-8dd7f5e0839f", + "metadata": {}, + "source": [ + "### Mapping to KLIFS residue IDs" + ] + }, + { + "cell_type": "markdown", + "id": "1edc00d6-a8a4-4c8f-8a2f-a364bbf18ff6", + "metadata": {}, + "source": [ + "Map Martin's residue numbers (PKA) to KLIFS residue IDs:\n", + "- Example structure: 1RE8\n", + "- https://klifs.net/details.php?structure_id=5923 " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "031bd184-761b-450c-8293-1756ec48065c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexresidue.klifs_idresidue.idresidue.klifs_region_idresidue.klifs_regionresidue.klifs_color
04551g.l.5g.lgreen
17854g.l.8g.lgreen
2272895αC.28αCred
33435103b.l.35b.lgreen
43738106IV.38IVkhaki
54344119V.44Vkhaki
64445120GK.45GKorange
74546121hinge.46hingemagenta
84748123hinge.48hingemagenta
95051126linker.51linkercyan
105152127linker.52linkercyan
116566162VI.66VIkhaki
126667163VI.67VIkhaki
137677173VII.77VIIkhaki
147980183xDFG.80xDFGcornflowerblue
158384187a.l.84a.lcornflowerblue
\n", + "
" + ], + "text/plain": [ + " index residue.klifs_id residue.id residue.klifs_region_id \\\n", + "0 4 5 51 g.l.5 \n", + "1 7 8 54 g.l.8 \n", + "2 27 28 95 αC.28 \n", + "3 34 35 103 b.l.35 \n", + "4 37 38 106 IV.38 \n", + "5 43 44 119 V.44 \n", + "6 44 45 120 GK.45 \n", + "7 45 46 121 hinge.46 \n", + "8 47 48 123 hinge.48 \n", + "9 50 51 126 linker.51 \n", + "10 51 52 127 linker.52 \n", + "11 65 66 162 VI.66 \n", + "12 66 67 163 VI.67 \n", + "13 76 77 173 VII.77 \n", + "14 79 80 183 xDFG.80 \n", + "15 83 84 187 a.l.84 \n", + "\n", + " residue.klifs_region residue.klifs_color \n", + "0 g.l green \n", + "1 g.l green \n", + "2 αC red \n", + "3 b.l green \n", + "4 IV khaki \n", + "5 V khaki \n", + "6 GK orange \n", + "7 hinge magenta \n", + "8 hinge magenta \n", + "9 linker cyan \n", + "10 linker cyan \n", + "11 VI khaki \n", + "12 VI khaki \n", + "13 VII khaki \n", + "14 xDFG cornflowerblue \n", + "15 a.l cornflowerblue " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pocket = klifs.pockets.by_structure_klifs_id(5923)\n", + "# Select paper residues\n", + "pocket_martin = pocket[pocket[\"residue.id\"].isin([str(i) for i in residues_number_martin])]\n", + "pocket_martin = pocket_martin.reset_index(drop=False)\n", + "pocket_martin" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "eeed6fd2-d1f9-4436-8a5e-8e1ce42e3346", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16\n", + "5 8 28 35 38 44 45 46 48 51 52 66 67 77 80 84\n" + ] + } + ], + "source": [ + "pocket_martin_klifs_ids = pocket_martin[\"residue.klifs_id\"].to_list()\n", + "print(len(pocket_martin_klifs_ids))\n", + "print(*pocket_martin_klifs_ids)" + ] + }, + { + "cell_type": "markdown", + "id": "c83b009c-7b37-4c88-b5b9-9b8e2c1335f5", + "metadata": {}, + "source": [ + "## Bosc" + ] + }, + { + "cell_type": "markdown", + "id": "7d459c6d-7555-4245-8285-d9ec7fc56908", + "metadata": {}, + "source": [ + "### Residue definition in paper (alignment numbering)\n", + "\n", + "Residues from paper --- NOT UniProt numbering but refers to numbering based on author's sequence alignment in Figure 4 of the paper's SI:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "32ee70a0-bdc5-4c5d-8ecd-3200b5904c43", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "29\n", + "114 118 119 187 192 226 239 295 296 325 329 331 365 402 409 414 426 450 492 493 577 606 622 665 675 778 785 799 802\n" + ] + } + ], + "source": [ + "residues_number_bosc = [\n", + " 409,\n", + " 325,\n", + " 414,\n", + " 187,\n", + " 426,\n", + " 296,\n", + " 492,\n", + " 226,\n", + " 778,\n", + " 192,\n", + " 295,\n", + " 785,\n", + " 118,\n", + " 675,\n", + " 329,\n", + " 802,\n", + " 577,\n", + " 119,\n", + " 622,\n", + " 402,\n", + " 493,\n", + " 114,\n", + " 239,\n", + " 799,\n", + " 365,\n", + " 331,\n", + " 450,\n", + " 665,\n", + " 606,\n", + "]\n", + "residues_number_bosc = sorted(residues_number_bosc)\n", + "print(len(residues_number_bosc))\n", + "print(*residues_number_bosc)" + ] + }, + { + "cell_type": "markdown", + "id": "48b2b8db-e433-44fe-89e3-e0835aff76e0", + "metadata": {}, + "source": [ + "### Mapping to UniProt numbering" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fd32398b-7916-4e71-8717-3cb62ffb9d7a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bosc_numberingresidue_nameuniprot_numbering
9393I242
9494T243
9595M244
9696K245
9797H246
............
800800I489
801801H490
802802Q491
803803A492
804804F493
\n", + "

252 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " bosc_numbering residue_name uniprot_numbering\n", + "93 93 I 242\n", + "94 94 T 243\n", + "95 95 M 244\n", + "96 96 K 245\n", + "97 97 H 246\n", + ".. ... ... ...\n", + "800 800 I 489\n", + "801 801 H 490\n", + "802 802 Q 491\n", + "803 803 A 492\n", + "804 804 F 493\n", + "\n", + "[252 rows x 3 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# flake8-noqa-cell\n", + "# Alignment taken from paper's SI (Figure 4)\n", + "bosc_numbering = \"\"\"- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n", + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - I T M K H - K L G G G Q Y G E - V Y E G V W K K Y S -\n", + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - L\n", + "T V A V K T L - K E D T M E - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - V E E F L K E A A V M K E I K -\n", + "- - - - - - - - - H P N L V - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Q L L G V C T R E P -\n", + "- - - - - - - - - - - - - - - - - - P F Y I - I T E F - M T Y G - - - - - - - - - - - - - - - - - - - - - - - - - N L L\n", + "D Y L R E - - - - - - - - - - - - - - - - - - - - - - C N R Q E V N A V V L L Y M A T Q I S S A M E Y L - E K - - - - -\n", + "- - - - K N F I H - - - - - - - - - - - - R D L A A R N C L V - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n", + "- - - - - - G E N H L V K V - - - - A D F G - - - - - - - - - - - - - - - - - - - - L S R L M T - - - - - - - - - - - -\n", + "- - - - - - - - - - - - - - - - - - - - - - - - - - G D T Y T A H A G - A K F P I K W T - - - - - - - - - - - - - - - -\n", + "- - - - A P E - - - - - - - - S L A Y N K F - S I - - - - - K - S D V W A F G V L L W E I - - - - - - - - - - - - - - -\n", + "- A T Y G M S - - - - - P Y - - - - - - - - - - - - - - P G I - - - - - - - - - - - - - - - - - - - - - - - - - D L S Q\n", + "V - - - - Y E L L E K - D Y R M E R P E G - C P E K V Y E L - - - - - - - - - - - - - - - - - - - - - - - - - - M R A C\n", + "W Q W N P S D - - - - - R - P S - F A E I H Q A F - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n", + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n", + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n", + "- - - - - - - - - - - - - - - - -\"\"\"\n", + "bosc_numbering = bosc_numbering.replace(\" \", \"\").replace(\"\\n\", \"\")\n", + "bosc_numbering = list(bosc_numbering)\n", + "\n", + "# Cast to DataFrame\n", + "residue_number_mapping = pd.DataFrame(bosc_numbering, columns=[\"residue_name\"])\n", + "residue_number_mapping.index.name = \"bosc_numbering\"\n", + "# Reset index to keep Bosc numbering\n", + "residue_number_mapping = residue_number_mapping.reset_index()\n", + "# Drop gaps\n", + "residue_number_mapping = residue_number_mapping[residue_number_mapping[\"residue_name\"] != \"-\"]\n", + "# NOTE: Manual step: Bosc's sequence starts in UniProt at residue 242\n", + "residue_number_mapping[\"uniprot_numbering\"] = list(range(242, 242 + len(residue_number_mapping)))\n", + "residue_number_mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "29c6104f-2f4f-48df-ae8c-b99542842daa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bosc_numberingresidue_nameuniprot_numbering
192192M278
\n", + "
" + ], + "text/plain": [ + " bosc_numbering residue_name uniprot_numbering\n", + "192 192 M 278" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "residue_number_mapping[residue_number_mapping[\"bosc_numbering\"] == 192]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d6e946c5-d0ca-4e3f-8b8e-27d8568bae61", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "23\n", + "261 265 278 282 306 307 316 319 321 345 352 356 359 371 378 379 400 409 437 474 481 488 491\n" + ] + } + ], + "source": [ + "# Keep only Bosc's selected pocket residues (see Table 2)\n", + "residue_number_mapping_selected = residue_number_mapping[\n", + " residue_number_mapping[\"bosc_numbering\"].isin(residues_number_bosc)\n", + "]\n", + "residue_number_mapping_selected = residue_number_mapping_selected.reset_index(drop=True)\n", + "# Get list of UniProt numbers\n", + "residues_number_bosc = residue_number_mapping_selected[\"uniprot_numbering\"].to_list()\n", + "print(len(residues_number_bosc))\n", + "print(*residues_number_bosc)" + ] + }, + { + "cell_type": "markdown", + "id": "e1c1cbc1-db7e-411c-81df-4201a2a2788e", + "metadata": {}, + "source": [ + "**Not all 29 residues can be found, since 6 of them are gaps in the ABL1 alignment --- no assignment to UniProt IDs and therefore no mapping to KLIFS residue IDs possible.**" + ] + }, + { + "cell_type": "markdown", + "id": "1bc93d88-4df2-41c1-9f5e-647dbeee5f2e", + "metadata": {}, + "source": [ + "### Mapping to KLIFS residue IDs" + ] + }, + { + "cell_type": "markdown", + "id": "a40a8821-aba7-4e46-919a-0add87bd2094", + "metadata": {}, + "source": [ + "Map Bosc's residue numbers (ALB1) to KLIFS IDs:\n", + "- Example structure: 2G2I\n", + "- https://klifs.net/details.php?structure_id=1111" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "fe7970f6-fb6f-45fa-a7f1-b97f6a4feeba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
residue.klifs_idresidue.idresidue.klifs_region_idresidue.klifs_regionresidue.klifs_color
020282αC.20αCred
146316hinge.46hingemagenta
249319linker.49linkercyan
351321linker.51linkercyan
463356αE.63αEred
566359VI.66VIkhaki
678371VII.78VIIkhaki
779379VIII.79VIIIkhaki
\n", + "
" + ], + "text/plain": [ + " residue.klifs_id residue.id residue.klifs_region_id residue.klifs_region \\\n", + "0 20 282 αC.20 αC \n", + "1 46 316 hinge.46 hinge \n", + "2 49 319 linker.49 linker \n", + "3 51 321 linker.51 linker \n", + "4 63 356 αE.63 αE \n", + "5 66 359 VI.66 VI \n", + "6 78 371 VII.78 VII \n", + "7 79 379 VIII.79 VIII \n", + "\n", + " residue.klifs_color \n", + "0 red \n", + "1 magenta \n", + "2 cyan \n", + "3 cyan \n", + "4 red \n", + "5 khaki \n", + "6 khaki \n", + "7 khaki " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pocket = klifs.pockets.by_structure_klifs_id(1111)\n", + "pocket_bosc = pocket[pocket[\"residue.id\"].isin([str(i) for i in residues_number_bosc])]\n", + "pocket_bosc = pocket_bosc.reset_index(drop=True)\n", + "pocket_bosc" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c9c8883f-e83a-48e0-8840-61135b10aa0a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
residue.klifs_idresidue.idresidue.klifs_region_idresidue.klifs_regionresidue.klifs_colorbosc_numberingresidue_nameuniprot_numbering
020282αC.20αCred226E282
146316hinge.46hingemagenta325E316
249319linker.49linkercyan329T319
351321linker.51linkercyan331G321
463356αE.63αEred414K356
566359VI.66VIkhaki426F359
678371VII.78VIIkhaki450V371
779379VIII.79VIIIkhaki493V379
\n", + "
" + ], + "text/plain": [ + " residue.klifs_id residue.id residue.klifs_region_id residue.klifs_region \\\n", + "0 20 282 αC.20 αC \n", + "1 46 316 hinge.46 hinge \n", + "2 49 319 linker.49 linker \n", + "3 51 321 linker.51 linker \n", + "4 63 356 αE.63 αE \n", + "5 66 359 VI.66 VI \n", + "6 78 371 VII.78 VII \n", + "7 79 379 VIII.79 VIII \n", + "\n", + " residue.klifs_color bosc_numbering residue_name uniprot_numbering \n", + "0 red 226 E 282 \n", + "1 magenta 325 E 316 \n", + "2 cyan 329 T 319 \n", + "3 cyan 331 G 321 \n", + "4 red 414 K 356 \n", + "5 khaki 426 F 359 \n", + "6 khaki 450 V 371 \n", + "7 khaki 493 V 379 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.merge(\n", + " pocket_bosc.astype({\"residue.id\": int}),\n", + " residue_number_mapping,\n", + " left_on=\"residue.id\",\n", + " right_on=\"uniprot_numbering\",\n", + " how=\"left\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6960b67d-c679-4393-9da6-88728e195862", + "metadata": {}, + "source": [ + "Comments on mapping:\n", + "- res325 is assigned in paper to \"Interestingly, the residue at position 325 corresponds to the gatekeeper.\", not to hinge region." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "bf2c7c6c-0ea4-47d8-94f6-d74117f37efe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8\n", + "20 46 49 51 63 66 78 79\n" + ] + } + ], + "source": [ + "pocket_bosc_klifs_ids = pocket_bosc[\"residue.klifs_id\"].to_list()\n", + "print(len(pocket_bosc_klifs_ids))\n", + "print(*pocket_bosc_klifs_ids)" + ] + }, + { + "cell_type": "markdown", + "id": "aa3cfb59-244a-4ea2-b8dd-a6285d507d5b", + "metadata": {}, + "source": [ + "## Residue overlap between Martin and Bosc (KLIFS numbering)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "3bc2d98a-9b0e-4968-bfe0-7e7b5662759f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAS0AAADqCAYAAAD3eRNWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAb5UlEQVR4nO3deXBc13Xn8e/pxk6AAPdFBAWSojaKomRSCyXHkm3ZkneNpyYZjzNjJ5nETkopp1KlxFlqYCRll1OV+I9sk6rJMuPYyXjicRJFlseyJWulKMkiKZEyKVEU950ESSwNoLczf9yG1IZAEkv3u/e9dz5VXSJBAO9QAH687757zxVVxRhj4iLjuwBjjJkOCy1jTKxYaBljYsVCyxgTKxZaxphYsdAyxsSKhZYxJlYstIwxsWKhZYyJFQstY0ysWGgZY2LFQssYEysWWsaYWLHQMsbEioWWMSZWLLSMMbFioWWMiRULLWNMrFhoGWNixULLGBMrFlrGmFix0DLGxIqFljEmViy0jDGx0uC7ADMFIhlgDtBeeTUDTUBj5ZWtvDJAqepVrPp1HhgGcsAQMIKd1GtiyEIrJCLtwMLKay4uoDqANkBqfLUyIjlckA0AZ996qY7W+FrG1IzYP7aeiLQCS4BFvB1UrV5retswLsDOAKeA46gW/JZkjGOhFRV3i7cE6AZW4EIqLsq4ADsKHAZOoVr2W5JJKwutenKjqR5cUF2Bm39KgjxwBHgTOIhqyXM9JkUstGpNpBFYBVyFC6paz0WFpgDsB94Ajtrkvqk3C61aEVkOXIMLrLQ+4BjBjb72oHrWdzEmmSy0ZkMkC1wN3Ah0eq4mNMeBnbjbR/smMzVjoTUTIi3AusqrxXM1oRsAdgGv2RNIUwsWWtMhMhc3qrqa9N4CzlQe2A3sQHXMdzEmviy0psI9BdwIXIttfZqtPLAD2IVq0XMtJoYstC5FpAE3stpAcpYrhCIHbMNN2tuaLzNlFlqTERHck8BNuC00pn4GgBdQfdN3ISYeLLQmElkE3AXM911KyhwBnkF1wHchJmwWWuPcreAmYD3JXxAaqhLulvFlu2U0F2OhBSCyDHgPttYqFGeBJ1E947sQE550h5bbcnMbcL3vUsw7lIHtwDZbnGqqpTe0RBYC9+D6VplwHQMeRzXnuxAThnSGlsg64HZct08TvhFccB31XYjxL12h5Sbb7wLW+C7FTJvibhdfstvFdEtPaIl0Ah/AljLE3THgh9YSOr3SEVqubcwHcYdBmPi7AHzP1nSlU/JDS2Q18F5s/ippRoDvo3rKdyEmWskOLZEbgM3YYtGkKgKPoXrQdyEmOskNLZFbgZt8l2HqToEtqL7quxATjeSFltvsfBeu55VJjxdQ3eG7CFN/yWpk5wLrvbhDJUy63IpIGdVXoryo9EkG94Bn/MTvDO6wjwJQ0F7NR1lPGiRrpCXyHlyjPpNeW1DdVatPJn3SASzALZWZj9ufOh5QTVy+KWQZGMU9OMgBg7gzJE8D57TXNoZPV3JCS+QO4AbfZZggPIPqT6b7QdInc3BnVC7k7ZCq5zKZEtCPC7DxIOvX3qT8UNZHMkJL5BbgZt9lmKA8jeruy72T9Ml83IG6PYRx6vcI7hzJN4HjFmDvFP/QErkJuNV3GSY4ils5v7/6jdInAizh7aAKecP8CHAA2IcF2FviHVpu4eg9vsswwSoCD6N6SvqkGTffeT3Q4besGRk/CHen9qZ7J0B8Q8u1lvk4SXsCampqcC4XNn6eQ3vbuI5kfK8obuS1Q3u133cxPsQztNyRXv8OaPddignTQCf53TdRPL2M1hNN5B/ooXEsk7jj3w4C27U3XVuZ4hdaIhngo8BS36WY8Ax1kN+5idLZpbRWv317G7n/1p3Yk5WO4sLrmO9CohDH0LoLd7yXMW8pC/r6enL7rqNVLzKi+vZ8hv/XIuZEXVuE9gNbtFeHfRdST/EKLZFrcQdQGPOW8/MZ23Yn5NppvtT7lUG/2E1+d9ul3y/mCsBLwK6kLlyNT2iJdAGfJBmTqaYGilnKr25k5PBq2pCpdfI4l6Xwy6vJJnB+a6IzwI+0V8/5LqTW4hFabh7rfsJY/GcCcGoZoy/fRnaslcbpfuyz7eS+ekVi57eqlXCjrpeTtMYrLqFlbWYMAKUM5e2bGT2xcnah89XljDzb8dOT9Ql2AnhUe5PRojr80BJZgluPZY38Um6smdLW91EY7KJltp8rJ5Q+txo935Ca6YZB4PtJWNsV9n29Oz3nbiywUm+4ncJT91GqRWABtCnZ3zlKoRafKyY6gE9In6z0XchshR1a7pbQjqpPuTOLGX3qPjJjbbXtuHD9KK33nidNh8A2AvdKn9zou5DZCPf2UKQD+FnsQIpUO9JD7uXbaLnY2qvZGspQ/OwaMil4mjjRHuCZOC6LCPkLtRkLrFTbcyPDOzbTVq/AAmgv0/DZ04zU6/MH7FrgQ9InsfsZCzO0RLpxbUNMSm3fTO6NddGsXr/vPK1L8qma3xp3BXBPpWV0bIRXrFuTdYfvMow/uzeQO9oT3TqqBsg8cJJiVNcLzJW4g2BiI7zQgvXY5HtqHVpNbt/10S/8vClH683DJGId0wyslT6503cRUxVWaIk0YotIU+vMYkZ33lKbJQ0z8WsnEVECfTJVd+ukTzb5LmIqwgotWAeJ3sxqLmKog/yLd9FYz0n3y1laoPl9A6mclB/3LumT9b6LuJxwQsstJI31+hEzM2PNlJ57P1Jq8P+0+D+e9V+DZ5tDX4AaTmi5UZa3WwPjRzFL+bn3U5zJxud6WFqg+ZahVI+2AO6WPgl2Q3kYoWWjrNR66d2MDnWGNSXw6TOp3zbWgjupPUhhhBZcB6nZcW8qjvSQO708vBYxa8ZouS7HmO86PLtC+mSD7yImE0po2cnQKTPWQnHnprBGWNU+fZbYbW+pg1ukT4LrYec/tERWEM9z6MwsvHQHhVJjuJPe63O0dI+R912HZxng/dInQcw3jvMfWu7W0KTIkR5y/UvCng7IgPz7/tSukq/WSWA7VPyGlkgbbhuBSYliA+VdG2vbYqZeNg/SnE3vYtNq10ifLPZdxDjfI61rAqjBRGjXRkaKTfHoFtqmZG8dSu3WnomCGW35CwwRwbXHMClxfh5jR1aF97TwUj503kZaFYulT9b6LgL8jnKWYxPwqbJrE+WpHvUVivU5WtpKlHzXEYhbQui/5TO0Vnu8tonYhS7y5xeGPfk+mQbIvH8g9Wu2xrUTwIMzP6Hlbg17vFzbePH6+vg+ibvngs27VrlZ+sTrnKSvL8YybAV8aoy0Ujy5PL5f79VjtCwqpLKz6WRacfuEvfEVWrbMIUX23sAYmXjNZU10+5CFVpXrfV7cQsvUVaGR0pGe+Hfv2DTku4KgdEifrPB18ehDS6QLmBv5dY0Xb17DaDmAPlmzde1IuPskPfG2XMnHSOsKD9c0HpQFPXB1Mn7Y25Ts2hF7ililR/rEywjaR2gt8XBN48GxKxkpNMdj9ftU3Doc3yegdZABrvZ14agt9XBN48GxoJv2Tt/Gofjf5taYl1vEaENLpB23QM0knIKeXZyMW8Nxq8Zoaixbn60qXdInkQ9Coh5p2SgrJfoXMRZyv6yZaIDMtaOp77E1UeQ7Wyy0TF0cX5nM/XpXjdpIa4LIH6xFHVrB9OQx9XVqeXIm4KutsUY1E82TPol0t0N0oeX2G86L7HrGm5FWirn2ZM1njbtyzPYhTmJZlBeL8gvQDsma4zCTO74yufM+ywphnM8YmEhvEaMMra4Ir2U8OrEi3vsML6VZydrm6XdYHuXFLLRMzV2YH48e8DN11agtMp2gM8oTqaMMrc4Ir2U8GWumVErAXsNLsSeIk4pstGUjLVNTwx3JH4UsKST39ncW5kd1oShDyzo7pMBgZzLXZ1WbV7TQmkRkO12iDK3Ydq40UzfYmfzTa+aWLLQmEdkhNdGElkgDttwhFYY7kv8D3VGy7+VJJCy0iH/nSjM1ufbk/0C3l22B6STaojpezELL1NRIWzK371RrVrLW7WFSkYy2LLRMzYw1U0pCa+Wp6LIDXCcTyWR8VP8qJjq0NsJndsP6Nhg8A30A98LHt8FNAtoOA9+E/7kZLviutZ5GWykR4dzlwBgNDzzCgyWlQZXs2gW81Hc3/xbFtecXKZ8OYUPPV/gKDYwCilDiQb7isZpIRlpRhVaiV0h/GrbMgx89CL8w/ra/gkdXwUMAn4L3/QZ89Hn4prciI1DORvvksL2J4tfu5WsL2xgbKZD93MM8+Mhedn14Lfvrfe0mDegp6Wf4GksI4bygSGI8qtvDRD9R+k3Y2w3D1W9bBW81MclBkxDQN3mdlCK+McwILGxzh02MFsmWlWxU32gNif9qzkgkeRLVSCvRoXUxH4D7t8LtzTDyGPyJ73rqTTPRB3O+hPziv/L7Q3kWrVvEEx+KYJQFkA0ltAT4Ol8AYC1PcT9Pe6wmktCKaqSVykfEP4B/GYQv3gnP/za813c99Vb2EFpNWfQbn+QP//Ij/PbxIXqeOhjNHrhsKLeHP88f8SBf5uf5U17nbp5jrcdqEhVaYXyBPfkCvPBjeJfvOupN1N+IenkHIys7ef3ZQ6yL4nrlUO4duisPd5YxyHJ2cIgej9VE8nNuoVUnD1e1lv5r2LAYTvisJwqZcrShtf8c7ccG3fawC6M0HjjPdSs6o/n/XAphymOQJs5XOsQO0sRJrmcJxzxWFMnatajmtBK9EG89/Nc34epRaG+HP/o5eGgLrP8sLBHQeXD27xL+5BAgE/HKpYMX6PzrbfyCQkYVuWYBP/7PN7IzimuX/EcWnGQu3+FXAVCydPM8d/Oqx4oi+TkX1QgGQSLXAHfV/0LGp/PzyT9zb7KXt4z7Yjejr7Yle/3hDGzRXt1V74tEdXtoZ5ikQGM+gFumiAxk0/lw6TJyUVwkqv/xYxFdx3jUOkwD5XTMX55LyXalaRqM4iI20jI1k1GkeSz5nUtLoENZC61JWGiZ+GkdTn5o5TK2WXoSRe3VSH7Oo7w9TMVtQ9q1Dyb/6zyYtdCaRCSjLIgqtNwjSpvXSoH2RPexcAayyV7CM0MJCy1n+PLvYuKufSD5T9XONyR/NDkDiQyt8xFey3jScSH5E9T9FlqTSWRopeDGwbQNJX/Zw4Gm9KxHm4bItqnZSMvUlIC05cj7rqOe3mhJ/mhymvLA6aguZqFlam7R8eQueyiDHmiOpkNnjJzQ3ij2AzoWWqbmlh1K7kikv4FCIZP8hw3TFGlniej+56sWsSeIqbDgFM3ZYjLXMh1uSu4ochYSGlpOZPe9xh8BmX8qmevy9jcn+yHDDIwBZ6O8YNShlfhGeMZZdth3BfWx1ybhJzoe5XwWWGiZOll6hGZC6aNeQ6+1WmhNEPk/T1GH1hmwOYE0aMqT7biQrFvEs1nypxvtyWGVIrAv6otGG1qqZWxeKzWWHE3WZPwrc+wf3Ane1F6NfE2ej0e3douYEivfoClJq+NfnGMr4SfY7eOiPkLruIdrGg/acjQuOsGI7zpqoQT60px09L+fonPaqyd9XNhXaNkwOyWu2ZmMieujTYzlrFtptT2+Lhx9aKmWgCORX9d40dVPc2d//DvXbp9jPbSqlIG9vi7uazvCQU/XNR6s3RX/ea3n223rTpX9UbVWnozP0Ir9N7KZmqVHaW0dim/nh6EMxV2tlZOcDcB2nxf3E1qqo9iEfKqs2R3fecwtHYyp2JPDin3aq/0+C/A55H3T47VNxFbuo7UxpseLPdJFg+8aAqHAj30X4Tu0bHIzJTKKrNkdv1vEkw3k97XYrWHFa9qr3jsQ+wstd4u439v1TeTW7Ka1bSheW3se74zn6LAOCsCLvosAvyMt8LSi1vghIDc957uKqSuD/r8uW1BasU17NYiFwn5DS/UYduBFqsw/Q/PSQ+R81zEVb7Qw2t9g81nAALDTdxHjfI+0wEZbqXPjizQ35MO/7Xqky5bl4Cbff6S9Gsz8cwih9TokqxuAubSmPNkbX6Dgu45LOZel8PhcWn3XEYCXfO0xvBj/oWUT8qm0/DCtS46Ee5v4z/Mp2NosTuB5Ielk/IeWE9z/GFN/N22lOcS1W8MZig930eK7Ds/ywONRt1KeijBCS/Ucttg0dRoLZDc9TVHKYa3X+14XY3ZMGE9prw75LmIyIX1htvkuwERvwWlaNjwfTheIvFD+p/mpH2Xt0V4NdhARTmip9gMHfJdhorfiAG1rd4Uxv/XEXEZS3jfrNLDFdxGXEk5oOTbaSqlrdtK2/IDf4CpA+ZsLU71l5xzwPe3V4OYZq4UVWqpnsF5bqXXzc7TOO+2vPfN35zGS4sWkg8AjPvtkTVVYoeU8j22kTiUBue0JmtsGo9+feCFL4RsLU7suKwd8V3t12HchUxFeaKmeJ6AtAyZaDUUydzxGtmk02sWnf7eIwlg6nxjmcSOsAd+FTFWoX6RtEMbErIleywgNdz4KLbloWtkcaGL0sU7aorhWYIq4OSyvTf2mK8zQUi0AW32XYfyZM0zjXY+QnVvnQzHKoH+6NJUr30dxI6ygtuhMRZihBaD6Bnawa6o1Fsi++1Ga67ndZ2s7o3vT1/+9H/hn7dVY/nyFG1rOs9ikfKplFLnladpW7aHmk8Q5ofTfl9BY688buEPAv2qvDvouZKbCDi3Vs8AO32UY/9ZtZ84NL5KjXLt2MX++lPz5dC1x2Al8X3s16A4blxOHL9g2YCWw0Hchxq+eN2ibM8Toj3+GxlLD7FatP9tO7um5qZl8LwPPaK96OxW6lkTD28T9TiLzgE9CqrdXmIqRVorb76DQv3hm66rOZSn8ymqyo+lY4jCAa+IXuwn3i4lHaAGIrAPu9F2GCceRHnK7NtJUbJr6HUMZ9Ivd5He3pWLyfRfwQujbcqYrPqEFIHIf7lbRGADyTZRevo38yRVTG3U91MXw/1jCnHrX5dkF4Gnt1WO+C6mHuIVWC+42sd13KSYsp5Yx+vJtZMdaL/408HATY7/eQ1MpuR1JS7iGmi9rrya2hXm8QgtAZAHwCeLxEMFEqJil/OpGRg6voo3MTwfTcIbiAz1wpjGx3zcHgK1x2o4zU/ELLQCR1cA9vsswYRqeQ+G1DRSOd9OiGTIl0N/rZuzVtsQ191NgH7Bde/Wc72KiEs/QAhDZBLzLdxkmXCOtFF/bQL7vFkr/soAO3/XUUBl4DXcbmPiR1UTxDS0AkQ8CPb7LMEF7Rb7EduBaYB3xng8t4s4JfSUubWTqIe6h1Qh8HFjguxQTpP2o/mD8N9InAlwJXAV0Qyy28JSAI7jbwEPaq5F0vghZvEMLQKQV+BjQ5bkSE5YTwCPo5GuUpE+ywBW4kfqVEFQDwBJwGHdClQXVBPEPLQCRObgRV5LmLczMnQK+W2lxdFmVEdhiXIB1A/Mg0mURiuvPfho4igXVJSUjtABEOnDBlfSFg+bSTuMCa8Y/9JVR2DxgPm7qYUHl17V4+lgGzuPqPFP5b3/SVq3XU3JCC0CkC3erGNJQ30TnDC6w6tJjXvqkDZgLNFVejZP8NwMUql5jwAiuE+8IMJTkhZ9RSFZoAYjMBz5Kbf5VNPHRDzyMhn+ajJmd5IUWgEgn8GFsjistzuAm3S2wUiCZoQUg0gbch/XhSrrDwA+nOulu4i+5oQXj67g+iHu0bZJnD/AMqtaSO0WSHVoAIhngbtyCQpMcL6K63XcRJnrJD61xtlcxKcrAk6ju9V2I8SM9oQUgciXwXtzjaRM/g8BjqJ7yXYjxJ12hBeNPFj+AWyxo4uMA8MRsFo2aZEhfaAGIZHH95q/1XYq5rDLwPKo7fRdiwpDO0BonsgZ4N6TikIM4GsQtZzjtu5DpEJEDwBLcxucCsAX4vKoe9llXUqThCKWLU90H/B/cbnoTlj3A/41bYFX5mKq2A8uAk8Cfea4nMdIdWgCqI6j+EPgBbn+Y8WsQt3/wqSTMX6lbpf9t4HoAEekUka+LyGkROSgivy9uWQ4icpWIPCkiF0TkjIh8a/zziMg6EfmBiPSLyEkR+V0/fyP/ktrkf/pU9yNyDNgMXO27nBQq445tf+liPbDiSNzOjJ8Dtlbe9GdAJ7Aa1z3iUeA48DfAH1Z+P/6Ee1Plc3QAPwT+GNcQoJFKCKZRuue0LkbkCuB2rCNqVE7gVrb3+y6kFipzWgtx7ZHbcf297gV+ghvN36yqP6m87+eAT6nq3SLydWAU+ANVPVL1+T4F/Jaq3hzpXyRQdns4GdWjwHeAJ4Ahv8Uk2jng+6g+lJTAqnK/qnbhHvI8ADwJrMCNoA5Wvd9B3t5m9lu45oMviMirIvKLlbd349otGyy0Lk5VUX0d+BbwAhD7+ZWADOH+Qfg2qgcv876xpqolVf0O7kni7biniVdWvctKXLdSVPWEqv6yqi4HPgf8pYhchdsUvibaysNloXU5qiVUdwD/G3gF901nZmYUeA74Fqqvk4K5CXE+geuEugv3tPrLItIhbofGbwLfqLzvfxCRFZUPPYdrw1wCHgaWishviEhz5WNvi/wvEwgLralSHUV1K/APuJGXPWmcugHcWqV/RHUnmorOnf8mIkO4v/uXgc+o6qvArwPDuGU2z+C+n/628jG3AM9XPu4h4Auqul9VB3G7OD6Gm//bi5usTyWbiJ8p95h6LXAj7l9R807HcE8ED6VhVGWiYaFVCyLdwHW4+Ym0j16LuEnjnQmcXDcBsNCqJZEW3OhrLenqmFrGHSj6BnDQuoiaerLQqhd3MtBVwCqSe/t4AhdUb1p/dhMVC60oiLTj1uh049bkxLWfVx43ojoCHEZ12HM9JoUstKLmJvAX4wJsCe42MtQQy+NWcx+rvE7bhLrxzUIrBK4x4UJgUeW/PoJsCDj7Uy/VgYhrMOayLLRC5Sb12yuvjqpft+O2hoyfcnypp5Wlqlcetz4ohwuo4arXQL1OZTam1iy04s7dbmYrrwzjIZWgTgnGVLPQMsbEStoXQhpjYsZCyxgTKxZaxphYsdAyxsSKhZYxJlYstIwxsWKhFQMi8mkRedR3HcaEwNZpzULl1JXlwHJVPVP19h3ABmCVqh6Y5ufsAfYDjWoLRI15Bxtpzd5+4FPjvxGR9UDrTD6RiNg5lMZchoXW7P098F+qfv8Z4OvjvxGRj4jIdhEZEJHDIvKlqj/rEREVkV8SkUPA48BTlT8+LyJDIrJZRD4rIs9UfZyKyOdFZK+InBORvxARqevf0phAWGjN3lZgrohcJyJZ3GnC36j682FcqHUBHwF+VUTun/A57sK1a74XeE/lbV2q2q6qz13kuh/FHYSwAfjZyscak3gWWrUxPtr6ALCHyjl2AKr6hKruVNWyqr4C/CMupKp9SVWHVXVkGtf8qqqeV9VDwI+Am2b1NzAmJmwOpTb+Hndbt4qqW0OAyvl0XwVuwLWSaQb+acLHH57BNU9U/TqHa1ljTOLZSKsG1J2SvB/4MPCdCX/8D7gz7LpVtRP4K9zR5z/1KS7ya2PMBBZatfNLwPv0nX3TO4B+VR0VkVuB/3SZz3Mad7rN6jrUaEzs2e1hjajqvov80a8BfyIifw48iTsWvesSnycnIl8GnhWRRuC+WtdqTJzZ4lJjTKzY7aExJlYstIwxsWKhZYyJFQstY0ysWGgZY2LFQssYEysWWsaYWLHQMsbEioWWMSZW/j+k+l+lyZPcOQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "martin_only = set(pocket_martin_klifs_ids) - set(pocket_bosc_klifs_ids)\n", + "bosc_only = set(pocket_bosc_klifs_ids) - set(pocket_martin_klifs_ids)\n", + "both = set(pocket_bosc_klifs_ids) & set(pocket_martin_klifs_ids)\n", + "venn2(subsets=(len(martin_only), len(bosc_only), len(both)), set_labels=(\"Martin\", \"Bosc\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/004_fingerprints/README.md b/notebooks/004_fingerprints/README.md index da07f20..6b631fd 100644 --- a/notebooks/004_fingerprints/README.md +++ b/notebooks/004_fingerprints/README.md @@ -53,6 +53,20 @@ We check the coverage and variability of fingerprint bit positions across all fi - Get top X bit positions with no/high standard deviation +## `006_literature_pocket_subsets.ipynb` + +Pocket subsets + +- Martin et al. 2011 + - https://doi.org/10.1021/ci200314j + - 16 residues + - Residue numbering based on PKA +- Bosc et al. 2015 + - https://doi.org/10.1021/acschembio.5b00555 + - 29 residues + - Residue numbering based on ABL1 + + ## `999_fetch_sitealign_features.ipynb` SiteAlign features diff --git a/notebooks/008_application_grk/001_grk_structures_in_klifs.ipynb b/notebooks/008_application_grk/001_grk_structures_in_klifs.ipynb new file mode 100644 index 0000000..ffe7f25 --- /dev/null +++ b/notebooks/008_application_grk/001_grk_structures_in_klifs.ipynb @@ -0,0 +1,689 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "de774775-ab50-4ac6-8fd1-9edbf6bdb0a2", + "metadata": {}, + "source": [ + "# GRK structures in KLIFS" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "45f44548-22d2-45a1-8541-7200f2918477", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from opencadd.databases.klifs import setup_remote" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "58dfff58-5843-4d23-96f1-20a9ca6b7a4d", + "metadata": {}, + "outputs": [], + "source": [ + "klifs_session = setup_remote()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b0d78c63-0cdd-4bdf-b0cf-1b6042b2ce4b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
kinase.klifs_idkinase.klifs_namekinase.full_namekinase.gene_namekinase.uniprotspecies.klifs
016RHOKG protein-coupled receptor kinase 1GRK1Q15835Human
111BARK1adrenergic, beta, receptor kinase 1GRK2P25098Human
212BARK2adrenergic, beta, receptor kinase 2GRK3P35626Human
313GPRK4G protein-coupled receptor kinase 4GRK4P32298Human
414GPRK5G protein-coupled receptor kinase 5GRK5P34947Human
515GPRK6G protein-coupled receptor kinase 6GRK6P43250Human
617GPRK7G protein-coupled receptor kinase 7GRK7Q8WTQ7Human
\n", + "
" + ], + "text/plain": [ + " kinase.klifs_id kinase.klifs_name kinase.full_name \\\n", + "0 16 RHOK G protein-coupled receptor kinase 1 \n", + "1 11 BARK1 adrenergic, beta, receptor kinase 1 \n", + "2 12 BARK2 adrenergic, beta, receptor kinase 2 \n", + "3 13 GPRK4 G protein-coupled receptor kinase 4 \n", + "4 14 GPRK5 G protein-coupled receptor kinase 5 \n", + "5 15 GPRK6 G protein-coupled receptor kinase 6 \n", + "6 17 GPRK7 G protein-coupled receptor kinase 7 \n", + "\n", + " kinase.gene_name kinase.uniprot species.klifs \n", + "0 GRK1 Q15835 Human \n", + "1 GRK2 P25098 Human \n", + "2 GRK3 P35626 Human \n", + "3 GRK4 P32298 Human \n", + "4 GRK5 P34947 Human \n", + "5 GRK6 P43250 Human \n", + "6 GRK7 Q8WTQ7 Human " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kinases = klifs_session.kinases.all_kinases(families=\"GRK\", species=\"Human\")\n", + "kinases" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "189c2c0c-baeb-4ac7-8de2-1d4e3c940065", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16 11 12 13 14 15 17\n" + ] + } + ], + "source": [ + "kinase_klifs_ids = kinases[\"kinase.klifs_id\"].to_list()\n", + "print(*kinase_klifs_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0111493c-95ce-4ddc-8caa-b35d4c08ad19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
kinase.klifs_idkinase.klifs_namekinase.full_namekinase.gene_namekinase.familykinase.groupkinase.subfamilyspecies.klifskinase.uniprotkinase.iupharkinase.pocket
011BARK1adrenergic, beta, receptor kinase 1GRK2GRKAGCBARKHumanP250981466RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD...
112BARK2adrenergic, beta, receptor kinase 2GRK3GRKAGCBARKHumanP356261467RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMTYACFILD...
213GPRK4G protein-coupled receptor kinase 4GRK4GRKAGCGRKHumanP322981468RVLGKGGFGEVCAYACKKLMALNEKRILEKVQRFVVSLAYACLVLT...
314GPRK5G protein-coupled receptor kinase 5GRK5GRKAGCGRKHumanP349471469RVLGKGGFGEVCAYACKRLMALNEKQILEKVNQFVVNLAYACLVLT...
415GPRK6G protein-coupled receptor kinase 6GRK6GRKAGCGRKHumanP432501470RVLGKGGFGEVCAYACKKLMALNEKQILEKVNRFVVSLAYACLVLT...
516RHOKG protein-coupled receptor kinase 1GRK1GRKAGCGRKHumanQ158351465RVLGKGGFGEVSAYACKKLGAMVEKKILMKVHRFIVSLAYACLVMT...
617GPRK7G protein-coupled receptor kinase 7GRK7GRKAGCGRKHumanQ8WTQ71471RVLGKGGFGEVCAYACKKLMALLEKEILEKVSPFIVSLAYACLVMS...
\n", + "
" + ], + "text/plain": [ + " kinase.klifs_id kinase.klifs_name kinase.full_name \\\n", + "0 11 BARK1 adrenergic, beta, receptor kinase 1 \n", + "1 12 BARK2 adrenergic, beta, receptor kinase 2 \n", + "2 13 GPRK4 G protein-coupled receptor kinase 4 \n", + "3 14 GPRK5 G protein-coupled receptor kinase 5 \n", + "4 15 GPRK6 G protein-coupled receptor kinase 6 \n", + "5 16 RHOK G protein-coupled receptor kinase 1 \n", + "6 17 GPRK7 G protein-coupled receptor kinase 7 \n", + "\n", + " kinase.gene_name kinase.family kinase.group kinase.subfamily species.klifs \\\n", + "0 GRK2 GRK AGC BARK Human \n", + "1 GRK3 GRK AGC BARK Human \n", + "2 GRK4 GRK AGC GRK Human \n", + "3 GRK5 GRK AGC GRK Human \n", + "4 GRK6 GRK AGC GRK Human \n", + "5 GRK1 GRK AGC GRK Human \n", + "6 GRK7 GRK AGC GRK Human \n", + "\n", + " kinase.uniprot kinase.iuphar \\\n", + "0 P25098 1466 \n", + "1 P35626 1467 \n", + "2 P32298 1468 \n", + "3 P34947 1469 \n", + "4 P43250 1470 \n", + "5 Q15835 1465 \n", + "6 Q8WTQ7 1471 \n", + "\n", + " kinase.pocket \n", + "0 RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD... \n", + "1 RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMTYACFILD... \n", + "2 RVLGKGGFGEVCAYACKKLMALNEKRILEKVQRFVVSLAYACLVLT... \n", + "3 RVLGKGGFGEVCAYACKRLMALNEKQILEKVNQFVVNLAYACLVLT... \n", + "4 RVLGKGGFGEVCAYACKKLMALNEKQILEKVNRFVVSLAYACLVLT... \n", + "5 RVLGKGGFGEVSAYACKKLGAMVEKKILMKVHRFIVSLAYACLVMT... \n", + "6 RVLGKGGFGEVCAYACKKLMALLEKEILEKVSPFIVSLAYACLVMS... " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kinases = klifs_session.kinases.by_kinase_klifs_id(kinase_klifs_ids)\n", + "kinases" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e0e49df2-3d5c-43ce-b384-7a527569c4f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of GRK structures: 41\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
structure.klifs_idstructure.pdb_idstructure.alternate_modelstructure.chainspecies.klifs_xkinase.klifs_idkinase.klifs_name_xkinase.namesstructure.pocketligand.expo_id...kinase.klifs_name_ykinase.full_namekinase.gene_namekinase.familykinase.groupkinase.subfamilyspecies.klifs_ykinase.uniprotkinase.iupharkinase.pocket
094405wg5BAHuman11BARK1<NA>RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD...ZSO...BARK1adrenergic, beta, receptor kinase 1GRK2GRKAGCBARKHumanP250981466RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD...
182565uvc-AHuman11BARK1<NA>RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD...8PV...BARK1adrenergic, beta, receptor kinase 1GRK2GRKAGCBARKHumanP250981466RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD...
294375wg5AAHuman11BARK1<NA>RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD...ZSO...BARK1adrenergic, beta, receptor kinase 1GRK2GRKAGCBARKHumanP250981466RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD...
394385wg4BAHuman11BARK1<NA>RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD...AFV...BARK1adrenergic, beta, receptor kinase 1GRK2GRKAGCBARKHumanP250981466RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD...
494395wg3-AHuman11BARK1<NA>RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD...AFM...BARK1adrenergic, beta, receptor kinase 1GRK2GRKAGCBARKHumanP250981466RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD...
\n", + "

5 rows × 54 columns

\n", + "
" + ], + "text/plain": [ + " structure.klifs_id structure.pdb_id structure.alternate_model \\\n", + "0 9440 5wg5 B \n", + "1 8256 5uvc - \n", + "2 9437 5wg5 A \n", + "3 9438 5wg4 B \n", + "4 9439 5wg3 - \n", + "\n", + " structure.chain species.klifs_x kinase.klifs_id kinase.klifs_name_x \\\n", + "0 A Human 11 BARK1 \n", + "1 A Human 11 BARK1 \n", + "2 A Human 11 BARK1 \n", + "3 A Human 11 BARK1 \n", + "4 A Human 11 BARK1 \n", + "\n", + " kinase.names structure.pocket \\\n", + "0 RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD... \n", + "1 RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD... \n", + "2 RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD... \n", + "3 RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD... \n", + "4 RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD... \n", + "\n", + " ligand.expo_id ... kinase.klifs_name_y \\\n", + "0 ZSO ... BARK1 \n", + "1 8PV ... BARK1 \n", + "2 ZSO ... BARK1 \n", + "3 AFV ... BARK1 \n", + "4 AFM ... BARK1 \n", + "\n", + " kinase.full_name kinase.gene_name kinase.family \\\n", + "0 adrenergic, beta, receptor kinase 1 GRK2 GRK \n", + "1 adrenergic, beta, receptor kinase 1 GRK2 GRK \n", + "2 adrenergic, beta, receptor kinase 1 GRK2 GRK \n", + "3 adrenergic, beta, receptor kinase 1 GRK2 GRK \n", + "4 adrenergic, beta, receptor kinase 1 GRK2 GRK \n", + "\n", + " kinase.group kinase.subfamily species.klifs_y kinase.uniprot \\\n", + "0 AGC BARK Human P25098 \n", + "1 AGC BARK Human P25098 \n", + "2 AGC BARK Human P25098 \n", + "3 AGC BARK Human P25098 \n", + "4 AGC BARK Human P25098 \n", + "\n", + " kinase.iuphar kinase.pocket \n", + "0 1466 RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD... \n", + "1 1466 RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD... \n", + "2 1466 RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD... \n", + "3 1466 RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD... \n", + "4 1466 RIIGRGGFGEVYGYAMKCLLALNERIMLSLVSPFIVCMSYASFILD... \n", + "\n", + "[5 rows x 54 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "structures = klifs_session.structures.by_kinase_klifs_id(kinase_klifs_ids)\n", + "structures = pd.merge(\n", + " structures.drop([\"kinase.family\", \"kinase.group\"], axis=1), kinases, on=[\"kinase.klifs_id\"]\n", + ")\n", + "print(f\"Number of GRK structures: {len(structures)}\")\n", + "structures.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c9ba7827-1796-4a15-835f-3dcfb3f7f4e0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of KLIFS structures per kinase\n" + ] + }, + { + "data": { + "text/plain": [ + "kinase.gene_name\n", + "GRK2 29\n", + "GRK4 2\n", + "GRK5 4\n", + "GRK6 6\n", + "dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Number of KLIFS structures per kinase\")\n", + "structures.groupby(\"kinase.gene_name\").size()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96309789-cd99-4e5a-ad39-628e2f5bd753", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/008_application_grk/README.md b/notebooks/008_application_grk/README.md new file mode 100644 index 0000000..0c0c958 --- /dev/null +++ b/notebooks/008_application_grk/README.md @@ -0,0 +1,3 @@ +## `001_grk_structures_in_klifs.ipynb` + +GRK structures in KLIFS