From beaa56d1a08009fbccf2470f0d3debc792b75e7d Mon Sep 17 00:00:00 2001 From: cassiawag Date: Fri, 15 Mar 2024 05:15:21 -0700 Subject: [PATCH] get total numbers --- notebooks/KO_cutoff.ipynb | 175 +++++++++++++++++++++++++++-- notebooks/intrahost_analysis.ipynb | 26 +++-- 2 files changed, 186 insertions(+), 15 deletions(-) diff --git a/notebooks/KO_cutoff.ipynb b/notebooks/KO_cutoff.ipynb index 58d8f2d..a04f5b6 100644 --- a/notebooks/KO_cutoff.ipynb +++ b/notebooks/KO_cutoff.ipynb @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "9c114cd3-6225-4e6b-bc56-89648ac691dd", "metadata": {}, "outputs": [ @@ -67,25 +67,46 @@ "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/b5/2grxct1x69395r8j6vkk07bc0000gp/T/ipykernel_16216/4184021818.py:2: DtypeWarning: Columns (50,60,152,178) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/var/folders/b5/2grxct1x69395r8j6vkk07bc0000gp/T/ipykernel_48061/2367602586.py:2: DtypeWarning: Columns (50,60,152,178) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = pd.read_csv(f,sep='\\t')\n" ] } ], "source": [ - "with open('results/gisaid.washington_ko_meta.tsv','r') as f:\n", + "with open('wa_results/gisaid.washington_ko_meta.tsv','r') as f:\n", " data = pd.read_csv(f,sep='\\t')" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, + "id": "bdabee2f-ac40-43ee-8624-44fa4c93f9ac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(149535, 179)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "id": "2be48910-ba6a-416f-a6eb-0c05a7aadca4", "metadata": {}, "outputs": [], "source": [ "## Get reference protien lengths\n", - "ref = SeqIO.read('sars2_ref.gb','gb')\n", + "ref = SeqIO.read('params/sars2_ref.gb','gb')\n", "\n", "proteins = {}\n", "for feature in ref.features:\n", @@ -101,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "84b6a554-43e5-4836-aef1-115089f7521b", "metadata": {}, "outputs": [], @@ -125,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "4342a513-26cd-47f8-92fa-9571cd257134", "metadata": {}, "outputs": [], @@ -133,6 +154,146 @@ "cutoffs = generate_cutoffs(data,proteins)" ] }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5d941aff-61e8-443a-b103-fd1d2c8dece6", + "metadata": {}, + "outputs": [], + "source": [ + "cutoffs.to_csv('figs/supplemental/S1_SourceData.tsv',sep='\\t',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "54e0b1bd-c3da-478b-aade-85ebee60a55f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ko_cutoffn_kogene
0185392ORF1a
1134128ORF1b
21111202S
311963ORF3a
411225E
............
298324956ORF7a
298424980ORF7b
298524968ORF8
298624985N
298724935ORF9b
\n", + "

2988 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " ko_cutoff n_ko gene\n", + "0 1 85392 ORF1a\n", + "1 1 34128 ORF1b\n", + "2 1 111202 S\n", + "3 1 1963 ORF3a\n", + "4 1 1225 E\n", + "... ... ... ...\n", + "2983 249 56 ORF7a\n", + "2984 249 80 ORF7b\n", + "2985 249 68 ORF8\n", + "2986 249 85 N\n", + "2987 249 35 ORF9b\n", + "\n", + "[2988 rows x 3 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cutoffs" + ] + }, { "cell_type": "code", "execution_count": 8, diff --git a/notebooks/intrahost_analysis.ipynb b/notebooks/intrahost_analysis.ipynb index 4645a56..0743133 100644 --- a/notebooks/intrahost_analysis.ipynb +++ b/notebooks/intrahost_analysis.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 56, "id": "959c0ddf-adf7-444a-ac39-49cc36a3e881", "metadata": {}, "outputs": [], @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 57, "id": "0b3a9712-bbda-485c-b648-88a8f5433ca0", "metadata": {}, "outputs": [], @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 58, "id": "530cd307-8aed-4e64-a008-caac81c16783", "metadata": {}, "outputs": [], @@ -78,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 59, "id": "d1746b2b-7491-4ec5-8e2b-1a4cbd78f9b0", "metadata": {}, "outputs": [], @@ -149,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 60, "id": "990c049a-b9e4-41d1-90d9-6f3c1b47b0b3", "metadata": {}, "outputs": [], @@ -159,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 61, "id": "95fa91c0-a0ed-4233-8ccc-3e086399e620", "metadata": {}, "outputs": [ @@ -178,7 +178,17 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 66, + "id": "9cc76353-c78b-4ece-8298-646512e1a982", + "metadata": {}, + "outputs": [], + "source": [ + "qc[['gisaid_id']].to_csv('data/gisaid_intrahost.csv',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, "id": "be126dfe-5f22-4bbf-aa75-f7555bc65c81", "metadata": {}, "outputs": [], @@ -188,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 63, "id": "742af837-dff8-4809-be0f-3cd98c11c02c", "metadata": {}, "outputs": [],