diff --git a/notebooks/KO_cutoff.ipynb b/notebooks/KO_cutoff.ipynb index 58d8f2d..a04f5b6 100644 --- a/notebooks/KO_cutoff.ipynb +++ b/notebooks/KO_cutoff.ipynb @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "9c114cd3-6225-4e6b-bc56-89648ac691dd", "metadata": {}, "outputs": [ @@ -67,25 +67,46 @@ "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/b5/2grxct1x69395r8j6vkk07bc0000gp/T/ipykernel_16216/4184021818.py:2: DtypeWarning: Columns (50,60,152,178) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/var/folders/b5/2grxct1x69395r8j6vkk07bc0000gp/T/ipykernel_48061/2367602586.py:2: DtypeWarning: Columns (50,60,152,178) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = pd.read_csv(f,sep='\\t')\n" ] } ], "source": [ - "with open('results/gisaid.washington_ko_meta.tsv','r') as f:\n", + "with open('wa_results/gisaid.washington_ko_meta.tsv','r') as f:\n", " data = pd.read_csv(f,sep='\\t')" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, + "id": "bdabee2f-ac40-43ee-8624-44fa4c93f9ac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(149535, 179)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "id": "2be48910-ba6a-416f-a6eb-0c05a7aadca4", "metadata": {}, "outputs": [], "source": [ "## Get reference protien lengths\n", - "ref = SeqIO.read('sars2_ref.gb','gb')\n", + "ref = SeqIO.read('params/sars2_ref.gb','gb')\n", "\n", "proteins = {}\n", "for feature in ref.features:\n", @@ -101,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "84b6a554-43e5-4836-aef1-115089f7521b", "metadata": {}, "outputs": [], @@ -125,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "4342a513-26cd-47f8-92fa-9571cd257134", "metadata": {}, "outputs": [], @@ -133,6 +154,146 @@ "cutoffs = generate_cutoffs(data,proteins)" ] }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5d941aff-61e8-443a-b103-fd1d2c8dece6", + "metadata": {}, + "outputs": [], + "source": [ + "cutoffs.to_csv('figs/supplemental/S1_SourceData.tsv',sep='\\t',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "54e0b1bd-c3da-478b-aade-85ebee60a55f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | ko_cutoff | \n", + "n_ko | \n", + "gene | \n", + "
---|---|---|---|
0 | \n", + "1 | \n", + "85392 | \n", + "ORF1a | \n", + "
1 | \n", + "1 | \n", + "34128 | \n", + "ORF1b | \n", + "
2 | \n", + "1 | \n", + "111202 | \n", + "S | \n", + "
3 | \n", + "1 | \n", + "1963 | \n", + "ORF3a | \n", + "
4 | \n", + "1 | \n", + "1225 | \n", + "E | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "
2983 | \n", + "249 | \n", + "56 | \n", + "ORF7a | \n", + "
2984 | \n", + "249 | \n", + "80 | \n", + "ORF7b | \n", + "
2985 | \n", + "249 | \n", + "68 | \n", + "ORF8 | \n", + "
2986 | \n", + "249 | \n", + "85 | \n", + "N | \n", + "
2987 | \n", + "249 | \n", + "35 | \n", + "ORF9b | \n", + "
2988 rows × 3 columns
\n", + "