From 42569cb2955a30a98b9e68515598da5f51f0dd05 Mon Sep 17 00:00:00 2001 From: Anastasia Rizzo <55031920+arizzogithub@users.noreply.github.com> Date: Sat, 4 Apr 2020 01:56:53 +0200 Subject: [PATCH] #3 Traversal of the space of train test splits, eeg.csv --- ... space of train test splits, eeg.csv.ipynb | 1224 +++++++++++++++++ 1 file changed, 1224 insertions(+) create mode 100644 dev/AnastasiaRizzo/#3 Traversal of the space of train test splits, eeg.csv.ipynb diff --git a/dev/AnastasiaRizzo/#3 Traversal of the space of train test splits, eeg.csv.ipynb b/dev/AnastasiaRizzo/#3 Traversal of the space of train test splits, eeg.csv.ipynb new file mode 100644 index 000000000..5329bce54 --- /dev/null +++ b/dev/AnastasiaRizzo/#3 Traversal of the space of train test splits, eeg.csv.ipynb @@ -0,0 +1,1224 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Traversal of the space of train/test splits #3 / eeg.csv dataset\n", + "### Please note:\n", + " K-Nearest Neighbours model (with its default state and with the best hyper parameter {'n_neighbors': 1} ) will be applied for this issue #3. Decision had been made, since this model became a leader in issue #2." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### * Import, Read data from ‘eeg.csv’ file\n", + "#### * Info, Head, Missing Data \n", + "#### * Observation of target variable 'Class' (visualisation: countplot)\n", + "#### * Create datasets for ML \n", + "#### * Scaler\n", + "#### * Applying 'test_size' attribute splitting from 10-90 % for Train and Test ('random_state' = 0)\n", + "#### * Finding the best number of 'random_state' attribute from 10-90 for Train and Test ('test_size'=0.3)\n", + "#### * 'Train\\Test' splitting method with new attributes\n", + "#### * Scaler\n", + "#### * Conclusion\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import " + ] + }, + { + "cell_type": "code", + "execution_count": 458, + "metadata": {}, + "outputs": [], + "source": [ + "# import standard libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "%matplotlib inline\n", + "\n", + "import sklearn.metrics as metrics\n", + "from sklearn.metrics import (accuracy_score, \n", + " f1_score,\n", + " precision_score, \n", + " average_precision_score, \n", + " recall_score\n", + " )\n", + "from sklearn.model_selection import (train_test_split, \n", + " GridSearchCV\n", + " )\n", + "\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "%run train_test_split.py\n", + "%run scalers.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read data from ‘eeg.csv’ file + Info" + ] + }, + { + "cell_type": "code", + "execution_count": 459, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 14976 entries, 0 to 14975\n", + "Data columns (total 15 columns):\n", + "V1 14976 non-null float64\n", + "V2 14976 non-null float64\n", + "V3 14976 non-null float64\n", + "V4 14976 non-null float64\n", + "V5 14976 non-null float64\n", + "V6 14976 non-null float64\n", + "V7 14976 non-null float64\n", + "V8 14976 non-null float64\n", + "V9 14976 non-null float64\n", + "V10 14976 non-null float64\n", + "V11 14976 non-null float64\n", + "V12 14976 non-null float64\n", + "V13 14976 non-null float64\n", + "V14 14976 non-null float64\n", + "Class 14976 non-null int64\n", + "dtypes: float64(14), int64(1)\n", + "memory usage: 1.7 MB\n" + ] + } + ], + "source": [ + "# read data from 'egg.csv' file\n", + "dataset = pd.read_csv('eeg.csv') \n", + "dataset.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dataset consists of 14976 rows and 15 columns; \n", + "\n", + "has 2 datatypes: float64(14), int64(1);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Head" + ] + }, + { + "cell_type": "code", + "execution_count": 460, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
V1V2V3V4V5V6V7V8V9V10V11V12V13V14Class
04329.234009.234289.234148.214350.264586.154096.924641.034222.054238.464211.284280.514635.904393.851
14324.624004.624293.854148.724342.054586.674097.444638.974210.774226.674207.694279.494632.824384.101
24327.694006.674295.384156.414336.924583.594096.924630.264207.694222.054206.674282.054628.724389.231
34328.724011.794296.414155.904343.594582.564097.444630.774217.444235.384210.774287.694632.314396.411
44326.154011.794292.314151.284347.694586.674095.904627.694210.774244.104212.824288.214632.824398.461
\n", + "
" + ], + "text/plain": [ + " V1 V2 V3 V4 V5 V6 V7 V8 \\\n", + "0 4329.23 4009.23 4289.23 4148.21 4350.26 4586.15 4096.92 4641.03 \n", + "1 4324.62 4004.62 4293.85 4148.72 4342.05 4586.67 4097.44 4638.97 \n", + "2 4327.69 4006.67 4295.38 4156.41 4336.92 4583.59 4096.92 4630.26 \n", + "3 4328.72 4011.79 4296.41 4155.90 4343.59 4582.56 4097.44 4630.77 \n", + "4 4326.15 4011.79 4292.31 4151.28 4347.69 4586.67 4095.90 4627.69 \n", + "\n", + " V9 V10 V11 V12 V13 V14 Class \n", + "0 4222.05 4238.46 4211.28 4280.51 4635.90 4393.85 1 \n", + "1 4210.77 4226.67 4207.69 4279.49 4632.82 4384.10 1 \n", + "2 4207.69 4222.05 4206.67 4282.05 4628.72 4389.23 1 \n", + "3 4217.44 4235.38 4210.77 4287.69 4632.31 4396.41 1 \n", + "4 4210.77 4244.10 4212.82 4288.21 4632.82 4398.46 1 " + ] + }, + "execution_count": 460, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# preview of the first 5 lines of the loaded data \n", + "dataset.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Missing Data" + ] + }, + { + "cell_type": "code", + "execution_count": 461, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "V1 0\n", + "V2 0\n", + "V3 0\n", + "V4 0\n", + "V5 0\n", + "V6 0\n", + "V7 0\n", + "V8 0\n", + "V9 0\n", + "V10 0\n", + "V11 0\n", + "V12 0\n", + "V13 0\n", + "V14 0\n", + "Class 0\n", + "dtype: int64" + ] + }, + "execution_count": 461, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check for missing values\n", + "dataset.isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Dataset has no missing values." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Observation of target variable 'Class' (visualisation: countplot)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Note:__ In 'Class' column, __'1'__ indicates the __eye-open__, __'2'__ the __eye-closed state__ (based on OpenML dataset description at https://www.openml.org/d/1471)." + ] + }, + { + "cell_type": "code", + "execution_count": 462, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 8254\n", + "2 6722\n", + "Name: Class, dtype: int64" + ] + }, + "execution_count": 462, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check for number of '1' and '2' in 'Class' column\n", + "dataset['Class'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the __output__, we can observe that there are __8254 eye-open state__ and __6722 eye-closed state__ cases in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 463, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1(eye-open state): 55.11 % of the dataset\n", + "2(eye-closed state): 44.89 % of the dataset\n" + ] + } + ], + "source": [ + "# present '1' and '2' states in %\n", + "print('1(eye-open state):', round(dataset['Class'].value_counts()[1] / len(dataset) * 100, 2), '% of the dataset')\n", + "print('2(eye-closed state):', round(dataset['Class'].value_counts()[2] / len(dataset) * 100, 2), '% of the dataset')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 464, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Class Distributions (1 || 2)')" + ] + }, + "execution_count": 464, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEXCAYAAABcRGizAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAdMklEQVR4nO3df7xVdZ3v8ddbUMuygDwaAiOW1GjO+KMzSppW0iBaN5huFv3yjPGIHpP9sJmm0bm3MM0Ze1wby0rnUiDoNR1SC65ZxkWtmceYekQyhAxERo6QHD2IP0gM/Nw/vt+ji80+Z+0DZ+1z4Lyfj8d+7LU+67vW+u7DYb/3Wt+1z1JEYGZm1pt9BroDZmY2+DkszMyslMPCzMxKOSzMzKyUw8LMzEo5LMzMrJTDwszMSjksrE8krZX0xYHuRxlJ4yWFpNYKtn2hpOWF+XmSbunv/eRtV/Y6dqEv75D0O0nDdnM78yRdWFMLSeN3c7ufkbRod7ZhPXNY2EskHSLpW5IelrRV0mOSfirpzIHuW7f8ptL92CJpjaQfSHp7TdN1wGhgWYPb7UsIXga8ow/dboikOyV9p6bcp9dRsf8FXBIR2wEkjc4/+99K2i5pXlU7lvROSQslbcj/7g9I+kRNs+8BrZJOqaofQ5nDwoD0CRZYCpwOXAD8OfBu4CfAvw5Yx+r7JOkN9EhgBvAC8EtJf9/dICK2R8TvI2Jbf+1U0j6ShkXEsxHxZH9ttzdVvI5dIekk4E+BHxbK+wNPAJcCd1fchZOA3wAfAI4GrgJmS/pId4OI2Ar8APhcxX0ZmiLCDz8AbgXWA6+us2xkYXot8MXC/N8CDwDPAY8B3wdGFJa/FrgW2Ag8D6wBziss/xTwu7ysE7gNGN5LPwP4QJ36PwHbgCPy/PjctjXP7wtckV/jVtIn9kvzsjtz25ceuf7XwLPAmcDyvP2jgQuB5YV9zwNuAf4n8Hhe52rglYU2dwLfqenzPOCWwnTUPMbXvo7c9lTSm/PzeX+XA/vV7OvK/DN5Iv/sLwP2KbR5f/53+wPQBfwCOKSXn/t3gJt7WX4LMK/B37V5wIV1/l3H9/F3dgFwU03t1Pzve8BA/5/a2x4+sjAkjQKmkN7Mnq1dHhGbeln9ReA84C3AR4ATgG8Xln8N+DPgvaRPpp8ghQr5PPx3ga8CbyYdyfxsF1/GN0hHytN6WP454K+A6cAE4EPAQ3nZ+4EO4CLSEcvownqvIIXAp4CjgP/qYfvvAI4BJgH/HZgMfL0P/f88cBcpZLr7sK62kaQxwE+B+4HjSEdWHwb+uabpR0nhdhLwGdK/0YfyNl4P3ADMJx2dnUoK9N6cArT34fU0w2uA2t/NdmA48Lbmd2fvNnygO2CDwhGAgJV9XTEivlmYXSvpS8BCSW0R8SJwGHB/RNzT3abQ/k9IRySLIuIZ0hvxr3eh/0TEk5I2Am/ooclhpCOYf4/0EfRR4D/zul2StgPPRMTva9YbBnw2Iu7rLkiqt/3twDk5bJdL+gdgjqQLIuK5Bvq/WdILwJZiH+rs69PABuDT+ee7UtL5wP+W9OWI2JLbrYiIr+Tp30n6JCnIrgcOJR1p3RgR3eG3nN4dlvc7KEh6L+n1nFysR8QWSZtJR2TWj3xkYZCCYtdWlE6TtFhSh6RngJuB/YDX5yZXAR+U9GtJl0kqDgwvJgXEI5Kuk9Qm6cBd7QvpdfT0Z5TnAceS3ji/K+k9khr5/d9GY4PLD9Qcld1F+jm8sYF1++JI4K4cFN3+I+/riGJ/atZbDxycp38N/D9SqN0k6W8ktZTs95Wk014DTtLJ5LGJwoeQoj+Q+mv9yGFhAKtIb7JH9mUlSYeRBsBXAmcBbyWdZoL05kVE/JT0qfQy4CDgJ5KuzsueAY4HPkj6pH8B8FtJh/b1BUg6CGghjYnsJCKWkj5t/iPp934+sLiBwNga+eqf3fQiO4fyvruwnd4CsVj/Y51l+0AaNCedJptMCpUZwCpJx/Sy3yeAkbvQ336Vr3r7KfCViLiqh2ajSONf1o8cFkZEdJEGlj8j6dW1yyWN6GHVVlIofCEi7oqI35FOcdRu/4mIuDYi/pr0xtQmaf+8bFtE3B4R3VdgvYo0vtFXf0d6Q17YU4OIeCYifhgRfwO8BziNlz+Nv0A65bSr/kzSqwrzE/M2H87znew4FgJpjKOokT6sAN5WE3Jvr9lXqUjuioivAn9BOvL4UC+r3E8asxkwkk4lBcVXa05/Ftu8kTTOtLSZfRsKPGZh3T5NOoffLunLpE+cAt5F+sT/J3XWWUX6wHGepJtJb5DnFRtIuoj0H/dB0u/b+4E1EbE1n3d+I/BL0hU57wIOpHzsZEQepO0+zdMGnA18KSJW11tB0t+SzrkvI33q/gjwNGlgG9JYyimS/g/paOKJkj7UGg7Mza/3UNLlpN8rjFfcDnxT0vtIA+ufAsax4xjOWuCEfBnzs6SfSa0rST/jKyV9izRGcynp4oQtddrvRNJE0sUEt5Gupjou92VFL6vdRgr62m0dmydfA7yY51+IiN621WeS3kk6ir0SuC7/+wNsj4jiUcQppN+vVf25f8OXzvrx8oP0yffbpFM5W0mfNn8KnFFos5YdL539HOnqpj8AS0inlF66DBL4H6Sg2EJ687sVODIveztwB/BkXn85aZC4tz4WLy19HniENGh7ak278ex46ewnSaH1DCkkfgGcVGg/kXQu/3lqLp2t04cLqX/p7FdIl6k+SzrNdUChzb6kK7+eyI+LKFw6m9u8iTTWsYXGLp3dysuXzu5fWH4nvV+me2T+d308b2M1KWh7+7mPzP16Sy//Ht2PtSXbmkcfL52l/qXFO+2LFGrnD/T/pb3xofwDNjPrlaRLgZaI2OkIo4/bmUd6k7+wUAvg8IhYuxvbPZr0geVNEbF5d/poO/OYhZk16p+ANbv7t6EqdChwtoOiGh6zMLOGRMTTwCUD3Y+eRMTPB7oPezOHhZk124+Bp2pqX61Ts0FkrxyzOOigg2L8+PED3Q0zsz3Kfffd90RE1P2C5l55ZDF+/Hja2wfbn7ExMxvcJPX0t888wG1mZuUcFmZmVsphYWZmpRwWZmZWymFhZmalHBZmZlbKYWFmZqUcFmZmVsphYWZmpfbKb3D3h/de98WB7oINQrd89LKB7oLZgPCRhZmZlXJYmJlZqUrDQtIXJD0oabmk6yW9QtLhku6WtErSv0naL7fdP8+vzsvHF7ZzQa4/JOn0KvtsZmY7qywsJI0h3Z+5NSKOBoYB04GvA5dHxARgEy/fBH4GsCkijiDdU/jreTtH5fXeAkwh3ah+sN6py8xsr1T1aajhwCslDQcOADYApwE35uXzgWl5emqeJy+fJEm5fkNEbI2IR0g3lz+h4n6bmVlBZWEREY8BlwGPkkJiM3Af8FREbMvNOoAxeXoMsC6vuy23f12xXmedl0iaKaldUntnZ2f/vyAzsyGsytNQI0lHBYeTbqT+KuCMOk27b9WnHpb1VN+xEDE7IlojorWlpe6NnszMbBdVeRrq3cAjEdEZEX8EbgZOAkbk01IAY4H1eboDGAeQl78W6CrW66xjZmZNUGVYPApMlHRAHnuYBKwA7gA+kNu0AQvz9KI8T15+e6QbhC8CpuerpQ4HJgD3VNhvMzOrUdk3uCPibkk3AkuBbcD9wGzgJ8ANkr6Wa3PyKnOAayWtJh1RTM/beVDSAlLQbAPOjYjtVfXbzMx2Vumf+4iIWcCsmvIa6lzNFBHPA2f1sJ1LgEv6vYNmZtYQf4PbzMxKOSzMzKyUw8LMzEo5LMzMrJTDwszMSjkszMyslMPCzMxKOSzMzKyUw8LMzEo5LMzMrJTDwszMSjkszMyslMPCzMxKOSzMzKyUw8LMzEo5LMzMrFRlYSHpzZKWFR5PSzpP0ihJiyWtys8jc3tJukLSakkPSDq+sK223H6VpLae92pmZlWoLCwi4qGIODYijgXeCmwBfgScDyyJiAnAkjwPcAbp/toTgJnAVQCSRpHutnci6Q57s7oDxszMmqNZp6EmAQ9HxH8BU4H5uT4fmJanpwLXRPIrYISk0cDpwOKI6IqITcBiYEqT+m1mZjQvLKYD1+fpQyJiA0B+PjjXxwDrCut05FpP9R1ImimpXVJ7Z2dnP3ffzGxoqzwsJO0HvA/4YVnTOrXopb5jIWJ2RLRGRGtLS0vfO2pmZj1qxpHFGcDSiHg8zz+eTy+RnzfmegcwrrDeWGB9L3UzM2uSZoTFh3n5FBTAIqD7iqY2YGGhfna+KmoisDmfproNmCxpZB7YnpxrZmbWJMOr3LikA4C/BD5VKF8KLJA0A3gUOCvXbwXOBFaTrpw6ByAiuiRdDNyb210UEV1V9tvMzHZUaVhExBbgdTW1J0lXR9W2DeDcHrYzF5hbRR/NzKycv8FtZmalKj2yMLP+98EvLh3oLtggtOCy48sb7QYfWZiZWSmHhZmZlXJYmJlZKYeFmZmVcliYmVkph4WZmZVyWJiZWSmHhZmZlXJYmJlZKYeFmZmVcliYmVkph4WZmZVyWJiZWSmHhZmZlao0LCSNkHSjpN9KWinpbZJGSVosaVV+HpnbStIVklZLekDS8YXttOX2qyS19bxHMzOrQtVHFt8CfhYRfwocA6wEzgeWRMQEYEmeBzgDmJAfM4GrACSNAmYBJwInALO6A8bMzJqjsrCQ9BrgVGAOQES8EBFPAVOB+bnZfGBanp4KXBPJr4ARkkYDpwOLI6IrIjYBi4EpVfXbzMx2VuWRxRuATuBqSfdL+r6kVwGHRMQGgPx8cG4/BlhXWL8j13qq70DSTEntkto7Ozv7/9WYmQ1hVYbFcOB44KqIOA54jpdPOdWjOrXopb5jIWJ2RLRGRGtLS8uu9NfMzHpQZVh0AB0RcXeev5EUHo/n00vk542F9uMK648F1vdSNzOzJqksLCLi98A6SW/OpUnACmAR0H1FUxuwME8vAs7OV0VNBDbn01S3AZMljcwD25NzzczMmmR4xdv/LHCdpP2ANcA5pIBaIGkG8ChwVm57K3AmsBrYktsSEV2SLgbuze0uioiuivttZmYFlYZFRCwDWussmlSnbQDn9rCducDc/u2dmZk1yt/gNjOzUg4LMzMr5bAwM7NSDgszMyvlsDAzs1IOCzMzK+WwMDOzUg4LMzMr5bAwM7NSDgszMyvlsDAzs1IOCzMzK+WwMDOzUg4LMzMr5bAwM7NSDgszMytVaVhIWivpN5KWSWrPtVGSFktalZ9H5rokXSFptaQHJB1f2E5bbr9KUltP+zMzs2o048jiXRFxbER03zHvfGBJREwAluR5gDOACfkxE7gKUrgAs4ATgROAWd0BY2ZmzTEQp6GmAvPz9HxgWqF+TSS/AkZIGg2cDiyOiK6I2AQsBqY0u9NmZkNZ1WERwM8l3SdpZq4dEhEbAPLzwbk+BlhXWLcj13qq70DSTEntkto7Ozv7+WWYmQ1twyve/skRsV7SwcBiSb/tpa3q1KKX+o6FiNnAbIDW1tadlpuZ2a6r9MgiItbn543Aj0hjDo/n00vk5425eQcwrrD6WGB9L3UzM2uSysJC0qskHdg9DUwGlgOLgO4rmtqAhXl6EXB2vipqIrA5n6a6DZgsaWQe2J6ca2Zm1iRVnoY6BPiRpO79/CAifibpXmCBpBnAo8BZuf2twJnAamALcA5ARHRJuhi4N7e7KCK6Kuy3mZnVqCwsImINcEyd+pPApDr1AM7tYVtzgbn93UczM2tMQ6ehJC1ppGZmZnunXo8sJL0COAA4KI8XdF+Z9Brg0Ir7ZmZmg0TZaahPAeeRguE+Xg6Lp4HvVtgvMzMbRHoNi4j4FvAtSZ+NiG83qU9mZjbINDTAHRHflnQSML64TkRcU1G/zMxsEGkoLCRdC7wRWAZsz+UAHBZmZkNAo5fOtgJH5ctbzcxsiGn0G9zLgddX2REzMxu8Gj2yOAhYIekeYGt3MSLeV0mvzMxsUGk0LC6sshNmZja4NXo11C+q7oiZmQ1ejV4N9Qwv30NiP2Bf4LmIeE1VHTMzs8Gj0SOLA4vzkqaR7k1hZmZDwC7dzyIifgyc1s99MTOzQarR01DvL8zuQ/rehb9zYWY2RDR6NdR/K0xvA9YCU/u9N2ZmNig1OmZxzq7uQNIwoB14LCLeK+lw4AZgFLAU+HhEvCBpf9KfD3kr8CTwoYhYm7dxATCD9KdGPhcRvq2qmVkTNXrzo7GSfiRpo6THJd0kaWyD+/g8sLIw/3Xg8oiYAGwihQD5eVNEHAFcntsh6ShgOvAWYApwZQ4gMzNrkkYHuK8GFpHuazEG+L+51qscKO8Bvp/nRRoYvzE3mQ9My9NT8zx5+aTcfipwQ0RsjYhHSPfo9pVYZmZN1GhYtETE1RGxLT/mAS0NrPdN4EvAi3n+dcBTEbEtz3eQwof8vA4gL9+c279Ur7POSyTNlNQuqb2zs7PBl2VmZo1oNCyekPQxScPy42OkcYUeSXovsDEi7iuW6zSNkmW9rfNyIWJ2RLRGRGtLSyM5ZmZmjWo0LD4BfBD4PbAB+ABQNuh9MvA+SWtJA9qnkY40RkjqHlgfC6zP0x3AOIC8/LVAV7FeZx0zM2uCRsPiYqAtIloi4mBSeFzY2woRcUFEjI2I8aQB6tsj4qPAHaSwAWgDFubpRXmevPz2fP+MRcB0SfvnK6kmAPc02G8zM+sHjX7P4s8jYlP3TER0STpuF/f5D8ANkr4G3A/MyfU5wLWSVpOOKKbnfT0oaQGwgvQdj3MjYvvOmzUzs6o0Ghb7SBrZHRiSRvVhXSLiTuDOPL2GOlczRcTzwFk9rH8JcEmj+zMzs/7V6Bv+N4D/lHQjaXD5g/jN28xsyGj0G9zXSGonDVILeH9ErKi0Z2ZmNmj05VTSCtK4gZmZDTG79CfKzcxsaHFYmJlZKYeFmZmVcliYmVkph4WZmZVyWJiZWSmHhZmZlXJYmJlZKYeFmZmVcliYmVkph4WZmZVyWJiZWSmHhZmZlaosLCS9QtI9kn4t6UFJX831wyXdLWmVpH+TtF+u75/nV+fl4wvbuiDXH5J0elV9NjOz+qo8stgKnBYRxwDHAlMkTQS+DlweEROATcCM3H4GsCkijgAuz+2QdBTpFqtvAaYAV0oaVmG/zcysRmVhEcmzeXbf/AjSDZRuzPX5wLQ8PTXPk5dPkqRcvyEitkbEI8Bq6tyW1czMqlPpmIWkYZKWARuBxcDDwFMRsS036QDG5OkxwDqAvHwz8Lpivc46ZmbWBJWGRURsj4hjgbGko4Ej6zXLz+phWU/1HUiaKaldUntnZ+eudtnMzOpoytVQEfEUcCcwERghqft2rmOB9Xm6AxgHkJe/Fugq1uusU9zH7IhojYjWlpaWKl6GmdmQVeXVUC2SRuTpVwLvBlYCdwAfyM3agIV5elGeJy+/PSIi16fnq6UOByYA91TVbzMz29nw8ia7bDQwP1+5tA+wICJukbQCuEHS14D7gTm5/RzgWkmrSUcU0wEi4kFJC4AVwDbg3IjYXmG/zcysRmVhEREPAMfVqa+hztVMEfE8cFYP27oEuKS/+2hmZo3xN7jNzKyUw8LMzEo5LMzMrJTDwszMSjkszMyslMPCzMxKOSzMzKyUw8LMzEo5LMzMrJTDwszMSjkszMyslMPCzMxKOSzMzKyUw8LMzEo5LMzMrJTDwszMSlV5W9Vxku6QtFLSg5I+n+ujJC2WtCo/j8x1SbpC0mpJD0g6vrCtttx+laS2nvZpZmbVqPLIYhvwdxFxJDAROFfSUcD5wJKImAAsyfMAZ5Durz0BmAlcBSlcgFnAiaQ77M3qDhgzM2uOysIiIjZExNI8/QywEhgDTAXm52bzgWl5eipwTSS/AkZIGg2cDiyOiK6I2AQsBqZU1W8zM9tZU8YsJI0n3Y/7buCQiNgAKVCAg3OzMcC6wmodudZT3czMmqTysJD0auAm4LyIeLq3pnVq0Uu9dj8zJbVLau/s7Ny1zpqZWV2VhoWkfUlBcV1E3JzLj+fTS+TnjbneAYwrrD4WWN9LfQcRMTsiWiOitaWlpX9fiJnZEFfl1VAC5gArI+JfCosWAd1XNLUBCwv1s/NVUROBzfk01W3AZEkj88D25FwzM7MmGV7htk8GPg78RtKyXPtH4FJggaQZwKPAWXnZrcCZwGpgC3AOQER0SboYuDe3uygiuirst5mZ1agsLCLiP6g/3gAwqU77AM7tYVtzgbn91zszM+sLf4PbzMxKOSzMzKyUw8LMzEo5LMzMrJTDwszMSjkszMyslMPCzMxKOSzMzKyUw8LMzEo5LMzMrJTDwszMSjkszMyslMPCzMxKOSzMzKyUw8LMzEo5LMzMrJTDwszMSlV5D+65kjZKWl6ojZK0WNKq/Dwy1yXpCkmrJT0g6fjCOm25/SpJbfX2ZWZm1aryyGIeMKWmdj6wJCImAEvyPMAZwIT8mAlcBSlcgFnAicAJwKzugDEzs+apLCwi4pdAV015KjA/T88HphXq10TyK2CEpNHA6cDiiOiKiE3AYnYOIDMzq1izxywOiYgNAPn54FwfA6wrtOvItZ7qO5E0U1K7pPbOzs5+77iZ2VA2WAa4VacWvdR3LkbMjojWiGhtaWnp186ZmQ11zQ6Lx/PpJfLzxlzvAMYV2o0F1vdSNzOzJmp2WCwCuq9oagMWFupn56uiJgKb82mq24DJkkbmge3JuWZmZk00vKoNS7oeeCdwkKQO0lVNlwILJM0AHgXOys1vBc4EVgNbgHMAIqJL0sXAvbndRRFRO2huZmYVqywsIuLDPSyaVKdtAOf2sJ25wNx+7JqZmfXRYBngNjOzQcxhYWZmpRwWZmZWymFhZmalHBZmZlbKYWFmZqUcFmZmVsphYWZmpRwWZmZWymFhZmalHBZmZlbKYWFmZqUcFmZmVsphYWZmpRwWZmZWymFhZmal9piwkDRF0kOSVks6f6D7Y2Y2lOwRYSFpGPBd4AzgKODDko4a2F6ZmQ0de0RYACcAqyNiTUS8ANwATB3gPpmZDRmV3YO7n40B1hXmO4ATiw0kzQRm5tlnJT3UpL4NBQcBTwx0JwYDfewbA90F25F/NzP1z6/mYT0t2FPCQnVqscNMxGxgdnO6M7RIao+I1oHuh1kt/242z55yGqoDGFeYHwusH6C+mJkNOXtKWNwLTJB0uKT9gOnAogHuk5nZkLFHnIaKiG2SPgPcBgwD5kbEgwPcraHEp/dssPLvZpMoIspbmZnZkLannIYyM7MB5LAwM7NSDgvrkaS5kjZKWj7QfTErkjRO0h2SVkp6UNLnB7pPezuPWViPJJ0KPAtcExFHD3R/zLpJGg2Mjoilkg4E7gOmRcSKAe7aXstHFtajiPgl0DXQ/TCrFREbImJpnn4GWEn6Sw9WEYeFme3RJI0HjgPuHtie7N0cFma2x5L0auAm4LyIeHqg+7M3c1iY2R5J0r6koLguIm4e6P7s7RwWZrbHkSRgDrAyIv5loPszFDgsrEeSrgfuAt4sqUPSjIHuk1l2MvBx4DRJy/LjzIHu1N7Ml86amVkpH1mYmVkph4WZmZVyWJiZWSmHhZmZlXJYmJlZKYeF2W6S9HpJN0h6WNIKSbdKepP/Wq/tTfaI26qaDVb5y2E/AuZHxPRcOxY4ZEA7ZtbPfGRhtnveBfwxIv61uxARy4B13fOSxkv6d0lL8+OkXB8t6Zf5C2XLJZ0iaZikeXn+N5K+0PyXZLYzH1mY7Z6jSfdS6M1G4C8j4nlJE4DrgVbgI8BtEXGJpGHAAcCxwJju+4dIGlFd180a57Awq96+wHfy6antwJty/V5gbv6DeD+OiGWS1gBvkPRt4CfAzwekx2Y1fBrKbPc8CLy1pM0XgMeBY0hHFPvBSzeXOhV4DLhW0tkRsSm3uxM4F/h+Nd026xuHhdnuuR3YX9InuwuS/gI4rNDmtcCGiHiR9MfvhuV2hwEbI+J7pL+gerykg4B9IuIm4MvA8c15GWa982kos90QESHpr4BvSjofeB5YC5xXaHYlcJOks4A7gOdy/Z3A30v6I+le52eTbg16taTuD3IXVP4izBrgvzprZmalfBrKzMxKOSzMzKyUw8LMzEo5LMzMrJTDwszMSjkszMyslMPCzMxK/X/KWlDyEVavMgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# visualisation of '1' and '2' states.\n", + "colors = [\"mediumseagreen\", \"royalblue\"]\n", + "sns.countplot('Class', data = dataset, palette = colors)\n", + "plt.title('Class Distributions (1 || 2)', fontsize = 14)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create datasets for ML " + ] + }, + { + "cell_type": "code", + "execution_count": 465, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
V1V2V3V4V5V6V7V8V9V10V11V12V13V14
04329.234009.234289.234148.214350.264586.154096.924641.034222.054238.464211.284280.514635.904393.85
14324.624004.624293.854148.724342.054586.674097.444638.974210.774226.674207.694279.494632.824384.10
24327.694006.674295.384156.414336.924583.594096.924630.264207.694222.054206.674282.054628.724389.23
34328.724011.794296.414155.904343.594582.564097.444630.774217.444235.384210.774287.694632.314396.41
44326.154011.794292.314151.284347.694586.674095.904627.694210.774244.104212.824288.214632.824398.46
\n", + "
" + ], + "text/plain": [ + " V1 V2 V3 V4 V5 V6 V7 V8 \\\n", + "0 4329.23 4009.23 4289.23 4148.21 4350.26 4586.15 4096.92 4641.03 \n", + "1 4324.62 4004.62 4293.85 4148.72 4342.05 4586.67 4097.44 4638.97 \n", + "2 4327.69 4006.67 4295.38 4156.41 4336.92 4583.59 4096.92 4630.26 \n", + "3 4328.72 4011.79 4296.41 4155.90 4343.59 4582.56 4097.44 4630.77 \n", + "4 4326.15 4011.79 4292.31 4151.28 4347.69 4586.67 4095.90 4627.69 \n", + "\n", + " V9 V10 V11 V12 V13 V14 \n", + "0 4222.05 4238.46 4211.28 4280.51 4635.90 4393.85 \n", + "1 4210.77 4226.67 4207.69 4279.49 4632.82 4384.10 \n", + "2 4207.69 4222.05 4206.67 4282.05 4628.72 4389.23 \n", + "3 4217.44 4235.38 4210.77 4287.69 4632.31 4396.41 \n", + "4 4210.77 4244.10 4212.82 4288.21 4632.82 4398.46 " + ] + }, + "execution_count": 465, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# set 'X' for features and 'y' for the target ('Class').\n", + "X = dataset.drop('Class', axis=1)\n", + "y = dataset['Class']\n", + "X.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scaler" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Note:__ Usually, at this stage I am applying __scaler__ for data. This important process needs to be done before building and training ML models since many ML algorithms don't perform well if the features are not on relatively similar scales. \n", + "However, I always asked myself 'At this stage, how am I supposed to know which scaler fits data in its best way?'. Therefore, I will apply scaler later.\n", + "\n", + "For experiment reason, __K-Nearest Neighbours model (in its default state and with hyper parameter {'n_neighbors': 1})__ will be applied. The following steps will be done:\n", + "\n", + " Step 1. The best 'test_size' attribute split percentage in 'Train\\Test splitting method' will be found. \n", + " Step 2. The best 'random_state' attribute number in 'Train\\Test splitting method' will be found. \n", + " Step 3. The best Scaler for KNN accuracy (with the best hyper parameters) will be found and applyed.\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Applying 'test_size' attribute splitting from 10-90 % for Train and Test ('random_state' = 0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This method will show two kinds of measurment: Accuracy and F1 score. F1 score was choosen instead of Precision and Recall metrics since it is the harmonic mean of them and gives a better measure of the incorrectly classified cases than the accuracy metric." + ] + }, + { + "cell_type": "code", + "execution_count": 466, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
X_trainX_testAccuracyAccuracy with hyper parametersF1_scoreF1_score with hyper parameters
190.010.00.9739650.9833110.9759410.984596
280.020.00.9709610.9799730.9731900.981504
370.030.00.9637210.9728470.9666870.974887
460.040.00.9562680.9681190.9601340.970800
550.050.00.9479170.9592680.9526580.962882
640.060.00.9405740.9537060.9461690.957963
730.070.00.9282720.9404810.9353950.946262
820.080.00.9108590.9265500.9201080.933795
910.090.00.8678690.8901250.8819980.900437
\n", + "
" + ], + "text/plain": [ + " X_train X_test Accuracy Accuracy with hyper parameters F1_score \\\n", + "1 90.0 10.0 0.973965 0.983311 0.975941 \n", + "2 80.0 20.0 0.970961 0.979973 0.973190 \n", + "3 70.0 30.0 0.963721 0.972847 0.966687 \n", + "4 60.0 40.0 0.956268 0.968119 0.960134 \n", + "5 50.0 50.0 0.947917 0.959268 0.952658 \n", + "6 40.0 60.0 0.940574 0.953706 0.946169 \n", + "7 30.0 70.0 0.928272 0.940481 0.935395 \n", + "8 20.0 80.0 0.910859 0.926550 0.920108 \n", + "9 10.0 90.0 0.867869 0.890125 0.881998 \n", + "\n", + " F1_score with hyper parameters \n", + "1 0.984596 \n", + "2 0.981504 \n", + "3 0.974887 \n", + "4 0.970800 \n", + "5 0.962882 \n", + "6 0.957963 \n", + "7 0.946262 \n", + "8 0.933795 \n", + "9 0.900437 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "split_train_test (X, y, split_size, columns1, table, index)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results above show, that the best splitting percentage is '90/10'. However, I always remember one rule:\n", + "'The more training data you have, the better your model will be. The more testing data you have, the less variance you can expect in your results (ie. accuracy, false positive rate, etc.)'.\n", + "\n", + "Therefore, __'70/30'__ splitting percentage with:\n", + "\n", + " KNN Accuracy: 0.963721\n", + " KNN Accuracy with hyper parameters: 0.972847\n", + " KNN F1_score: 0.966687\n", + " KNN F1_score with hyper parameters: 0.974887\n", + "\n", + "will be choosen for future testing.\n", + "\n", + "Next step will be applying 'random_state' attribute number from 10-90 for Train and Test with 'test_size'= 0.3." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Finding the best number of 'random_state' attribute from 10-90 for Train and Test ('test_size'=0.3)" + ] + }, + { + "cell_type": "code", + "execution_count": 467, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Random StateAccuracyAccuracy with hyper parametersF1_scoreF1_score with hyper parameters
10.010.00.9574890.9710660.9609490.973306
20.020.00.9672820.9730690.9702970.975441
30.030.00.9612730.9735140.9646200.975788
40.042.00.9655020.9737370.9682310.975780
50.050.00.9601600.9706210.9642500.973547
60.060.00.9683950.9795240.9714170.981429
70.070.00.9646120.9768530.9682440.979183
80.080.00.9601600.9719560.9632970.974159
90.090.00.9632760.9739590.9664970.976108
\n", + "
" + ], + "text/plain": [ + " Random State Accuracy Accuracy with hyper parameters F1_score \\\n", + "1 0.0 10.0 0.957489 0.971066 0.960949 \n", + "2 0.0 20.0 0.967282 0.973069 0.970297 \n", + "3 0.0 30.0 0.961273 0.973514 0.964620 \n", + "4 0.0 42.0 0.965502 0.973737 0.968231 \n", + "5 0.0 50.0 0.960160 0.970621 0.964250 \n", + "6 0.0 60.0 0.968395 0.979524 0.971417 \n", + "7 0.0 70.0 0.964612 0.976853 0.968244 \n", + "8 0.0 80.0 0.960160 0.971956 0.963297 \n", + "9 0.0 90.0 0.963276 0.973959 0.966497 \n", + "\n", + " F1_score with hyper parameters \n", + "1 0.973306 \n", + "2 0.975441 \n", + "3 0.975788 \n", + "4 0.975780 \n", + "5 0.973547 \n", + "6 0.981429 \n", + "7 0.979183 \n", + "8 0.974159 \n", + "9 0.976108 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "random_state (X, y, random1, columns2, table1, index)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results above depict, that the best __'random_state' number = 60__ with:\n", + "\n", + " KNN Accuracy: 0.968395\n", + " KNN Accuracy with hyper parameters: 0.979524\n", + " KNN F1_score: 0.971417\n", + " KNN F1_score with hyper parameters: 0.981429" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this stage, our Train\\Test splitting parameters already found:\n", + " __X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=60)__\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 'Train\\Test' splitting method with new attributes" + ] + }, + { + "cell_type": "code", + "execution_count": 468, + "metadata": {}, + "outputs": [], + "source": [ + "# apply 'Train\\Test' splitting method\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=60)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scalers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Tables above show, that the best __Accuracy__ and __F1 score__ always belong to __KNN with hyper parameter {'n_neighbors': 1}__. Therefore, in order to find the best scaler to improve measurements, Scalers will be applied to it." + ] + }, + { + "cell_type": "code", + "execution_count": 469, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelScalerAccuracy
0K-Nearest Neighbour:StandardScaler(copy=True, with_mean=True, with...0.968395
1K-Nearest Neighbour:MinMaxScaler(copy=True, feature_range=(0, 1))0.969953
2K-Nearest Neighbour:MaxAbsScaler(copy=True)0.980191
3K-Nearest Neighbour:RobustScaler(copy=True, quantile_range=(25.0, ...0.971956
4K-Nearest Neighbour:QuantileTransformer(copy=True, ignore_implicit...0.971289
5K-Nearest Neighbour:QuantileTransformer(copy=True, ignore_implicit...0.963944
6K-Nearest Neighbour:PowerTransformer(copy=True, method='yeo-johnso...0.445805
7K-Nearest Neighbour:Normalizer(copy=True, norm='l2')0.980414
\n", + "
" + ], + "text/plain": [ + " Model Scaler \\\n", + "0 K-Nearest Neighbour: StandardScaler(copy=True, with_mean=True, with... \n", + "1 K-Nearest Neighbour: MinMaxScaler(copy=True, feature_range=(0, 1)) \n", + "2 K-Nearest Neighbour: MaxAbsScaler(copy=True) \n", + "3 K-Nearest Neighbour: RobustScaler(copy=True, quantile_range=(25.0, ... \n", + "4 K-Nearest Neighbour: QuantileTransformer(copy=True, ignore_implicit... \n", + "5 K-Nearest Neighbour: QuantileTransformer(copy=True, ignore_implicit... \n", + "6 K-Nearest Neighbour: PowerTransformer(copy=True, method='yeo-johnso... \n", + "7 K-Nearest Neighbour: Normalizer(copy=True, norm='l2') \n", + "\n", + " Accuracy \n", + "0 0.968395 \n", + "1 0.969953 \n", + "2 0.980191 \n", + "3 0.971956 \n", + "4 0.971289 \n", + "5 0.963944 \n", + "6 0.445805 \n", + "7 0.980414 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "scalers (X_train, y_train, X_test, y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results above depict, that the best scaler is __Normalizer__ with __Accuracy = 0.980414__." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " __No scaling, after 'test_size' 70/30:__\n", + " \n", + " KNN Accuracy: 0.963721\n", + " \n", + " KNN Accuracy with hyper parameter {'n_neighbors': 1}: __0.972847__\n", + " \n", + " __No scaling, after 'random_state' 60:__\n", + " \n", + " KNN Accuracy: 0.968395\n", + " \n", + " KNN Accuracy with hyper parameter {'n_neighbors': 1}: __0.979524__\n", + " \n", + " __Scaler 'Normalizer' applied:__\n", + " \n", + " KNN Accuracy with hyper parameter {'n_neighbors': 1}: __0.980414__\n", + " \n", + "Dataset was tuned and improved for future calculations __from 0.963721 to 0.980414.__" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Anastasia Rizzo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}