mozilla · mlopatka · Mar 20, 2020 · Mar 8, 2020 · Mar 10, 2020 · Mar 10, 2020
diff --git a/.gitignore b/.gitignore
@@ -109,6 +109,8 @@ venv/
 ENV/
 env.bak/
 venv.bak/
+.vs 
+.vscode 
 
 # Spyder project settings
 .spyderproject

diff --git a/dev/tab1tha/k_nn.py b/dev/tab1tha/k_nn.py
@@ -0,0 +1,56 @@
+from sklearn.neighbors import KNeighborsClassifier
+import matplotlib.pyplot as plt
+import numpy as np
+
+def k_nearest(data_train, target_train, data_test, target_test):
+    #training model using the k nearest neighbor algorithm
+    #note that 'neighbor' must be spelt without including a 'u' in order for it to work
+    '''This function trains the model then uses it to predict target values of the test data. In it, the accuracy
+    of the K-nearest neighbor algorithm on this data set is computed and returned '''
+
+    # Instantiate a k-NN classifier: knn
+    knn = KNeighborsClassifier(n_neighbors = 5)
+    y = target_train 
+    X = data_train
+
+    # Fit the classifier to the training data
+    knn.fit(X,y)
+
+    #predict labels of the test data
+    target_pred = knn.predict(data_test.values)
+    #compute accuracy and return it
+    acc = knn.score(data_test, target_test)
+    return (acc, target_pred)
+
+def visual_compare(data_train, target_train, data_test, target_test):
+    '''This function outputs a graph of the predictions gotten from the K-nn algorithm. In that same graph, the 
+    actual values are plotted. The variation between the predicted values and actual values is clearly visualised such
+    that the user can choose the optimal value of the n_neighbors parameter of KNeighborsClassifier that will yied maximum
+    prediction accuracy'''
+    # Setup arrays to store train and test accuracies
+    neighbors = np.arange(1, 9)
+    train_accuracy = np.empty(len(neighbors))
+    test_accuracy = np.empty(len(neighbors))
+
+    # Loop over different values of k
+    for i, k in enumerate(neighbors):
+        # Setup a k-NN Classifier with k neighbors: knn
+        knn = KNeighborsClassifier(n_neighbors=k)
+
+        # Fit the classifier to the training data
+        knn.fit(data_train, target_train)
+
+        #Compute accuracy on the training set
+        train_accuracy[i] = knn.score(data_train, target_train)
+
+        #Compute accuracy on the testing set
+        test_accuracy[i] = knn.score(data_test, target_test)
+
+    # Generate plot
+    plt.title('k-NN: Varying Number of Neighbors')
+    plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
+    plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
+    plt.legend()
+    plt.xlabel('Number of Neighbors')
+    plt.ylabel('Accuracy')
+    plt.show()
diff --git a/dev/tab1tha/load_dataset.py b/dev/tab1tha/load_dataset.py
@@ -0,0 +1,10 @@
+import pandas as pd
+from pathlib import Path
+
+def load_dataset(filename):
+    '''Function that reads the dataset file into a dataframe irrespective of
+    the operating system. It returns a pandas dataframe. '''
+    datasets = Path('../../datasets/')
+    fil = datasets / filename
+    df = pd.read_csv(fil)
+    return df
diff --git a/dev/tab1tha/main.py b/dev/tab1tha/main.py
@@ -0,0 +1,31 @@
+import pandas as pd
+import sklearn
+import seaborn as sns
+import k_nn
+from sklearn.model_selection import train_test_split
+
+#import data into a pandas dataframe
+df = pd.read_csv('..\..\datasets\generated.csv')
+
+#EDA exploratory analysis of the data
+#Exploration
+print('*'*50+'HEAD'+'*'*50)
+print(df.head())
+print('*'*50+'DESCRIBE'+'*'*50)
+print(df.describe())
+print('*'*50+'INFO'+'*'*50)
+print(df.info())
+print('*'*100)
+
+#todo visualisation
+
+data = df.drop('label', axis = 1)
+target = df['label']
+#split data set into train and test sets 
+data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.30, random_state = 10, stratify = target)
+
+#Evaluation of the  performance of the K-nearest neighbors prediction model
+kn_accuracy = k_nn.k_nearest(data_train, target_train, data_test, target_test)
+print('The accuracy of the k nearest neighbours algorithm on this dataset is {}'.format(kn_accuracy))
+print('The behaviour of the predicted values with respect to the actual values is as shown below:')
+k_nn.visual_compare(data_train, target_train, data_test, target_test)
diff --git a/dev/tab1tha/test_size_vary.py b/dev/tab1tha/test_size_vary.py
@@ -0,0 +1,32 @@
+
+# Import necessary modules
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import classification_report
+from sklearn.model_selection import train_test_split
+import k_nn
+
+def vary_size(X,y):
+    '''This function trains tests and evaluates the performance of a dataset in comparison
+    with a changing train-to-test size ratio. It returns two lists of values; accuracy and test_ratio which
+    containing the values of accuracy and test_size respectively'''
+    accuracy = []
+    test_ratio = []
+    performance = "\n PERFORMANCE \n "
+    #starting the range iterator from zero  or ending at 101 raises a ValueError:
+    # The test_size = 0 (1.0) should be greater or equal to the number of classes = 4
+    for i in range(5,100,5):
+        size = i/100
+        #split data set into train and test sets 
+        data_train, data_test, target_train, target_test = train_test_split(X,y, test_size = size, random_state = 10, stratify = y)
+        #Evaluation of the  performance of the K-nearest neighbors prediction model
+        kn_accuracy, target_pred = k_nn.k_nearest(data_train, target_train, data_test, target_test)
+        #generate classification report to observe values of precision, recall, f1_score and support
+        class_report = classification_report(target_test, target_pred)
+        #separator string demarcates results of one iteration from the other
+        separator = ("+"*100) + " \n"
+        performance = performance + separator + class_report + " \n"
+        #update lists
+        accuracy.append(kn_accuracy)
+        test_ratio.append(size)
+
+    return (test_ratio,accuracy,performance)
diff --git a/dev/tab1tha/train_test.ipynb b/dev/tab1tha/train_test.ipynb