Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Train test ratio #43

Merged
merged 7 commits into from
Mar 20, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ venv/
ENV/
env.bak/
venv.bak/
.vs
.vscode

# Spyder project settings
.spyderproject
Expand Down
56 changes: 56 additions & 0 deletions dev/tab1tha/k_nn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import numpy as np

def k_nearest(data_train, target_train, data_test, target_test):
#training model using the k nearest neighbor algorithm
#note that 'neighbor' must be spelt without including a 'u' in order for it to work
'''This function trains the model then uses it to predict target values of the test data. In it, the accuracy
of the K-nearest neighbor algorithm on this data set is computed and returned '''

# Instantiate a k-NN classifier: knn
knn = KNeighborsClassifier(n_neighbors = 5)
y = target_train
X = data_train

# Fit the classifier to the training data
knn.fit(X,y)

#predict labels of the test data
target_pred = knn.predict(data_test.values)
#compute accuracy and return it
acc = knn.score(data_test, target_test)
return (acc, target_pred)

def visual_compare(data_train, target_train, data_test, target_test):
'''This function outputs a graph of the predictions gotten from the K-nn algorithm. In that same graph, the
actual values are plotted. The variation between the predicted values and actual values is clearly visualised such
that the user can choose the optimal value of the n_neighbors parameter of KNeighborsClassifier that will yied maximum
prediction accuracy'''
# Setup arrays to store train and test accuracies
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

# Loop over different values of k
for i, k in enumerate(neighbors):
# Setup a k-NN Classifier with k neighbors: knn
knn = KNeighborsClassifier(n_neighbors=k)

# Fit the classifier to the training data
knn.fit(data_train, target_train)

#Compute accuracy on the training set
train_accuracy[i] = knn.score(data_train, target_train)

#Compute accuracy on the testing set
test_accuracy[i] = knn.score(data_test, target_test)

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()
10 changes: 10 additions & 0 deletions dev/tab1tha/load_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import pandas as pd
from pathlib import Path

def load_dataset(filename):
'''Function that reads the dataset file into a dataframe irrespective of
the operating system. It returns a pandas dataframe. '''
datasets = Path('../../datasets/')
fil = datasets / filename
df = pd.read_csv(fil)
return df
31 changes: 31 additions & 0 deletions dev/tab1tha/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pandas as pd
import sklearn
import seaborn as sns
import k_nn
from sklearn.model_selection import train_test_split

#import data into a pandas dataframe
df = pd.read_csv('..\..\datasets\generated.csv')

#EDA exploratory analysis of the data
#Exploration
print('*'*50+'HEAD'+'*'*50)
print(df.head())
print('*'*50+'DESCRIBE'+'*'*50)
print(df.describe())
print('*'*50+'INFO'+'*'*50)
print(df.info())
print('*'*100)

#todo visualisation

data = df.drop('label', axis = 1)
target = df['label']
#split data set into train and test sets
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.30, random_state = 10, stratify = target)

#Evaluation of the performance of the K-nearest neighbors prediction model
kn_accuracy = k_nn.k_nearest(data_train, target_train, data_test, target_test)
print('The accuracy of the k nearest neighbours algorithm on this dataset is {}'.format(kn_accuracy))
print('The behaviour of the predicted values with respect to the actual values is as shown below:')
k_nn.visual_compare(data_train, target_train, data_test, target_test)
32 changes: 32 additions & 0 deletions dev/tab1tha/test_size_vary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

# Import necessary modules
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import k_nn

def vary_size(X,y):
'''This function trains tests and evaluates the performance of a dataset in comparison
with a changing train-to-test size ratio. It returns two lists of values; accuracy and test_ratio which
containing the values of accuracy and test_size respectively'''
accuracy = []
test_ratio = []
performance = "\n PERFORMANCE \n "
#starting the range iterator from zero or ending at 101 raises a ValueError:
# The test_size = 0 (1.0) should be greater or equal to the number of classes = 4
for i in range(5,100,5):
size = i/100
#split data set into train and test sets
data_train, data_test, target_train, target_test = train_test_split(X,y, test_size = size, random_state = 10, stratify = y)
#Evaluation of the performance of the K-nearest neighbors prediction model
kn_accuracy, target_pred = k_nn.k_nearest(data_train, target_train, data_test, target_test)
#generate classification report to observe values of precision, recall, f1_score and support
class_report = classification_report(target_test, target_pred)
#separator string demarcates results of one iteration from the other
separator = ("+"*100) + " \n"
performance = performance + separator + class_report + " \n"
#update lists
accuracy.append(kn_accuracy)
test_ratio.append(size)

return (test_ratio,accuracy,performance)
270 changes: 270 additions & 0 deletions dev/tab1tha/train_test.ipynb

Large diffs are not rendered by default.

Loading