Skip to content

Commit

Permalink
Train test ratio (#43)
Browse files Browse the repository at this point in the history
* KNN model trained and tested on generated.csv dataset

* Effect of split ratio on performance

* Hyperparameter tuning and cross validation implemented

* Black formatting applied

Co-authored-by: mlopatka <[email protected]>
  • Loading branch information
tab1tha and mlopatka authored Mar 20, 2020
1 parent 089104f commit dde72c1
Show file tree
Hide file tree
Showing 4 changed files with 479 additions and 1 deletion.
5 changes: 4 additions & 1 deletion dev/tab1tha/k_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,12 @@ def k_nearest(data_train, target_train, data_test, target_test):
for use as a parameter of the classifier."""
knn = KNeighborsClassifier(n_neighbors=n)
knn.fit(data_train, target_train)

# predict labels of the test data
target_pred = knn.predict(data_test.values)
# The held out test set is used to evaluate the accuracy of the model
acc = knn.score(data_test, target_test)
return acc
return (acc, target_pred)


def visual_compare(data_train, target_train, data_test, target_test):
Expand Down
36 changes: 36 additions & 0 deletions dev/tab1tha/test_size_vary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Import necessary modules
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import k_nn


def vary_size(X, y):
"""This function trains tests and evaluates the performance of a dataset in comparison
with a changing train-to-test size ratio. It returns two lists of values; accuracy and test_ratio which
containing the values of accuracy and test_size respectively"""
accuracy = []
test_ratio = []
performance = "\n PERFORMANCE \n "
"""starting the range iterator from zero or ending at 101 raises a ValueError:
The test_size = 0 (1.0) should be greater or equal to the number of classes = 4"""
for i in range(5, 100, 5):
size = i / 100
# split data set into train and test sets
data_train, data_test, target_train, target_test = train_test_split(
X, y, test_size=size, random_state=10, stratify=y
)
# Evaluation of the performance of the K-nearest neighbors prediction model
kn_accuracy, target_pred = k_nn.k_nearest(
data_train, target_train, data_test, target_test
)
# generate classification report to observe values of precision, recall, f1_score and support
class_report = classification_report(target_test, target_pred)
# separator string demarcates results of one iteration from the other
separator = ("+" * 100) + " \n"
performance = performance + separator + class_report + " \n"
# update lists
accuracy.append(kn_accuracy)
test_ratio.append(size)

return (test_ratio, accuracy, performance)
Loading

0 comments on commit dde72c1

Please sign in to comment.