Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First attempt on vehicle dataset with a random forest classifier #13

Merged
merged 9 commits into from
Mar 16, 2020
Merged
81 changes: 81 additions & 0 deletions dev/Sidrah-Madiha/allcustommodules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from math import sqrt
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
from sklearn.metrics import confusion_matrix, classification_report



def data_stats(dataset):
''' Shows some basic stats of the dataset'''
print("=========== SOME STATS of Dataset ===========")
print('Shape of the dataset: ' + str(dataset.shape) + "\n")
print('List of attribute columns' , list(dataset.columns))
print("\n")
list_cat = dataset.Class.unique()
print('List of Categories ' , list_cat , "\n" )


def tokenize_target_column(dataset):
''' tokenize the Class column values to numeric data'''
factor = pd.factorize(dataset['Class'])
dataset.Class = factor[0]
definitions = factor[1]
print("Updated tokenize 'Class' column - first 5 values")
print(dataset.Class.head())
print("Distinct Tokens used for converting Class column to integers")
print(definitions)
return definitions
def train_data_test_data_split(dataset):
X = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:,-1].values
# print(X[0])
# print(y[0])
# print(X.shape)
# print(y.shape)
# print('the data attributes columns')
# print(X[:5,:])
# print('The target variable: ')
# print(y[:5])
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 ,random_state = 21)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 0.2 test proportion does not match the comment in the notebook. In general, how did decide on this test set size? I would be good to include a comment about this in the notebook.

return X_train, X_test, y_train, y_test

def train(X_train, y_train):
''' training model on train data'''
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)
return classifier

def test(classifier, X_test):
''' testing model on test data'''
y_pred=classifier.predict(X_test)
return y_pred

def untokenizing_testdata_prediction(y_test, y_pred, definitions):
'''Converting numeric target and predict values back to original labels'''
reversefactor = dict(zip(range(4),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
return y_test, y_pred


def create_confusion_matrix_class_report(y_test, y_pred):
''' Creates Cinfusion Matrix and summary of evaluation metric '''

labels = ["van" , "saab" ,"bus" , "opel"]
cm = confusion_matrix(y_test, y_pred, labels=labels)
df_cm = pd.DataFrame(cm, index=labels, columns=labels)

sn.heatmap(df_cm, annot=True, fmt='d')
plt.xlabel('Real Vehicle Category')
plt.ylabel('Predicted Vehicle Category')
print("============== Summary of all evaluation metics ===============")
print(classification_report(y_test,y_pred))
print ("====================== Confusion Matrix=====================")


Loading