Skip to content

Commit

Permalink
feature implementation for fix # 4 (#38)
Browse files Browse the repository at this point in the history
* First attempt on vehicle data with a random forest calssifier

* minor changes

* Comparative model evaluation for vehicle dataset

* first attempt for implementing task 7

* fixes #8

* fixes #4, attempt 1

* updated missclassification graph and brokedown functions

* fixed code formatting issues and  removed extra file
  • Loading branch information
Sidrah-Madiha authored Mar 20, 2020
1 parent 1218953 commit 3a27b6a
Show file tree
Hide file tree
Showing 3 changed files with 1,100 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from math import sqrt
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from matplotlib.pyplot import figure
import seaborn as sns
from sklearn.svm import SVC


def data_stats(dataset):
""" Shows some basic stats of the dataset"""
print("=========== SOME STATS of Dataset ===========")
print("Shape of the dataset: " + str(dataset.shape) + "\n")
print("List of attribute columns", list(dataset.columns))
print("\n")
list_cat = dataset.Class.unique()
print("List of Categories ", list_cat, "\n")


def tokenize_target_column(dataset):
""" tokenize the Class column values to numeric data"""
factor = pd.factorize(dataset["Class"])
dataset.Class = factor[0]
definitions = factor[1]
print("Updated tokenize 'Class' column - first 5 values")
print(dataset.Class.head())
print("Distinct Tokens used for converting Class column to integers")
print(definitions)
return definitions


def train_data_test_data_split(dataset):
""" splitting test and training data in 80/20 split"""
X, y = training_data_and_target_Label_split(dataset)
# print(X[0])
# print(y[0])
# print(X.shape)
# print(y.shape)
# print('the data attributes columns')
# print(X[:5,:])
# print('The target variable: ')
# print(y[:5])
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=21
)
return X_train, X_test, y_train, y_test


def training_data_and_target_Label_split(dataset):
""" splitting dataset into training/test data and labels """
X = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:, -1].values
return X, y


def test(classifier, X_test):
""" testing model on test data"""
y_pred = classifier.predict(X_test)
return y_pred


def untokenizing_testdata_prediction(y_test, y_pred, definitions):
"""Converting numeric target and predict values back to original labels"""
reversefactor = dict(zip(range(4), definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
return y_test, y_pred


def create_confusion_matrix_class_report(y_test, y_pred):
""" Creates Cinfusion Matrix and summary of evaluation metric """

labels = ["van", "saab", "bus", "opel"]
cm = confusion_matrix(y_test, y_pred, labels=labels)
df_cm = pd.DataFrame(cm, index=labels, columns=labels)

sn.heatmap(df_cm, annot=True, fmt="d")
plt.xlabel("Real Vehicle Category")
plt.ylabel("Predicted Vehicle Category")
print("============== Summary of all evaluation metics ===============")
print(classification_report(y_test, y_pred))
print("====================== Confusion Matrix=====================")


def model_evaluation(X_train, y_train):
""" Checking accuaracy of different models and plotting it for comparison"""
print(
"Evaluating performance of various classifier:\n ==================================== \n Random Forest Classifier, K Neighbor Classifier, RBF SVM, Naive Bayes, Logistic Regression, Decision Tree\n "
)
figure(num=None, figsize=(12, 12), dpi=80, facecolor="w", edgecolor="k")
models = [
RandomForestClassifier(n_estimators=10, criterion="entropy", random_state=42),
KNeighborsClassifier(n_neighbors=7),
SVC(kernel="rbf", C=1000, gamma=0.0001),
GaussianNB(),
LogisticRegression(solver="lbfgs", multi_class="auto"),
DecisionTreeClassifier(),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, X_train, y_train, scoring="accuracy", cv=CV)
for fold_idx, accuracy in enumerate(accuracies):
entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=["model_name", "fold_idx", "accuracy"])
model_evaluation_plot(cv_df)
cv_df.groupby("model_name").accuracy.mean()


def model_evaluation_plot(cv_df):
""" Display dataframe containing model and their accuracy for comparison"""
sns.boxplot(x="model_name", y="accuracy", data=cv_df)
sns.stripplot(
x="model_name",
y="accuracy",
data=cv_df,
size=8,
jitter=True,
edgecolor="gray",
linewidth=2,
)
plt.show()
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import numpy as np
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd


def performance_evaluater_over_folds(classifier, no_of_cv, X, y):
"""returns table (type dataframe) containing folds and corresponding average cross validation score
inputs:
classifier : classifier/model/estimator
no_of_cv : range of folds you want to generate table for
X: training data
y: target label"""

scores = list()
range_of_cv = list(range(2, no_of_cv + 1))
for i in range_of_cv:
scores_avg = np.mean(cross_val_score(classifier, X, y, cv=i))
scores.append(scores_avg)
table = pd.DataFrame({"No. of folds": range_of_cv, "Average metric Score": scores})
table.set_index("No. of folds", inplace=True)
return table


def visualising_performance_evaluater_over_folds(table):
""" displays a bar plot that shows cross validation score for each fold
inputs:
table: dataframe that has 2 columns: 'No. of folds' and 'Average metric Score' """

folds = table.index.values.tolist()
score = table["Average metric Score"].values.tolist()
fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
ax.bar(folds, score)
plt.xticks(folds)
ax.set_title("Scores by No. of Folds")
plt.ylabel("Average Metric Score")
plt.xlabel("No. of Folds")
ax.axhline(
np.mean(score),
label="Mean score = {:0.3f}".format(np.mean(score)),
linestyle="--",
linewidth=0.3,
)
plt.legend(loc="upper right")
axes = plt.gca()
ymin = min(score)
ymax = max(score)
axes.set_ylim([ymin - (ymin * 0.001), ymax + (ymax * 0.001)])
plt.show()
Loading

0 comments on commit 3a27b6a

Please sign in to comment.