Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First attempt on vehicle dataset with a random forest classifier #13

Merged
merged 9 commits into from
Mar 16, 2020
Merged
30 changes: 30 additions & 0 deletions dev/Sidrah-Madiha/Visualization_for_misclassifications.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pandas as pd
import allcustommodules as sidra
import matplotlib.pyplot as plt


def missclassified_data_category_frquency(y_test, y_pred):
""" This function plots frequency of missclassified points against the incorrect categories they were predicted for
parameters:
y_test: reverse factorized test values
y_pred: reverse factorized predicted values """
cm = sidra.create_confusion_matrix(y_test, y_pred)
for label in cm.columns:
cm.at[label, label] = 0

ax = cm.plot(kind="bar", title="Predicted Class Error", stacked=True)
ax.locator_params(axis="y", integer=True)
ax.set_xlabel("Classes/Categories")
ax.set_ylabel("Number of Incorrectly Predicted Class")
plt.show()


def untokenize_test_predict_data(definition, y_test, y_pred):
""" this function can be used to reverse factor test and predicted values before using 'missclassified_data_category_frquency' function
parameters:
y_test: factorized test values
y_pred: factorized predicted values
definitions: categories for reverse factorizing"""
reversefactor = dict(zip(range(len(definitions) + 1), definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
156 changes: 156 additions & 0 deletions dev/Sidrah-Madiha/allcustommodules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from math import sqrt
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from matplotlib.pyplot import figure
import seaborn as sns
from sklearn.svm import SVC


def data_stats(dataset):
""" Shows some basic stats of the dataset"""
print("=========== SOME STATS of Dataset ===========")
print("Shape of the dataset: " + str(dataset.shape) + "\n")
print("List of attribute columns", list(dataset.columns))
print("\n")
list_cat = dataset.Class.unique()
print("List of Categories ", list_cat, "\n")


def tokenize_target_column(dataset):
""" tokenize the Class column values to numeric data"""
factor = pd.factorize(dataset["Class"])
dataset.Class = factor[0]
definitions = factor[1]
print("Updated tokenize 'Class' column - first 5 values")
print(dataset.Class.head())
print("Distinct Tokens used for converting Class column to integers")
print(definitions)
return definitions


def train_data_test_data_split(X, y, test_size=0.2):
""" splitting test and training data in 80/20 split"""

# print(X[0])
# print(y[0])
# print(X.shape)
# print(y.shape)
# print('the data attributes columns')
# print(X[:5,:])
# print('The target variable: ')
# print(y[:5])
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=21
)
return X_train, X_test, y_train, y_test


def training_data_and_target_Label_split(dataset):
""" return last column of dataset as target y with training dataset as X. """
X = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:, -1].values
return X, y


# def train_randomforest(classifier, X_train, y_train):
# """ training model on train data"""
# classifier.fit(X_train, y_train)
# return classifier


def test(classifier, X_test):
""" testing model on test data"""
y_pred = classifier.predict(X_test)
return y_pred


def untokenizing_testdata(y_test, definitions):
"""Converting numeric target values back to original labels"""
reversefactor = dict(zip(range(len(definitions) + 1), definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
return y_test


def untokenizing_prediction(y_pred, definitions):
"""Converting numeric predicted values back to original labels"""
reversefactor = dict(zip(range(len(definitions) + 1), definitions))
y_pred = np.vectorize(reversefactor.get)(y_pred)
return y_pred


def create_confusion_matrix(y_test, y_pred):
""" Creates Cinfusion Matrix and summary of evaluation metric """

labels = ["van", "saab", "bus", "opel"]
cm = confusion_matrix(y_test, y_pred, labels=labels)
df_cm = pd.DataFrame(cm, index=labels, columns=labels)
return df_cm


def display_confusion_matrix(df_cm):
""" displays confusion matrix of dataframe provided)"""

sn.heatmap(df_cm, annot=True, fmt="d")
plt.xlabel("Real Vehicle Category")
plt.ylabel("Predicted Vehicle Category")
print("====================== Confusion Matrix=====================")


def display_classification_report(y_test, y_pred):
print("============== Summary of all evaluation metics ===============")
print(classification_report(y_test, y_pred))


def model_evaluation(X_train, y_train):
""" Checking accuaracy of different models and plotting it for comparison"""
print(
"Evaluating performance of various classifier:\n ==================================== \n Random Forest Classifier, K Neighbor Classifier, RBF SVM, Naive Bayes, Logistic Regression, Decision Tree\n "
)
figure(num=None, figsize=(12, 12), dpi=80, facecolor="w", edgecolor="k")
models = [
RandomForestClassifier(n_estimators=10, criterion="entropy", random_state=42),
KNeighborsClassifier(n_neighbors=7),
SVC(kernel="rbf", C=1000, gamma=0.0001),
GaussianNB(),
LogisticRegression(solver="lbfgs", multi_class="auto"),
DecisionTreeClassifier(),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, X_train, y_train, scoring="accuracy", cv=CV)
for fold_idx, accuracy in enumerate(accuracies):
entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=["model_name", "fold_idx", "accuracy"])
model_evaluation_plot(cv_df)
cv_df.groupby("model_name").accuracy.mean()


def model_evaluation_plot(cv_df):
""" Display dataframe containing model and their accuracy for comparison"""
sns.boxplot(x="model_name", y="accuracy", data=cv_df)
sns.stripplot(
x="model_name",
y="accuracy",
data=cv_df,
size=8,
jitter=True,
edgecolor="gray",
linewidth=2,
)
plt.show()
57 changes: 57 additions & 0 deletions dev/Sidrah-Madiha/train_test_split_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from sklearn.model_selection import learning_curve
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# inputs: estimator, X, y, cv, scoring
def train_test_split_table_scores(estimator, X, y, cv):
""" returns table that shows train and test split percentages as well as per split metric with average over train and test scores """
# train_percent_float = np.array(list(range(1, 101))) / 100

# train_sizes = (train_percent_float*int(len(X) - len(X)*0.2)).astype(int)
train_sizes, train_scores, validation_scores = learning_curve(
estimator=estimator, X=X, y=y, cv=cv, shuffle=True
)

# train_percent = (train_percent_float*100).astype(int)
# test_percent =100 -print('Training scores:\n\n', train_scores) train_percent

train_percent = np.round(train_sizes * 100 / len(X), 2)
test_percent = (len(X) - train_sizes) * 100 / len(X)
train_scores_mean = train_scores.mean(axis=1)
validation_scores_mean = validation_scores.mean(axis=1)
column_names = [("Split" + str(i)) for i in range(1, cv + 1)]

table_of_train_test_split = pd.concat(
[
pd.DataFrame(
{"Train Percent": train_percent, "Test Percent": test_percent}
),
pd.DataFrame(train_scores, columns=column_names),
pd.DataFrame(
{
"Training Scores Mean": train_scores_mean,
"Testing Scores Mean": validation_scores_mean,
}
),
],
axis=1,
)

return table_of_train_test_split


def visual_tain_test_split_score(table):
""" visualises table of train data percent Vs avg train scores and train data percent Vs avg test score. """
plt.style.use("seaborn")
plt.plot(
table["Train Percent"], table["Training Scores Mean"], label="Training Score"
)
plt.plot(
table["Train Percent"], table["Testing Scores Mean"], label="Validation Score"
)
plt.ylabel("Score", fontsize=14)
plt.xlabel("Training set size", fontsize=14)
plt.title("Learning curves", fontsize=18, y=1.03)
plt.legend()
plt.ylim(0.5, 1)
Loading