-
Notifications
You must be signed in to change notification settings - Fork 0
/
good_bad_classficiation.py
178 lines (165 loc) · 6.55 KB
/
good_bad_classficiation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
"""
基于data_class_raw.csv数据,根据高斯密度概率函数,寻找异常点并剔除。
基于data_class_processed.csv数据,进行pca处理,确定重要数据维度及成分
完成数据分离,参数为random_state=4,test_size=0.4
建立knn模型完成分类,邻居选择10,计算准确率可视化边界
计算测试数据集对应的混淆矩阵,计算准确率召回率特异度精确率f1分数
尝试改变邻居参数,计算其在训练集测试集上的准确率并画出
"""
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
# covariance标准差 异常检测包
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# 混淆矩阵包
from sklearn.metrics import confusion_matrix
import os
current_directory = os.path.dirname(os.path.abspath(__file__))
# print(current_directory)
# load the data,define the data, visualize the data
data = pd.read_csv(current_directory+'/'+'data_class_raw.csv')
# print(data.head())
X = data.drop(['y'], axis=1)
y = data.loc[:, 'y']
# print('X', X, 'y', y)
fig0 = plt.figure(figsize=(5, 5))
bad = plt.scatter(X.loc[:, 'x1'][y == 0], X.loc[:, 'x2'][y == 0])
good = plt.scatter(X.loc[:, 'x1'][y == 1], X.loc[:, 'x2'][y == 1])
plt.legend((bad, good), ('bad', 'good'))
plt.xlabel('x1')
plt.ylabel('x2')
plt.title('raw data')
# plt.show()
# anomay detection
AD = EllipticEnvelope(contamination=0.02)
AD.fit(X[y == 0])
bad_AD_predict = AD.predict(X[y == 0])
# print('bad_AD_predict', bad_AD_predict)
# visualize the anomay data
fig1 = plt.figure(figsize=(10, 10))
bad = plt.scatter(X.loc[:, 'x1'][y == 0], X.loc[:, 'x2'][y == 0])
good = plt.scatter(X.loc[:, 'x1'][y == 1], X.loc[:, 'x2'][y == 1])
anomay_bad = plt.scatter(X.loc[:, 'x1'][y == 0]
[bad_AD_predict == -1], X.loc[:, 'x2'][y == 0][bad_AD_predict == -1], marker='x', s=150)
plt.legend((bad, good, anomay_bad), ('bad', 'good', 'anomay_bad'))
plt.xlabel('x1')
plt.ylabel('x2')
plt.title('data with anomay_bad')
# plt.show()
# 以上结束第一个任务,开始第二个
data = pd.read_csv(current_directory+'/'+'data_class_processed.csv')
# print(data.head())
X = data.drop(['y'], axis=1)
y = data.loc[:, 'y']
# print('X', X, 'y', y)
# pca
X_norm = StandardScaler().fit_transform(X)
pca = PCA(n_components=2)
X_reduction = pca.fit_transform(X_norm)
# show the 标准差比率,观测主成分选择个数
covariance_ratio = pca.explained_variance_ratio_
# covariance_ratio [0.5369408 0.4630592] 两个都是主成分
# print('covariance_ratio', covariance_ratio)
# visualize the ratio
fig2 = plt.figure(figsize=(5, 5))
plt.bar([1, 2], covariance_ratio)
# plt.show()
# train and test split:random_state=4,test_size=0.4
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=4, test_size=0.4)
# (21, 2) (14, 2) (35, 2) 将数据6:4分割,纵维度不变。
# print(X_train.shape, X_test.shape, X.shape)
# 建立knn模型完成分类,邻居选择10,计算准确率可视化边界
KNN_10 = KNeighborsClassifier(n_neighbors=10)
KNN_10.fit(X_train, y_train)
y_train_predict = KNN_10.predict(X_train)
y_test_predict = KNN_10.predict(X_test)
# print('y_train_predict', y_train_predict, 'y_test_predict', y_test_predict)
score_train = accuracy_score(y_train, y_train_predict)
score_test = accuracy_score(y_test, y_test_predict)
# score_train 0.9047619047619048 score_test 0.6428571428571429 效果并不好,观察一下分类边界
# print('score_train', score_train, 'score_test', score_test)
# visualize the knn model and boundary
'''
xx is a 200*200 0-9.95,0-0 yy is a 200*200 0-0,0-9.95
涉及分类模型决策边界中的np.meshgrid()与np.c_[]两个函数的使用.
np.meshgrid:会返回两个np.arange类型的列表.是的,列表.
xx.reval():将多维列表转换为一维列表
np.c_[xx.ravel(), yy.ravel()]:按行连接两个一维列表,要求行数相等
exampl:
a = [1, 1, 1]
b = [2, 2, 2]
np.c_[a, b] ===>
[
[1, 2],
[1, 2],
[1, 2]
]
'''
xx, yy = np.meshgrid(np.arange(0, 10, 0.05), np.arange(0, 10, 0.05))
# print('xx.shape', xx.shape, 'xx', xx)
# print('yy.shape', yy.shape, 'yy.type', type(yy), 'yy', yy)
# print('xx.ravel()', xx.ravel(), 'xx.ravel().type', type(xx.ravel()))
# print('yy.ravel()', yy.ravel())
x_range = np.c_[xx.ravel(), yy.ravel()]
# print('x_range', x_range, 'x_range.shape', x_range.shape)
y_range_predict = KNN_10.predict(x_range)
# visualize the knn_10 predict result
fig4 = plt.figure(figsize=(10, 10))
bad_knn10 = plt.scatter(
x_range[:, 0][y_range_predict == 0], x_range[:, 1][y_range_predict == 0])
good_knn10 = plt.scatter(
x_range[:, 0][y_range_predict == 1], x_range[:, 1][y_range_predict == 1])
# 后画原始数据,要在所有数据点上面画出原始数据.若先画原始数据会被覆盖.
bad = plt.scatter(X.loc[:, 'x1'][y == 0], X.loc[:, 'x2'][y == 0])
good = plt.scatter(X.loc[:, 'x1'][y == 1], X.loc[:, 'x2'][y == 1])
plt.legend((bad, good, bad_knn10, good_knn10),
('bad', 'good', 'bad_knn10', 'good_knn10'))
plt.xlabel('x1')
plt.ylabel('x2')
plt.title('knn predict data')
# plt.show()
# confuse matrix
cm = confusion_matrix(y_test, y_test_predict)
print('cm', cm)
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
'''
accurary=(TP + TN )/( TP + FP + TN + FN)
灵敏度(sensitivity)/召回率(recall): TP / (TP + FN),正确预测为正占全部正样本的比例
特异度(specificity):TN/(TN+FP) 负样本中预测正确的比例
精准率(precision):TP / (TP + FP),正确预测为正占全部预测为正的比例
'''
accurary = (TP + TN)/(TP + FP + TN + FN)
recall = TP / (TP + FN)
specificity = TN/(TN+FP)
precision = TP / (TP + FP)
f1 = 2*precision*recall/(precision+recall)
# try different k and calcualte the accuracy for each
n = [i for i in range(1, 21)]
accurary_train = []
accurary_test = []
for i in n:
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
y_train_predict = knn.predict(X_train)
y_test_predict = knn.predict(X_test)
score_train = accuracy_score(y_train, y_train_predict)
score_test = accuracy_score(y_test, y_test_predict)
accurary_train.append(score_train)
accurary_test.append(score_test)
print('accurary_train', accurary_train, 'accurary_test', accurary_test)
# visualize the accurary
fig5 = plt.figure(figsize=(10, 5))
plt.plot(n, accurary_train, marker='o')
plt.plot(n, accurary_test, marker='x')
plt.xlabel('n')
plt.ylabel('train or test accurary')
# plt.show()