-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_data.py
122 lines (93 loc) · 3.89 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# coding: utf-8
# Load pickled data
import pickle
# TODO: Fill this in based on where you saved the training and testing data
training_file = "../data/traffic-signs-data/train.p"
validation_file = "../data/traffic-signs-data/valid.p"
testing_file = "../data/traffic-signs-data/test.p"
### Preprocessed file
training_pp_file = "../data/traffic-signs-data/train_pp.p"
### Preprocessed balanced file
training_pp_bal_file = "../data/traffic-signs-data/train_pp_bal.p"
with open(training_file, mode='rb') as f:
train = pickle.load(f)
with open(validation_file, mode='rb') as f:
valid = pickle.load(f)
with open(testing_file, mode='rb') as f:
test = pickle.load(f)
X_train, y_train = train['features'], train['labels']
X_valid, y_valid = valid['features'], valid['labels']
X_test, y_test = test['features'], test['labels']
import cv2
import numpy as np
from collections import Counter
from sklearn.utils import shuffle
from tqdm import tqdm
def scale(img, x):
rows, cols, ch = img.shape
M = np.float32([[x, 0, 0],[0, x, 0]])
return cv2.warpAffine(img, M, (cols, rows))
def translate(img, x, y):
rows, cols, ch = img.shape
M = np.float32([[1, 0, x],[0, 1, y]])
return cv2.warpAffine(img, M, (cols, rows))
def rotate(img, theta):
rows, cols, ch = img.shape
M = cv2.getRotationMatrix2D((cols / 2, rows / 2), theta, 1)
return cv2.warpAffine(img, M, (cols, rows))
### Created larger data set by jittering inputs (scaling, translating, rotating)
X_train_new = []
y_train_new = []
images_pbar = tqdm(range(len(X_train)), desc='X_train', unit='images')
for X, y, i in zip(X_train, y_train, images_pbar):
X_train_new.append(scale(X, 0.9))
X_train_new.append(scale(X, 1.1))
X_train_new.append(translate(X, 2, 2))
X_train_new.append(translate(X, -2, 2))
X_train_new.append(translate(X, 2, -2))
X_train_new.append(translate(X, -2, -2))
X_train_new.append(rotate(X, 15))
X_train_new.append(rotate(X, 7.5))
X_train_new.append(rotate(X, 5.0))
X_train_new.append(rotate(X, -5.0))
X_train_new.append(rotate(X, -7.5))
X_train_new.append(rotate(X, -15))
y_train_new.extend([y] * 12)
out_X = np.concatenate((X_train, X_train_new), axis=0)
out_y = np.concatenate((y_train, y_train_new), axis=0)
with open(training_pp_file, 'wb') as pfile:
pickle.dump({'features': out_X, 'labels': out_y}, pfile, protocol=pickle.HIGHEST_PROTOCOL)
### Create a larger data set by jittering inputs, but keep the classes balanced
X_train_new = []
y_train_new = []
images_pbar = tqdm(range(len(X_train)), desc='X_train', unit='images')
for X, y, i in zip(X_train, y_train, images_pbar):
X_train_new.append(scale(X, 0.9))
X_train_new.append(scale(X, 1.1))
X_train_new.append(translate(X, 2, 2))
X_train_new.append(translate(X, -2, 2))
X_train_new.append(translate(X, 2, -2))
X_train_new.append(translate(X, -2, -2))
X_train_new.append(rotate(X, 15))
X_train_new.append(rotate(X, 7.5))
X_train_new.append(rotate(X, 5.0))
X_train_new.append(rotate(X, -5.0))
X_train_new.append(rotate(X, -7.5))
X_train_new.append(rotate(X, -15))
y_train_new.extend([y] * 12)
class_dict = Counter(y_train)
min_n = len(y_train) * 13
for c, n in class_dict.items():
if (n * 13) < min_n:
min_n = n * 13
out_X = np.empty((0,) + X_train[0].shape, dtype=X_train.dtype)
out_y = np.array([], dtype=y_train.dtype)
for c in class_dict.keys():
only_c = np.array([X for X, y in zip(X_train, y_train) if y == c])
only_c_new = [X for X, y in zip(X_train_new, y_train_new) if y == c]
only_c_new = shuffle(only_c_new)
out_X = np.concatenate((out_X, only_c), axis=0)
out_X = np.concatenate((out_X, only_c_new[:min_n-len(only_c)]))
out_y = np.concatenate((out_y, [c] * min_n))
with open(training_pp_bal_file, 'wb') as pfile:
pickle.dump({'features': out_X, 'labels': out_y}, pfile, protocol=pickle.HIGHEST_PROTOCOL)