-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProcessDataset.py
50 lines (38 loc) · 1.1 KB
/
ProcessDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import numpy as np
import pandas as pd
import sys
def load_data(fname):
# reads into dataframe
df = pd.read_csv(fname, dtype=np.float64, engine='python', header=None)
y = df.iloc[:,-1].values
X = df.values[:, list(range(0,len(df.columns)-1))]
y = binarize_labels(y)
X = normalize_features(standardize_features(X))
new_df = np.column_stack((X,y))
pd.DataFrame(new_df).to_csv(fname[:-4] + '_processed.csv', header=None, index=None)
def binarize_labels(Y):
c = Y[0]
norm = []
for y in Y:
if y == c:
norm.append(0)
else:
norm.append(1)
return norm
# converts scientific notation to float
def standardize_features(X):
column_average = X.mean(0)
std_deviation = X.std(0)
for i in range(len(X)):
X[i] = np.true_divide(np.subtract(X[i], column_average), std_deviation)
return X
def normalize_features(X):
maxV = X.max()
minV = X.min()
for i in range(len(X)):
X[i] = (X[i] - minV) / (maxV - minV)
return X
def main():
load_data(sys.argv[1])
if __name__ == "__main__":
main()