-
Notifications
You must be signed in to change notification settings - Fork 6
/
feature_selector.py
88 lines (65 loc) · 1.78 KB
/
feature_selector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import numpy as np
import pandas as pd
all_api_calls_file = open('mixed_dataset/all_api_calls.txt')
all_api_calls = []
#column_names = []
for lines in all_api_calls_file.readlines():
all_api_calls.append(lines[:-1])
api_index = dict((c,i+1) for i,c in enumerate(all_api_calls))
#print(api_index)
#print(len(api_index))
#total = len(api_index)
processed_and_indexed_training_examples = []
outputs = []
indexed_file = open('training_file_index','r')
count = 0
for row in indexed_file.readlines():
#print(row)
count = count + 1
#print(count)
components = row.split('-')
name = components[0]
indexes = components[1]
indexes = indexes.split(',')[:-1]
if name[-1] == 'e':
outputs.append([0,1])
else:
outputs.append([1,0])
#print(name)
#print(indexes)
xlist = []
for i in indexes:
xlist.append(int(i))
processed_and_indexed_training_examples.append(xlist)
#break
indexed_file.close()
#print(processed_and_indexed_training_examples)
list_for_all = []
column_names = [str(j+1) for j in range(len(api_index))]
#list_for_all.append(xlist)
total = len(processed_and_indexed_training_examples)
for c,i in enumerate(processed_and_indexed_training_examples):
#print(i)
print(str(c)+'/'+str(total))
xlist = [0 for j in range(len(api_index))]
for x in i:
if x == 0:
continue
xlist[x] = 1
#print(xlist)
list_for_all.append(xlist)
#break
#print(list_for_all[2])
df = pd.DataFrame(np.array(list_for_all),columns = all_api_calls)
#print(df.head())
df.to_csv('features.csv')
'''
#VarianceThreshold
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
x = pd.DataFrame(sel.fit_transform(df))
print (sel.get_support(indices=True))
selected_features = sel.get_support(indices=True)
for a in selected_features:
print(all_api_calls[a])
'''