-
Notifications
You must be signed in to change notification settings - Fork 44
/
Copy pathcombine_features.py
executable file
·89 lines (82 loc) · 4.39 KB
/
combine_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/python3
"""
This script reads JSON files for different features (permissions, system calls, etc.)
with data on a set of apps, and reads the features' weights from a trained model, and
makes a dataset with the most heavily weighted features of each type.
The output data format is as follows:
{"features": ["ANDROID.PERMISSION.READ_PHONE_STATE", "java/security/Signature",...],
"apps": {"999eca2457729e371355aea5faa38e14.apk": {"vector": [0,0,0,1], "malicious": [0,1]}, ...}}
"""
from configparser import ConfigParser
import json
__author__='mwleeds'
def main():
config = ConfigParser()
config.read('config.ini')
FEATURES = config.get('AMA', 'FEATURES').split(',')
TOP_N_FEATURES = config.getint('AMA', 'TOP_N_FEATURES')
INCLUDE_DATES = config.getboolean('AMA', 'INCLUDE_DATES')
all_features = [] # list of strings naming each feature used in the combined dataset
app_feature_map = {} # mapping from android app names to lists of features
app_malicious_map = {} # mapping from android app names to 1 or 0 for malware or goodware
for feature in FEATURES:
with open(feature + '_weights.json') as weights:
feature_weights = json.load(weights)
print('Found ' + str(len(feature_weights)) + ' sets of weights for ' + feature)
# no need to look at benign weights; they're complementary
malicious_weights = [weight[0] for weight in feature_weights]
malicious_indices = sorted(range(len(malicious_weights)), key=lambda k: malicious_weights[k], reverse=True)
with open('app_' + feature + '_vectors.json') as vectors:
feature_data = json.load(vectors)
feature_names = feature_data['features']
print('Selecting ' + str(TOP_N_FEATURES) + ' top features of ' + str(len(feature_names)))
for i in range(min(int(len(malicious_indices) / 2), int(TOP_N_FEATURES / 2))):
index = malicious_indices[i]
all_features.append(feature_names[index])
for i in range(min(int(len(malicious_indices) / 2), int(TOP_N_FEATURES / 2))):
index = malicious_indices[-i]
all_features.append(feature_names[index])
# The date feature has equal numbers of apps in each range to avoid it
# being used as a feature directly, so only use those apps
if INCLUDE_DATES:
with open('app_date_vectors.json') as vectors:
feature_data = json.load(vectors)
date_buckets = feature_data['features']
all_features += date_buckets
date_apps = feature_data['apps']
for app in date_apps:
if app not in app_malicious_map:
app_malicious_map[app] = date_apps[app]['malicious']
if app not in app_feature_map:
app_feature_map[app] = []
for bucket in date_buckets:
index = date_buckets.index(bucket)
if date_apps[app]['vector'][index] == 1:
app_feature_map[app].append(bucket)
for feature in FEATURES:
with open('app_' + feature + '_vectors.json') as vectors:
feature_data = json.load(vectors)
feature_names = feature_data['features']
feature_apps = feature_data['apps']
print('Found ' + str(len(feature_apps)) + ' apps for ' + feature)
for app in feature_apps:
if INCLUDE_DATES and app not in date_apps:
continue
if app not in app_malicious_map:
app_malicious_map[app] = feature_apps[app]['malicious']
if app not in app_feature_map:
app_feature_map[app] = []
for feature_name in all_features:
if feature_name in feature_names:
index = feature_names.index(feature_name)
if feature_apps[app]['vector'][index] == 1:
app_feature_map[app].append(feature_name)
all_apps = {} # mapping combining app_feature_map and app_malicious_map using bits
for app_name in app_feature_map:
bit_vector = [1 if p in app_feature_map[app_name] else 0 for p in all_features]
all_apps[app_name] = {'vector': bit_vector, 'malicious': app_malicious_map[app_name]}
with open('app_feature_vectors.json', 'w') as outfile:
json.dump({'features': all_features, 'apps': all_apps}, outfile)
print('Wrote data on ' + str(len(all_features)) + ' features and ' + str(len(all_apps)) + ' apps to a file.')
if __name__=='__main__':
main()