-
Notifications
You must be signed in to change notification settings - Fork 5
/
AA-Smurf.py
198 lines (169 loc) · 6.46 KB
/
AA-Smurf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
####################################
# Author: Jeremy (Meng-Chieh) Lee #
# Email : [email protected] #
####################################
import numpy as np
from scipy import sparse
from math import ceil
from collections import OrderedDict
import pickle
import copy
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import argparse
def log_star(x):
"""
Compute universal code length used for encode real number
INPUTS
Real number
OUTPUTS
Approximate universal code length of the real number
"""
return 2 * np.log2(x) + 1
def edgelist_to_matrix(edgelist):
"""
Transform edgelist to adjacency matrix
INPUTS
Edge List
OUTPUTS
Adjacency matrix and dictionary of nodes
"""
node_dict = {e: idx for idx, e in enumerate(np.unique(list(edgelist)))}
ajm = np.zeros((len(node_dict), len(node_dict)))
for e in edgelist:
ajm[node_dict[e[0]]][node_dict[e[1]]] = 1
return ajm, node_dict
def compute_mdl(ajm, order, start, count):
"""
Encode matrix by given order and the information of smurf patterns
INPUTS
ajm: Binary adjacency matrix
order: Order used to reorder the matrix
start: Start positions of each smurf pattern
count: number of detected patterns and intermediaries
OUTPUTS
[Encoding description length, Purity]
"""
purity, mdl, n = [], 0, len(ajm)
order.extend([i for i in range(n) if i not in order])
ajm = np.array(ajm)[np.ix_(order, order)]
### Encode sub-matrix A, B and C
for idx in range(1, len(start)):
s, e = start[idx-1], start[idx] - 1
k = e - s + 1
e1 = np.sum(ajm[s+1:e, s:e-1]) * (2 * ceil(np.log2(k - 1)))
e2 = np.sum(ajm[e+1:-1, s:e]) * (ceil(np.log2(n)) + ceil(np.log2(n - k)))
e3 = np.sum(ajm[s:e, e+1:-1]) * (ceil(np.log2(n)) + ceil(np.log2(n - k)))
et = e1 + e2 + e3
mdl += et
sum_abc = np.sum(ajm[s:e, s:e]) + np.sum(ajm[e+1:-1, s:e]) + np.sum(ajm[s:e, e+1:-1])
purity.append((k - 2) * 2 / sum_abc)
### Encode sub-matrix D
ajm = 1 - ajm
mdl += np.sum(ajm[start[-1]:-1, start[-1]:-1]) * (2 * ceil(np.log2(n)))
### Encode the real number of found patterns and intermediaries
mdl += ceil(log_star(count[0])) + ceil(log_star(count[1]))
### Encode the indexes of senders, receivers and intermediaries
mdl += np.sum(count) * ceil(np.log2(n))
### Encode for the start point of each pattern
mdl += ceil(log_star(len(start) - 1))
return mdl, np.mean(purity)
def AA_Smurf(ajm, max_iter, visualize):
"""
Identify the best order for spotting smurf pattern
INPUTS
ajm: Binary adjacency matrix
max_iter: Maximum iteration to run the algorithm
visualize: Path of visualization result
OUTPUTS
[Reordered matrix, Best order for reordering]
"""
### In case using 'cfd_injected.pkl' file
# ajm, node_dict = edgelist_to_matrix(edgelist)
### Get edge-pairs which have the number of intermediaries hgiher than the threshold c
print('Get Edge-Pairs...')
row, col = ajm, ajm.T
edis = OrderedDict()
dis_mtr = (sparse.csr_matrix(ajm) * sparse.csr_matrix(ajm)).todense()
for idx1, idx2 in zip(*dis_mtr.nonzero()):
val = dis_mtr[idx1, idx2]
if val >= 3:
edis[(idx1, idx2)] = [val, np.arange(len(row))[(row[idx1] + col[idx2]) == 2]]
edis = OrderedDict(sorted(edis.items(), key=lambda t: t[1][0])[::-1])
print('Done!\n')
### Heuristically identify the best order by MDL and purity
print('Identify Best Order...')
def func(ajm, key, value, order, count, start, prev_mdl):
if key[0] not in order and key[1] not in order:
order.append(key[0])
tmp_mid = [a for a in value[1] if a not in order]
if len(tmp_mid) == 0:
return -1, -1, -1, -1, -1
order.extend(tmp_mid)
order.append(key[1])
start.append(len(order))
mdl, purity = compute_mdl(ajm, copy.copy(order), copy.copy(start),
[count[0] + 1, count[1] + len(tmp_mid), count[2] + 1])
score = ((prev_mdl - mdl) / prev_mdl) * purity
if mdl < prev_mdl:
count = [count[0] + 1, count[1] + len(tmp_mid), count[2] + 1]
return mdl, score, order, start, count
return -1, -1, -1, -1, -1
old_mdl = np.ceil(np.sum(1 - ajm)) * (2 * ceil(np.log2(len(ajm))))
count_arr, order_arr, start_arr, mdl_arr = [[0, 0, 0]], [[]], [[0]], [old_mdl]
iter = 0
while True:
prev_mdl = mdl_arr[-1]
results = Parallel(n_jobs=4)(
[delayed(func)(ajm, key, value, \
copy.copy(order_arr[-1]), \
copy.copy(count_arr[-1]), \
copy.copy(start_arr[-1]), prev_mdl)
for idx, (key, value) in enumerate(edis.items())])
tmp_mdl = [r[0] for r in results]
tmp_score = [r[1] for r in results]
tmp_order = [r[2] for r in results]
tmp_start = [r[3] for r in results]
tmp_count = [r[4] for r in results]
### No more smurf-like pattern found
if np.max(tmp_score) == -1 or (max_iter != None and iter > max_iter):
break
max_idx = np.argmax(tmp_score)
count_arr.append(tmp_count[max_idx])
order_arr.append(tmp_order[max_idx])
start_arr.append(tmp_start[max_idx])
mdl_arr.append(tmp_mdl[max_idx])
iter += 1
print('Done!\n')
### Get the result with MDL 10% higher than the minimum bits
max_idx = next(idx - 1 for idx, m in enumerate(mdl_arr) if m < mdl_arr[-1] * 1.1)
count, order, start, ori_mdl = count_arr[max_idx], order_arr[max_idx], start_arr[max_idx], mdl_arr[max_idx]
order.extend([i for i in range(len(ajm)) if i not in order])
ro_ajm = ajm[np.ix_(order, order)]
if visualize != None:
print('Start Visualize Result...')
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.matshow(ajm, fignum=False, cmap='binary')
plt.title('Before Reordering')
plt.subplot(1, 2, 2)
plt.matshow(ro_ajm, fignum=False, cmap='binary')
plt.title('After Reordering')
# plt.savefig(visualize)
plt.show()
print('Done!\n')
return ro_ajm, order
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Parameters for AA-Smurf of AutoAudit')
parser.add_argument('--f', default='data/sample_matrix.txt', type=str, help='Input Path')
parser.add_argument('--o', default='results/AA-Smurf_result.png', type=str, help='Output Path')
parser.add_argument('--i', default=None, type=int, help='Maximum Iteration')
args = parser.parse_args()
ajm = np.loadtxt(args.f)
ro_ajm, order = AA_Smurf(ajm, args.i, args.o)
### In case using 'cfd_injected.pkl' file
# with open(args.f, 'rb') as handle:
# data = pickle.load(handle)
# for k, v in data.items():
# for idx, vv in enumerate(v):
# ro_ajm, order = AA_Smurf(vv['Edgelist'], args.i, args.o)