-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
419 lines (324 loc) · 18 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
import numpy as np
import csv, os,re, sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '5'
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow import keras
from keras import backend as K
import keras.callbacks
from tabulate import tabulate
## variables
dataset_path = '/mnt/1tb/unishare/complete/mixed/'
checkpoint_path = 'verysmall7.cpk'
#checkpoint_path = os.getenv('CPATH',checkpoint_path)
include_lines = 2000 # how many logs do we take from each file
test_split = 0.35 # how do we split test / train data.
LSTM_UNITS_PATH = 128
LSTM_UNITS_DETAILS = 128
LSTM_UNITS_DURATION = 8
LSTM_UNITS_OPERATION = 16
LSTM_UNITS_RESULTS = 16
DENSE_UNITS = 32
fit_epochs = 8
fit_batches = 10
##
# read labels.csv from directory, use to construct np array of all collected process within labels.csv
''' in_dataset format:
time ------------->
[ \/ one operation \/ one operation
[[feature1,feature2,feature3],[feature1.feature2,feature3],..], <- program 1
[[feature1,feature2,feature3],[feature1.feature2,feature3],..], <- program 2
[[feature1,feature2,feature3],[feature1.feature2,feature3],..],..
]
the model will read from multiple inputs, each input comes from one of the features
//should the input be just for one instance in a sequence or the whole sequence as it will just be a time sequence of one feature?
//does each input need to be labelled the same? or can i label all of the inputs as one?
[vectorized_path] -> input(path)
[vectorized_details] -> input(details)
'''
def construct_dataset_fromdir(dir):
constructed_data = []
labels = []
with open(dir+'labels.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
for i, process in enumerate(reader):
timeline = retrieve_operation_data(dir+process['filename'])
constructed_data.append(timeline)
timeline = None
labels.append(process['label'])
'''
if(i % 30 == 0):
print("batching...")
if('batched_data' in locals()):
batched_data = np.concatenate([batched_data, np.array(constructed_data)])
else:
batched_data = np.array(constructed_data)
constructed_data = []
print(str(i), sys.getsizeof(batched_data))
'''
print("final concat...")
return constructed_data, labels
# convert a single applications csv file into a usable numpy array of the process + created threads, perform this for the first x rows
def retrieve_operation_data(csv_path):
process_operations = []
with open(csv_path, newline='') as csvfile:
reader = csv.DictReader(csvfile)
#child_threads = [] LEGACY
for i, row in enumerate(reader):
# we only do the first x rows
if(i >= include_lines):
break
''' THIS IS LEGACY CODE AND IS NO LONGER NEEDED, OUR DATASET IS PRE-FILTERED
# if this is monitored process or a child of this process
if(row['Process Name'] == process_name or row['TID'] in child_threads):
# if this thread creates a child we should follow this process as well
if(row["Operation"] == "Thread Create"):
# skips the prefix 'Thread ID: '(len=11) and just appends the thread number
child_threads.append(row['Detail'][11:])
'''
# prepare needed headers to variables
proc_operation = row['Operation']
proc_path = row['Path']
proc_result = row['Result']
proc_detail = row['Detail']
proc_duration = row['Duration']
# append operation
process_operations.append([proc_path,proc_operation,proc_result,proc_detail,proc_duration])
reader = None
csvfile.close()
return process_operations
# takes in standardized feature lists and sends each set of features to the appropriate vectorizer/normalizer
def vectorize_feature_datasets(normalized_datalists, labels):
print("Running Vectorization...")
print("Vectorizing paths...")
v_path_train, v_path_test,labels_train,labels_test, path_vocab = vectorize_epaths(normalized_datalists[0], labels)
print("Vectorizing operations...")
v_op_train, v_op_test = vectorize_operations(normalized_datalists[1])
print("Vectorizing results...")
v_res_train, v_res_test = vectorize_results(normalized_datalists[2])
print("Vectorizing details...")
v_details_train, v_details_test, ignore,ignore,details_vocab = vectorize_epaths(normalized_datalists[3],labels)
print("Normalising durations...")
n_dur_train, n_dur_test = normalize_durations(normalized_datalists[4])
train_inputs = [v_path_train,v_details_train,v_res_train,v_op_train,n_dur_train]
test_inputs = [v_path_test,v_details_test,v_res_test,v_op_test,n_dur_test]
vocabs = [path_vocab,details_vocab]
return train_inputs, test_inputs, labels_train, labels_test, vocabs
def vectorize_epaths(epath_dataSet, labels):
epath_train, epath_test, labels_train, labels_test= train_test_split(epath_dataSet, labels, test_size=test_split, random_state=55) # split our input data, we must do this for each input and ensure that they are all the same size
# create our tensors of data, using ragged as technically they are the same shape but within the lists could be a different amount of text
epath_vectorTrain = tf.ragged.constant(epath_train)
epath_train = tf.ragged.constant(epath_train)
epath_test = tf.ragged.constant(epath_test)
#just the values of training data used for adapting
epath_vectorTrain = epath_vectorTrain.flat_values
epath_vectorizer = keras.layers.TextVectorization(standardize=None,split=None,output_mode="tf_idf")
epath_vectorizer.adapt(epath_vectorTrain)
epath_vocab = epath_vectorizer.get_vocabulary()
print("Vocab includes: ",epath_vocab)
vect_epath = epath_vectorizer(epath_train.values)
vect_epath_train = tf.reshape(vect_epath, (epath_train.shape[0],include_lines,len(epath_vocab))) # reshapes the vectorized output into a 3d tensor, where each top level is next process
vect_epath = epath_vectorizer(epath_test.values)
vect_epath_test = tf.reshape(vect_epath, (epath_test.shape[0],include_lines,len(epath_vocab))) # reshapes the vectorized output into a 3d tensor, where each top level is next process
return(vect_epath_train,vect_epath_test,labels_train,labels_test,epath_vocab) # <- vectorized test / train data / vocabulary (so we can remember lengths of vectorized input)
def vectorize_operations(operation_dataSet):
op_train, op_test = train_test_split(operation_dataSet, test_size=test_split, random_state=55) # split our input data, we must do this for each input and ensure that they are all the same size
op_trainVocab = tf.ragged.constant(op_train).flat_values
op_train = tf.convert_to_tensor(op_train)
op_test = tf.convert_to_tensor(op_test)
operation_vectorizer = keras.layers.TextVectorization(standardize=None,split=None,output_mode="int")
operation_vectorizer.adapt(op_trainVocab)
operation_vocab = operation_vectorizer.get_vocabulary()
print("Vocab includes: ",operation_vocab)
v_op_train = operation_vectorizer(op_train)
v_op_train = tf.reshape(v_op_train,(len(v_op_train),include_lines,1))
v_op_test = operation_vectorizer(op_test)
v_op_test = tf.reshape(v_op_test,(len(v_op_test),include_lines,1))
return(v_op_train,v_op_test)
def vectorize_results(result_dataSet):
res_train, res_test = train_test_split(result_dataSet, test_size=test_split, random_state=55)
res_trainVocab = tf.ragged.constant(res_train).flat_values
res_train = tf.convert_to_tensor(res_train)
res_test = tf.convert_to_tensor(res_test)
result_vectorizer = keras.layers.TextVectorization(standardize=None,split=None,output_mode="int")
result_vectorizer.adapt(res_trainVocab)
result_vocab = result_vectorizer.get_vocabulary()
print("Vocab includes: ",result_vocab)
v_res_train = result_vectorizer(res_train)
v_res_train = tf.reshape(v_res_train,(len(v_res_train),include_lines,1))
v_res_test = result_vectorizer(res_test)
v_res_test = tf.reshape(v_res_test,(len(v_res_test),include_lines,1))
return(v_res_train,v_res_test)
def normalize_durations(duration_dataSet):
dur_train, dur_test = train_test_split(duration_dataSet, test_size=test_split, random_state=55)
#dur_trainVocab = tf.ragged.constant(dur_train).flat_values
dur_train = tf.convert_to_tensor(dur_train)
dur_test = tf.convert_to_tensor(dur_test)
duration_normalizer = keras.layers.Normalization(axis=None)
duration_normalizer.adapt(dur_train)
n_dur_train = duration_normalizer(dur_train)
n_dur_train = tf.reshape(n_dur_train,(len(n_dur_train),include_lines,1))
n_dur_test = duration_normalizer(dur_test)
n_dur_test = tf.reshape(n_dur_test,(len(n_dur_test),include_lines,1))
return(n_dur_train, n_dur_test)
# standardizes + splits text data to be prepared for vectorization < - vectorize features seperately (aside from paths) + only use training data for adapting
def standardize_data(pre_standardized_dataset):
standardized_array = []
standardized_array.append(path_lists_to_token_list(pre_standardized_dataset[:,:,0])) # <- standardized paths [0]
standardized_array.append(multi_to_single_word_list(pre_standardized_dataset[:,:,1])) # <- standarized operations [1]
standardized_array.append(multi_to_single_word_list(pre_standardized_dataset[:,:,2])) # <- standarized results [2]
standardized_array.append(details_splitting(pre_standardized_dataset[:,:,3])) # <- standardized details [3]
standardized_array.append(string_to_float_lists(pre_standardized_dataset[:,:,4])) # <- standardized durations [4]
return standardized_array
def details_splitting(details_lists):
standardized_details_list = []
for x in range(0,len(details_lists)): #<- x = 1 entire timeline
standardized_details_list.append([]) # create timeline entry
for y in range(0,len(details_lists[x])):
tokenized_list = re.split('\, |\: ',details_lists[x][y])
curated_list = []
for i, k in enumerate(tokenized_list):
detail = k
detail = detail.lower()
detail = detail.replace(" ", "")
detail = detail.replace(",","")
detail = detail.replace("0x","")
try:
detail = int(detail,16)
detail = str(detail)
except:
None
if(len(detail) > 15 or detail.isdigit()):
continue
curated_list.append(detail)
standardized_details_list[x].append(curated_list)
return standardized_details_list
def string_to_float_lists(string_lists):
standardized_float_list = []
for x in range(0,len(string_lists)): #<- x = 1 entire timeline
standardized_float_list.append([]) # create timeline entry
for y in range(0,len(string_lists[x])): # <- y = 1 instance of activity
duration = float(string_lists[x][y])
standardized_float_list[x].append(duration)
return standardized_float_list
def multi_to_single_word_list(word_list):
standardized_word_list = []
for x in range(0,len(word_list)): #<- x = 1 entire timeline
standardized_word_list.append([]) # create timeline entry
for y in range(0,len(word_list[x])): # <- y = 1 instance of activity
word = word_list[x][y].replace(" ","")
word = word.lower()
standardized_word_list[x].append(word)
return standardized_word_list
def path_lists_to_token_list(path_list): # in: a list of all paths, out: a list of lists containing the paths seperated by the '\'
standardized_path_list = []
for x in range(0,len(path_list)): #<- x = 1 entire timeline
standardized_path_list.append([]) # create timeline entry
for y in range(0,len(path_list[x])): # <- y = 1 instance of activity
standardized_path_list[x].append(split_path_data_to_tokens(path_list[x][y]))
return standardized_path_list
def split_path_data_to_tokens(path_data):
# drop first 3 chars (for windows drive) + split paths as a list of strings by '/' (token)
if(path_data[:3] == 'C:\\'):
path_data = path_data[3:]
path_data = re.split('\\\\|\.',path_data)
curated_paths = []
# Introduced to keep the length of the string reasonable <= 11 to stop random UUIDs being included
for token in (path_data):
if(len(token) > 10 or token.isdigit()):
continue
curated_paths.append(token.lower())
return curated_paths
## testing
base_dataset, base_labels = construct_dataset_fromdir(dataset_path)
base_dataset = np.asarray(base_dataset)
input_lists = standardize_data(base_dataset)
base_dataset = None
train_dataset, test_dataset,labels_train,labels_test, vocabs = vectorize_feature_datasets(input_lists, base_labels)
label_encoder = LabelEncoder()
enc_labels_train = label_encoder.fit_transform(labels_train)
enc_labels_test = label_encoder.transform(labels_test)
print(enc_labels_train)
# HERE BEGINS THE BRAIN # EPATH IS BEING CONSIDERED FOR REMOVAL AS THE LAUNCH WILL ALWAYS BE FROM A SMB SHARE.
path_in = keras.layers.Input(shape=(include_lines,len(vocabs[0])))
detail_in = keras.layers.Input(shape=(include_lines,len(vocabs[1])))
result_in = keras.layers.Input(shape=(include_lines,1))
operation_in = keras.layers.Input(shape=(include_lines,1))
duration_in = keras.layers.Input(shape=(include_lines,1))
lstm_path = tf.keras.layers.LSTM(units=LSTM_UNITS_PATH)(path_in)
lstm_details = tf.keras.layers.LSTM(units=LSTM_UNITS_DETAILS)(detail_in)
lstm_result = tf.keras.layers.LSTM(units=LSTM_UNITS_RESULTS)(result_in)
lstm_operation = tf.keras.layers.LSTM(units=LSTM_UNITS_OPERATION)(operation_in)
lstm_duration = tf.keras.layers.LSTM(units=LSTM_UNITS_DURATION)(duration_in)
dropped_path = keras.layers.Dropout(0.8)(lstm_path)
dropped_details = keras.layers.Dropout(0.8)(lstm_details)
dropped_op = keras.layers.Dropout(0.8)(lstm_operation)
merged_lstm = keras.layers.concatenate([dropped_path,dropped_details,lstm_result,dropped_op,lstm_duration])
dropped = keras.layers.Dropout(0.5)(merged_lstm)
dense = keras.layers.Dense(DENSE_UNITS,activation='relu')(dropped)
output = keras.layers.Dense(1,activation='sigmoid')(dropped)
tm = tf.keras.Model(inputs=[path_in,detail_in,result_in,operation_in,duration_in],outputs=output)
# END OF BRAIN #
print("Saving...")
tf.keras.saving.save_model(tm,'/home/elliot/Documents/Work/uni/final_proj/LSTMalware/models/demo.keras', overwrite=True, save_format='keras')
print(tm.summary())
# Borrowed code #
def recall_m(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision_m(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def f1_m(y_true, y_pred):
precision = precision_m(y_true, y_pred)
recall = recall_m(y_true, y_pred)
return 2*((precision*recall)/(precision+recall+K.epsilon()))
# End borrowed code #
callback = tf.keras.callbacks.ModelCheckpoint(filepath='/mnt/1tb/unishare/checkpoints/'+checkpoint_path,save_best_only=True,verbose=1)
tm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m,precision_m, recall_m])
model_history = tm.fit(train_dataset,enc_labels_train,validation_data=(test_dataset,enc_labels_test),epochs=fit_epochs,batch_size=fit_batches,callbacks=[callback])
tf.keras.saving.save_model(tm,'/mnt/1tb/unishare/very_small.keras', overwrite=True, save_format='keras')
print("Preparing results data...")
fin_table = []
for x in range(0,fit_epochs):
fin_table.append(['Epoch '+str(x+1),
model_history.history['val_f1_m'][x],
model_history.history['val_accuracy'][x],
model_history.history['val_loss'][x],
model_history.history['val_precision_m'][x],
model_history.history['val_recall_m'][x]])
print(tabulate(fin_table,headers=['Epoch','F1','Accuracy','Losses','Precision','Recall']))
exit()
# --- need to do standardization on each feature while keeping the dataset the same shape (REWRITE CURRENT standardization FUNCTIONS TO ACCOMMODATE) use dataset.map
# -- then need to split datasets here post standardization to produce training/test datasets for adapting vectorization layers.
# -- map the training features to appropriate vectorizations
## -- remap the vectored text back to original shape
'''
outline:
data = dataset_fromdir
x_train, x_test = split(data)
// these following standardized lists are [[ALL FEATURE1 TENSOR],[ALL FEATURE2 TENSOR],..]
// these must be reshaped into the original datasets shape once vectorization/normalization has taken place.
standard_train = standardize(x_train)
standard_test = standardize(x_test)
// the vectorization process should take both train and test data simultaniously but only adapt based
// on the training data, this will prevent the model from having any unseen knowledge.
vectorized_train, vectorized_test = vectorize(standard_train,standard_test)
// once vectorization is finished then both datasets should be converted back to the original tensor shape
rebuilt_train, rebuilt_test = rebuild_dataset(vectorized_train, vectorized_test)
// after this there should be 2 datasets
rebuilt_train=[
[[feature1_vectorized,feature2_vectorized],[feature1_vectorized,feature2_vectorized]],
[[feature1_vectorized,feature2_vectorized],[feature1_vectorized,feature2_vectorized]]
]
rebuilt_test=[
[[feature1_vectorized,feature2_vectorized],[feature1_vectorized,feature2_vectorized]],
[[feature1_vectorized,feature2_vectorized],[feature1_vectorized,feature2_vectorized]]
]
'''