main.py

import numpy as np
import csv, os,re, sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '5'
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow import keras
from keras import backend as K
import keras.callbacks
from tabulate import tabulate
## variables
dataset_path = '/mnt/1tb/unishare/complete/mixed/'

checkpoint_path = 'verysmall7.cpk'
#checkpoint_path = os.getenv('CPATH',checkpoint_path)


include_lines = 2000 # how many logs do we take from each file
test_split = 0.35 # how do we split test / train data.

LSTM_UNITS_PATH = 128
LSTM_UNITS_DETAILS = 128
LSTM_UNITS_DURATION = 8
LSTM_UNITS_OPERATION = 16
LSTM_UNITS_RESULTS = 16


DENSE_UNITS = 32


fit_epochs = 8
fit_batches = 10
##


# read labels.csv from directory, use to construct np array of all collected process within labels.csv
''' in_dataset format:
time ------------->
[    \/ one operation             \/ one operation
    [[feature1,feature2,feature3],[feature1.feature2,feature3],..], <- program 1
    [[feature1,feature2,feature3],[feature1.feature2,feature3],..], <- program 2
    [[feature1,feature2,feature3],[feature1.feature2,feature3],..],..
]

the model will read from multiple inputs, each input comes from one of the features 
//should the input be just for one instance in a sequence or the whole sequence as it will just be a time sequence of one feature?
//does each input need to be labelled the same? or can i label all of the inputs as one?

[vectorized_path] -> input(path)
[vectorized_details] -> input(details)

'''
def construct_dataset_fromdir(dir):
    constructed_data = []
    labels = []
    with open(dir+'labels.csv', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for i, process in enumerate(reader):
            
            timeline = retrieve_operation_data(dir+process['filename']) 
            constructed_data.append(timeline)
            timeline = None
            labels.append(process['label'])
            '''
            if(i % 30 == 0):
                print("batching...")
                if('batched_data' in locals()):
                    batched_data = np.concatenate([batched_data, np.array(constructed_data)])
                else:
                    batched_data = np.array(constructed_data)
                constructed_data = []
                print(str(i), sys.getsizeof(batched_data))
            '''
    print("final concat...")

    return constructed_data, labels

# convert a single applications csv file into a usable numpy array of the process + created threads, perform this for the first x rows
def retrieve_operation_data(csv_path):
    process_operations = []
    with open(csv_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        #child_threads = [] LEGACY
        for i, row in enumerate(reader):
            # we only do the first x rows
            if(i >= include_lines):
                break
            ''' THIS IS LEGACY CODE AND IS NO LONGER NEEDED, OUR DATASET IS PRE-FILTERED
            # if this is monitored process or a child of this process
            if(row['Process Name'] == process_name or row['TID'] in child_threads):
                # if this thread creates a child we should follow this process as well
                if(row["Operation"] == "Thread Create"):
                    # skips the prefix 'Thread ID: '(len=11) and just appends the thread number 
                    child_threads.append(row['Detail'][11:])
            '''
                # prepare needed headers to variables
            proc_operation = row['Operation']
            proc_path = row['Path']
            proc_result = row['Result']
            proc_detail = row['Detail']
            proc_duration = row['Duration']
            
            # append operation
            process_operations.append([proc_path,proc_operation,proc_result,proc_detail,proc_duration])
        reader = None
    csvfile.close()
    return process_operations

# takes in standardized feature lists and sends each set of features to the appropriate vectorizer/normalizer
def vectorize_feature_datasets(normalized_datalists, labels):
    print("Running Vectorization...")
    print("Vectorizing paths...")
    v_path_train, v_path_test,labels_train,labels_test, path_vocab = vectorize_epaths(normalized_datalists[0], labels)
    print("Vectorizing operations...")
    v_op_train, v_op_test = vectorize_operations(normalized_datalists[1])
    print("Vectorizing results...")
    v_res_train, v_res_test = vectorize_results(normalized_datalists[2])
    print("Vectorizing details...")
    v_details_train, v_details_test, ignore,ignore,details_vocab = vectorize_epaths(normalized_datalists[3],labels)
    print("Normalising durations...")
    n_dur_train, n_dur_test = normalize_durations(normalized_datalists[4])
    

    train_inputs = [v_path_train,v_details_train,v_res_train,v_op_train,n_dur_train]
    test_inputs = [v_path_test,v_details_test,v_res_test,v_op_test,n_dur_test]
    vocabs = [path_vocab,details_vocab]


    return train_inputs, test_inputs, labels_train, labels_test, vocabs


def vectorize_epaths(epath_dataSet, labels):
    epath_train, epath_test, labels_train, labels_test= train_test_split(epath_dataSet, labels, test_size=test_split, random_state=55) # split our input data, we must do this for each input and ensure that they are all the same size
    # create our tensors of data, using ragged as technically they are the same shape but within the lists could be a different amount of text
    epath_vectorTrain = tf.ragged.constant(epath_train)
    epath_train = tf.ragged.constant(epath_train)
    epath_test = tf.ragged.constant(epath_test)
    #just the values of training data used for adapting
    epath_vectorTrain = epath_vectorTrain.flat_values


    epath_vectorizer = keras.layers.TextVectorization(standardize=None,split=None,output_mode="tf_idf")
    epath_vectorizer.adapt(epath_vectorTrain)
    epath_vocab = epath_vectorizer.get_vocabulary()
    print("Vocab includes: ",epath_vocab)
    vect_epath = epath_vectorizer(epath_train.values)
    vect_epath_train = tf.reshape(vect_epath, (epath_train.shape[0],include_lines,len(epath_vocab))) # reshapes the vectorized output into a 3d tensor, where each top level is next process
    vect_epath = epath_vectorizer(epath_test.values)
    vect_epath_test = tf.reshape(vect_epath, (epath_test.shape[0],include_lines,len(epath_vocab))) # reshapes the vectorized output into a 3d tensor, where each top level is next process

    return(vect_epath_train,vect_epath_test,labels_train,labels_test,epath_vocab) # <- vectorized test / train data / vocabulary (so we can remember lengths of vectorized input)
def vectorize_operations(operation_dataSet):
    op_train, op_test = train_test_split(operation_dataSet, test_size=test_split, random_state=55) # split our input data, we must do this for each input and ensure that they are all the same size
    
    op_trainVocab = tf.ragged.constant(op_train).flat_values
    op_train = tf.convert_to_tensor(op_train)
    op_test = tf.convert_to_tensor(op_test)

    
    operation_vectorizer = keras.layers.TextVectorization(standardize=None,split=None,output_mode="int")
    operation_vectorizer.adapt(op_trainVocab)
    operation_vocab = operation_vectorizer.get_vocabulary()
    print("Vocab includes: ",operation_vocab)


    v_op_train = operation_vectorizer(op_train)
    v_op_train = tf.reshape(v_op_train,(len(v_op_train),include_lines,1))
    v_op_test = operation_vectorizer(op_test)
    v_op_test = tf.reshape(v_op_test,(len(v_op_test),include_lines,1))
    return(v_op_train,v_op_test)
def vectorize_results(result_dataSet):
    res_train, res_test = train_test_split(result_dataSet, test_size=test_split, random_state=55)

    res_trainVocab = tf.ragged.constant(res_train).flat_values
    res_train = tf.convert_to_tensor(res_train)
    res_test = tf.convert_to_tensor(res_test)

    result_vectorizer = keras.layers.TextVectorization(standardize=None,split=None,output_mode="int")
    result_vectorizer.adapt(res_trainVocab)
    result_vocab = result_vectorizer.get_vocabulary()
    print("Vocab includes: ",result_vocab)

    v_res_train = result_vectorizer(res_train)
    v_res_train = tf.reshape(v_res_train,(len(v_res_train),include_lines,1))
    v_res_test = result_vectorizer(res_test)
    v_res_test = tf.reshape(v_res_test,(len(v_res_test),include_lines,1))

    return(v_res_train,v_res_test)
def normalize_durations(duration_dataSet):
    dur_train, dur_test = train_test_split(duration_dataSet, test_size=test_split, random_state=55)

    #dur_trainVocab = tf.ragged.constant(dur_train).flat_values
    dur_train = tf.convert_to_tensor(dur_train)
    dur_test = tf.convert_to_tensor(dur_test)

    duration_normalizer = keras.layers.Normalization(axis=None)
    duration_normalizer.adapt(dur_train)
    
    n_dur_train = duration_normalizer(dur_train)
    n_dur_train = tf.reshape(n_dur_train,(len(n_dur_train),include_lines,1))
    n_dur_test = duration_normalizer(dur_test)
    n_dur_test = tf.reshape(n_dur_test,(len(n_dur_test),include_lines,1))
    
    return(n_dur_train, n_dur_test)


# standardizes + splits text data to be prepared for vectorization < - vectorize features seperately (aside from paths) + only use training data for adapting
def standardize_data(pre_standardized_dataset):
    standardized_array = []
    standardized_array.append(path_lists_to_token_list(pre_standardized_dataset[:,:,0])) # <- standardized paths [0]
    standardized_array.append(multi_to_single_word_list(pre_standardized_dataset[:,:,1])) # <- standarized operations [1]
    standardized_array.append(multi_to_single_word_list(pre_standardized_dataset[:,:,2])) # <- standarized results [2]
    standardized_array.append(details_splitting(pre_standardized_dataset[:,:,3])) # <- standardized details [3]
    standardized_array.append(string_to_float_lists(pre_standardized_dataset[:,:,4])) # <- standardized durations [4]
    return standardized_array


def details_splitting(details_lists):
    standardized_details_list = []
    for x in range(0,len(details_lists)): #<- x = 1 entire timeline
        standardized_details_list.append([]) # create timeline entry
        for y in range(0,len(details_lists[x])):
            tokenized_list = re.split('\, |\: ',details_lists[x][y])
            curated_list = []
            for i, k in enumerate(tokenized_list):
                detail = k
                detail = detail.lower()
                detail = detail.replace(" ", "")
                detail = detail.replace(",","")
                detail = detail.replace("0x","")
                try:
                    detail = int(detail,16)
                    detail = str(detail)
                except:
                    None
                if(len(detail) > 15 or detail.isdigit()):
                    continue
                curated_list.append(detail)
            standardized_details_list[x].append(curated_list)
    return standardized_details_list
def string_to_float_lists(string_lists):
    standardized_float_list = []
    for x in range(0,len(string_lists)): #<- x = 1 entire timeline
        standardized_float_list.append([]) # create timeline entry
        for y in range(0,len(string_lists[x])): # <- y = 1 instance of activity
            duration = float(string_lists[x][y])
            standardized_float_list[x].append(duration)
    return standardized_float_list
def multi_to_single_word_list(word_list):
    standardized_word_list = []
    for x in range(0,len(word_list)): #<- x = 1 entire timeline
        standardized_word_list.append([]) # create timeline entry
        for y in range(0,len(word_list[x])): # <- y = 1 instance of activity
            word = word_list[x][y].replace(" ","")
            word = word.lower()
            standardized_word_list[x].append(word)
    return standardized_word_list
def path_lists_to_token_list(path_list): # in: a list of all paths, out: a list of lists containing the paths seperated by the '\'
    standardized_path_list = []
    for x in range(0,len(path_list)): #<- x = 1 entire timeline
        standardized_path_list.append([]) # create timeline entry
        for y in range(0,len(path_list[x])): # <- y = 1 instance of activity
            standardized_path_list[x].append(split_path_data_to_tokens(path_list[x][y]))
    return standardized_path_list
def split_path_data_to_tokens(path_data):
    # drop first 3 chars (for windows drive)  + split paths as a list of strings by '/' (token)
    if(path_data[:3] == 'C:\\'):
        path_data = path_data[3:]
    path_data = re.split('\\\\|\.',path_data)
    curated_paths = []
    # Introduced to keep the length of the string reasonable <= 11 to stop random UUIDs being included
    for token in (path_data):
        if(len(token) > 10 or token.isdigit()):
            continue
        curated_paths.append(token.lower())
    return curated_paths

## testing
base_dataset, base_labels = construct_dataset_fromdir(dataset_path)
base_dataset = np.asarray(base_dataset)
input_lists = standardize_data(base_dataset)
base_dataset = None
train_dataset, test_dataset,labels_train,labels_test, vocabs = vectorize_feature_datasets(input_lists, base_labels)


label_encoder = LabelEncoder()
enc_labels_train = label_encoder.fit_transform(labels_train)

enc_labels_test = label_encoder.transform(labels_test)

print(enc_labels_train)

# HERE BEGINS THE BRAIN # EPATH IS BEING CONSIDERED FOR REMOVAL AS THE LAUNCH WILL ALWAYS BE FROM A SMB SHARE.

path_in = keras.layers.Input(shape=(include_lines,len(vocabs[0])))
detail_in = keras.layers.Input(shape=(include_lines,len(vocabs[1])))
result_in = keras.layers.Input(shape=(include_lines,1))
operation_in = keras.layers.Input(shape=(include_lines,1))
duration_in = keras.layers.Input(shape=(include_lines,1))

lstm_path = tf.keras.layers.LSTM(units=LSTM_UNITS_PATH)(path_in)
lstm_details = tf.keras.layers.LSTM(units=LSTM_UNITS_DETAILS)(detail_in)
lstm_result = tf.keras.layers.LSTM(units=LSTM_UNITS_RESULTS)(result_in)
lstm_operation = tf.keras.layers.LSTM(units=LSTM_UNITS_OPERATION)(operation_in)
lstm_duration = tf.keras.layers.LSTM(units=LSTM_UNITS_DURATION)(duration_in)

dropped_path = keras.layers.Dropout(0.8)(lstm_path)
dropped_details = keras.layers.Dropout(0.8)(lstm_details)
dropped_op = keras.layers.Dropout(0.8)(lstm_operation)


merged_lstm = keras.layers.concatenate([dropped_path,dropped_details,lstm_result,dropped_op,lstm_duration])
dropped = keras.layers.Dropout(0.5)(merged_lstm)
dense = keras.layers.Dense(DENSE_UNITS,activation='relu')(dropped)

output = keras.layers.Dense(1,activation='sigmoid')(dropped)


tm = tf.keras.Model(inputs=[path_in,detail_in,result_in,operation_in,duration_in],outputs=output)

# END OF BRAIN #
print("Saving...")

tf.keras.saving.save_model(tm,'/home/elliot/Documents/Work/uni/final_proj/LSTMalware/models/demo.keras', overwrite=True, save_format='keras')
print(tm.summary())


# Borrowed code #

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


# End borrowed code #


callback = tf.keras.callbacks.ModelCheckpoint(filepath='/mnt/1tb/unishare/checkpoints/'+checkpoint_path,save_best_only=True,verbose=1)
tm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_m,precision_m, recall_m])
model_history = tm.fit(train_dataset,enc_labels_train,validation_data=(test_dataset,enc_labels_test),epochs=fit_epochs,batch_size=fit_batches,callbacks=[callback])


tf.keras.saving.save_model(tm,'/mnt/1tb/unishare/very_small.keras', overwrite=True, save_format='keras')
print("Preparing results data...")
fin_table = []

for x in range(0,fit_epochs):
    fin_table.append(['Epoch '+str(x+1),
                      model_history.history['val_f1_m'][x],
                      model_history.history['val_accuracy'][x],
                      model_history.history['val_loss'][x],
                      model_history.history['val_precision_m'][x],
                      model_history.history['val_recall_m'][x]])

print(tabulate(fin_table,headers=['Epoch','F1','Accuracy','Losses','Precision','Recall']))

exit()

# --- need to do standardization on each feature while keeping the dataset the same shape (REWRITE CURRENT standardization FUNCTIONS TO ACCOMMODATE) use dataset.map


# -- then need to split datasets here post standardization to produce training/test datasets for adapting vectorization layers.


# -- map the training features to appropriate vectorizations


## -- remap the vectored text back to original shape

'''
outline:
data = dataset_fromdir
x_train, x_test = split(data)

// these following standardized lists are [[ALL FEATURE1 TENSOR],[ALL FEATURE2 TENSOR],..] 
// these must be reshaped into the original datasets shape once vectorization/normalization has taken place.

standard_train = standardize(x_train)  
standard_test = standardize(x_test)

// the vectorization process should take both train and test data simultaniously but only adapt based
// on the training data, this will prevent the model from having any unseen knowledge.

vectorized_train, vectorized_test = vectorize(standard_train,standard_test)

// once vectorization is finished then both datasets should be converted back to the original tensor shape

rebuilt_train, rebuilt_test = rebuild_dataset(vectorized_train, vectorized_test)

// after this there should be 2 datasets 

rebuilt_train=[
    [[feature1_vectorized,feature2_vectorized],[feature1_vectorized,feature2_vectorized]],
    [[feature1_vectorized,feature2_vectorized],[feature1_vectorized,feature2_vectorized]]
]

rebuilt_test=[
    [[feature1_vectorized,feature2_vectorized],[feature1_vectorized,feature2_vectorized]],
    [[feature1_vectorized,feature2_vectorized],[feature1_vectorized,feature2_vectorized]]
]

'''