-
Notifications
You must be signed in to change notification settings - Fork 2
/
preprocess.py
109 lines (94 loc) · 2.93 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import sys
INPUT_FOLDER = sys.argv[1]
OUTPUT_FOLDER = sys.argv[2]
if len(sys.argv) < 3:
print('Usage: python preprocess.py <INPUT_FOLDER> <OUTPUT_FOLDER>')
sys.exit(-1)
def preprocess_log(filetype, log_files=['dataset/mcBSC/Antti_Palojarvi/Examples/Training1/Switch_logs/no_passive_missing/EXT2_BMT.log']):
cmd_log = {} # dict of filetype vs (dict of cmd_name vs log content)
line_num = 1
for log_filename in log_files:
log_file = open(log_filename, 'r', encoding = "ISO-8859-1")
content = log_file.readlines()
log_file.close()
collected_log = []
log_lines = []
prev_cmd = None
for line in content:
if ";" in line:
words = word_tokenize(line)
for word in words:
if word[0] == 'Z':
w = word.split(':')[0]
if len(w) > 5:
w = w[:4]
if prev_cmd is not None:
if prev_cmd not in cmd_log:
cmd_log[prev_cmd] = [collected_log] #this is a list
else:
cmd_log[prev_cmd].append(collected_log)
prev_cmd = w
break
collected_log = [str(line_num)+'\t'+line]
else:
collected_log.append(str(line_num)+'\t'+line)
line_num += 1
if prev_cmd is not None:
if prev_cmd not in cmd_log:
cmd_log[prev_cmd] = [collected_log] #this is a list
else:
cmd_log[prev_cmd].append(collected_log)
return cmd_log
def zero_digits(s):
# return s
return re.sub('\d', '0', s)
filetypes = {} # file types vs names of file in that type
def walk_directory(rootdir=INPUT_FOLDER):
# filetypes = []
EXCLUDES = ['.zip', '.ZIP', '.bin','.BIN', '.rar', '.MAP', '.BAK', '.BBX', '.tgz', '.DAT', '.SHL', 'TEST', '0.HW', '.xlsx']
for subdir, dirs, files in os.walk(rootdir):
for file in files:
filetype = zero_digits(file)
if file[-4:] not in EXCLUDES and file[-2:] != '.Z' and filetype[-4:] != 'S000' and file != 'info.txt':
filename = os.path.join(subdir, file)
# print(os.path.join(subdir, file))
# preprocess_log(cmd_log, filetype, filename)
# print(filetype)
if filetype not in filetypes:
filetypes[filetype] = [filename]
else:
filetypes[filetype].append(filename)
# filetypes.append(filetype)
# print(filetypes['BCM00000.log'])
# print(filetypes.keys())
return filetypes
filetypes = walk_directory()
# sys.exit(-1)
# print(filetypes.keys())
# print(filetypes['SUPERV0.HW'])
# sys.exit(-1)
directory = OUTPUT_FOLDER
if not os.path.exists(directory):
os.makedirs(directory)
for ftype in filetypes:
if ftype == '':
continue
print('processing ftype:',ftype)
flog = preprocess_log(ftype, filetypes[ftype])
# print(flog.keys())
for cmd in flog:
logs = flog[cmd]
if cmd == '':
continue
out_file = open(directory+'/'+str(cmd)+'_'+str(ftype), 'w')
for log in logs:
out_file.write(''.join(log))
out_file.close()
# ftype = 'IP_configurations.txt'
# print(preprocess_log(ftype, filetypes[ftype][:2]))
# process_onekind()
# print(cmd_log)