-
Notifications
You must be signed in to change notification settings - Fork 35
/
extractor.py
205 lines (175 loc) · 7.32 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This file is part of Cockpit.
#
# Copyright (C) 2017 Slavek Kabrda
#
# Cockpit is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Cockpit is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Cockpit; If not, see <http://www.gnu.org/licenses/>.
# WARNING: As you change this code increment this version number so
# the machine learning model uses a new place to store the model
VERSION = 3
# This code extracts features from log items. In particular it normalizes
# and exracts the log.
#
# TODO: We could weight log lines using TF-IDF, but that would require
# a distance function that could apply that weight between lines. The
# NCD distance we use cannot do that.
import calendar
import re
import time
import sklearn.feature_extraction.text
# Ignore lines that appear in at least this fraction of logs
IGNORE_THRESHHOLD = 0.17
# Choose only one out of every N tracked items. These have
# already been manually "clustered" elsewhere, and we only need
# some cluster seeds
TRACKER_SPARSE = 100
NOISE = (
# 512 bit hashes
("x" * 128, re.compile('[0-9a-f]{128}')),
("X" * 128, re.compile('[0-9A-F]{128}')),
# 256 bit hashes
("x" * 64, re.compile('[0-9a-f]{64}')),
("X" * 64, re.compile('[0-9A-F]{64}')),
# 224 bit hashes
("x" * 56, re.compile('[0-9a-f]{56}')),
("X" * 56, re.compile('[0-9A-F]{56}')),
# 160 bit hashes
("x" * 40, re.compile('[0-9a-f]{40}')),
("X" * 40, re.compile('[0-9A-F]{40}')),
# 128 bit hashes
("x" * 32, re.compile('[0-9a-f]{32}')),
("X" * 32, re.compile('[0-9A-F]{32}')),
# GUIDs
('xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx',
re.compile('[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}')),
# Digits
('000', re.compile('\d+')),
# Filenames
('\g<base>', re.compile(r'(/[a-zA-Z0-9_\.-]+)+/(?P<base>[a-zA-Z0-9_.-]+)\b'))
)
# Various features extracted
FEATURE_LOG = 0 # string: The normalized and collapsed log extracted
FEATURE_INDEX = 1 # number: Unique index of the item
FEATURE_URL = 2 # string: The full URL to the test result
FEATURE_NAME = 3 # string: The name of the test run
FEATURE_CONTEXT = 4 # string: The context in which the test is run
FEATURE_TRACKER = 5 # string: A tracker issue for this
FEATURE_MERGED = 6 # number: 1 if merged, 0 if not, -1 if unknown
FEATURE_TIMESTAMP = 7 # number: The time since epoch at which test was run
# Return already tokenized data
def noop(value):
return value
# Select which items we want to operate on.
#
# Because we have so many tracked failures, we need to only bring
# some of those into our clustering algorithm. We can assume that
# these are already clusters
tracked = { }
def select(item):
if item.get("status") != "failure":
return False
tracker = item.get("tracker")
if not tracker:
return True
count = tracked[tracker] = tracked.get(tracker, 0) + 1
return count % TRACKER_SPARSE == 0 # Only every Nth for tracked failures
# The actual feature extractor. Currently only extracts a
# normalized log from each item. By using fit() you can train
# the extractor to ignore frequently found lines.
class Extractor():
def __init__(self, verbose=False):
self.extract = sklearn.feature_extraction.text.CountVectorizer(
analyzer='word',
tokenizer=noop,
lowercase=False,
max_df=IGNORE_THRESHHOLD)
self.verbose = verbose
@staticmethod
def tokenize(item):
result = [ ]
value = item["log"] or ""
for line in value.replace('\r\n', '\n').replace('\r', '\n').split('\n'):
line = line.strip()
for (substitute, pattern) in NOISE:
line = pattern.sub(substitute, line)
result.append(line)
return result
def fit(self, items, tokenized=None):
tokenized = tokenized or map(Extractor.tokenize, items)
self.extract.fit(tokenized)
def transform(self, items, tokenized=None, limit=None):
tokenized = list(tokenized or map(Extractor.tokenize, items))
results = [ ]
for index, item in enumerate(items):
if not select(item):
continue
lines = tokenized[index]
filtered = filter(lambda line: line not in self.extract.stop_words_, lines)
try:
timestamp = calendar.timegm(time.strptime(item.get("date", ""), "%Y-%m-%dT%H:%M:%SZ"))
except ValueError:
timestamp = -1
merged = item.get("merged")
if merged is None:
merged = -1
else:
merged = merged and 1 or 0
results.append((
"\n".join(filtered), # FEATURE_LOG
index, # FEATURE_INDEX
item.get("url", ""), # FEATURE_URL
item.get("test", ""), # FEATURE_NAME
item.get("context", ""), # FEATURE_CONTEXT
item.get("tracker", ""), # FEATURE_TRACKER
merged, # FEATURE_MERGED
timestamp # FEATURE_TIMESTAMP
))
if limit:
results = results[-limit:]
return results
def fit_transform(self, items, limit=None):
tokenized = list(map(Extractor.tokenize, items))
self.fit(items, tokenized)
return self.transform(items, tokenized, limit=limit)
def stop_tokens(self):
return self.extract.stop_words_
# This is a helpful debugger to help diagnose data, and figure out if we're
# getting the above threshold and regular expressions right
if __name__ == '__main__':
import data
import argparse
parser = argparse.ArgumentParser(description="Look for noise lines in input jsonl")
parser.add_argument("--only", action="append", help="Only analyze these statuses")
parser.add_argument("-v", "--verbose", action="store_true", help="Print verbose progress output")
parser.add_argument("-t", "--tokenize", action="store_true", help="Just tokenize a raw file")
parser.add_argument("filename", help="The filename in JSONL gzip format or raw file for --tokenize")
opts = parser.parse_args()
if opts.tokenize:
with open(opts.filename, "r") as fp:
contents = fp.read()
print("\n".join(Extractor.tokenize({ "log": contents })))
else:
# The kind of statuses to inlcude
if not opts.only:
only = None
else:
only = lambda item: item.get("status") in opts.only
# Load the actual data
items = data.load(opts.filename, only=only, verbose=opts.verbose)
# Print out all lines we think are stop lines in the data
extract = Extractor()
extract.fit(items)
for stop in extract.stop_tokens():
print(stop)