From cda33e764e96537e4e0fd13ef3219f3ccc9cb0af Mon Sep 17 00:00:00 2001 From: Aaron Crickenberger Date: Tue, 19 May 2020 15:39:05 -0700 Subject: [PATCH] triage: bail on certain global clusters after 30s triage works by clustering test failures in two stages: - locally: create clusters of test failures for each unique test - globally: merge each test's clusters into a global set of clusters The clustering/merging is done by computing edit distance between the failure text of each test failure or failure cluster and accepting the first pair that has an edit distance of 10% of their combined length. This can add up in the worst case, where edit distance is going to be computed for every existing cluster before creating a new cluster. We've arbitrarily handled it thus far by: - truncating failure text to ~200k~ 10k chars - bailing out on local clustering after 60s per unique test This PR adds: - bailing out on global clustering of pathological / low value clusters after 30s - more logging to see where clustering is working vs. not --- triage/summarize.py | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/triage/summarize.py b/triage/summarize.py index 6e14f193a590..77b52ed1731f 100755 --- a/triage/summarize.py +++ b/triage/summarize.py @@ -49,6 +49,10 @@ r'|(?<=minion-group-|default-pool-)[-0-9a-z]{4,}' # node names ) +LONG_OUTPUT_LEN = 10000 +TRUNCATED_SEP = '\n...[truncated]...\n' +MAX_CLUSTER_TEXT_LEN = LONG_OUTPUT_LEN + len(TRUNCATED_SEP) + def normalize(s): """ @@ -84,12 +88,12 @@ def repl(m): s = flakeReasonOrdinalRE.sub(repl, s) - if len(s) > 10000: + if len(s) > LONG_OUTPUT_LEN: # for long strings, remove repeated lines! s = re.sub(r'(?m)^(.*\n)\1+', r'\1', s) - if len(s) > 10000: # ridiculously long test output - s = s[:5000] + '\n...[truncated]...\n' + s[-5000:] + if len(s) > LONG_OUTPUT_LEN: # ridiculously long test output + s = s[:int(LONG_OUTPUT_LEN/2)] + TRUNCATED_SEP + s[-int(LONG_OUTPUT_LEN/2):] return s @@ -287,7 +291,7 @@ def cluster_local(failed_tests): reverse=True), 1): num_failures += len(tests) - logging.info('%4d/%4d, %d failures, %s', n, len(failed_tests), len(tests), test_name) + logging.info('%4d/%4d tests, %5d failures, %s', n, len(failed_tests), len(tests), test_name) sys.stdout.flush() clustered[test_name] = cluster_test(tests) elapsed = time.time() - start @@ -333,19 +337,37 @@ def cluster_global(clustered, previous_clustered): key=lambda kv: sum(len(x) for x in kv[1].values()), reverse=True), 1): - logging.info('%4d/%4d, %d clusters, %s', n, len(clustered), len(test_clusters), test_name) + logging.info('%4d/%4d tests, %4d clusters, %s', n, len(clustered), len(test_clusters), test_name) + test_start = time.time() # Look at clusters with the most failures first - for key, tests in sorted(test_clusters.items(), - key=lambda x: len(x[1]), reverse=True): - num_failures += len(tests) + for m, (key, tests) in enumerate( + sorted(test_clusters.items(), + key=lambda x: len(x[1]), + reverse=True), + 1): + cluster_start = time.time() + ftext_len = len(key) + num_clusters = len(test_clusters) + num_tests = len(tests) + cluster_case = "" + logging.info(' %4d/%4d clusters, %5d chars failure text, %5d failures ...', m, num_clusters, ftext_len, num_tests) + num_failures += num_tests if key in clusters: + cluster_case = "EXISTING" clusters[key].setdefault(test_name, []).extend(tests) + # if we've taken longer than 30 seconds for this test, bail on pathological / low value cases + elif time.time() > test_start + 30 and ftext_len > MAX_CLUSTER_TEXT_LEN/2 and num_tests == 1: + cluster_case = "BAILED" else: other = find_match(key, clusters) if other: + cluster_case = "OTHER" clusters[other].setdefault(test_name, []).extend(tests) else: + cluster_case = "NEW" clusters[key] = {test_name: list(tests)} + cluster_dur = time.time() - cluster_start + logging.info(' %4d/%4d clusters, %5d chars failure text, %5d failures, cluster:%s in %d sec, test: %s', m, num_clusters, ftext_len, num_tests, cluster_case, cluster_dur, test_name) # If we seeded clusters using the previous run's keys, some of those # clusters may have disappeared. Remove the resulting empty entries.