diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java index 01fa529234bf4..e611806682cda 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java @@ -6,6 +6,7 @@ package org.elasticsearch.xpack.ml.job.categorization; import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.elasticsearch.grok.Grok; import java.util.ArrayList; @@ -25,8 +26,11 @@ */ public final class GrokPatternCreator { + private static final Logger logger = LogManager.getLogger(GrokPatternCreator.class); + private static final String PREFACE = "preface"; private static final String EPILOGUE = "epilogue"; + private static final int MAX_RECURSE_DEPTH = 10; /** * The first match in this list will be chosen, so it needs to be ordered @@ -89,6 +93,11 @@ public static String findBestGrokMatchFromExamples(String jobId, String regex, C // E.g., ".*?cat.+?sat.+?mat.*" -> [ "", "cat", "sat", "mat" ] String[] fixedRegexBits = regex.split("\\.[*+]\\??"); + // If there are no fixed regex bits, that implies there would only be a single capture group + // And that would mean a pretty uninteresting grok pattern + if (fixedRegexBits.length == 0) { + return regex; + } // Create a pattern that will capture the bits in between the fixed parts of the regex // // E.g., ".*?cat.+?sat.+?mat.*" -> Pattern (.*?)cat(.+?)sat(.+?)mat(.*) @@ -112,8 +121,7 @@ public static String findBestGrokMatchFromExamples(String jobId, String regex, C // We should never get here. If we do it implies a bug in the original categorization, // as it's produced a regex that doesn't match the examples. assert matcher.matches() : exampleProcessor.pattern() + " did not match " + example; - LogManager.getLogger(GrokPatternCreator.class).error("[{}] Pattern [{}] did not match example [{}]", jobId, - exampleProcessor.pattern(), example); + logger.error("[{}] Pattern [{}] did not match example [{}]", jobId, exampleProcessor.pattern(), example); } } @@ -125,19 +133,19 @@ public static String findBestGrokMatchFromExamples(String jobId, String regex, C // Remember (from the first comment in this method) that the first element in this array is // always the empty string overallGrokPatternBuilder.append(fixedRegexBits[inBetweenBitNum]); - appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, inBetweenBitNum == 0, + appendBestGrokMatchForStrings(jobId, fieldNameCountStore, overallGrokPatternBuilder, inBetweenBitNum == 0, inBetweenBitNum == fixedRegexBits.length - 1, groupsMatchesFromExamples.get(inBetweenBitNum)); } return overallGrokPatternBuilder.toString(); } - /** - * Given a collection of strings, work out which (if any) of the grok patterns we're allowed - * to use matches it best. Then append the appropriate grok language to represent that finding - * onto the supplied string builder. - */ - static void appendBestGrokMatchForStrings(Map fieldNameCountStore, StringBuilder overallGrokPatternBuilder, - boolean isFirst, boolean isLast, Collection mustMatchStrings) { + private static void appendBestGrokMatchForStrings(String jobId, + Map fieldNameCountStore, + StringBuilder overallGrokPatternBuilder, + boolean isFirst, + boolean isLast, + Collection mustMatchStrings, + int numRecurse) { GrokPatternCandidate bestCandidate = null; if (mustMatchStrings.isEmpty() == false) { @@ -149,7 +157,10 @@ static void appendBestGrokMatchForStrings(Map fieldNameCountSto } } - if (bestCandidate == null) { + if (bestCandidate == null || numRecurse >= MAX_RECURSE_DEPTH) { + if (bestCandidate != null) { + logger.warn("[{}] exited grok discovery early, reached max depth [{}]", jobId, MAX_RECURSE_DEPTH); + } if (isLast) { overallGrokPatternBuilder.append(".*"); } else if (isFirst || mustMatchStrings.stream().anyMatch(String::isEmpty)) { @@ -161,13 +172,39 @@ static void appendBestGrokMatchForStrings(Map fieldNameCountSto Collection prefaces = new ArrayList<>(); Collection epilogues = new ArrayList<>(); populatePrefacesAndEpilogues(mustMatchStrings, bestCandidate.grok, prefaces, epilogues); - appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, isFirst, false, prefaces); + appendBestGrokMatchForStrings(jobId, + fieldNameCountStore, + overallGrokPatternBuilder, + isFirst, + false, + prefaces, + numRecurse + 1); overallGrokPatternBuilder.append("%{").append(bestCandidate.grokPatternName).append(':') - .append(buildFieldName(fieldNameCountStore, bestCandidate.fieldName)).append('}'); - appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, isLast, epilogues); + .append(buildFieldName(fieldNameCountStore, bestCandidate.fieldName)).append('}'); + appendBestGrokMatchForStrings(jobId, + fieldNameCountStore, + overallGrokPatternBuilder, + false, isLast, + epilogues, + numRecurse + 1); } } + + /** + * Given a collection of strings, work out which (if any) of the grok patterns we're allowed + * to use matches it best. Then append the appropriate grok language to represent that finding + * onto the supplied string builder. + */ + static void appendBestGrokMatchForStrings(String jobId, + Map fieldNameCountStore, + StringBuilder overallGrokPatternBuilder, + boolean isFirst, + boolean isLast, + Collection mustMatchStrings) { + appendBestGrokMatchForStrings(jobId, fieldNameCountStore, overallGrokPatternBuilder, isFirst, isLast, mustMatchStrings, 0); + } + /** * Given a collection of strings, and a grok pattern that matches some part of them all, * return collections of the bits that come before (prefaces) and after (epilogues) the diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java index 381a02e4580ad..5e3fd41de6782 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java @@ -14,6 +14,7 @@ import java.util.HashMap; import java.util.Map; +import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.Matchers.containsInAnyOrder; public class GrokPatternCreatorTests extends ESTestCase { @@ -71,7 +72,8 @@ public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() { Map fieldNameCountStore = new HashMap<>(); StringBuilder overallGrokPatternBuilder = new StringBuilder(); - GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false, + false, mustMatchStrings); assertEquals(".+?%{TIMESTAMP_ISO8601:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString()); } @@ -87,7 +89,8 @@ public void testAppendBestGrokMatchForStringsGivenTomcatDatestamps() { Map fieldNameCountStore = new HashMap<>(); StringBuilder overallGrokPatternBuilder = new StringBuilder(); - GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false, + false, mustMatchStrings); assertEquals(".*?%{TOMCAT_DATESTAMP:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString()); } @@ -105,7 +108,8 @@ public void testAppendBestGrokMatchForStringsGivenTrappyFloatCandidates() { Map fieldNameCountStore = new HashMap<>(); StringBuilder overallGrokPatternBuilder = new StringBuilder(); - GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false, + false, mustMatchStrings); assertEquals(".+?", overallGrokPatternBuilder.toString()); } @@ -120,7 +124,8 @@ public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() { Map fieldNameCountStore = new HashMap<>(); StringBuilder overallGrokPatternBuilder = new StringBuilder(); - GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false, + false, mustMatchStrings); assertEquals(".+?%{NUMBER:field}.+?", overallGrokPatternBuilder.toString()); } @@ -134,7 +139,8 @@ public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak() Map fieldNameCountStore = new HashMap<>(); StringBuilder overallGrokPatternBuilder = new StringBuilder(); - GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false, + false, mustMatchStrings); // It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers assertEquals(".+?", overallGrokPatternBuilder.toString()); @@ -150,7 +156,8 @@ public void testAppendBestGrokMatchForStringsGivenHexNumbers() { Map fieldNameCountStore = new HashMap<>(); StringBuilder overallGrokPatternBuilder = new StringBuilder(); - GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false, + false, mustMatchStrings); assertEquals(".*?%{BASE16NUM:field}.*?", overallGrokPatternBuilder.toString()); } @@ -163,7 +170,8 @@ public void testAppendBestGrokMatchForStringsGivenHostnamesWithNumbers() { Map fieldNameCountStore = new HashMap<>(); StringBuilder overallGrokPatternBuilder = new StringBuilder(); - GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false, + false, mustMatchStrings); // We don't want the .1. in the middle to get detected as a hex number assertEquals(".+?", overallGrokPatternBuilder.toString()); @@ -178,7 +186,8 @@ public void testAppendBestGrokMatchForStringsGivenEmailAddresses() { Map fieldNameCountStore = new HashMap<>(); StringBuilder overallGrokPatternBuilder = new StringBuilder(); - GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false, + false, mustMatchStrings); assertEquals(".*?%{EMAILADDRESS:email}.*?", overallGrokPatternBuilder.toString()); } @@ -192,7 +201,8 @@ public void testAppendBestGrokMatchForStringsGivenUris() { Map fieldNameCountStore = new HashMap<>(); StringBuilder overallGrokPatternBuilder = new StringBuilder(); - GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false, + false, mustMatchStrings); assertEquals(".*?%{URI:uri}.*?", overallGrokPatternBuilder.toString()); } @@ -206,7 +216,8 @@ public void testAppendBestGrokMatchForStringsGivenPaths() { Map fieldNameCountStore = new HashMap<>(); StringBuilder overallGrokPatternBuilder = new StringBuilder(); - GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false, + false, mustMatchStrings); assertEquals(".+?%{PATH:path}.*?", overallGrokPatternBuilder.toString()); } @@ -263,4 +274,74 @@ public void testFindBestGrokMatchFromExamplesGivenMultiTimestampLogs() { "%{IP:ipaddress}.+?Authpriv.+?Info.+?sshd.+?subsystem.+?request.+?for.+?sftp.*", GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples)); } + + public void testFindBestGrokMatchFromExamplesGivenAdversarialInputRecurseDepth() { + String regex = ".*?combo.+?rpc\\.statd.+?gethostbyname.+?error.+?for.+?X.+?X.+?Z.+?Z.+?hn.+?hn.*"; + // Two timestamps: one local, one UTC + Collection examples = Arrays.asList( + "combo rpc.statd[1605]: gethostbyname error for ^X^X^Z^Z%8x%8x%8x%8x%8x%8x%8x%8x%8x%62716x%hn%51859x%hn" + + "\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\22...", + "combo rpc.statd[1608]: gethostbyname error for ^X^X^Z^Z%8x%8x%8x%8x%8x%8x%8x%8x%8x%62716x%hn%51859x%hn" + + "\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\22...", + "combo rpc.statd[1635]: gethostbyname error for ^X^X^Z^Z%8x%8x%8x%8x%8x%8x%8x%8x%8x%62716x%hn%51859x%hn" + + "\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" + + "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\22..."); + assertEquals( + ".*?combo.+?rpc\\.statd.+?%{NUMBER:field}.+?gethostbyname.+?error.+?for.+?X.+?X.+?Z.+?Z.+?hn.+?hn.+?%{NUMBER:field2}" + + ".+?%{NUMBER:field3}.+?%{NUMBER:field4}.+?%{NUMBER:field5}.+?%{NUMBER:field6}.+?%{NUMBER:field7}.+?%{NUMBER:field8}" + + ".+?%{NUMBER:field9}.+?%{NUMBER:field10}.+?%{NUMBER:field11}.*", + GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples)); + } + + public void testFindBestGrokMatchFromExamplesGivenMatchAllRegex() { + String regex = ".*"; + // Two timestamps: one local, one UTC + Collection examples = Arrays.asList( + "Killing job [count_tweets]", + "Killing job [tweets_by_location]", + "[count_tweets] Killing job", + "[tweets_by_location] Killing job"); + assertThat(GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples), equalTo(regex)); + } }