From 7f2bbc6e8d5b637dafff8f856c6ec1162a8a098d Mon Sep 17 00:00:00 2001 From: David Roberts Date: Sun, 13 Oct 2019 20:06:42 +0100 Subject: [PATCH] [ML] Fix detection of syslog-like timestamp in find_file_structure (#47970) Usually syslog timestamps have two spaces before a single digit day-of-month. However, in some non-syslog cases where syslog-like timestamps are used there is only one space. The grok pattern supports this, so the timestamp parser should too. This change makes the find_file_structure endpoint do this. Also fixes another problem that the same test case exposed in the find_file_structure endpoint, which was that the exclude_lines_pattern for delimited files was always created on the assumption the delimiter was a comma. Now it is based on the actual delimiter. --- .../DelimitedFileStructureFinder.java | 4 +- .../TimestampFormatFinder.java | 8 ++-- .../DelimitedFileStructureFinderTests.java | 42 +++++++++++++++++++ .../FileStructureUtilsTests.java | 4 +- .../TimestampFormatFinderTests.java | 11 ++--- 5 files changed, 57 insertions(+), 12 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java index 7e5b660c8da74..b947ed2d9cffe 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java @@ -139,9 +139,11 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List optQuote + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + optQuote) - .collect(Collectors.joining(","))); + .collect(Collectors.joining(delimiterMatcher))); } boolean needClientTimeZone = timeField.v2().hasTimezoneDependentParsing(); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java index 08686ed26a31c..fef24b2ac73a4 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java @@ -145,7 +145,7 @@ public final class TimestampFormatFinder { example -> CandidateTimestampFormat.expandDayAndAdjustFractionalSecondsFromExample(example, "MMM dd HH:mm:ss"), "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)(?:[:.,][0-9]{3,9})?\\b", "SYSLOGTIMESTAMP", - Arrays.asList(" 11 11 11 11", " 1 11 11 11"), 4, 10), + Arrays.asList(" 11 11 11 11", " 1 11 11 11"), 6, 10), new CandidateTimestampFormat(example -> Collections.singletonList("dd/MMM/yyyy:HH:mm:ss XX"), "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "\\b%{MONTHDAY}/%{MONTH}/%{YEAR}:%{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) [+-]?%{HOUR}%{MINUTE}\\b", "HTTPDATE", @@ -154,10 +154,10 @@ public final class TimestampFormatFinder { "\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "%{MONTH} %{MONTHDAY}, 20\\d{2} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:AM|PM)\\b", "CATALINA_DATESTAMP", Arrays.asList(" 11 1111 1 11 11", " 11 1111 11 11 11"), 0, 3), - new CandidateTimestampFormat(example -> Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), + new CandidateTimestampFormat(example -> Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), "\\b[A-Z]\\S{2} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", "%{MONTH} +%{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", "CISCOTIMESTAMP", - Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), 0, 0), + Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), 1, 0), new CandidateTimestampFormat(CandidateTimestampFormat::indeterminateDayMonthFormatFromExample, "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "\\b%{DATESTAMP}\\b", "DATESTAMP", // In DATESTAMP the month may be 1 or 2 digits, but the day must be 2 @@ -1467,7 +1467,7 @@ private static String adjustFractionalSecondsFromEndOfExample(String example, St static List expandDayAndAdjustFractionalSecondsFromExample(String example, String formatWithddAndNoFraction) { String formatWithdd = adjustFractionalSecondsFromEndOfExample(example, formatWithddAndNoFraction); - return Arrays.asList(formatWithdd, formatWithdd.replace(" dd", " d")); + return Arrays.asList(formatWithdd, formatWithdd.replace(" dd", " d"), formatWithdd.replace(" dd", " d")); } static List indeterminateDayMonthFormatFromExample(String example) { diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java index 01a45b67e8784..993343084a848 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java @@ -24,6 +24,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', '"', 2, false); + private FileStructureFinderFactory tsvFactory = new DelimitedFileStructureFinderFactory('\t', '"', 3, false); public void testCreateConfigsGivenCompleteCsv() throws Exception { String sample = "time,message\n" + @@ -368,6 +369,47 @@ public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss.SSSSSS"), structure.getJodaTimestampFormats()); } + public void testCreateConfigsGivenTsvWithSyslogLikeTimestamp() throws Exception { + String sample = "Latitude\tLongitude\tloc\tTimestamp\n" + + "25.78042\t18.441196\t\"25.7804200000,18.4411960000\"\tJun 30 2019 13:21:24\n" + + "25.743484\t18.443047\t\"25.7434840000,18.4430470000\"\tJun 30 2019 06:02:35\n" + + "25.744583\t18.442783\t\"25.7445830000,18.4427830000\"\tJun 30 2019 06:02:35\n" + + "25.754593\t18.431637\t\"25.7545930000,18.4316370000\"\tJul 1 2019 06:02:43\n" + + "25.768574\t18.433483\t\"25.7685740000,18.4334830000\"\tJul 1 2019 06:21:28\n" + + "25.757736\t18.438683\t\"25.7577360000,18.4386830000\"\tJul 1 2019 12:06:08\n" + + "25.76615\t18.436565\t\"25.7661500000,18.4365650000\"\tJul 1 2019 12:06:08\n" + + "25.76896\t18.43586\t\"25.7689600000,18.4358600000\"\tJul 1 2019 12:13:50\n" + + "25.76423\t18.43705\t\"25.7642300000,18.4370500000\"\tJul 1 2019 12:39:10\n"; + assertTrue(tsvFactory.canCreateFromSample(explanation, sample)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = tsvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.DELIMITED, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertEquals("^\"?Latitude\"?\\t\"?Longitude\"?\\t\"?loc\"?\\t\"?Timestamp\"?", + structure.getExcludeLinesPattern()); + assertNull(structure.getMultilineStartPattern()); + assertEquals(Character.valueOf('\t'), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); + assertTrue(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals(Arrays.asList("Latitude", "Longitude", "loc", "Timestamp"), structure.getColumnNames()); + assertNull(structure.getGrokPattern()); + assertEquals("Timestamp", structure.getTimestampField()); + assertEquals(Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"), + structure.getJodaTimestampFormats()); + } + public void testCreateConfigsGivenDotInFieldName() throws Exception { String sample = "time.iso8601,message\n" + "2018-05-17T13:41:23,hello\n" + diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java index 91568573c9d71..a0f54c6b6f24f 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java @@ -194,7 +194,7 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDist EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("time", match.v1()); - assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss")); + assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss")); assertEquals("CISCOTIMESTAMP", match.v2().getGrokPatternName()); } @@ -227,7 +227,7 @@ public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentAndConsisten EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("time2", match.v1()); - assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss")); + assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss")); assertEquals("CISCOTIMESTAMP", match.v2().getGrokPatternName()); } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java index d669a888f61d2..7c5d6833efbb2 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java @@ -661,9 +661,9 @@ public void testFindFormatGivenOnlyKnownTimestampFormat() { "\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", "EEE MMM dd HH:mm:ss yyyy", 1526400896000L); validateTimestampMatch("May 15 17:14:56.725", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", - Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), 1526400896725L); + Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), 1526400896725L); validateTimestampMatch("May 15 17:14:56", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", - Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), 1526400896000L); + Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss"), 1526400896000L); validateTimestampMatch("15/May/2018:17:14:56 +0100", "HTTPDATE", "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "dd/MMM/yyyy:HH:mm:ss XX", 1526400896000L); @@ -672,7 +672,7 @@ public void testFindFormatGivenOnlyKnownTimestampFormat() { "\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "MMM dd, yyyy h:mm:ss a", 1526400896000L); validateTimestampMatch("May 15 2018 17:14:56", "CISCOTIMESTAMP", "\\b[A-Z]\\S{2} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", - Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), 1526400896000L); + Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), 1526400896000L); validateTimestampMatch("05/15/2018 17:14:56,374", "DATESTAMP", "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM/dd/yyyy HH:mm:ss,SSS", 1526400896374L); @@ -799,7 +799,8 @@ public void testFindFormatGivenRealLogMessages() { validateFindInFullMessage("Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' " + "opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed", "", "SYSLOGTIMESTAMP", - "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss")); + "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", + Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss")); validateFindInFullMessage("559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t" + "192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp", "559550912540598297\t", "TIMESTAMP_ISO8601", @@ -807,7 +808,7 @@ public void testFindFormatGivenRealLogMessages() { validateFindInFullMessage("Sep 8 11:55:35 dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving " + "'www.elastic.co/A/IN': 95.110.68.206#53", "", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", - Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss")); + Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss")); validateFindInFullMessage("10-28-2016 16:22:47.636 +0200 ERROR Network - " + "Error encountered for connection from src=192.168.0.1:12345. Local side shutting down", "", "DATESTAMP",