From dfe5af04115063f564255a3509e308d50caaebdd Mon Sep 17 00:00:00 2001 From: David Roberts Date: Tue, 25 Sep 2018 12:52:51 +0100 Subject: [PATCH] [ML] Return both Joda and Java formats from structure finder (#33900) Previously the timestamp_formats field in the response from the find_file_structure endpoint contained Joda timestamp formats. This change makes that clear by renaming the field to joda_timestamp_formats, and also adds a java_timestamp_formats field containing the equivalent Java time format strings. --- .../ml/apis/find-file-structure.asciidoc | 792 +++++++++++++++++- .../ml/filestructurefinder/FileStructure.java | 84 +- .../FileStructureTests.java | 3 +- .../DelimitedFileStructureFinder.java | 3 +- .../FileStructureUtils.java | 2 +- .../JsonFileStructureFinder.java | 3 +- .../TextLogFileStructureFinder.java | 7 +- .../TimestampFormatFinder.java | 220 +++-- .../XmlFileStructureFinder.java | 3 +- .../DelimitedFileStructureFinderTests.java | 18 +- .../FileStructureUtilsTests.java | 18 +- .../JsonFileStructureFinderTests.java | 2 +- .../TextLogFileStructureFinderTests.java | 24 +- .../TimestampFormatFinderTests.java | 298 +++++-- .../XmlFileStructureFinderTests.java | 2 +- .../api/xpack.ml.find_file_structure.json | 2 +- .../test/ml/find_file_structure.yml | 6 +- 17 files changed, 1254 insertions(+), 233 deletions(-) diff --git a/docs/reference/ml/apis/find-file-structure.asciidoc b/docs/reference/ml/apis/find-file-structure.asciidoc index 61987221a7602..0da485a3f0a30 100644 --- a/docs/reference/ml/apis/find-file-structure.asciidoc +++ b/docs/reference/ml/apis/find-file-structure.asciidoc @@ -159,7 +159,7 @@ format corresponds to the primary timestamp, but you do not want to specify the full `grok_pattern`. If this parameter is not specified, the structure finder chooses the best format from -the formats it knows, which are: +the formats it knows, which are these Joda formats and their Java time equivalents: * `dd/MMM/YYYY:HH:mm:ss Z` * `EEE MMM dd HH:mm zzz YYYY` @@ -178,7 +178,7 @@ the formats it knows, which are: * `MMM dd HH:mm:ss` * `MMM dd HH:mm:ss,SSS` * `MMM dd YYYY HH:mm:ss` -* `MMM dd, YYYY K:mm:ss a` +* `MMM dd, YYYY h:mm:ss a` * `TAI64N` * `UNIX` * `UNIX_MS` @@ -487,3 +487,791 @@ If the request does not encounter errors, you receive the following result: may provide clues that the data needs to be cleaned or transformed prior to use by other {ml} functionality. +The next example shows how it's possible to find the structure of some New York +City yellow cab trip data. The first `curl` command downloads the data, the +first 20000 lines of which are then piped into the `find_file_structure` +endpoint. The `lines_to_sample` query parameter of the endpoint is set to 20000 +to match what is specified in the `head` command. + +[source,js] +---- +curl -s "s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2018-06.csv" | head -20000 | curl -s -H "Content-Type: application/json" -XPOST "localhost:9200/_xpack/ml/find_file_structure?pretty&lines_to_sample=20000" -T - +---- +// NOTCONSOLE +// Not converting to console because this shows how curl can be used + +-- +NOTE: The `Content-Type: application/json` header must be set even though in +this case the data is not JSON. (Alternatively the `Content-Type` can be set +to any other supported by Elasticsearch, but it must be set.) +-- + +If the request does not encounter errors, you receive the following result: +[source,js] +---- +{ + "num_lines_analyzed" : 20000, + "num_messages_analyzed" : 19998, <1> + "sample_start" : "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount\n\n1,2018-06-01 00:15:40,2018-06-01 00:16:46,1,.00,1,N,145,145,2,3,0.5,0.5,0,0,0.3,4.3\n", + "charset" : "UTF-8", + "has_byte_order_marker" : false, + "format" : "delimited", <2> + "multiline_start_pattern" : "^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", + "exclude_lines_pattern" : "^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?,\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?,\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?", + "column_names" : [ <3> + "VendorID", + "tpep_pickup_datetime", + "tpep_dropoff_datetime", + "passenger_count", + "trip_distance", + "RatecodeID", + "store_and_fwd_flag", + "PULocationID", + "DOLocationID", + "payment_type", + "fare_amount", + "extra", + "mta_tax", + "tip_amount", + "tolls_amount", + "improvement_surcharge", + "total_amount" + ], + "has_header_row" : true, <4> + "delimiter" : ",", <5> + "quote" : "\"", <6> + "timestamp_field" : "tpep_pickup_datetime", <7> + "joda_timestamp_formats" : [ <8> + "YYYY-MM-dd HH:mm:ss" + ], + "java_timestamp_formats" : [ <9> + "yyyy-MM-dd HH:mm:ss" + ], + "need_client_timezone" : true, <10> + "mappings" : { + "@timestamp" : { + "type" : "date" + }, + "DOLocationID" : { + "type" : "long" + }, + "PULocationID" : { + "type" : "long" + }, + "RatecodeID" : { + "type" : "long" + }, + "VendorID" : { + "type" : "long" + }, + "extra" : { + "type" : "double" + }, + "fare_amount" : { + "type" : "double" + }, + "improvement_surcharge" : { + "type" : "double" + }, + "mta_tax" : { + "type" : "double" + }, + "passenger_count" : { + "type" : "long" + }, + "payment_type" : { + "type" : "long" + }, + "store_and_fwd_flag" : { + "type" : "keyword" + }, + "tip_amount" : { + "type" : "double" + }, + "tolls_amount" : { + "type" : "double" + }, + "total_amount" : { + "type" : "double" + }, + "tpep_dropoff_datetime" : { + "type" : "date", + "format" : "YYYY-MM-dd HH:mm:ss" + }, + "tpep_pickup_datetime" : { + "type" : "date", + "format" : "YYYY-MM-dd HH:mm:ss" + }, + "trip_distance" : { + "type" : "double" + } + }, + "field_stats" : { + "DOLocationID" : { + "count" : 19998, + "cardinality" : 240, + "min_value" : 1, + "max_value" : 265, + "mean_value" : 150.26532653265312, + "median_value" : 148, + "top_hits" : [ + { + "value" : 79, + "count" : 760 + }, + { + "value" : 48, + "count" : 683 + }, + { + "value" : 68, + "count" : 529 + }, + { + "value" : 170, + "count" : 506 + }, + { + "value" : 107, + "count" : 468 + }, + { + "value" : 249, + "count" : 457 + }, + { + "value" : 230, + "count" : 441 + }, + { + "value" : 186, + "count" : 432 + }, + { + "value" : 141, + "count" : 409 + }, + { + "value" : 263, + "count" : 386 + } + ] + }, + "PULocationID" : { + "count" : 19998, + "cardinality" : 154, + "min_value" : 1, + "max_value" : 265, + "mean_value" : 153.4042404240424, + "median_value" : 148, + "top_hits" : [ + { + "value" : 79, + "count" : 1067 + }, + { + "value" : 230, + "count" : 949 + }, + { + "value" : 148, + "count" : 940 + }, + { + "value" : 132, + "count" : 897 + }, + { + "value" : 48, + "count" : 853 + }, + { + "value" : 161, + "count" : 820 + }, + { + "value" : 234, + "count" : 750 + }, + { + "value" : 249, + "count" : 722 + }, + { + "value" : 164, + "count" : 663 + }, + { + "value" : 114, + "count" : 646 + } + ] + }, + "RatecodeID" : { + "count" : 19998, + "cardinality" : 5, + "min_value" : 1, + "max_value" : 5, + "mean_value" : 1.0656565656565653, + "median_value" : 1, + "top_hits" : [ + { + "value" : 1, + "count" : 19311 + }, + { + "value" : 2, + "count" : 468 + }, + { + "value" : 5, + "count" : 195 + }, + { + "value" : 4, + "count" : 17 + }, + { + "value" : 3, + "count" : 7 + } + ] + }, + "VendorID" : { + "count" : 19998, + "cardinality" : 2, + "min_value" : 1, + "max_value" : 2, + "mean_value" : 1.59005900590059, + "median_value" : 2, + "top_hits" : [ + { + "value" : 2, + "count" : 11800 + }, + { + "value" : 1, + "count" : 8198 + } + ] + }, + "extra" : { + "count" : 19998, + "cardinality" : 3, + "min_value" : -0.5, + "max_value" : 0.5, + "mean_value" : 0.4815981598159816, + "median_value" : 0.5, + "top_hits" : [ + { + "value" : 0.5, + "count" : 19281 + }, + { + "value" : 0, + "count" : 698 + }, + { + "value" : -0.5, + "count" : 19 + } + ] + }, + "fare_amount" : { + "count" : 19998, + "cardinality" : 208, + "min_value" : -100, + "max_value" : 300, + "mean_value" : 13.937719771977209, + "median_value" : 9.5, + "top_hits" : [ + { + "value" : 6, + "count" : 1004 + }, + { + "value" : 6.5, + "count" : 935 + }, + { + "value" : 5.5, + "count" : 909 + }, + { + "value" : 7, + "count" : 903 + }, + { + "value" : 5, + "count" : 889 + }, + { + "value" : 7.5, + "count" : 854 + }, + { + "value" : 4.5, + "count" : 802 + }, + { + "value" : 8.5, + "count" : 790 + }, + { + "value" : 8, + "count" : 789 + }, + { + "value" : 9, + "count" : 711 + } + ] + }, + "improvement_surcharge" : { + "count" : 19998, + "cardinality" : 3, + "min_value" : -0.3, + "max_value" : 0.3, + "mean_value" : 0.29915991599159913, + "median_value" : 0.3, + "top_hits" : [ + { + "value" : 0.3, + "count" : 19964 + }, + { + "value" : -0.3, + "count" : 22 + }, + { + "value" : 0, + "count" : 12 + } + ] + }, + "mta_tax" : { + "count" : 19998, + "cardinality" : 3, + "min_value" : -0.5, + "max_value" : 0.5, + "mean_value" : 0.4962246224622462, + "median_value" : 0.5, + "top_hits" : [ + { + "value" : 0.5, + "count" : 19868 + }, + { + "value" : 0, + "count" : 109 + }, + { + "value" : -0.5, + "count" : 21 + } + ] + }, + "passenger_count" : { + "count" : 19998, + "cardinality" : 7, + "min_value" : 0, + "max_value" : 6, + "mean_value" : 1.6201620162016201, + "median_value" : 1, + "top_hits" : [ + { + "value" : 1, + "count" : 14219 + }, + { + "value" : 2, + "count" : 2886 + }, + { + "value" : 5, + "count" : 1047 + }, + { + "value" : 3, + "count" : 804 + }, + { + "value" : 6, + "count" : 523 + }, + { + "value" : 4, + "count" : 406 + }, + { + "value" : 0, + "count" : 113 + } + ] + }, + "payment_type" : { + "count" : 19998, + "cardinality" : 4, + "min_value" : 1, + "max_value" : 4, + "mean_value" : 1.315631563156316, + "median_value" : 1, + "top_hits" : [ + { + "value" : 1, + "count" : 13936 + }, + { + "value" : 2, + "count" : 5857 + }, + { + "value" : 3, + "count" : 160 + }, + { + "value" : 4, + "count" : 45 + } + ] + }, + "store_and_fwd_flag" : { + "count" : 19998, + "cardinality" : 2, + "top_hits" : [ + { + "value" : "N", + "count" : 19910 + }, + { + "value" : "Y", + "count" : 88 + } + ] + }, + "tip_amount" : { + "count" : 19998, + "cardinality" : 717, + "min_value" : 0, + "max_value" : 128, + "mean_value" : 2.010959095909593, + "median_value" : 1.45, + "top_hits" : [ + { + "value" : 0, + "count" : 6917 + }, + { + "value" : 1, + "count" : 1178 + }, + { + "value" : 2, + "count" : 624 + }, + { + "value" : 3, + "count" : 248 + }, + { + "value" : 1.56, + "count" : 206 + }, + { + "value" : 1.46, + "count" : 205 + }, + { + "value" : 1.76, + "count" : 196 + }, + { + "value" : 1.45, + "count" : 195 + }, + { + "value" : 1.36, + "count" : 191 + }, + { + "value" : 1.5, + "count" : 187 + } + ] + }, + "tolls_amount" : { + "count" : 19998, + "cardinality" : 26, + "min_value" : 0, + "max_value" : 35, + "mean_value" : 0.2729697969796978, + "median_value" : 0, + "top_hits" : [ + { + "value" : 0, + "count" : 19107 + }, + { + "value" : 5.76, + "count" : 791 + }, + { + "value" : 10.5, + "count" : 36 + }, + { + "value" : 2.64, + "count" : 21 + }, + { + "value" : 11.52, + "count" : 8 + }, + { + "value" : 5.54, + "count" : 4 + }, + { + "value" : 8.5, + "count" : 4 + }, + { + "value" : 17.28, + "count" : 4 + }, + { + "value" : 2, + "count" : 2 + }, + { + "value" : 2.16, + "count" : 2 + } + ] + }, + "total_amount" : { + "count" : 19998, + "cardinality" : 1267, + "min_value" : -100.3, + "max_value" : 389.12, + "mean_value" : 17.499898989898995, + "median_value" : 12.35, + "top_hits" : [ + { + "value" : 7.3, + "count" : 478 + }, + { + "value" : 8.3, + "count" : 443 + }, + { + "value" : 8.8, + "count" : 420 + }, + { + "value" : 6.8, + "count" : 406 + }, + { + "value" : 7.8, + "count" : 405 + }, + { + "value" : 6.3, + "count" : 371 + }, + { + "value" : 9.8, + "count" : 368 + }, + { + "value" : 5.8, + "count" : 362 + }, + { + "value" : 9.3, + "count" : 332 + }, + { + "value" : 10.3, + "count" : 332 + } + ] + }, + "tpep_dropoff_datetime" : { + "count" : 19998, + "cardinality" : 9066, + "top_hits" : [ + { + "value" : "2018-06-01 01:12:12", + "count" : 10 + }, + { + "value" : "2018-06-01 00:32:15", + "count" : 9 + }, + { + "value" : "2018-06-01 00:44:27", + "count" : 9 + }, + { + "value" : "2018-06-01 00:46:42", + "count" : 9 + }, + { + "value" : "2018-06-01 01:03:22", + "count" : 9 + }, + { + "value" : "2018-06-01 01:05:13", + "count" : 9 + }, + { + "value" : "2018-06-01 00:11:20", + "count" : 8 + }, + { + "value" : "2018-06-01 00:16:03", + "count" : 8 + }, + { + "value" : "2018-06-01 00:19:47", + "count" : 8 + }, + { + "value" : "2018-06-01 00:25:17", + "count" : 8 + } + ] + }, + "tpep_pickup_datetime" : { + "count" : 19998, + "cardinality" : 8760, + "top_hits" : [ + { + "value" : "2018-06-01 00:01:23", + "count" : 12 + }, + { + "value" : "2018-06-01 00:04:31", + "count" : 10 + }, + { + "value" : "2018-06-01 00:05:38", + "count" : 10 + }, + { + "value" : "2018-06-01 00:09:50", + "count" : 10 + }, + { + "value" : "2018-06-01 00:12:01", + "count" : 10 + }, + { + "value" : "2018-06-01 00:14:17", + "count" : 10 + }, + { + "value" : "2018-06-01 00:00:34", + "count" : 9 + }, + { + "value" : "2018-06-01 00:00:40", + "count" : 9 + }, + { + "value" : "2018-06-01 00:02:53", + "count" : 9 + }, + { + "value" : "2018-06-01 00:05:40", + "count" : 9 + } + ] + }, + "trip_distance" : { + "count" : 19998, + "cardinality" : 1687, + "min_value" : 0, + "max_value" : 64.63, + "mean_value" : 3.6521062106210715, + "median_value" : 2.16, + "top_hits" : [ + { + "value" : 0.9, + "count" : 335 + }, + { + "value" : 0.8, + "count" : 320 + }, + { + "value" : 1.1, + "count" : 316 + }, + { + "value" : 0.7, + "count" : 304 + }, + { + "value" : 1.2, + "count" : 303 + }, + { + "value" : 1, + "count" : 296 + }, + { + "value" : 1.3, + "count" : 280 + }, + { + "value" : 1.5, + "count" : 268 + }, + { + "value" : 1.6, + "count" : 268 + }, + { + "value" : 0.6, + "count" : 256 + } + ] + } + } +} +---- +// NOTCONSOLE + +<1> `num_messages_analyzed` is 2 lower than `num_lines_analyzed` because only + data records count as messages. The first line contains the column names + and in this sample the second line is blank. +<2> Unlike the first example, in this case the `format` has been identified as + `delimited`. +<3> Because the `format` is `delimited`, the `column_names` field in the output + lists the column names in the order they appear in the sample. +<4> `has_header_row` indicates that for this sample the column names were in + the first row of the sample. (If they hadn't been then it would have been + a good idea to specify them in the `column_names` query parameter.) +<5> The `delimiter` for this sample is a comma, as it's a CSV file. +<6> The `quote` character is the default double quote. (The structure finder + does not attempt to deduce any other quote character, so if you have a + delimited file that's quoted with some other character you must specify it + using the `quote` query parameter.) +<7> The `timestamp_field` has been chosen to be `tpep_pickup_datetime`. + `tpep_dropoff_datetime` would work just as well, but `tpep_pickup_datetime` + was chosen because it comes first in the column order. If you prefer + `tpep_dropoff_datetime` then force it to be chosen using the + `timestamp_field` query parameter. +<8> `joda_timestamp_formats` are used to tell Logstash and Ingest pipeline how + to parse timestamps. +<9> `java_timestamp_formats` are the Java time formats recognized in the time + fields. In future Ingest pipeline will switch to use this format. +<10> The timestamp format in this sample doesn't specify a timezone, so to + accurately convert them to UTC timestamps to store in Elasticsearch it's + necessary to supply the timezone they relate to. `need_client_timezone` + will be `false` for timestamp formats that include the timezone. + diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java index db5f29f3b1b63..1ac9f081ebe6d 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java @@ -99,7 +99,8 @@ public String toString() { public static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields"); public static final ParseField GROK_PATTERN = new ParseField("grok_pattern"); public static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp_field"); - public static final ParseField TIMESTAMP_FORMATS = new ParseField("timestamp_formats"); + public static final ParseField JODA_TIMESTAMP_FORMATS = new ParseField("joda_timestamp_formats"); + public static final ParseField JAVA_TIMESTAMP_FORMATS = new ParseField("java_timestamp_formats"); public static final ParseField NEED_CLIENT_TIMEZONE = new ParseField("need_client_timezone"); public static final ParseField MAPPINGS = new ParseField("mappings"); public static final ParseField FIELD_STATS = new ParseField("field_stats"); @@ -123,7 +124,8 @@ public String toString() { PARSER.declareBoolean(Builder::setShouldTrimFields, SHOULD_TRIM_FIELDS); PARSER.declareString(Builder::setGrokPattern, GROK_PATTERN); PARSER.declareString(Builder::setTimestampField, TIMESTAMP_FIELD); - PARSER.declareStringArray(Builder::setTimestampFormats, TIMESTAMP_FORMATS); + PARSER.declareStringArray(Builder::setJodaTimestampFormats, JODA_TIMESTAMP_FORMATS); + PARSER.declareStringArray(Builder::setJavaTimestampFormats, JAVA_TIMESTAMP_FORMATS); PARSER.declareBoolean(Builder::setNeedClientTimezone, NEED_CLIENT_TIMEZONE); PARSER.declareObject(Builder::setMappings, (p, c) -> new TreeMap<>(p.map()), MAPPINGS); PARSER.declareObject(Builder::setFieldStats, (p, c) -> { @@ -150,7 +152,8 @@ public String toString() { private final Character quote; private final Boolean shouldTrimFields; private final String grokPattern; - private final List timestampFormats; + private final List jodaTimestampFormats; + private final List javaTimestampFormats; private final String timestampField; private final boolean needClientTimezone; private final SortedMap mappings; @@ -160,8 +163,9 @@ public String toString() { public FileStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampleStart, String charset, Boolean hasByteOrderMarker, Format format, String multilineStartPattern, String excludeLinesPattern, List columnNames, Boolean hasHeaderRow, Character delimiter, Character quote, Boolean shouldTrimFields, String grokPattern, - String timestampField, List timestampFormats, boolean needClientTimezone, Map mappings, - Map fieldStats, List explanation) { + String timestampField, List jodaTimestampFormats, List javaTimestampFormats, + boolean needClientTimezone, Map mappings, Map fieldStats, + List explanation) { this.numLinesAnalyzed = numLinesAnalyzed; this.numMessagesAnalyzed = numMessagesAnalyzed; @@ -178,7 +182,10 @@ public FileStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampl this.shouldTrimFields = shouldTrimFields; this.grokPattern = grokPattern; this.timestampField = timestampField; - this.timestampFormats = (timestampFormats == null) ? null : Collections.unmodifiableList(new ArrayList<>(timestampFormats)); + this.jodaTimestampFormats = + (jodaTimestampFormats == null) ? null : Collections.unmodifiableList(new ArrayList<>(jodaTimestampFormats)); + this.javaTimestampFormats = + (javaTimestampFormats == null) ? null : Collections.unmodifiableList(new ArrayList<>(javaTimestampFormats)); this.needClientTimezone = needClientTimezone; this.mappings = Collections.unmodifiableSortedMap(new TreeMap<>(mappings)); this.fieldStats = Collections.unmodifiableSortedMap(new TreeMap<>(fieldStats)); @@ -200,7 +207,8 @@ public FileStructure(StreamInput in) throws IOException { quote = in.readBoolean() ? (char) in.readVInt() : null; shouldTrimFields = in.readOptionalBoolean(); grokPattern = in.readOptionalString(); - timestampFormats = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null; + jodaTimestampFormats = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null; + javaTimestampFormats = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null; timestampField = in.readOptionalString(); needClientTimezone = in.readBoolean(); mappings = Collections.unmodifiableSortedMap(new TreeMap<>(in.readMap())); @@ -239,11 +247,17 @@ public void writeTo(StreamOutput out) throws IOException { } out.writeOptionalBoolean(shouldTrimFields); out.writeOptionalString(grokPattern); - if (timestampFormats == null) { + if (jodaTimestampFormats == null) { out.writeBoolean(false); } else { out.writeBoolean(true); - out.writeCollection(timestampFormats, StreamOutput::writeString); + out.writeCollection(jodaTimestampFormats, StreamOutput::writeString); + } + if (javaTimestampFormats == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + out.writeCollection(javaTimestampFormats, StreamOutput::writeString); } out.writeOptionalString(timestampField); out.writeBoolean(needClientTimezone); @@ -312,8 +326,12 @@ public String getTimestampField() { return timestampField; } - public List getTimestampFormats() { - return timestampFormats; + public List getJodaTimestampFormats() { + return jodaTimestampFormats; + } + + public List getJavaTimestampFormats() { + return javaTimestampFormats; } public boolean needClientTimezone() { @@ -371,8 +389,11 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws if (timestampField != null && timestampField.isEmpty() == false) { builder.field(TIMESTAMP_FIELD.getPreferredName(), timestampField); } - if (timestampFormats != null && timestampFormats.isEmpty() == false) { - builder.field(TIMESTAMP_FORMATS.getPreferredName(), timestampFormats); + if (jodaTimestampFormats != null && jodaTimestampFormats.isEmpty() == false) { + builder.field(JODA_TIMESTAMP_FORMATS.getPreferredName(), jodaTimestampFormats); + } + if (javaTimestampFormats != null && javaTimestampFormats.isEmpty() == false) { + builder.field(JAVA_TIMESTAMP_FORMATS.getPreferredName(), javaTimestampFormats); } builder.field(NEED_CLIENT_TIMEZONE.getPreferredName(), needClientTimezone); builder.field(MAPPINGS.getPreferredName(), mappings); @@ -396,7 +417,7 @@ public int hashCode() { return Objects.hash(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format, multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern, - timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation); + timestampField, jodaTimestampFormats, javaTimestampFormats, needClientTimezone, mappings, fieldStats, explanation); } @Override @@ -413,7 +434,6 @@ public boolean equals(Object other) { FileStructure that = (FileStructure) other; return this.numLinesAnalyzed == that.numLinesAnalyzed && this.numMessagesAnalyzed == that.numMessagesAnalyzed && - this.needClientTimezone == that.needClientTimezone && Objects.equals(this.sampleStart, that.sampleStart) && Objects.equals(this.charset, that.charset) && Objects.equals(this.hasByteOrderMarker, that.hasByteOrderMarker) && @@ -427,7 +447,9 @@ public boolean equals(Object other) { Objects.equals(this.shouldTrimFields, that.shouldTrimFields) && Objects.equals(this.grokPattern, that.grokPattern) && Objects.equals(this.timestampField, that.timestampField) && - Objects.equals(this.timestampFormats, that.timestampFormats) && + Objects.equals(this.jodaTimestampFormats, that.jodaTimestampFormats) && + Objects.equals(this.javaTimestampFormats, that.javaTimestampFormats) && + this.needClientTimezone == that.needClientTimezone && Objects.equals(this.mappings, that.mappings) && Objects.equals(this.fieldStats, that.fieldStats) && Objects.equals(this.explanation, that.explanation); @@ -450,7 +472,8 @@ public static class Builder { private Boolean shouldTrimFields; private String grokPattern; private String timestampField; - private List timestampFormats; + private List jodaTimestampFormats; + private List javaTimestampFormats; private boolean needClientTimezone; private Map mappings; private Map fieldStats = Collections.emptyMap(); @@ -539,8 +562,13 @@ public Builder setTimestampField(String timestampField) { return this; } - public Builder setTimestampFormats(List timestampFormats) { - this.timestampFormats = timestampFormats; + public Builder setJodaTimestampFormats(List jodaTimestampFormats) { + this.jodaTimestampFormats = jodaTimestampFormats; + return this; + } + + public Builder setJavaTimestampFormats(List javaTimestampFormats) { + this.javaTimestampFormats = javaTimestampFormats; return this; } @@ -652,11 +680,21 @@ public FileStructure build() { throw new IllegalStateException("enum value [" + format + "] missing from switch."); } - if ((timestampField == null) != (timestampFormats == null || timestampFormats.isEmpty())) { - throw new IllegalArgumentException("Timestamp field and timestamp formats must both be specified or neither be specified."); + boolean isTimestampFieldSpecified = (timestampField != null); + boolean isJodaTimestampFormatsSpecified = (jodaTimestampFormats != null && jodaTimestampFormats.isEmpty() == false); + boolean isJavaTimestampFormatsSpecified = (javaTimestampFormats != null && javaTimestampFormats.isEmpty() == false); + + if (isTimestampFieldSpecified != isJodaTimestampFormatsSpecified) { + throw new IllegalArgumentException( + "Timestamp field and Joda timestamp formats must both be specified or neither be specified."); + } + + if (isTimestampFieldSpecified != isJavaTimestampFormatsSpecified) { + throw new IllegalArgumentException( + "Timestamp field and Java timestamp formats must both be specified or neither be specified."); } - if (needClientTimezone && timestampField == null) { + if (needClientTimezone && isTimestampFieldSpecified == false) { throw new IllegalArgumentException("Client timezone cannot be needed if there is no timestamp field."); } @@ -670,7 +708,7 @@ public FileStructure build() { return new FileStructure(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format, multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern, - timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation); + timestampField, jodaTimestampFormats, javaTimestampFormats, needClientTimezone, mappings, fieldStats, explanation); } } } diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java index ac6c647136bd7..d008b31f9a6f7 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java @@ -63,7 +63,8 @@ public static FileStructure createTestFileStructure() { if (format == FileStructure.Format.SEMI_STRUCTURED_TEXT || randomBoolean()) { builder.setTimestampField(randomAlphaOfLength(10)); - builder.setTimestampFormats(Arrays.asList(generateRandomStringArray(3, 20, false, false))); + builder.setJodaTimestampFormats(Arrays.asList(generateRandomStringArray(3, 20, false, false))); + builder.setJavaTimestampFormats(Arrays.asList(generateRandomStringArray(3, 20, false, false))); builder.setNeedClientTimezone(randomBoolean()); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java index a103560480d06..870cb28570bb5 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java @@ -141,7 +141,8 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List guessScalarMapping(List explanation, String f Iterator iter = fieldValues.iterator(); TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(iter.next()); while (timestampMatch != null && iter.hasNext()) { - // To be mapped as type date all the values must match the same date format - it is + // To be mapped as type date all the values must match the same timestamp format - it is // not acceptable for all values to be dates, but with different formats if (timestampMatch.equals(TimestampFormatFinder.findFirstFullMatch(iter.next(), timestampMatch.candidateIndex)) == false) { timestampMatch = null; diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java index b20658f872b65..48c446c90f451 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java @@ -55,7 +55,8 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List explanat Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides); if (timeField != null) { structureBuilder.setTimestampField(timeField.v1()) - .setTimestampFormats(timeField.v2().dateFormats) + .setJodaTimestampFormats(timeField.v2().jodaTimestampFormats) + .setJavaTimestampFormats(timeField.v2().javaTimestampFormats) .setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing()); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java index e6e445a3ff6b1..b9386fab72f9b 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java @@ -112,7 +112,8 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex FileStructure structure = structureBuilder .setTimestampField(interimTimestampField) - .setTimestampFormats(bestTimestamp.v1().dateFormats) + .setJodaTimestampFormats(bestTimestamp.v1().jodaTimestampFormats) + .setJavaTimestampFormats(bestTimestamp.v1().javaTimestampFormats) .setNeedClientTimezone(bestTimestamp.v1().hasTimezoneDependentParsing()) .setGrokPattern(grokPattern) .setMappings(mappings) @@ -147,8 +148,8 @@ static Tuple> mostLikelyTimestamp(String[] sampleLin for (String sampleLine : sampleLines) { TimestampMatch match = TimestampFormatFinder.findFirstMatch(sampleLine, overrides.getTimestampFormat()); if (match != null) { - TimestampMatch pureMatch = new TimestampMatch(match.candidateIndex, "", match.dateFormats, match.simplePattern, - match.grokPatternName, ""); + TimestampMatch pureMatch = new TimestampMatch(match.candidateIndex, "", match.jodaTimestampFormats, + match.javaTimestampFormats, match.simplePattern, match.grokPatternName, ""); timestampMatches.compute(pureMatch, (k, v) -> { if (v == null) { return new Tuple<>(weightForMatch(match.preface), new HashSet<>(Collections.singletonList(match.preface))); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java index 363b1352a54cb..7ed95f656b4ee 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java @@ -12,7 +12,6 @@ import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.regex.Matcher; @@ -30,8 +29,12 @@ public final class TimestampFormatFinder { private static final String PREFACE = "preface"; private static final String EPILOGUE = "epilogue"; - private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile("([:.,])(\\d{3,9})"); + private static final String FRACTIONAL_SECOND_SEPARATORS = ":.,"; + private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})"); private static final char DEFAULT_FRACTIONAL_SECOND_SEPARATOR = ','; + private static final Pattern FRACTIONAL_SECOND_TIMESTAMP_FORMAT_PATTERN = + Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "]S{3,9})"); + private static final String DEFAULT_FRACTIONAL_SECOND_FORMAT = DEFAULT_FRACTIONAL_SECOND_SEPARATOR + "SSS"; /** * The timestamp patterns are complex and it can be slow to prove they do not @@ -48,7 +51,9 @@ public final class TimestampFormatFinder { // The end of some number (likely year or day) followed by a space then HH:mm Pattern.compile("\\d \\d{2}:\\d{2}\\b"), // HH:mm:ss surrounded by spaces - Pattern.compile(" \\d{2}:\\d{2}:\\d{2} ") + Pattern.compile(" \\d{2}:\\d{2}:\\d{2} "), + // Literal 'T' surrounded by numbers + Pattern.compile("\\dT\\d") ); /** @@ -59,81 +64,107 @@ public final class TimestampFormatFinder { // The TOMCAT_DATESTAMP format has to come before ISO8601 because it's basically ISO8601 but // with a space before the timezone, and because the timezone is optional in ISO8601 it will // be recognised as that with the timezone missed off if ISO8601 is checked first - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS Z", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", + new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS Z", "yyyy-MM-dd HH:mm:ss,SSS XX", + "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "\\b20\\d{2}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9} (?:Z|[+-]%{HOUR}%{MINUTE})\\b", "TOMCAT_DATESTAMP", Arrays.asList(0, 1)), // The Elasticsearch ISO8601 parser requires a literal T between the date and time, so // longhand formats are needed if there's a space instead - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", + new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZ", "yyyy-MM-dd HH:mm:ss,SSSXX", + "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}(?:Z|[+-]%{HOUR}%{MINUTE})\\b", "TIMESTAMP_ISO8601", Arrays.asList(0, 1)), - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", + new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZZ", "yyyy-MM-dd HH:mm:ss,SSSXXX", + "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}[+-]%{HOUR}:%{MINUTE}\\b", "TIMESTAMP_ISO8601", Arrays.asList(0, 1)), - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", + new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS", "yyyy-MM-dd HH:mm:ss,SSS", + "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "TIMESTAMP_ISO8601", Arrays.asList(0, 1)), - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", + new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZ", "yyyy-MM-dd HH:mm:ssXX", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)(?:Z|[+-]%{HOUR}%{MINUTE})\\b", "TIMESTAMP_ISO8601", Arrays.asList(0, 1)), - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", + new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZZ", "yyyy-MM-dd HH:mm:ssXXX", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[+-]%{HOUR}:%{MINUTE}\\b", "TIMESTAMP_ISO8601", Arrays.asList(0, 1)), - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", + new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss", "yyyy-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)\\b", "TIMESTAMP_ISO8601", Arrays.asList(0, 1)), - new CandidateTimestampFormat("ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "\\b%{TIMESTAMP_ISO8601}\\b", - "TIMESTAMP_ISO8601"), - new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm:ss zzz", + // When using Java time the Elasticsearch ISO8601 parser for fractional time requires that the fractional + // separator match the current JVM locale, which is too restrictive for arbitrary log file parsing + new CandidateTimestampFormat("ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSSXX", + "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", + "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY}T%{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}(?:Z|[+-]%{HOUR}%{MINUTE})\\b", + "TIMESTAMP_ISO8601", Collections.singletonList(3)), + new CandidateTimestampFormat("ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSSXXX", + "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", + "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY}T%{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}[+-]%{HOUR}:%{MINUTE}\\b", + "TIMESTAMP_ISO8601", Collections.singletonList(3)), + new CandidateTimestampFormat("ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSS", + "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", + "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY}T%{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "TIMESTAMP_ISO8601", + Collections.singletonList(3)), + new CandidateTimestampFormat("ISO8601", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "\\b%{TIMESTAMP_ISO8601}\\b", + "TIMESTAMP_ISO8601", Collections.singletonList(3)), + new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm:ss zzz", "EEE MMM dd yyyy HH:mm:ss zzz", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{TZ}\\b", "DATESTAMP_RFC822", Arrays.asList(1, 2)), - new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm zzz", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2} ", + new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm zzz", "EEE MMM dd yyyy HH:mm zzz", + "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2} ", "\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE} %{TZ}\\b", "DATESTAMP_RFC822", Collections.singletonList(1)), - new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss ZZ", + new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss ZZ", "EEE, dd MMM yyyy HH:mm:ss XXX", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:Z|[+-]%{HOUR}:%{MINUTE})\\b", "DATESTAMP_RFC2822", Arrays.asList(1, 2)), - new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss Z", + new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss Z", "EEE, dd MMM yyyy HH:mm:ss XX", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:Z|[+-]%{HOUR}%{MINUTE})\\b", "DATESTAMP_RFC2822", Arrays.asList(1, 2)), - new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm ZZ", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", + new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm ZZ", "EEE, dd MMM yyyy HH:mm XXX", + "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE} (?:Z|[+-]%{HOUR}:%{MINUTE})\\b", "DATESTAMP_RFC2822", Collections.singletonList(1)), - new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm Z", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", + new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm Z", "EEE, dd MMM yyyy HH:mm XX", + "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE} (?:Z|[+-]%{HOUR}%{MINUTE})\\b", "DATESTAMP_RFC2822", Collections.singletonList(1)), - new CandidateTimestampFormat("EEE MMM dd HH:mm:ss zzz YYYY", + new CandidateTimestampFormat("EEE MMM dd HH:mm:ss zzz YYYY", "EEE MMM dd HH:mm:ss zzz yyyy", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER", Arrays.asList(1, 2)), - new CandidateTimestampFormat("EEE MMM dd HH:mm zzz YYYY", + new CandidateTimestampFormat("EEE MMM dd HH:mm zzz YYYY", "EEE MMM dd HH:mm zzz yyyy", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE} %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER", Collections.singletonList(1)), - new CandidateTimestampFormat("YYYYMMddHHmmss", "\\b\\d{14}\\b", + new CandidateTimestampFormat("YYYYMMddHHmmss", "yyyyMMddHHmmss", "\\b\\d{14}\\b", "\\b20\\d{2}%{MONTHNUM2}(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01]))(?:2[0123]|[01][0-9])%{MINUTE}(?:[0-5][0-9]|60)\\b", "DATESTAMP_EVENTLOG"), - new CandidateTimestampFormat("EEE MMM dd HH:mm:ss YYYY", + new CandidateTimestampFormat("EEE MMM dd HH:mm:ss YYYY", "EEE MMM dd HH:mm:ss yyyy", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{YEAR}\\b", "HTTPDERROR_DATE", Arrays.asList(1, 2)), new CandidateTimestampFormat(Arrays.asList("MMM dd HH:mm:ss,SSS", "MMM d HH:mm:ss,SSS"), + Arrays.asList("MMM dd HH:mm:ss,SSS", "MMM d HH:mm:ss,SSS"), "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "SYSLOGTIMESTAMP", Collections.singletonList(1)), new CandidateTimestampFormat(Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), + Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", "SYSLOGTIMESTAMP", Collections.singletonList(1)), - new CandidateTimestampFormat("dd/MMM/YYYY:HH:mm:ss Z", "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", + new CandidateTimestampFormat("dd/MMM/YYYY:HH:mm:ss Z", "dd/MMM/yyyy:HH:mm:ss XX", + "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "\\b%{MONTHDAY}/%{MONTH}/%{YEAR}:%{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) [+-]?%{HOUR}%{MINUTE}\\b", "HTTPDATE"), - new CandidateTimestampFormat("MMM dd, YYYY K:mm:ss a", "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", + new CandidateTimestampFormat("MMM dd, YYYY h:mm:ss a", "MMM dd, yyyy h:mm:ss a", + "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "%{MONTH} %{MONTHDAY}, 20\\d{2} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:AM|PM)\\b", "CATALINA_DATESTAMP"), new CandidateTimestampFormat(Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"), + Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", "%{MONTH} +%{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", "CISCOTIMESTAMP", Collections.singletonList(1)), - new CandidateTimestampFormat("UNIX_MS", "\\b\\d{13}\\b", "\\b\\d{13}\\b", "POSINT"), - new CandidateTimestampFormat("UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "\\b\\d{10}\\.(?:\\d{3}){1,3}\\b", "NUMBER"), - new CandidateTimestampFormat("UNIX", "\\b\\d{10}\\b", "\\b\\d{10}\\b", "POSINT"), - new CandidateTimestampFormat("TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM") + new CandidateTimestampFormat("UNIX_MS", "UNIX_MS", "\\b\\d{13}\\b", "\\b\\d{13}\\b", "POSINT"), + new CandidateTimestampFormat("UNIX", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "\\b\\d{10}\\.(?:\\d{3}){1,3}\\b", "NUMBER"), + new CandidateTimestampFormat("UNIX", "UNIX", "\\b\\d{10}\\b", "\\b\\d{10}\\b", "POSINT"), + new CandidateTimestampFormat("TAI64N", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM") ); private TimestampFormatFinder() { @@ -151,7 +182,7 @@ public static TimestampMatch findFirstMatch(String text) { /** * Find the first timestamp format that matches part of the supplied value. * @param text The value that the returned timestamp format must exist within. - * @param requiredFormat A date format that any returned match must support. + * @param requiredFormat A timestamp format that any returned match must support. * @return The timestamp format, or null if none matches. */ public static TimestampMatch findFirstMatch(String text, String requiredFormat) { @@ -174,14 +205,19 @@ public static TimestampMatch findFirstMatch(String text, int ignoreCandidates) { * excluding a specified number of candidate formats. * @param text The value that the returned timestamp format must exist within. * @param ignoreCandidates The number of candidate formats to exclude from the search. - * @param requiredFormat A date format that any returned match must support. + * @param requiredFormat A timestamp format that any returned match must support. * @return The timestamp format, or null if none matches. */ public static TimestampMatch findFirstMatch(String text, int ignoreCandidates, String requiredFormat) { + if (ignoreCandidates >= ORDERED_CANDIDATE_FORMATS.size()) { + return null; + } Boolean[] quickRuleoutMatches = new Boolean[QUICK_RULE_OUT_PATTERNS.size()]; int index = ignoreCandidates; + String adjustedRequiredFormat = adjustRequiredFormat(requiredFormat); for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) { - if (requiredFormat == null || candidate.dateFormats.contains(requiredFormat)) { + if (adjustedRequiredFormat == null || candidate.jodaTimestampFormats.contains(adjustedRequiredFormat) || + candidate.javaTimestampFormats.contains(adjustedRequiredFormat)) { boolean quicklyRuledOut = false; for (Integer quickRuleOutIndex : candidate.quickRuleOutIndices) { if (quickRuleoutMatches[quickRuleOutIndex] == null) { @@ -219,7 +255,7 @@ public static TimestampMatch findFirstFullMatch(String text) { /** * Find the best timestamp format for matching an entire field value. * @param text The value that the returned timestamp format must match in its entirety. - * @param requiredFormat A date format that any returned match must support. + * @param requiredFormat A timestamp format that any returned match must support. * @return The timestamp format, or null if none matches. */ public static TimestampMatch findFirstFullMatch(String text, String requiredFormat) { @@ -242,13 +278,18 @@ public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidate * excluding a specified number of candidate formats. * @param text The value that the returned timestamp format must match in its entirety. * @param ignoreCandidates The number of candidate formats to exclude from the search. - * @param requiredFormat A date format that any returned match must support. + * @param requiredFormat A timestamp format that any returned match must support. * @return The timestamp format, or null if none matches. */ public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates, String requiredFormat) { + if (ignoreCandidates >= ORDERED_CANDIDATE_FORMATS.size()) { + return null; + } int index = ignoreCandidates; + String adjustedRequiredFormat = adjustRequiredFormat(requiredFormat); for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) { - if (requiredFormat == null || candidate.dateFormats.contains(requiredFormat)) { + if (adjustedRequiredFormat == null || candidate.jodaTimestampFormats.contains(adjustedRequiredFormat) || + candidate.javaTimestampFormats.contains(adjustedRequiredFormat)) { Map captures = candidate.strictFullMatchGrok.captures(text); if (captures != null) { return makeTimestampMatch(candidate, index, "", text, ""); @@ -259,16 +300,32 @@ public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidate return null; } + /** + * If a required timestamp format contains a fractional seconds component, adjust it to the + * fractional seconds format that's in the candidate timestamp formats, i.e. ",SSS". So, for + * example, "YYYY-MM-dd HH:mm:ss.SSSSSSSSS Z" would get adjusted to "YYYY-MM-dd HH:mm:ss,SSS Z". + */ + static String adjustRequiredFormat(String requiredFormat) { + + return (requiredFormat == null) ? null : + FRACTIONAL_SECOND_TIMESTAMP_FORMAT_PATTERN.matcher(requiredFormat).replaceFirst(DEFAULT_FRACTIONAL_SECOND_FORMAT); + } + private static TimestampMatch makeTimestampMatch(CandidateTimestampFormat chosenTimestampFormat, int chosenIndex, String preface, String matchedDate, String epilogue) { Tuple fractionalSecondsInterpretation = interpretFractionalSeconds(matchedDate); - List dateFormats = chosenTimestampFormat.dateFormats; + List jodaTimestampFormats = chosenTimestampFormat.jodaTimestampFormats; + List javaTimestampFormats = chosenTimestampFormat.javaTimestampFormats; Pattern simplePattern = chosenTimestampFormat.simplePattern; char separator = fractionalSecondsInterpretation.v1(); if (separator != DEFAULT_FRACTIONAL_SECOND_SEPARATOR) { - dateFormats = dateFormats.stream().map(dateFormat -> dateFormat.replace(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, separator)) + jodaTimestampFormats = jodaTimestampFormats.stream() + .map(jodaTimestampFormat -> jodaTimestampFormat.replace(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, separator)) + .collect(Collectors.toList()); + javaTimestampFormats = javaTimestampFormats.stream() + .map(javaTimestampFormat -> javaTimestampFormat.replace(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, separator)) .collect(Collectors.toList()); - if (dateFormats.stream().noneMatch(dateFormat -> dateFormat.startsWith("UNIX"))) { + if (jodaTimestampFormats.stream().noneMatch(jodaTimestampFormat -> jodaTimestampFormat.startsWith("UNIX"))) { String patternStr = simplePattern.pattern(); int separatorPos = patternStr.lastIndexOf(DEFAULT_FRACTIONAL_SECOND_SEPARATOR); if (separatorPos >= 0) { @@ -281,11 +338,15 @@ private static TimestampMatch makeTimestampMatch(CandidateTimestampFormat chosen int numberOfDigitsInFractionalComponent = fractionalSecondsInterpretation.v2(); if (numberOfDigitsInFractionalComponent > 3) { String fractionalSecondsFormat = "SSSSSSSSS".substring(0, numberOfDigitsInFractionalComponent); - dateFormats = dateFormats.stream().map(dateFormat -> dateFormat.replace("SSS", fractionalSecondsFormat)) + jodaTimestampFormats = jodaTimestampFormats.stream() + .map(jodaTimestampFormat -> jodaTimestampFormat.replace("SSS", fractionalSecondsFormat)) + .collect(Collectors.toList()); + javaTimestampFormats = javaTimestampFormats.stream() + .map(javaTimestampFormat -> javaTimestampFormat.replace("SSS", fractionalSecondsFormat)) .collect(Collectors.toList()); } - return new TimestampMatch(chosenIndex, preface, dateFormats, simplePattern, chosenTimestampFormat.standardGrokPatternName, - epilogue); + return new TimestampMatch(chosenIndex, preface, jodaTimestampFormats, javaTimestampFormats, simplePattern, + chosenTimestampFormat.standardGrokPatternName, epilogue); } /** @@ -323,7 +384,12 @@ public static final class TimestampMatch { /** * Time format specifier(s) that will work with Logstash and Ingest pipeline date parsers. */ - public final List dateFormats; + public final List jodaTimestampFormats; + + /** + * Time format specifier(s) that will work with Logstash and Ingest pipeline date parsers. + */ + public final List javaTimestampFormats; /** * A simple regex that will work in many languages to detect whether the timestamp format @@ -341,25 +407,25 @@ public static final class TimestampMatch { */ public final String epilogue; - TimestampMatch(int candidateIndex, String preface, String dateFormat, String simpleRegex, String grokPatternName, String epilogue) { - this(candidateIndex, preface, Collections.singletonList(dateFormat), simpleRegex, grokPatternName, epilogue); - } - - TimestampMatch(int candidateIndex, String preface, String dateFormat, String simpleRegex, String grokPatternName, String epilogue, - boolean hasFractionalComponentSmallerThanMillisecond) { - this(candidateIndex, preface, Collections.singletonList(dateFormat), simpleRegex, grokPatternName, epilogue); + TimestampMatch(int candidateIndex, String preface, String jodaTimestampFormat, String javaTimestampFormat, String simpleRegex, + String grokPatternName, String epilogue) { + this(candidateIndex, preface, Collections.singletonList(jodaTimestampFormat), Collections.singletonList(javaTimestampFormat), + simpleRegex, grokPatternName, epilogue); } - TimestampMatch(int candidateIndex, String preface, List dateFormats, String simpleRegex, String grokPatternName, - String epilogue) { - this(candidateIndex, preface, dateFormats, Pattern.compile(simpleRegex), grokPatternName, epilogue); + TimestampMatch(int candidateIndex, String preface, List jodaTimestampFormats, List javaTimestampFormats, + String simpleRegex, String grokPatternName, String epilogue) { + this(candidateIndex, preface, jodaTimestampFormats, javaTimestampFormats, Pattern.compile(simpleRegex), grokPatternName, + epilogue); } - TimestampMatch(int candidateIndex, String preface, List dateFormats, Pattern simplePattern, String grokPatternName, + TimestampMatch(int candidateIndex, String preface, List jodaTimestampFormats, List javaTimestampFormats, + Pattern simplePattern, String grokPatternName, String epilogue) { this.candidateIndex = candidateIndex; this.preface = preface; - this.dateFormats = dateFormats; + this.jodaTimestampFormats = Collections.unmodifiableList(jodaTimestampFormats); + this.javaTimestampFormats = Collections.unmodifiableList(javaTimestampFormats); this.simplePattern = simplePattern; this.grokPatternName = grokPatternName; this.epilogue = epilogue; @@ -370,8 +436,8 @@ public static final class TimestampMatch { * I.e., does the textual representation NOT define the timezone? */ public boolean hasTimezoneDependentParsing() { - return dateFormats.stream() - .anyMatch(dateFormat -> dateFormat.contains("HH") && dateFormat.toLowerCase(Locale.ROOT).indexOf('z') == -1); + return javaTimestampFormats.stream().anyMatch(javaTimestampFormat -> + javaTimestampFormat.indexOf('X') == -1 && javaTimestampFormat.indexOf('z') == -1 && javaTimestampFormat.contains("mm")); } /** @@ -380,13 +446,13 @@ public boolean hasTimezoneDependentParsing() { * and possibly also a "format" setting. */ public Map getEsDateMappingTypeWithFormat() { - if (dateFormats.contains("TAI64N")) { - // There's no format for TAI64N in the date formats used in mappings + if (jodaTimestampFormats.contains("TAI64N")) { + // There's no format for TAI64N in the timestamp formats used in mappings return Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"); } Map mapping = new LinkedHashMap<>(); mapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); - String formats = dateFormats.stream().flatMap(format -> { + String formats = jodaTimestampFormats.stream().flatMap(format -> { switch (format) { case "ISO8601": return Stream.empty(); @@ -406,7 +472,8 @@ public Map getEsDateMappingTypeWithFormat() { @Override public int hashCode() { - return Objects.hash(candidateIndex, preface, dateFormats, simplePattern.pattern(), grokPatternName, epilogue); + return Objects.hash(candidateIndex, preface, jodaTimestampFormats, javaTimestampFormats, simplePattern.pattern(), + grokPatternName, epilogue); } @Override @@ -421,7 +488,8 @@ public boolean equals(Object other) { TimestampMatch that = (TimestampMatch) other; return this.candidateIndex == that.candidateIndex && Objects.equals(this.preface, that.preface) && - Objects.equals(this.dateFormats, that.dateFormats) && + Objects.equals(this.jodaTimestampFormats, that.jodaTimestampFormats) && + Objects.equals(this.javaTimestampFormats, that.javaTimestampFormats) && Objects.equals(this.simplePattern.pattern(), that.simplePattern.pattern()) && Objects.equals(this.grokPatternName, that.grokPatternName) && Objects.equals(this.epilogue, that.epilogue); @@ -430,7 +498,8 @@ public boolean equals(Object other) { @Override public String toString() { return "index = " + candidateIndex + (preface.isEmpty() ? "" : ", preface = '" + preface + "'") + - ", date formats = " + dateFormats.stream().collect(Collectors.joining("', '", "[ '", "' ]")) + + ", Joda timestamp formats = " + jodaTimestampFormats.stream().collect(Collectors.joining("', '", "[ '", "' ]")) + + ", Java timestamp formats = " + javaTimestampFormats.stream().collect(Collectors.joining("', '", "[ '", "' ]")) + ", simple pattern = '" + simplePattern.pattern() + "', grok pattern = '" + grokPatternName + "'" + (epilogue.isEmpty() ? "" : ", epilogue = '" + epilogue + "'"); } @@ -438,29 +507,36 @@ public String toString() { static final class CandidateTimestampFormat { - final List dateFormats; + final List jodaTimestampFormats; + final List javaTimestampFormats; final Pattern simplePattern; final Grok strictSearchGrok; final Grok strictFullMatchGrok; final String standardGrokPatternName; final List quickRuleOutIndices; - CandidateTimestampFormat(String dateFormat, String simpleRegex, String strictGrokPattern, String standardGrokPatternName) { - this(Collections.singletonList(dateFormat), simpleRegex, strictGrokPattern, standardGrokPatternName); + CandidateTimestampFormat(String jodaTimestampFormat, String javaTimestampFormat, String simpleRegex, String strictGrokPattern, + String standardGrokPatternName) { + this(Collections.singletonList(jodaTimestampFormat), Collections.singletonList(javaTimestampFormat), simpleRegex, + strictGrokPattern, standardGrokPatternName); } - CandidateTimestampFormat(String dateFormat, String simpleRegex, String strictGrokPattern, String standardGrokPatternName, - List quickRuleOutIndices) { - this(Collections.singletonList(dateFormat), simpleRegex, strictGrokPattern, standardGrokPatternName, quickRuleOutIndices); + CandidateTimestampFormat(String jodaTimestampFormat, String javaTimestampFormat, String simpleRegex, String strictGrokPattern, + String standardGrokPatternName, List quickRuleOutIndices) { + this(Collections.singletonList(jodaTimestampFormat), Collections.singletonList(javaTimestampFormat), simpleRegex, + strictGrokPattern, standardGrokPatternName, quickRuleOutIndices); } - CandidateTimestampFormat(List dateFormats, String simpleRegex, String strictGrokPattern, String standardGrokPatternName) { - this(dateFormats, simpleRegex, strictGrokPattern, standardGrokPatternName, Collections.emptyList()); + CandidateTimestampFormat(List jodaTimestampFormats, List javaTimestampFormats, String simpleRegex, + String strictGrokPattern, String standardGrokPatternName) { + this(jodaTimestampFormats, javaTimestampFormats, simpleRegex, strictGrokPattern, standardGrokPatternName, + Collections.emptyList()); } - CandidateTimestampFormat(List dateFormats, String simpleRegex, String strictGrokPattern, String standardGrokPatternName, - List quickRuleOutIndices) { - this.dateFormats = dateFormats; + CandidateTimestampFormat(List jodaTimestampFormats, List javaTimestampFormats, String simpleRegex, + String strictGrokPattern, String standardGrokPatternName, List quickRuleOutIndices) { + this.jodaTimestampFormats = jodaTimestampFormats; + this.javaTimestampFormats = javaTimestampFormats; this.simplePattern = Pattern.compile(simpleRegex, Pattern.MULTILINE); // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java this.strictSearchGrok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + strictGrokPattern + diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java index d5e3fba34c972..66e6bbae88666 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java @@ -93,7 +93,8 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides); if (timeField != null) { structureBuilder.setTimestampField(timeField.v1()) - .setTimestampFormats(timeField.v2().dateFormats) + .setJodaTimestampFormats(timeField.v2().jodaTimestampFormats) + .setJavaTimestampFormats(timeField.v2().javaTimestampFormats) .setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing()); } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java index decc61a5397a5..9f6699fe71bd7 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java @@ -50,7 +50,7 @@ public void testCreateConfigsGivenCompleteCsv() throws Exception { assertEquals(Arrays.asList("time", "message"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("time", structure.getTimestampField()); - assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); + assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats()); } public void testCreateConfigsGivenCompleteCsvAndColumnNamesOverride() throws Exception { @@ -84,7 +84,7 @@ public void testCreateConfigsGivenCompleteCsvAndColumnNamesOverride() throws Exc assertEquals(Arrays.asList("my_time", "my_message"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("my_time", structure.getTimestampField()); - assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); + assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats()); } public void testCreateConfigsGivenCompleteCsvAndHasHeaderRowOverride() throws Exception { @@ -120,7 +120,7 @@ public void testCreateConfigsGivenCompleteCsvAndHasHeaderRowOverride() throws Ex assertEquals(Arrays.asList("column1", "column2"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertNull(structure.getTimestampField()); - assertNull(structure.getTimestampFormats()); + assertNull(structure.getJodaTimestampFormats()); } public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception { @@ -153,7 +153,7 @@ public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception assertEquals(Arrays.asList("message", "time", "count"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("time", structure.getTimestampField()); - assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); + assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats()); } public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception { @@ -193,7 +193,7 @@ public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception { "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("tpep_pickup_datetime", structure.getTimestampField()); - assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); + assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getJodaTimestampFormats()); } public void testCreateConfigsGivenCsvWithTrailingNullsAndOverriddenTimeField() throws Exception { @@ -237,7 +237,7 @@ public void testCreateConfigsGivenCsvWithTrailingNullsAndOverriddenTimeField() t "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("tpep_dropoff_datetime", structure.getTimestampField()); - assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); + assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getJodaTimestampFormats()); } public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exception { @@ -277,7 +277,7 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exce "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("tpep_pickup_datetime", structure.getTimestampField()); - assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); + assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getJodaTimestampFormats()); } public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeaderAndColumnNamesOverride() throws Exception { @@ -324,7 +324,7 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeaderAndColumnNames structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("my_tpep_pickup_datetime", structure.getTimestampField()); - assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); + assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getJodaTimestampFormats()); } public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { @@ -357,7 +357,7 @@ public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { assertEquals(Arrays.asList("pos_id", "trip_id", "latitude", "longitude", "altitude", "timestamp"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("timestamp", structure.getTimestampField()); - assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss.SSSSSS"), structure.getTimestampFormats()); + assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss.SSSSSS"), structure.getJodaTimestampFormats()); } public void testFindHeaderFromSampleGivenHeaderInSample() throws IOException { diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java index 901fca83ffd06..698b53cf7c741 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java @@ -39,7 +39,7 @@ public void testGuessTimestampGivenSingleSampleSingleField() { FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("field1", match.v1()); - assertThat(match.v2().dateFormats, contains("ISO8601")); + assertThat(match.v2().jodaTimestampFormats, contains("ISO8601")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } @@ -52,7 +52,7 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeField FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides); assertNotNull(match); assertEquals("field1", match.v1()); - assertThat(match.v2().dateFormats, contains("ISO8601")); + assertThat(match.v2().jodaTimestampFormats, contains("ISO8601")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } @@ -76,7 +76,7 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeForma FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides); assertNotNull(match); assertEquals("field1", match.v1()); - assertThat(match.v2().dateFormats, contains("ISO8601")); + assertThat(match.v2().jodaTimestampFormats, contains("ISO8601")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } @@ -99,7 +99,7 @@ public void testGuessTimestampGivenSamplesWithSameSingleTimeField() { FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("field1", match.v1()); - assertThat(match.v2().dateFormats, contains("ISO8601")); + assertThat(match.v2().jodaTimestampFormats, contains("ISO8601")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } @@ -128,7 +128,7 @@ public void testGuessTimestampGivenSingleSampleManyFieldsOneTimeFormat() { FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time", match.v1()); - assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); + assertThat(match.v2().jodaTimestampFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } @@ -145,7 +145,7 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormat() { FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time", match.v1()); - assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); + assertThat(match.v2().jodaTimestampFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } @@ -176,7 +176,7 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDist FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time", match.v1()); - assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); + assertThat(match.v2().jodaTimestampFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } @@ -193,7 +193,7 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDist FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time", match.v1()); - assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss")); + assertThat(match.v2().jodaTimestampFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss")); assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName); } @@ -226,7 +226,7 @@ public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentAndConsisten FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time2", match.v1()); - assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss")); + assertThat(match.v2().jodaTimestampFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss")); assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName); } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java index 6856e9a60214b..ce401c182eabd 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java @@ -38,6 +38,6 @@ public void testCreateConfigsGivenGoodJson() throws Exception { assertNull(structure.getShouldTrimFields()); assertNull(structure.getGrokPattern()); assertEquals("timestamp", structure.getTimestampField()); - assertEquals(Collections.singletonList("UNIX_MS"), structure.getTimestampFormats()); + assertEquals(Collections.singletonList("UNIX_MS"), structure.getJodaTimestampFormats()); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java index 5bc40a165117e..b4c28eda1496f 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java @@ -119,14 +119,14 @@ public void testCreateConfigsGivenElasticsearchLog() throws Exception { assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } assertNull(structure.getExcludeLinesPattern()); - assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", structure.getMultilineStartPattern()); assertNull(structure.getDelimiter()); assertNull(structure.getQuote()); assertNull(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} \\]\\[.*", structure.getGrokPattern()); assertEquals("timestamp", structure.getTimestampField()); - assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); + assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats()); } public void testCreateConfigsGivenElasticsearchLogAndTimestampFieldOverride() throws Exception { @@ -149,14 +149,14 @@ public void testCreateConfigsGivenElasticsearchLogAndTimestampFieldOverride() th assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } assertNull(structure.getExcludeLinesPattern()); - assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", structure.getMultilineStartPattern()); assertNull(structure.getDelimiter()); assertNull(structure.getQuote()); assertNull(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals("\\[%{TIMESTAMP_ISO8601:my_time}\\]\\[%{LOGLEVEL:loglevel} \\]\\[.*", structure.getGrokPattern()); assertEquals("my_time", structure.getTimestampField()); - assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); + assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats()); } public void testCreateConfigsGivenElasticsearchLogAndGrokPatternOverride() throws Exception { @@ -180,7 +180,7 @@ public void testCreateConfigsGivenElasticsearchLogAndGrokPatternOverride() throw assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } assertNull(structure.getExcludeLinesPattern()); - assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", structure.getMultilineStartPattern()); assertNull(structure.getDelimiter()); assertNull(structure.getQuote()); assertNull(structure.getHasHeaderRow()); @@ -188,7 +188,7 @@ public void testCreateConfigsGivenElasticsearchLogAndGrokPatternOverride() throw assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} *\\]" + "\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}", structure.getGrokPattern()); assertEquals("timestamp", structure.getTimestampField()); - assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); + assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats()); } public void testCreateConfigsGivenElasticsearchLogAndImpossibleGrokPatternOverride() { @@ -312,8 +312,8 @@ public void testMostLikelyTimestampGivenAllSame() { Tuple> mostLikelyMatch = TextLogFileStructureFinder.mostLikelyTimestamp(sample.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES); assertNotNull(mostLikelyMatch); - assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), - mostLikelyMatch.v1()); + assertEquals(new TimestampMatch(9, "", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSS", + "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), mostLikelyMatch.v1()); } public void testMostLikelyTimestampGivenExceptionTrace() { @@ -325,8 +325,8 @@ public void testMostLikelyTimestampGivenExceptionTrace() { // Even though many lines have a timestamp near the end (in the Lucene version information), // these are so far along the lines that the weight of the timestamp near the beginning of the // first line should take precedence - assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), - mostLikelyMatch.v1()); + assertEquals(new TimestampMatch(9, "", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSS", + "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), mostLikelyMatch.v1()); } public void testMostLikelyTimestampGivenExceptionTraceAndTimestampFormatOverride() { @@ -338,8 +338,8 @@ public void testMostLikelyTimestampGivenExceptionTraceAndTimestampFormatOverride assertNotNull(mostLikelyMatch); // The override should force the seemingly inferior choice of timestamp - assertEquals(new TimestampMatch(6, "", "YYYY-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", - ""), mostLikelyMatch.v1()); + assertEquals(new TimestampMatch(6, "", "YYYY-MM-dd HH:mm:ss", "yyyy-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", + "TIMESTAMP_ISO8601", ""), mostLikelyMatch.v1()); } public void testMostLikelyTimestampGivenExceptionTraceAndImpossibleTimestampFormatOverride() { diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java index bf27912b9db26..4b9b0b36c3426 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java @@ -6,14 +6,11 @@ package org.elasticsearch.xpack.ml.filestructurefinder; import org.elasticsearch.common.collect.Tuple; +import org.elasticsearch.common.time.DateFormatters; import org.elasticsearch.xpack.ml.filestructurefinder.TimestampFormatFinder.TimestampMatch; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; -import org.joda.time.format.ISODateTimeFormat; import java.util.Arrays; +import java.util.List; import java.util.Locale; public class TimestampFormatFinderTests extends FileStructureTestCase { @@ -28,199 +25,206 @@ public void testFindFirstMatchGivenNoMatch() { public void testFindFirstMatchGivenOnlyIso8601() { - TimestampMatch expected = new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", - ""); + validateTimestampMatch(new TimestampMatch(7, "", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSSXX", + "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15T16:14:56,374Z", + 1526400896374L); + validateTimestampMatch(new TimestampMatch(7, "", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSSXX", + "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15T17:14:56,374+0100", + 1526400896374L); + validateTimestampMatch(new TimestampMatch(8, "", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSSXXX", + "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15T17:14:56,374+01:00", + 1526400896374L); + validateTimestampMatch(new TimestampMatch(9, "", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSS", + "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15T17:14:56,374", 1526400896374L); - checkAndValidateDateFormat(expected, "2018-05-15T16:14:56,374Z", 1526400896374L); - checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374+0100", 1526400896374L); - checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374+01:00", 1526400896374L); - checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374", 1526400896374L); - checkAndValidateDateFormat(expected, "2018-05-15T16:14:56Z", 1526400896000L); - checkAndValidateDateFormat(expected, "2018-05-15T17:14:56+0100", 1526400896000L); - checkAndValidateDateFormat(expected, "2018-05-15T17:14:56+01:00", 1526400896000L); - checkAndValidateDateFormat(expected, "2018-05-15T17:14:56", 1526400896000L); + TimestampMatch pureIso8601Expected = new TimestampMatch(10, "", "ISO8601", "ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""); - checkAndValidateDateFormat(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ", + validateTimestampMatch(pureIso8601Expected, "2018-05-15T16:14:56Z", 1526400896000L); + validateTimestampMatch(pureIso8601Expected, "2018-05-15T17:14:56+0100", 1526400896000L); + validateTimestampMatch(pureIso8601Expected, "2018-05-15T17:14:56+01:00", 1526400896000L); + validateTimestampMatch(pureIso8601Expected, "2018-05-15T17:14:56", 1526400896000L); + + validateTimestampMatch(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ", "yyyy-MM-dd HH:mm:ss,SSSXX", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 16:14:56,374Z", 1526400896374L); - checkAndValidateDateFormat(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ", + validateTimestampMatch(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ", "yyyy-MM-dd HH:mm:ss,SSSXX", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374+0100", 1526400896374L); - checkAndValidateDateFormat(new TimestampMatch(2, "", "YYYY-MM-dd HH:mm:ss,SSSZZ", + validateTimestampMatch(new TimestampMatch(2, "", "YYYY-MM-dd HH:mm:ss,SSSZZ", "yyyy-MM-dd HH:mm:ss,SSSXXX", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374+01:00", 1526400896374L); - checkAndValidateDateFormat(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss,SSS", + validateTimestampMatch(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss,SSS", "yyyy-MM-dd HH:mm:ss,SSS", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374", 1526400896374L); - checkAndValidateDateFormat(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ", + validateTimestampMatch(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ", "yyyy-MM-dd HH:mm:ssXX", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 16:14:56Z", 1526400896000L); - checkAndValidateDateFormat(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ", + validateTimestampMatch(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ", "yyyy-MM-dd HH:mm:ssXX", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56+0100", 1526400896000L); - checkAndValidateDateFormat(new TimestampMatch(5, "", "YYYY-MM-dd HH:mm:ssZZ", + validateTimestampMatch(new TimestampMatch(5, "", "YYYY-MM-dd HH:mm:ssZZ", "yyyy-MM-dd HH:mm:ssXXX", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56+01:00", 1526400896000L); - checkAndValidateDateFormat(new TimestampMatch(6, "", "YYYY-MM-dd HH:mm:ss", + validateTimestampMatch(new TimestampMatch(6, "", "YYYY-MM-dd HH:mm:ss", "yyyy-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56", 1526400896000L); } - public void testFindFirstMatchGivenOnlyKnownDateFormat() { + public void testFindFirstMatchGivenOnlyKnownTimestampFormat() { // Note: some of the time formats give millisecond accuracy, some second accuracy and some minute accuracy - checkAndValidateDateFormat(new TimestampMatch(0, "", "YYYY-MM-dd HH:mm:ss,SSS Z", + validateTimestampMatch(new TimestampMatch(0, "", "YYYY-MM-dd HH:mm:ss,SSS Z", "yyyy-MM-dd HH:mm:ss,SSS XX", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TOMCAT_DATESTAMP", ""), "2018-05-15 17:14:56,374 +0100", 1526400896374L); - checkAndValidateDateFormat(new TimestampMatch(8, "", "EEE MMM dd YYYY HH:mm:ss zzz", + validateTimestampMatch(new TimestampMatch(11, "", "EEE MMM dd YYYY HH:mm:ss zzz", "EEE MMM dd yyyy HH:mm:ss zzz", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC822", ""), "Tue May 15 2018 16:14:56 UTC", 1526400896000L); - checkAndValidateDateFormat(new TimestampMatch(9, "", "EEE MMM dd YYYY HH:mm zzz", + validateTimestampMatch(new TimestampMatch(12, "", "EEE MMM dd YYYY HH:mm zzz", "EEE MMM dd yyyy HH:mm zzz", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC822", ""), "Tue May 15 2018 16:14 UTC", 1526400840000L); - checkAndValidateDateFormat(new TimestampMatch(10, "", "EEE, dd MMM YYYY HH:mm:ss ZZ", + validateTimestampMatch(new TimestampMatch(13, "", "EEE, dd MMM YYYY HH:mm:ss ZZ", "EEE, dd MMM yyyy HH:mm:ss XXX", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""), "Tue, 15 May 2018 17:14:56 +01:00", 1526400896000L); - checkAndValidateDateFormat(new TimestampMatch(11, "", "EEE, dd MMM YYYY HH:mm:ss Z", + validateTimestampMatch(new TimestampMatch(14, "", "EEE, dd MMM YYYY HH:mm:ss Z", "EEE, dd MMM yyyy HH:mm:ss XX", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""), "Tue, 15 May 2018 17:14:56 +0100", 1526400896000L); - checkAndValidateDateFormat(new TimestampMatch(12, "", "EEE, dd MMM YYYY HH:mm ZZ", + validateTimestampMatch(new TimestampMatch(15, "", "EEE, dd MMM YYYY HH:mm ZZ", "EEE, dd MMM yyyy HH:mm XXX", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""), "Tue, 15 May 2018 17:14 +01:00", 1526400840000L); - checkAndValidateDateFormat(new TimestampMatch(13, "", "EEE, dd MMM YYYY HH:mm Z", + validateTimestampMatch(new TimestampMatch(16, "", "EEE, dd MMM YYYY HH:mm Z", "EEE, dd MMM yyyy HH:mm XX", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""), "Tue, 15 May 2018 17:14 +0100", 1526400840000L); - checkAndValidateDateFormat(new TimestampMatch(14, "", "EEE MMM dd HH:mm:ss zzz YYYY", + validateTimestampMatch(new TimestampMatch(17, "", "EEE MMM dd HH:mm:ss zzz YYYY", "EEE MMM dd HH:mm:ss zzz yyyy", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "DATESTAMP_OTHER", ""), "Tue May 15 16:14:56 UTC 2018", 1526400896000L); - checkAndValidateDateFormat(new TimestampMatch(15, "", "EEE MMM dd HH:mm zzz YYYY", + validateTimestampMatch(new TimestampMatch(18, "", "EEE MMM dd HH:mm zzz YYYY", "EEE MMM dd HH:mm zzz yyyy", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "DATESTAMP_OTHER", ""), "Tue May 15 16:14 UTC 2018", 1526400840000L); - checkAndValidateDateFormat(new TimestampMatch(16, "", "YYYYMMddHHmmss", "\\b\\d{14}\\b", "DATESTAMP_EVENTLOG", ""), + validateTimestampMatch(new TimestampMatch(19, "", "YYYYMMddHHmmss", "yyyyMMddHHmmss", "\\b\\d{14}\\b", + "DATESTAMP_EVENTLOG", ""), "20180515171456", 1526400896000L); - checkAndValidateDateFormat(new TimestampMatch(17, "", "EEE MMM dd HH:mm:ss YYYY", + validateTimestampMatch(new TimestampMatch(20, "", "EEE MMM dd HH:mm:ss YYYY", "EEE MMM dd HH:mm:ss yyyy", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", "HTTPDERROR_DATE", ""), "Tue May 15 17:14:56 2018", 1526400896000L); - checkAndValidateDateFormat(new TimestampMatch(18, "", Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), + validateTimestampMatch(new TimestampMatch(21, "", Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), + Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}", "SYSLOGTIMESTAMP", ""), "May 15 17:14:56.725", 1526400896725L); - checkAndValidateDateFormat(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), + validateTimestampMatch(new TimestampMatch(22, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), + Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", ""), "May 15 17:14:56", 1526400896000L); - checkAndValidateDateFormat(new TimestampMatch(20, "", "dd/MMM/YYYY:HH:mm:ss Z", + validateTimestampMatch(new TimestampMatch(23, "", "dd/MMM/YYYY:HH:mm:ss Z", "dd/MMM/yyyy:HH:mm:ss XX", "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "HTTPDATE", ""), "15/May/2018:17:14:56 +0100", 1526400896000L); - checkAndValidateDateFormat(new TimestampMatch(21, "", "MMM dd, YYYY K:mm:ss a", + validateTimestampMatch(new TimestampMatch(24, "", "MMM dd, YYYY h:mm:ss a", "MMM dd, yyyy h:mm:ss a", "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "CATALINA_DATESTAMP", ""), "May 15, 2018 5:14:56 PM", 1526400896000L); - checkAndValidateDateFormat(new TimestampMatch(22, "", Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"), + validateTimestampMatch(new TimestampMatch(25, "", Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"), + Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", "CISCOTIMESTAMP", ""), "May 15 2018 17:14:56", 1526400896000L); } public void testFindFirstMatchGivenOnlySystemDate() { - assertEquals(new TimestampMatch(23, "", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""), + assertEquals(new TimestampMatch(26, "", "UNIX_MS", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""), TimestampFormatFinder.findFirstMatch("1526400896374")); - assertEquals(new TimestampMatch(23, "", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""), + assertEquals(new TimestampMatch(26, "", "UNIX_MS", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""), TimestampFormatFinder.findFirstFullMatch("1526400896374")); - assertEquals(new TimestampMatch(24, "", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""), + assertEquals(new TimestampMatch(27, "", "UNIX", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""), TimestampFormatFinder.findFirstMatch("1526400896.736")); - assertEquals(new TimestampMatch(24, "", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""), + assertEquals(new TimestampMatch(27, "", "UNIX", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""), TimestampFormatFinder.findFirstFullMatch("1526400896.736")); - assertEquals(new TimestampMatch(25, "", "UNIX", "\\b\\d{10}\\b", "POSINT", ""), + assertEquals(new TimestampMatch(28, "", "UNIX", "UNIX", "\\b\\d{10}\\b", "POSINT", ""), TimestampFormatFinder.findFirstMatch("1526400896")); - assertEquals(new TimestampMatch(25, "", "UNIX", "\\b\\d{10}\\b", "POSINT", ""), + assertEquals(new TimestampMatch(28, "", "UNIX", "UNIX", "\\b\\d{10}\\b", "POSINT", ""), TimestampFormatFinder.findFirstFullMatch("1526400896")); - assertEquals(new TimestampMatch(26, "", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""), + assertEquals(new TimestampMatch(29, "", "TAI64N", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""), TimestampFormatFinder.findFirstMatch("400000005afb159a164ac980")); - assertEquals(new TimestampMatch(26, "", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""), + assertEquals(new TimestampMatch(29, "", "TAI64N", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""), TimestampFormatFinder.findFirstFullMatch("400000005afb159a164ac980")); } - private void checkAndValidateDateFormat(TimestampMatch expected, String text, long expectedEpochMs) { - - assertEquals(expected, TimestampFormatFinder.findFirstMatch(text)); - assertEquals(expected, TimestampFormatFinder.findFirstFullMatch(text)); - - // All the test times are for Tue May 15 2018 16:14:56 UTC, which is 17:14:56 in London - DateTimeZone zone = DateTimeZone.forID("Europe/London"); - DateTime parsed; - for (int i = 0; i < expected.dateFormats.size(); ++i) { - try { - String dateFormat = expected.dateFormats.get(i); - switch (dateFormat) { - case "ISO8601": - parsed = ISODateTimeFormat.dateTimeParser().withZone(zone).withDefaultYear(2018).parseDateTime(text); - break; - default: - DateTimeFormatter parser = DateTimeFormat.forPattern(dateFormat).withZone(zone).withLocale(Locale.UK); - parsed = parser.withDefaultYear(2018).parseDateTime(text); - break; - } - if (expectedEpochMs == parsed.getMillis()) { - break; - } - // If the last one isn't right then propagate - if (i == expected.dateFormats.size() - 1) { - assertEquals(expectedEpochMs, parsed.getMillis()); - } - } catch (RuntimeException e) { - // If the last one throws then propagate - if (i == expected.dateFormats.size() - 1) { - throw e; - } - } - } - assertTrue(expected.simplePattern.matcher(text).find()); - } - public void testFindFirstMatchGivenRealLogMessages() { - assertEquals(new TimestampMatch(7, "[", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", + assertEquals(new TimestampMatch(9, "[", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSS", + "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", "][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [3.9gb], compressed ordinary object pointers [true]"), TimestampFormatFinder.findFirstMatch("[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] " + "heap size [3.9gb], compressed ordinary object pointers [true]")); - assertEquals(new TimestampMatch(20, "192.168.62.101 - - [", "dd/MMM/YYYY:HH:mm:ss Z", + assertEquals(new TimestampMatch(23, "192.168.62.101 - - [", "dd/MMM/YYYY:HH:mm:ss Z", "dd/MMM/yyyy:HH:mm:ss XX", "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "HTTPDATE", "] \"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384"), TimestampFormatFinder.findFirstMatch("192.168.62.101 - - [29/Jun/2016:12:11:31 +0000] " + "\"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384")); - assertEquals(new TimestampMatch(21, "", "MMM dd, YYYY K:mm:ss a", + assertEquals(new TimestampMatch(24, "", "MMM dd, YYYY h:mm:ss a", "MMM dd, yyyy h:mm:ss a", "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "CATALINA_DATESTAMP", " org.apache.tomcat.util.http.Parameters processParameters"), TimestampFormatFinder.findFirstMatch("Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters")); - assertEquals(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), + assertEquals(new TimestampMatch(22, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), + Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", " esxi1.acme.com Vpxa: " + "[3CB3FB90 verbose 'vpxavpxaInvtVm' opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed"), TimestampFormatFinder.findFirstMatch("Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' " + "opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed")); - assertEquals(new TimestampMatch(7, "559550912540598297\t", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", + assertEquals(new TimestampMatch(10, "559550912540598297\t", "ISO8601", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", "\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp"), TimestampFormatFinder.findFirstMatch("559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t" + "192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp")); - assertEquals(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), + assertEquals(new TimestampMatch(22, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), + Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", " dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53"), TimestampFormatFinder.findFirstMatch("Sep 8 11:55:35 dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving " + "'www.elastic.co/A/IN': 95.110.68.206#53")); - assertEquals(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss.SSSSSS", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}", - "TIMESTAMP_ISO8601", + assertEquals(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss.SSSSSS", "yyyy-MM-dd HH:mm:ss.SSSSSS", + "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}", "TIMESTAMP_ISO8601", "|INFO |VirtualServer |1 |client 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client " + "'User1'(id:2) in channel '3er Instanz'(id:2)"), TimestampFormatFinder.findFirstMatch("2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)")); + + // Differs from the above as the required format is specified + assertEquals(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss.SSSSSS", "yyyy-MM-dd HH:mm:ss.SSSSSS", + "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}", "TIMESTAMP_ISO8601", + "|INFO |VirtualServer |1 |client 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client " + + "'User1'(id:2) in channel '3er Instanz'(id:2)"), + TimestampFormatFinder.findFirstMatch("2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + + " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)", + randomFrom("YYYY-MM-dd HH:mm:ss.SSSSSS", "yyyy-MM-dd HH:mm:ss.SSSSSS"))); + + // Non-matching required format specified + assertNull(TimestampFormatFinder.findFirstMatch("2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + + " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)", + randomFrom("UNIX", "EEE MMM dd YYYY HH:mm zzz"))); + } + + public void testAdjustRequiredFormat() { + assertEquals("YYYY-MM-dd HH:mm:ss,SSS Z", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss,SSS Z")); + assertEquals("YYYY-MM-dd HH:mm:ss,SSS Z", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss,SSSSSS Z")); + assertEquals("YYYY-MM-dd HH:mm:ss,SSS Z", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss,SSSSSSSSS Z")); + assertEquals("YYYY-MM-dd HH:mm:ss,SSS Z", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss.SSS Z")); + assertEquals("YYYY-MM-dd HH:mm:ss,SSS Z", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss.SSSSSS Z")); + assertEquals("YYYY-MM-dd HH:mm:ss,SSS Z", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss.SSSSSSSSS Z")); + assertEquals("YYYY-MM-dd HH:mm:ss,SSS", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss,SSS")); + assertEquals("YYYY-MM-dd HH:mm:ss,SSS", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss,SSSSSS")); + assertEquals("YYYY-MM-dd HH:mm:ss,SSS", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss,SSSSSSSSS")); + assertEquals("YYYY-MM-dd HH:mm:ss,SSS", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss.SSS")); + assertEquals("YYYY-MM-dd HH:mm:ss,SSS", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss.SSSSSS")); + assertEquals("YYYY-MM-dd HH:mm:ss,SSS", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss.SSSSSSSSS")); } public void testInterpretFractionalSeconds() { @@ -239,4 +243,112 @@ public void testInterpretFractionalSeconds() { assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764 Z")); assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764 Z")); } + + private void validateTimestampMatch(TimestampMatch expected, String text, long expectedEpochMs) { + + assertEquals(expected, TimestampFormatFinder.findFirstMatch(text)); + assertEquals(expected, TimestampFormatFinder.findFirstFullMatch(text)); + assertEquals(expected, TimestampFormatFinder.findFirstMatch(text, expected.candidateIndex)); + assertEquals(expected, TimestampFormatFinder.findFirstFullMatch(text, expected.candidateIndex)); + assertNull(TimestampFormatFinder.findFirstMatch(text, Integer.MAX_VALUE)); + assertNull(TimestampFormatFinder.findFirstFullMatch(text, Integer.MAX_VALUE)); + assertEquals(expected, TimestampFormatFinder.findFirstMatch(text, randomFrom(expected.jodaTimestampFormats))); + assertEquals(expected, TimestampFormatFinder.findFirstFullMatch(text, randomFrom(expected.jodaTimestampFormats))); + assertEquals(expected, TimestampFormatFinder.findFirstMatch(text, randomFrom(expected.javaTimestampFormats))); + assertEquals(expected, TimestampFormatFinder.findFirstFullMatch(text, randomFrom(expected.javaTimestampFormats))); + assertNull(TimestampFormatFinder.findFirstMatch(text, "wrong format")); + assertNull(TimestampFormatFinder.findFirstFullMatch(text, "wrong format")); + + validateJodaTimestampFormats(expected.jodaTimestampFormats, text, expectedEpochMs); + validateJavaTimestampFormats(expected.javaTimestampFormats, text, expectedEpochMs); + + assertTrue(expected.simplePattern.matcher(text).find()); + } + + private void validateJodaTimestampFormats(List jodaTimestampFormats, String text, long expectedEpochMs) { + + // All the test times are for Tue May 15 2018 16:14:56 UTC, which is 17:14:56 in London. + // This is the timezone that will be used for any text representations that don't include it. + org.joda.time.DateTimeZone defaultZone = org.joda.time.DateTimeZone.forID("Europe/London"); + org.joda.time.DateTime parsed; + for (int i = 0; i < jodaTimestampFormats.size(); ++i) { + try { + String timestampFormat = jodaTimestampFormats.get(i); + switch (timestampFormat) { + case "ISO8601": + parsed = org.joda.time.format.ISODateTimeFormat.dateTimeParser() + .withZone(defaultZone).withDefaultYear(2018).parseDateTime(text); + break; + default: + org.joda.time.format.DateTimeFormatter parser = + org.joda.time.format.DateTimeFormat.forPattern(timestampFormat).withZone(defaultZone).withLocale(Locale.ROOT); + parsed = parser.withDefaultYear(2018).parseDateTime(text); + break; + } + if (expectedEpochMs == parsed.getMillis()) { + break; + } + // If the last one isn't right then propagate + if (i == jodaTimestampFormats.size() - 1) { + assertEquals(expectedEpochMs, parsed.getMillis()); + } + } catch (RuntimeException e) { + // If the last one throws then propagate + if (i == jodaTimestampFormats.size() - 1) { + throw e; + } + } + } + } + + private void validateJavaTimestampFormats(List javaTimestampFormats, String text, long expectedEpochMs) { + + // All the test times are for Tue May 15 2018 16:14:56 UTC, which is 17:14:56 in London. + // This is the timezone that will be used for any text representations that don't include it. + java.time.ZoneId defaultZone = java.time.ZoneId.of("Europe/London"); + java.time.temporal.TemporalAccessor parsed; + for (int i = 0; i < javaTimestampFormats.size(); ++i) { + try { + String timestampFormat = javaTimestampFormats.get(i); + switch (timestampFormat) { + case "ISO8601": + parsed = DateFormatters.forPattern("strict_date_optional_time_nanos").withZone(defaultZone).parse(text); + break; + default: + java.time.format.DateTimeFormatter parser = new java.time.format.DateTimeFormatterBuilder() + .appendPattern(timestampFormat).parseDefaulting(java.time.temporal.ChronoField.YEAR_OF_ERA, 2018) + .toFormatter(Locale.ROOT); + // This next line parses the textual date without any default timezone, so if + // the text doesn't contain the timezone then the resulting temporal accessor + // will be incomplete (i.e. impossible to convert to an Instant). You would + // hope that it would be possible to specify a timezone to be used only in this + // case, and in Java 9 and 10 it is, by adding withZone(zone) before the + // parse(text) call. However, with Java 8 this overrides any timezone parsed + // from the text. The solution is to parse twice, once without a default + // timezone and then again with a default timezone if the first parse didn't + // find one in the text. + parsed = parser.parse(text); + if (parsed.query(java.time.temporal.TemporalQueries.zone()) == null) { + // TODO: when Java 8 is no longer supported remove the two + // lines and comment above and the closing brace below + parsed = parser.withZone(defaultZone).parse(text); + } + break; + } + long actualEpochMs = java.time.Instant.from(parsed).toEpochMilli(); + if (expectedEpochMs == actualEpochMs) { + break; + } + // If the last one isn't right then propagate + if (i == javaTimestampFormats.size() - 1) { + assertEquals(expectedEpochMs, actualEpochMs); + } + } catch (RuntimeException e) { + // If the last one throws then propagate + if (i == javaTimestampFormats.size() - 1) { + throw e; + } + } + } + } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java index 01c44147b0430..2429da0901832 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java @@ -38,6 +38,6 @@ public void testCreateConfigsGivenGoodXml() throws Exception { assertNull(structure.getShouldTrimFields()); assertNull(structure.getGrokPattern()); assertEquals("timestamp", structure.getTimestampField()); - assertEquals(Collections.singletonList("UNIX_MS"), structure.getTimestampFormats()); + assertEquals(Collections.singletonList("UNIX_MS"), structure.getJodaTimestampFormats()); } } diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json b/x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json index 20a5c8e0c2a01..e630b34ddf764 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json +++ b/x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json @@ -49,7 +49,7 @@ }, "timestamp_format": { "type": "string", - "description": "Optional parameter to specify the timestamp format in the file" + "description": "Optional parameter to specify the timestamp format in the file - may be either a Joda or Java time format" }, "explain": { "type": "boolean", diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml index fcacdb9cdc7fd..8f1d9a4dc9cac 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml @@ -27,7 +27,8 @@ - match: { has_byte_order_marker: false } - match: { format: json } - match: { timestamp_field: time } - - match: { timestamp_formats.0: UNIX } + - match: { joda_timestamp_formats.0: UNIX } + - match: { java_timestamp_formats.0: UNIX } - match: { need_client_timezone: false } - match: { mappings.airline.type: keyword } - match: { mappings.responsetime.type: double } @@ -83,7 +84,8 @@ - match: { has_byte_order_marker: false } - match: { format: json } - match: { timestamp_field: time } - - match: { timestamp_formats.0: UNIX } + - match: { joda_timestamp_formats.0: UNIX } + - match: { java_timestamp_formats.0: UNIX } - match: { need_client_timezone: false } - match: { mappings.airline.type: keyword } - match: { mappings.responsetime.type: double }