diff --git a/docs/reference/ml/apis/find-file-structure.asciidoc b/docs/reference/ml/apis/find-file-structure.asciidoc index 0da485a3f0a30..e8887fd1e3a4f 100644 --- a/docs/reference/ml/apis/find-file-structure.asciidoc +++ b/docs/reference/ml/apis/find-file-structure.asciidoc @@ -22,7 +22,7 @@ This API provides a starting point for ingesting data into {es} in a format that is suitable for subsequent use with other {ml} functionality. Unlike other {es} endpoints, the data that is posted to this endpoint does not -need to be UTF-8 encoded and in JSON format. It must, however, be text; binary +need to be UTF-8 encoded and in JSON format. It must, however, be text; binary file formats are not currently supported. The response from the API contains: @@ -122,6 +122,11 @@ to request analysis of 100000 lines to achieve some variety. is not specified and the delimiter is pipe (`|`), the default value is `true`. Otherwise, the default value is `false`. +`timeout`:: + (time) Sets the maximum amount of time that the structure analysis make take. + If the analysis is still running when the timeout expires then it will be + aborted. The default value is 25 seconds. + `timestamp_field`:: (string) The name of the field that contains the primary timestamp of each record in the file. In particular, if the file were ingested into an index, @@ -197,7 +202,7 @@ the formats it knows, which are these Joda formats and their Java time equivalen The text file that you want to analyze. It must contain data that is suitable to be ingested into {es}. It does not need to be in JSON format and it does not -need to be UTF-8 encoded. The size is limited to the {es} HTTP receive buffer +need to be UTF-8 encoded. The size is limited to the {es} HTTP receive buffer size, which defaults to 100 Mb. @@ -245,6 +250,7 @@ POST _xpack/ml/find_file_structure // TEST If the request does not encounter errors, you receive the following result: + [source,js] ---- { @@ -483,7 +489,7 @@ If the request does not encounter errors, you receive the following result: `keyword` type as it is not considered specific enough to convert to the `date` type. <9> `field_stats` contains the most common values of each field, plus basic - numeric statistics for the numeric `page_count` field. This information + numeric statistics for the numeric `page_count` field. This information may provide clues that the data needs to be cleaned or transformed prior to use by other {ml} functionality. @@ -502,11 +508,12 @@ curl -s "s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2018-06.csv" | head -- NOTE: The `Content-Type: application/json` header must be set even though in -this case the data is not JSON. (Alternatively the `Content-Type` can be set +this case the data is not JSON. (Alternatively the `Content-Type` can be set to any other supported by Elasticsearch, but it must be set.) -- If the request does not encounter errors, you receive the following result: + [source,js] ---- { @@ -1269,9 +1276,405 @@ If the request does not encounter errors, you receive the following result: <8> `joda_timestamp_formats` are used to tell Logstash and Ingest pipeline how to parse timestamps. <9> `java_timestamp_formats` are the Java time formats recognized in the time - fields. In future Ingest pipeline will switch to use this format. + fields. In future Ingest pipeline will switch to use this format. <10> The timestamp format in this sample doesn't specify a timezone, so to accurately convert them to UTC timestamps to store in Elasticsearch it's necessary to supply the timezone they relate to. `need_client_timezone` will be `false` for timestamp formats that include the timezone. +If you try to analyze a lot of data then the analysis will take a long time. +If you want to limit the amount of processing your {es} cluster performs for +a request, use the timeout query parameter. The analysis will be aborted and +an error returned when the timeout expires. For example, you can replace 20000 +lines in the previous example with 200000 and set a 1 second timeout on the +analysis: + +[source,js] +---- +curl -s "s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2018-06.csv" | head -200000 | curl -s -H "Content-Type: application/json" -XPOST "localhost:9200/_xpack/ml/find_file_structure?pretty&lines_to_sample=200000&timeout=1s" -T - +---- +// NOTCONSOLE +// Not converting to console because this shows how curl can be used + +Unless you are using an incredibly fast computer you'll receive a timeout error: + +[source,js] +---- +{ + "error" : { + "root_cause" : [ + { + "type" : "timeout_exception", + "reason" : "Aborting structure analysis during [delimited record parsing] as it has taken longer than the timeout of [1s]" + } + ], + "type" : "timeout_exception", + "reason" : "Aborting structure analysis during [delimited record parsing] as it has taken longer than the timeout of [1s]" + }, + "status" : 500 +} +---- +// NOTCONSOLE + +-- +NOTE: If you try the example above yourself you will note that the overall +running time of the `curl` commands is considerably longer than 1 second. This +is because it takes a while to download 200000 lines of CSV from the internet, +and the timeout is measured from the time this endpoint starts to process the +data. +-- + +This is an example of analyzing {es}'s own log file: + +[source,js] +---- +curl -s -H "Content-Type: application/json" -XPOST "localhost:9200/_xpack/ml/find_file_structure?pretty" -T "$ES_HOME/logs/elasticsearch.log" +---- +// NOTCONSOLE +// Not converting to console because this shows how curl can be used + +If the request does not encounter errors, the result will look something like +this: + +[source,js] +---- +{ + "num_lines_analyzed" : 53, + "num_messages_analyzed" : 53, + "sample_start" : "[2018-09-27T14:39:28,518][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], net usable_space [165.4gb], net total_space [464.7gb], types [hfs]\n[2018-09-27T14:39:28,521][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [494.9mb], compressed ordinary object pointers [true]\n", + "charset" : "UTF-8", + "has_byte_order_marker" : false, + "format" : "semi_structured_text", <1> + "multiline_start_pattern" : "^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", <2> + "grok_pattern" : "\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel}.*", <3> + "timestamp_field" : "timestamp", + "joda_timestamp_formats" : [ + "ISO8601" + ], + "java_timestamp_formats" : [ + "yyyy-MM-dd'T'HH:mm:ss,SSS" + ], + "need_client_timezone" : true, + "mappings" : { + "@timestamp" : { + "type" : "date" + }, + "loglevel" : { + "type" : "keyword" + }, + "message" : { + "type" : "text" + } + }, + "field_stats" : { + "loglevel" : { + "count" : 53, + "cardinality" : 3, + "top_hits" : [ + { + "value" : "INFO", + "count" : 51 + }, + { + "value" : "DEBUG", + "count" : 1 + }, + { + "value" : "WARN", + "count" : 1 + } + ] + }, + "timestamp" : { + "count" : 53, + "cardinality" : 28, + "top_hits" : [ + { + "value" : "2018-09-27T14:39:29,859", + "count" : 10 + }, + { + "value" : "2018-09-27T14:39:29,860", + "count" : 9 + }, + { + "value" : "2018-09-27T14:39:29,858", + "count" : 6 + }, + { + "value" : "2018-09-27T14:39:28,523", + "count" : 3 + }, + { + "value" : "2018-09-27T14:39:34,234", + "count" : 2 + }, + { + "value" : "2018-09-27T14:39:28,518", + "count" : 1 + }, + { + "value" : "2018-09-27T14:39:28,521", + "count" : 1 + }, + { + "value" : "2018-09-27T14:39:28,522", + "count" : 1 + }, + { + "value" : "2018-09-27T14:39:29,861", + "count" : 1 + }, + { + "value" : "2018-09-27T14:39:32,786", + "count" : 1 + } + ] + } + } +} +---- +// NOTCONSOLE + +<1> This time the `format` has been identified as `semi_structured_text`. +<2> The `multiline_start_pattern` is set on the basis that the timestamp appears + in the first line of each multi-line log message. +<3> A very simple `grok_pattern` has been created, which extracts the timestamp + and recognizable fields that appear in every analyzed message. In this case + the only field that was recognized beyond the timestamp was the log level. + +If you recognize more fields than the simple `grok_pattern` produced by the +structure finder unaided then you can resubmit the request specifying a more +advanced `grok_pattern` as a query parameter and the structure finder will +calculate `field_stats` for your additional fields. + +In the case of the {es} log a more complete Grok pattern is +`\[%{TIMESTAMP_ISO8601:timestamp}\]\[%{LOGLEVEL:loglevel} *\]\[%{JAVACLASS:class} *\] \[%{HOSTNAME:node}\] %{JAVALOGMESSAGE:message}`. +You can analyze the same log file again, submitting this `grok_pattern` as a +query parameter (appropriately URL escaped): + +[source,js] +---- +curl -s -H "Content-Type: application/json" -XPOST "localhost:9200/_xpack/ml/find_file_structure?pretty&format=semi_structured_text&grok_pattern=%5C%5B%25%7BTIMESTAMP_ISO8601:timestamp%7D%5C%5D%5C%5B%25%7BLOGLEVEL:loglevel%7D%20*%5C%5D%5C%5B%25%7BJAVACLASS:class%7D%20*%5C%5D%20%5C%5B%25%7BHOSTNAME:node%7D%5C%5D%20%25%7BJAVALOGMESSAGE:message%7D" -T "$ES_HOME/logs/elasticsearch.log" +---- +// NOTCONSOLE +// Not converting to console because this shows how curl can be used + +If the request does not encounter errors, the result will look something like +this: + +[source,js] +---- +{ + "num_lines_analyzed" : 53, + "num_messages_analyzed" : 53, + "sample_start" : "[2018-09-27T14:39:28,518][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], net usable_space [165.4gb], net total_space [464.7gb], types [hfs]\n[2018-09-27T14:39:28,521][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [494.9mb], compressed ordinary object pointers [true]\n", + "charset" : "UTF-8", + "has_byte_order_marker" : false, + "format" : "semi_structured_text", + "multiline_start_pattern" : "^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", + "grok_pattern" : "\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} *\\]\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}", <1> + "timestamp_field" : "timestamp", + "joda_timestamp_formats" : [ + "ISO8601" + ], + "java_timestamp_formats" : [ + "yyyy-MM-dd'T'HH:mm:ss,SSS" + ], + "need_client_timezone" : true, + "mappings" : { + "@timestamp" : { + "type" : "date" + }, + "class" : { + "type" : "keyword" + }, + "loglevel" : { + "type" : "keyword" + }, + "message" : { + "type" : "text" + }, + "node" : { + "type" : "keyword" + } + }, + "field_stats" : { <1> + "class" : { + "count" : 53, + "cardinality" : 14, + "top_hits" : [ + { + "value" : "o.e.p.PluginsService", + "count" : 26 + }, + { + "value" : "o.e.c.m.MetaDataIndexTemplateService", + "count" : 8 + }, + { + "value" : "o.e.n.Node", + "count" : 7 + }, + { + "value" : "o.e.e.NodeEnvironment", + "count" : 2 + }, + { + "value" : "o.e.a.ActionModule", + "count" : 1 + }, + { + "value" : "o.e.c.s.ClusterApplierService", + "count" : 1 + }, + { + "value" : "o.e.c.s.MasterService", + "count" : 1 + }, + { + "value" : "o.e.d.DiscoveryModule", + "count" : 1 + }, + { + "value" : "o.e.g.GatewayService", + "count" : 1 + }, + { + "value" : "o.e.l.LicenseService", + "count" : 1 + } + ] + }, + "loglevel" : { + "count" : 53, + "cardinality" : 3, + "top_hits" : [ + { + "value" : "INFO", + "count" : 51 + }, + { + "value" : "DEBUG", + "count" : 1 + }, + { + "value" : "WARN", + "count" : 1 + } + ] + }, + "message" : { + "count" : 53, + "cardinality" : 53, + "top_hits" : [ + { + "value" : "Using REST wrapper from plugin org.elasticsearch.xpack.security.Security", + "count" : 1 + }, + { + "value" : "adding template [.monitoring-alerts] for index patterns [.monitoring-alerts-6]", + "count" : 1 + }, + { + "value" : "adding template [.monitoring-beats] for index patterns [.monitoring-beats-6-*]", + "count" : 1 + }, + { + "value" : "adding template [.monitoring-es] for index patterns [.monitoring-es-6-*]", + "count" : 1 + }, + { + "value" : "adding template [.monitoring-kibana] for index patterns [.monitoring-kibana-6-*]", + "count" : 1 + }, + { + "value" : "adding template [.monitoring-logstash] for index patterns [.monitoring-logstash-6-*]", + "count" : 1 + }, + { + "value" : "adding template [.triggered_watches] for index patterns [.triggered_watches*]", + "count" : 1 + }, + { + "value" : "adding template [.watch-history-9] for index patterns [.watcher-history-9*]", + "count" : 1 + }, + { + "value" : "adding template [.watches] for index patterns [.watches*]", + "count" : 1 + }, + { + "value" : "starting ...", + "count" : 1 + } + ] + }, + "node" : { + "count" : 53, + "cardinality" : 1, + "top_hits" : [ + { + "value" : "node-0", + "count" : 53 + } + ] + }, + "timestamp" : { + "count" : 53, + "cardinality" : 28, + "top_hits" : [ + { + "value" : "2018-09-27T14:39:29,859", + "count" : 10 + }, + { + "value" : "2018-09-27T14:39:29,860", + "count" : 9 + }, + { + "value" : "2018-09-27T14:39:29,858", + "count" : 6 + }, + { + "value" : "2018-09-27T14:39:28,523", + "count" : 3 + }, + { + "value" : "2018-09-27T14:39:34,234", + "count" : 2 + }, + { + "value" : "2018-09-27T14:39:28,518", + "count" : 1 + }, + { + "value" : "2018-09-27T14:39:28,521", + "count" : 1 + }, + { + "value" : "2018-09-27T14:39:28,522", + "count" : 1 + }, + { + "value" : "2018-09-27T14:39:29,861", + "count" : 1 + }, + { + "value" : "2018-09-27T14:39:32,786", + "count" : 1 + } + ] + } + } +} +---- +// NOTCONSOLE + +<1> The `grok_pattern` in the output is now the overridden one supplied in the + query parameter. +<2> The returned `field_stats` include entries for the fields from the + overridden `grok_pattern`. + +The URL escaping is hard, so if you are working interactively it is best to use +the {ml} UI! diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java index d10fedfb58975..78fcc4939ca36 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java @@ -16,6 +16,7 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.xcontent.StatusToXContentObject; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.rest.RestStatus; @@ -112,6 +113,7 @@ public boolean equals(Object other) { public static class Request extends ActionRequest { public static final ParseField LINES_TO_SAMPLE = new ParseField("lines_to_sample"); + public static final ParseField TIMEOUT = new ParseField("timeout"); public static final ParseField CHARSET = FileStructure.CHARSET; public static final ParseField FORMAT = FileStructure.FORMAT; public static final ParseField COLUMN_NAMES = FileStructure.COLUMN_NAMES; @@ -128,6 +130,7 @@ public static class Request extends ActionRequest { "[%s] may only be specified if [" + FORMAT.getPreferredName() + "] is [%s]"; private Integer linesToSample; + private TimeValue timeout; private String charset; private FileStructure.Format format; private List columnNames; @@ -151,6 +154,14 @@ public void setLinesToSample(Integer linesToSample) { this.linesToSample = linesToSample; } + public TimeValue getTimeout() { + return timeout; + } + + public void setTimeout(TimeValue timeout) { + this.timeout = timeout; + } + public String getCharset() { return charset; } @@ -313,6 +324,7 @@ public ActionRequestValidationException validate() { public void readFrom(StreamInput in) throws IOException { super.readFrom(in); linesToSample = in.readOptionalVInt(); + timeout = in.readOptionalTimeValue(); charset = in.readOptionalString(); format = in.readBoolean() ? in.readEnum(FileStructure.Format.class) : null; columnNames = in.readBoolean() ? in.readList(StreamInput::readString) : null; @@ -330,6 +342,7 @@ public void readFrom(StreamInput in) throws IOException { public void writeTo(StreamOutput out) throws IOException { super.writeTo(out); out.writeOptionalVInt(linesToSample); + out.writeOptionalTimeValue(timeout); out.writeOptionalString(charset); if (format == null) { out.writeBoolean(false); @@ -365,7 +378,7 @@ public void writeTo(StreamOutput out) throws IOException { @Override public int hashCode() { - return Objects.hash(linesToSample, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, timestampFormat, + return Objects.hash(linesToSample, timeout, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, timestampFormat, timestampField, sample); } @@ -382,6 +395,7 @@ public boolean equals(Object other) { Request that = (Request) other; return Objects.equals(this.linesToSample, that.linesToSample) && + Objects.equals(this.timeout, that.timeout) && Objects.equals(this.charset, that.charset) && Objects.equals(this.format, that.format) && Objects.equals(this.columnNames, that.columnNames) && diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java index ec37a2b7481f6..0906af9a80d46 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java @@ -48,10 +48,10 @@ protected void doExecute(Task task, FindFileStructureAction.Request request, private FindFileStructureAction.Response buildFileStructureResponse(FindFileStructureAction.Request request) throws Exception { - FileStructureFinderManager structureFinderManager = new FileStructureFinderManager(); + FileStructureFinderManager structureFinderManager = new FileStructureFinderManager(threadPool.scheduler()); FileStructureFinder fileStructureFinder = structureFinderManager.findFileStructure(request.getLinesToSample(), - request.getSample().streamInput(), new FileStructureOverrides(request)); + request.getSample().streamInput(), new FileStructureOverrides(request), request.getTimeout()); return new FindFileStructureAction.Response(fileStructureFinder.getStructure()); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java index 870cb28570bb5..8cdbd030eb5dd 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java @@ -41,10 +41,11 @@ public class DelimitedFileStructureFinder implements FileStructureFinder { static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, CsvPreference csvPreference, - boolean trimFields, FileStructureOverrides overrides) + boolean trimFields, FileStructureOverrides overrides, + TimeoutChecker timeoutChecker) throws IOException { - Tuple>, List> parsed = readRows(sample, csvPreference); + Tuple>, List> parsed = readRows(sample, csvPreference, timeoutChecker); List> rows = parsed.v1(); List lineNumbers = parsed.v2(); @@ -106,7 +107,8 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides); + Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides, + timeoutChecker); if (timeField != null) { String timeLineRegex = null; StringBuilder builder = new StringBuilder("^"); @@ -148,7 +150,7 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List, SortedMap> mappingsAndFieldStats = - FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords); + FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker); SortedMap mappings = mappingsAndFieldStats.v1(); if (timeField != null) { @@ -183,7 +185,8 @@ public FileStructure getStructure() { return structure; } - static Tuple>, List> readRows(String sample, CsvPreference csvPreference) throws IOException { + static Tuple>, List> readRows(String sample, CsvPreference csvPreference, TimeoutChecker timeoutChecker) + throws IOException { int fieldsInFirstRow = -1; @@ -204,6 +207,7 @@ static Tuple>, List> readRows(String sample, CsvPrefe } } rows.add(row); + timeoutChecker.check("delimited record parsing"); lineNumbers.add(csvReader.getLineNumber()); } } catch (SuperCsvException e) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java index 62e5eff517e90..982a6ff703572 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java @@ -62,8 +62,8 @@ public boolean canCreateFromSample(List explanation, String sample) { @Override public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides) throws IOException { + FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException { return DelimitedFileStructureFinder.makeDelimitedFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, - csvPreference, trimFields, overrides); + csvPreference, trimFields, overrides, timeoutChecker); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java index bff4b2115b0fd..8790b8f526864 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java @@ -39,9 +39,10 @@ public interface FileStructureFinderFactory { * @param hasByteOrderMarker Did the sample have a byte order marker? null means "not relevant". * @param overrides Stores structure decisions that have been made by the end user, and should * take precedence over anything the {@link FileStructureFinder} may decide. + * @param timeoutChecker Will abort the operation if its timeout is exceeded. * @return A {@link FileStructureFinder} object suitable for determining the structure of the supplied sample. * @throws Exception if something goes wrong during creation. */ FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides) throws Exception; + FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws Exception; } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java index 7949998d16e01..a508735af07f3 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java @@ -7,7 +7,9 @@ import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; +import org.elasticsearch.ElasticsearchTimeoutException; import org.elasticsearch.common.collect.Tuple; +import org.elasticsearch.common.unit.TimeValue; import java.io.BufferedInputStream; import java.io.BufferedReader; @@ -23,15 +25,17 @@ import java.util.HashSet; import java.util.List; import java.util.Locale; +import java.util.Objects; import java.util.Optional; import java.util.Set; +import java.util.concurrent.ScheduledExecutorService; import java.util.stream.Collectors; /** * Runs the high-level steps needed to create ingest configs for the specified file. In order: * 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.) * 2. Load a sample of the file, consisting of the first 1000 lines of the file - * 3. Determine the most likely file structure - one of ND-JSON, XML, CSV, TSV or semi-structured text + * 3. Determine the most likely file structure - one of ND-JSON, XML, delimited or semi-structured text * 4. Create an appropriate structure object and delegate writing configs to it */ public final class FileStructureFinderManager { @@ -81,8 +85,18 @@ public final class FileStructureFinderManager { private static final int BUFFER_SIZE = 8192; + private final ScheduledExecutorService scheduler; + + /** + * Create the file structure manager. + * @param scheduler Used for checking timeouts. + */ + public FileStructureFinderManager(ScheduledExecutorService scheduler) { + this.scheduler = Objects.requireNonNull(scheduler); + } + public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile) throws Exception { - return findFileStructure(idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES); + return findFileStructure(idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null); } /** @@ -95,42 +109,49 @@ public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Input * @param overrides Aspects of the file structure that are known in advance. These take precedence over * values determined by structure analysis. An exception will be thrown if the file structure * is incompatible with an overridden value. + * @param timeout The maximum time the analysis is permitted to take. If it takes longer than this an + * {@link ElasticsearchTimeoutException} may be thrown (although not necessarily immediately + * the timeout is exceeded). * @return A {@link FileStructureFinder} object from which the structure and messages can be queried. * @throws Exception A variety of problems could occur at various stages of the structure finding process. */ - public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile, FileStructureOverrides overrides) + public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile, FileStructureOverrides overrides, + TimeValue timeout) throws Exception { return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount, - fromFile, overrides); + fromFile, overrides, timeout); } public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, InputStream fromFile) throws Exception { - return findFileStructure(new ArrayList<>(), idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES); + return findFileStructure(explanation, idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null); } public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, InputStream fromFile, - FileStructureOverrides overrides) throws Exception { - - String charsetName = overrides.getCharset(); - Reader sampleReader; - if (charsetName != null) { - // Creating the reader will throw if the specified character set does not exist - sampleReader = new InputStreamReader(fromFile, charsetName); - explanation.add("Using specified character encoding [" + charsetName + "]"); - } else { - CharsetMatch charsetMatch = findCharset(explanation, fromFile); - charsetName = charsetMatch.getName(); - sampleReader = charsetMatch.getReader(); - } + FileStructureOverrides overrides, TimeValue timeout) throws Exception { + + try (TimeoutChecker timeoutChecker = new TimeoutChecker("structure analysis", timeout, scheduler)) { + + String charsetName = overrides.getCharset(); + Reader sampleReader; + if (charsetName != null) { + // Creating the reader will throw if the specified character set does not exist + sampleReader = new InputStreamReader(fromFile, charsetName); + explanation.add("Using specified character encoding [" + charsetName + "]"); + } else { + CharsetMatch charsetMatch = findCharset(explanation, fromFile, timeoutChecker); + charsetName = charsetMatch.getName(); + sampleReader = charsetMatch.getReader(); + } - Tuple sampleInfo = sampleFile(sampleReader, charsetName, MIN_SAMPLE_LINE_COUNT, - Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount)); + Tuple sampleInfo = sampleFile(sampleReader, charsetName, MIN_SAMPLE_LINE_COUNT, + Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount), timeoutChecker); - return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), overrides); + return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), overrides, timeoutChecker); + } } - CharsetMatch findCharset(List explanation, InputStream inputStream) throws Exception { + CharsetMatch findCharset(List explanation, InputStream inputStream, TimeoutChecker timeoutChecker) throws Exception { // We need an input stream that supports mark and reset, so wrap the argument // in a BufferedInputStream if it doesn't already support this feature @@ -141,6 +162,7 @@ CharsetMatch findCharset(List explanation, InputStream inputStream) thro // This is from ICU4J CharsetDetector charsetDetector = new CharsetDetector().setText(inputStream); CharsetMatch[] charsetMatches = charsetDetector.detectAll(); + timeoutChecker.check("character set detection"); // Determine some extra characteristics of the input to compensate for some deficiencies of ICU4J boolean pureAscii = true; @@ -164,6 +186,7 @@ CharsetMatch findCharset(List explanation, InputStream inputStream) thro remainingLength -= bytesRead; } while (containsZeroBytes == false && remainingLength > 0); inputStream.reset(); + timeoutChecker.check("character set detection"); if (pureAscii) { // If the input is pure ASCII then many single byte character sets will match. We want to favour @@ -220,7 +243,7 @@ CharsetMatch findCharset(List explanation, InputStream inputStream) thro } FileStructureFinder makeBestStructureFinder(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides) throws Exception { + FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws Exception { Character delimiter = overrides.getDelimiter(); Character quote = overrides.getQuote(); @@ -250,8 +273,9 @@ FileStructureFinder makeBestStructureFinder(List explanation, String sam } for (FileStructureFinderFactory factory : factories) { + timeoutChecker.check("high level format detection"); if (factory.canCreateFromSample(explanation, sample)) { - return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, overrides); + return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, overrides, timeoutChecker); } } @@ -259,7 +283,8 @@ FileStructureFinder makeBestStructureFinder(List explanation, String sam ((overrides.getFormat() == null) ? "any known formats" : "the specified format [" + overrides.getFormat() + "]")); } - private Tuple sampleFile(Reader reader, String charsetName, int minLines, int maxLines) throws IOException { + private Tuple sampleFile(Reader reader, String charsetName, int minLines, int maxLines, TimeoutChecker timeoutChecker) + throws IOException { int lineCount = 0; BufferedReader bufferedReader = new BufferedReader(reader); @@ -283,6 +308,7 @@ private Tuple sampleFile(Reader reader, String charsetName, int String line; while ((line = bufferedReader.readLine()) != null && ++lineCount <= maxLines) { sample.append(line).append('\n'); + timeoutChecker.check("sample line splitting"); } if (lineCount < minLines) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java index d19353e4b57ca..796587a9c58cb 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java @@ -54,11 +54,12 @@ private FileStructureUtils() { * @param overrides Aspects of the file structure that are known in advance. These take precedence over * values determined by structure analysis. An exception will be thrown if the file structure * is incompatible with an overridden value. + * @param timeoutChecker Will abort the operation if its timeout is exceeded. * @return A tuple of (field name, timestamp format) if one can be found, or null if * there is no consistent timestamp. */ static Tuple guessTimestampField(List explanation, List> sampleRecords, - FileStructureOverrides overrides) { + FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { if (sampleRecords.isEmpty()) { return null; } @@ -80,6 +81,8 @@ static Tuple guessTimestampField(List explanatio break; } + timeoutChecker.check("timestamp field determination"); + TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(fieldValue.toString(), overrides.getTimestampFormat()); if (match == null || match.candidateIndex != candidate.v2().candidateIndex) { if (overrides.getTimestampFormat() != null) { @@ -143,11 +146,14 @@ private static List> findCandidates(List e /** * Given the sampled records, guess appropriate Elasticsearch mappings. + * @param explanation List of reasons for making decisions. May contain items when passed and new reasons + * can be appended by this method. * @param sampleRecords The sampled records. + * @param timeoutChecker Will abort the operation if its timeout is exceeded. * @return A map of field name to mapping settings. */ - static Tuple, SortedMap> - guessMappingsAndCalculateFieldStats(List explanation, List> sampleRecords) { + static Tuple, SortedMap> guessMappingsAndCalculateFieldStats( + List explanation, List> sampleRecords, TimeoutChecker timeoutChecker) { SortedMap mappings = new TreeMap<>(); SortedMap fieldStats = new TreeMap<>(); @@ -163,7 +169,7 @@ private static List> findCandidates(List e ).collect(Collectors.toList()); Tuple, FieldStats> mappingAndFieldStats = - guessMappingAndCalculateFieldStats(explanation, fieldName, fieldValues); + guessMappingAndCalculateFieldStats(explanation, fieldName, fieldValues, timeoutChecker); if (mappingAndFieldStats != null) { if (mappingAndFieldStats.v1() != null) { mappings.put(fieldName, mappingAndFieldStats.v1()); @@ -178,7 +184,8 @@ private static List> findCandidates(List e } static Tuple, FieldStats> guessMappingAndCalculateFieldStats(List explanation, - String fieldName, List fieldValues) { + String fieldName, List fieldValues, + TimeoutChecker timeoutChecker) { if (fieldValues == null || fieldValues.isEmpty()) { // We can get here if all the records that contained a given field had a null value for it. // In this case it's best not to make any statement about what the mapping type should be. @@ -196,11 +203,13 @@ static Tuple, FieldStats> guessMappingAndCalculateFieldStats if (fieldValues.stream().anyMatch(value -> value instanceof List || value instanceof Object[])) { // Elasticsearch fields can be either arrays or single values, but array values must all have the same type return guessMappingAndCalculateFieldStats(explanation, fieldName, - fieldValues.stream().flatMap(FileStructureUtils::flatten).collect(Collectors.toList())); + fieldValues.stream().flatMap(FileStructureUtils::flatten).collect(Collectors.toList()), timeoutChecker); } Collection fieldValuesAsStrings = fieldValues.stream().map(Object::toString).collect(Collectors.toList()); - return new Tuple<>(guessScalarMapping(explanation, fieldName, fieldValuesAsStrings), calculateFieldStats(fieldValuesAsStrings)); + Map mapping = guessScalarMapping(explanation, fieldName, fieldValuesAsStrings); + timeoutChecker.check("mapping determination"); + return new Tuple<>(mapping, calculateFieldStats(fieldValuesAsStrings, timeoutChecker)); } private static Stream flatten(Object value) { @@ -278,12 +287,14 @@ else if (fieldValues.stream().allMatch(IP_GROK::match)) { /** * Calculate stats for a set of field values. * @param fieldValues Values of the field for which field stats are to be calculated. + * @param timeoutChecker Will abort the operation if its timeout is exceeded. * @return The stats calculated from the field values. */ - static FieldStats calculateFieldStats(Collection fieldValues) { + static FieldStats calculateFieldStats(Collection fieldValues, TimeoutChecker timeoutChecker) { FieldStatsCalculator calculator = new FieldStatsCalculator(); calculator.accept(fieldValues); + timeoutChecker.check("field stats calculation"); return calculator.calculate(NUM_TOP_HITS); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java index 54be5079c9d2c..4c6549ad3934c 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java @@ -123,6 +123,7 @@ public final class GrokPatternCreator { private final Map fieldStats; private final Map fieldNameCountStore = new HashMap<>(); private final StringBuilder overallGrokPatternBuilder = new StringBuilder(); + private final TimeoutChecker timeoutChecker; /** * @@ -130,14 +131,16 @@ public final class GrokPatternCreator { * can be appended by the methods of this class. * @param sampleMessages Sample messages that any Grok pattern found must match. * @param mappings Will be updated with mappings appropriate for the returned pattern, if non-null. + * @param timeoutChecker Will abort the operation if its timeout is exceeded. * @param fieldStats Will be updated with field stats for the fields in the returned pattern, if non-null. */ public GrokPatternCreator(List explanation, Collection sampleMessages, Map mappings, - Map fieldStats) { + Map fieldStats, TimeoutChecker timeoutChecker) { this.explanation = explanation; this.sampleMessages = Collections.unmodifiableCollection(sampleMessages); this.mappings = mappings; this.fieldStats = fieldStats; + this.timeoutChecker = timeoutChecker; } /** @@ -150,8 +153,8 @@ public Tuple findFullLineGrokPattern(String timestampField) { for (FullMatchGrokPatternCandidate candidate : FULL_MATCH_GROK_PATTERNS) { if (timestampField == null || timestampField.equals(candidate.getTimeField())) { - if (candidate.matchesAll(sampleMessages)) { - return candidate.processMatch(explanation, sampleMessages, mappings, fieldStats); + if (candidate.matchesAll(sampleMessages, timeoutChecker)) { + return candidate.processMatch(explanation, sampleMessages, mappings, fieldStats, timeoutChecker); } } } @@ -169,8 +172,8 @@ public Tuple findFullLineGrokPattern(String timestampField) { public void validateFullLineGrokPattern(String grokPattern, String timestampField) { FullMatchGrokPatternCandidate candidate = FullMatchGrokPatternCandidate.fromGrokPattern(grokPattern, timestampField); - if (candidate.matchesAll(sampleMessages)) { - candidate.processMatch(explanation, sampleMessages, mappings, fieldStats); + if (candidate.matchesAll(sampleMessages, timeoutChecker)) { + candidate.processMatch(explanation, sampleMessages, mappings, fieldStats, timeoutChecker); } else { throw new IllegalArgumentException("Supplied Grok pattern [" + grokPattern + "] does not match sample messages"); } @@ -213,7 +216,7 @@ private void processCandidateAndSplit(GrokPatternCandidate chosenPattern, boolea Collection prefaces = new ArrayList<>(); Collection epilogues = new ArrayList<>(); String patternBuilderContent = - chosenPattern.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, mappings, fieldStats); + chosenPattern.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, mappings, fieldStats, timeoutChecker); appendBestGrokMatchForStrings(false, prefaces, ignoreKeyValueCandidateLeft, ignoreValueOnlyCandidatesLeft); overallGrokPatternBuilder.append(patternBuilderContent); appendBestGrokMatchForStrings(isLast, epilogues, ignoreKeyValueCandidateRight, ignoreValueOnlyCandidatesRight); @@ -407,7 +410,8 @@ interface GrokPatternCandidate { * @return The string that needs to be incorporated into the overall Grok pattern for the line. */ String processCaptures(Map fieldNameCountStore, Collection snippets, Collection prefaces, - Collection epilogues, Map mappings, Map fieldStats); + Collection epilogues, Map mappings, Map fieldStats, + TimeoutChecker timeoutChecker); } /** @@ -464,7 +468,8 @@ public boolean matchesAll(Collection snippets) { */ @Override public String processCaptures(Map fieldNameCountStore, Collection snippets, Collection prefaces, - Collection epilogues, Map mappings, Map fieldStats) { + Collection epilogues, Map mappings, Map fieldStats, + TimeoutChecker timeoutChecker) { Collection values = new ArrayList<>(); for (String snippet : snippets) { Map captures = grok.captures(snippet); @@ -475,6 +480,7 @@ public String processCaptures(Map fieldNameCountStore, Collecti prefaces.add(captures.getOrDefault(PREFACE, "").toString()); values.add(captures.getOrDefault(VALUE, "").toString()); epilogues.add(captures.getOrDefault(EPILOGUE, "").toString()); + timeoutChecker.check("full message Grok pattern field extraction"); } String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName); if (mappings != null) { @@ -485,11 +491,12 @@ public String processCaptures(Map fieldNameCountStore, Collecti if (timestampMatch != null) { fullMappingType = timestampMatch.getEsDateMappingTypeWithFormat(); } + timeoutChecker.check("mapping determination"); } mappings.put(adjustedFieldName, fullMappingType); } if (fieldStats != null) { - fieldStats.put(adjustedFieldName, FileStructureUtils.calculateFieldStats(values)); + fieldStats.put(adjustedFieldName, FileStructureUtils.calculateFieldStats(values, timeoutChecker)); } return "%{" + grokPatternName + ":" + adjustedFieldName + "}"; } @@ -535,7 +542,8 @@ public boolean matchesAll(Collection snippets) { @Override public String processCaptures(Map fieldNameCountStore, Collection snippets, Collection prefaces, - Collection epilogues, Map mappings, Map fieldStats) { + Collection epilogues, Map mappings, Map fieldStats, + TimeoutChecker timeoutChecker) { if (fieldName == null) { throw new IllegalStateException("Cannot process KV matches until a field name has been determined"); } @@ -551,13 +559,15 @@ public String processCaptures(Map fieldNameCountStore, Collecti prefaces.add(captures.getOrDefault(PREFACE, "").toString()); values.add(captures.getOrDefault(VALUE, "").toString()); epilogues.add(captures.getOrDefault(EPILOGUE, "").toString()); + timeoutChecker.check("full message Grok pattern field extraction"); } String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName); if (mappings != null) { mappings.put(adjustedFieldName, FileStructureUtils.guessScalarMapping(explanation, adjustedFieldName, values)); + timeoutChecker.check("mapping determination"); } if (fieldStats != null) { - fieldStats.put(adjustedFieldName, FileStructureUtils.calculateFieldStats(values)); + fieldStats.put(adjustedFieldName, FileStructureUtils.calculateFieldStats(values, timeoutChecker)); } return "\\b" + fieldName + "=%{USER:" + adjustedFieldName + "}"; } @@ -574,8 +584,9 @@ static class NoMappingGrokPatternCandidate extends ValueOnlyGrokPatternCandidate @Override public String processCaptures(Map fieldNameCountStore, Collection snippets, Collection prefaces, - Collection epilogues, Map mappings, Map fieldStats) { - return super.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, null, fieldStats); + Collection epilogues, Map mappings, Map fieldStats, + TimeoutChecker timeoutChecker) { + return super.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, null, fieldStats, timeoutChecker); } } @@ -606,16 +617,22 @@ public String getTimeField() { return timeField; } - public boolean matchesAll(Collection sampleMessages) { - return sampleMessages.stream().allMatch(grok::match); + public boolean matchesAll(Collection sampleMessages, TimeoutChecker timeoutChecker) { + for (String sampleMessage : sampleMessages) { + if (grok.match(sampleMessage) == false) { + return false; + } + timeoutChecker.check("full message Grok pattern matching"); + } + return true; } /** * This must only be called if {@link #matchesAll} returns true. * @return A tuple of (time field name, Grok string). */ - public Tuple processMatch(List explanation, Collection sampleMessages, - Map mappings, Map fieldStats) { + public Tuple processMatch(List explanation, Collection sampleMessages, Map mappings, + Map fieldStats, TimeoutChecker timeoutChecker) { explanation.add("A full message Grok pattern [" + grokPattern.substring(2, grokPattern.length() - 1) + "] looks appropriate"); @@ -641,6 +658,7 @@ public Tuple processMatch(List explanation, Collection> valuesForField : valuesPerField.entrySet()) { @@ -650,10 +668,11 @@ public Tuple processMatch(List explanation, Collection explanation, String sample, String charsetName, - Boolean hasByteOrderMarker, FileStructureOverrides overrides) - throws IOException { + Boolean hasByteOrderMarker, FileStructureOverrides overrides, + TimeoutChecker timeoutChecker) throws IOException { List> sampleRecords = new ArrayList<>(); @@ -43,6 +43,7 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List explanat XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION, sampleMessage); sampleRecords.add(parser.mapOrdered()); + timeoutChecker.check("JSON parsing"); } FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.JSON) @@ -52,7 +53,8 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List explanat .setNumLinesAnalyzed(sampleMessages.size()) .setNumMessagesAnalyzed(sampleRecords.size()); - Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides); + Tuple timeField = + FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides, timeoutChecker); if (timeField != null) { structureBuilder.setTimestampField(timeField.v1()) .setJodaTimestampFormats(timeField.v2().jodaTimestampFormats) @@ -61,7 +63,7 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List explanat } Tuple, SortedMap> mappingsAndFieldStats = - FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords); + FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker); SortedMap mappings = mappingsAndFieldStats.v1(); if (timeField != null) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java index cfeaa222679c0..e49f597a83c3b 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java @@ -68,8 +68,9 @@ DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader( @Override public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides) throws IOException { - return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides); + FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException { + return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides, + timeoutChecker); } private static class ContextPrintingStringReader extends StringReader { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java index b9386fab72f9b..2d3072dda39e5 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java @@ -28,10 +28,11 @@ public class TextLogFileStructureFinder implements FileStructureFinder { private final FileStructure structure; static TextLogFileStructureFinder makeTextLogFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker, FileStructureOverrides overrides) { + Boolean hasByteOrderMarker, FileStructureOverrides overrides, + TimeoutChecker timeoutChecker) { String[] sampleLines = sample.split("\n"); - Tuple> bestTimestamp = mostLikelyTimestamp(sampleLines, overrides); + Tuple> bestTimestamp = mostLikelyTimestamp(sampleLines, overrides, timeoutChecker); if (bestTimestamp == null) { // Is it appropriate to treat a file that is neither structured nor has // a regular pattern of timestamps as a log file? Probably not... @@ -68,6 +69,7 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex ++linesInMessage; } } + timeoutChecker.check("multi-line message determination"); if (sampleMessages.size() < 2) { preamble.append(sampleLine).append('\n'); } @@ -88,7 +90,7 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex SortedMap fieldStats = new TreeMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, fieldStats); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, fieldStats, timeoutChecker); // We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove String interimTimestampField = overrides.getTimestampField(); String grokPattern = overrides.getGrokPattern(); @@ -98,7 +100,8 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex } grokPatternCreator.validateFullLineGrokPattern(grokPattern, interimTimestampField); } else { - Tuple timestampFieldAndFullMatchGrokPattern = grokPatternCreator.findFullLineGrokPattern(interimTimestampField); + Tuple timestampFieldAndFullMatchGrokPattern = + grokPatternCreator.findFullLineGrokPattern(interimTimestampField); if (timestampFieldAndFullMatchGrokPattern != null) { interimTimestampField = timestampFieldAndFullMatchGrokPattern.v1(); grokPattern = timestampFieldAndFullMatchGrokPattern.v2(); @@ -139,7 +142,8 @@ public FileStructure getStructure() { return structure; } - static Tuple> mostLikelyTimestamp(String[] sampleLines, FileStructureOverrides overrides) { + static Tuple> mostLikelyTimestamp(String[] sampleLines, FileStructureOverrides overrides, + TimeoutChecker timeoutChecker) { Map>> timestampMatches = new LinkedHashMap<>(); @@ -160,6 +164,7 @@ static Tuple> mostLikelyTimestamp(String[] sampleLin }); differenceBetweenTwoHighestWeights = findDifferenceBetweenTwoHighestWeights(timestampMatches.values()); } + timeoutChecker.check("timestamp format determination"); // The highest possible weight is 1, so if the difference between the two highest weights // is less than the number of lines remaining then the leader cannot possibly be overtaken if (differenceBetweenTwoHighestWeights > --remainingLines) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java index b92b705aaffdf..5931fea5f1abf 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java @@ -41,8 +41,8 @@ public boolean canCreateFromSample(List explanation, String sample) { @Override public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides) { + FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { return TextLogFileStructureFinder.makeTextLogFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, - overrides); + overrides, timeoutChecker); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimeoutChecker.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimeoutChecker.java new file mode 100644 index 0000000000000..30c018827292d --- /dev/null +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimeoutChecker.java @@ -0,0 +1,78 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.ml.filestructurefinder; + +import org.elasticsearch.ElasticsearchTimeoutException; +import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.common.util.concurrent.FutureUtils; +import org.elasticsearch.grok.Grok; + +import java.io.Closeable; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.TimeUnit; + +/** + * This class can be used to keep track of when a long running operation started and + * to check whether it has run for longer than permitted. + * + * An object should be constructed at the beginning of the operation and then the + * {@link #check} method called periodically during the processing of the operation. + * + * This class does not use the {@link Thread#interrupt} mechanism because some other + * methods already convert interruptions to other types of exceptions (for example + * {@link Grok#captures}) and this would lead to non-uniform exception types and + * misleading error messages in the event that the interrupt was handled by one of + * these methods. The code in the long running operation would still have to + * periodically call {@link Thread#interrupted}, so it is not much more of an + * inconvenience to have to periodically call this class's {@link #check} method. + */ +public class TimeoutChecker implements Closeable { + + private final String operation; + private final ScheduledFuture future; + private final TimeValue timeout; + private volatile boolean timeoutExceeded; + + /** + * The constructor should be called at the start of the operation whose duration + * is to be checked, as the timeout is measured relative to time of construction. + * @param operation A description of the operation whose duration is to be checked. + * @param timeout The timeout period. If null then there is no timeout. + * @param scheduler Used to schedule the timer. This may be null + * in the case where {@code timeout} is also null. + */ + public TimeoutChecker(String operation, TimeValue timeout, ScheduledExecutorService scheduler) { + this.operation = operation; + this.timeout = timeout; + this.future = (timeout != null) ? scheduler.schedule(this::setTimeoutExceeded, timeout.nanos(), TimeUnit.NANOSECONDS) : null; + } + + /** + * Stops the timer if running. + */ + @Override + public void close() { + FutureUtils.cancel(future); + } + + /** + * Check whether the operation has been running longer than the permitted time. + * @param where Which stage of the operation is currently in progress? + * @throws ElasticsearchTimeoutException If the operation is found to have taken longer than the permitted time. + */ + public void check(String where) { + + if (timeoutExceeded) { + throw new ElasticsearchTimeoutException("Aborting " + operation + " during [" + where + + "] as it has taken longer than the timeout of [" + timeout + "]"); + } + } + + private void setTimeoutExceeded() { + timeoutExceeded = true; + } +} diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java index 66e6bbae88666..1022d6d0ec0d7 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java @@ -38,7 +38,8 @@ public class XmlFileStructureFinder implements FileStructureFinder { private final FileStructure structure; static XmlFileStructureFinder makeXmlFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker, FileStructureOverrides overrides) + Boolean hasByteOrderMarker, FileStructureOverrides overrides, + TimeoutChecker timeoutChecker) throws IOException, ParserConfigurationException, SAXException { String messagePrefix; @@ -66,6 +67,7 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio sampleRecords.add(docToMap(docBuilder.parse(is))); sampleMessages.add(sampleDoc); linesConsumed += numNewlinesIn(sampleDoc); + timeoutChecker.check("XML parsing"); } catch (SAXException e) { // Tolerate an incomplete last record as long as we have one complete record if (sampleRecords.isEmpty() || i < sampleDocEnds.length - 1) { @@ -90,7 +92,8 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio .setNumMessagesAnalyzed(sampleRecords.size()) .setMultilineStartPattern("^\\s*<" + topLevelTag); - Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides); + Tuple timeField = + FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides, timeoutChecker); if (timeField != null) { structureBuilder.setTimestampField(timeField.v1()) .setJodaTimestampFormats(timeField.v2().jodaTimestampFormats) @@ -99,7 +102,7 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio } Tuple, SortedMap> mappingsAndFieldStats = - FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords); + FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker); if (mappingsAndFieldStats.v2() != null) { structureBuilder.setFieldStats(mappingsAndFieldStats.v2()); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java index 3079f53931db6..9f52e666a3399 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java @@ -122,8 +122,9 @@ public boolean canCreateFromSample(List explanation, String sample) { @Override public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - FileStructureOverrides overrides) + FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException, ParserConfigurationException, SAXException { - return XmlFileStructureFinder.makeXmlFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides); + return XmlFileStructureFinder.makeXmlFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides, + timeoutChecker); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java index 316a4b56e4a07..0528c30e059c2 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java @@ -8,6 +8,7 @@ import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.client.node.NodeClient; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.rest.BaseRestHandler; import org.elasticsearch.rest.RestController; import org.elasticsearch.rest.RestRequest; @@ -20,9 +21,12 @@ import java.io.IOException; import java.util.Collections; import java.util.Set; +import java.util.concurrent.TimeUnit; public class RestFindFileStructureAction extends BaseRestHandler { + private static final TimeValue DEFAULT_TIMEOUT = new TimeValue(25, TimeUnit.SECONDS); + public RestFindFileStructureAction(Settings settings, RestController controller) { super(settings); controller.registerHandler(RestRequest.Method.POST, MachineLearning.BASE_PATH + "find_file_structure", this); @@ -39,6 +43,8 @@ protected RestChannelConsumer prepareRequest(RestRequest restRequest, NodeClient FindFileStructureAction.Request request = new FindFileStructureAction.Request(); request.setLinesToSample(restRequest.paramAsInt(FindFileStructureAction.Request.LINES_TO_SAMPLE.getPreferredName(), FileStructureFinderManager.DEFAULT_IDEAL_SAMPLE_LINE_COUNT)); + request.setTimeout(TimeValue.parseTimeValue(restRequest.param(FindFileStructureAction.Request.TIMEOUT.getPreferredName()), + DEFAULT_TIMEOUT, FindFileStructureAction.Request.TIMEOUT.getPreferredName())); request.setCharset(restRequest.param(FindFileStructureAction.Request.CHARSET.getPreferredName())); request.setFormat(restRequest.param(FindFileStructureAction.Request.FORMAT.getPreferredName())); request.setColumnNames(restRequest.paramAsStringArray(FindFileStructureAction.Request.COLUMN_NAMES.getPreferredName(), null)); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java index 9f6699fe71bd7..10bdf0d16d8eb 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java @@ -30,7 +30,7 @@ public void testCreateConfigsGivenCompleteCsv() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES); + FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -64,7 +64,8 @@ public void testCreateConfigsGivenCompleteCsvAndColumnNamesOverride() throws Exc String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, + NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -100,7 +101,8 @@ public void testCreateConfigsGivenCompleteCsvAndHasHeaderRowOverride() throws Ex String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, + NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -133,7 +135,7 @@ public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES); + FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -168,7 +170,7 @@ public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES); + FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -212,7 +214,8 @@ public void testCreateConfigsGivenCsvWithTrailingNullsAndOverriddenTimeField() t String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, + NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -252,7 +255,7 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exce String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES); + FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -298,7 +301,8 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeaderAndColumnNames String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, + NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -336,7 +340,7 @@ public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES); + FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -368,20 +372,21 @@ public void testFindHeaderFromSampleGivenHeaderInSample() throws IOException { "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n"; Tuple header = DelimitedFileStructureFinder.findHeaderFromSample(explanation, - DelimitedFileStructureFinder.readRows(withHeader, CsvPreference.EXCEL_PREFERENCE).v1(), FileStructureOverrides.EMPTY_OVERRIDES); + DelimitedFileStructureFinder.readRows(withHeader, CsvPreference.EXCEL_PREFERENCE, NOOP_TIMEOUT_CHECKER).v1(), + FileStructureOverrides.EMPTY_OVERRIDES); assertTrue(header.v1()); assertThat(header.v2(), arrayContaining("time", "airline", "responsetime", "sourcetype")); } public void testFindHeaderFromSampleGivenHeaderNotInSample() throws IOException { - String withoutHeader = "2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" + + String noHeader = "2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" + "2014-06-23 00:00:00Z,JZA,990.4628,farequote\n" + "2014-06-23 00:00:01Z,JBU,877.5927,farequote\n" + "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n"; Tuple header = DelimitedFileStructureFinder.findHeaderFromSample(explanation, - DelimitedFileStructureFinder.readRows(withoutHeader, CsvPreference.EXCEL_PREFERENCE).v1(), + DelimitedFileStructureFinder.readRows(noHeader, CsvPreference.EXCEL_PREFERENCE, NOOP_TIMEOUT_CHECKER).v1(), FileStructureOverrides.EMPTY_OVERRIDES); assertFalse(header.v1()); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java index 00929ff474cce..5e0aa02d6f0e0 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java @@ -6,26 +6,50 @@ package org.elasticsearch.xpack.ml.filestructurefinder; import com.ibm.icu.text.CharsetMatch; +import org.elasticsearch.ElasticsearchTimeoutException; +import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.junit.After; +import org.junit.Before; import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Arrays; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledThreadPoolExecutor; +import java.util.concurrent.TimeUnit; import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides.EMPTY_OVERRIDES; +import static org.hamcrest.Matchers.endsWith; import static org.hamcrest.Matchers.startsWith; import static org.hamcrest.core.IsInstanceOf.instanceOf; public class FileStructureFinderManagerTests extends FileStructureTestCase { - private FileStructureFinderManager structureFinderManager = new FileStructureFinderManager(); + private ScheduledExecutorService scheduler; + private FileStructureFinderManager structureFinderManager; + + @Before + public void setup() { + scheduler = new ScheduledThreadPoolExecutor(1); + structureFinderManager = new FileStructureFinderManager(scheduler); + } + + @After + public void shutdownScheduler() { + scheduler.shutdown(); + } public void testFindCharsetGivenCharacterWidths() throws Exception { for (Charset charset : Arrays.asList(StandardCharsets.UTF_8, StandardCharsets.UTF_16LE, StandardCharsets.UTF_16BE)) { CharsetMatch charsetMatch = structureFinderManager.findCharset(explanation, - new ByteArrayInputStream(TEXT_SAMPLE.getBytes(charset))); + new ByteArrayInputStream(TEXT_SAMPLE.getBytes(charset)), NOOP_TIMEOUT_CHECKER); assertEquals(charset.name(), charsetMatch.getName()); } } @@ -41,7 +65,8 @@ public void testFindCharsetGivenBinary() throws Exception { } try { - CharsetMatch charsetMatch = structureFinderManager.findCharset(explanation, new ByteArrayInputStream(binaryBytes)); + CharsetMatch charsetMatch = structureFinderManager.findCharset(explanation, new ByteArrayInputStream(binaryBytes), + NOOP_TIMEOUT_CHECKER); assertThat(charsetMatch.getName(), startsWith("UTF-16")); } catch (IllegalArgumentException e) { assertEquals("Could not determine a usable character encoding for the input - could it be binary data?", e.getMessage()); @@ -50,7 +75,7 @@ public void testFindCharsetGivenBinary() throws Exception { public void testMakeBestStructureGivenJson() throws Exception { assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - EMPTY_OVERRIDES), instanceOf(JsonFileStructureFinder.class)); + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(JsonFileStructureFinder.class)); } public void testMakeBestStructureGivenJsonAndDelimitedOverride() throws Exception { @@ -61,12 +86,12 @@ public void testMakeBestStructureGivenJsonAndDelimitedOverride() throws Exceptio .setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build(); assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - overrides), instanceOf(DelimitedFileStructureFinder.class)); + overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class)); } public void testMakeBestStructureGivenXml() throws Exception { assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - EMPTY_OVERRIDES), instanceOf(XmlFileStructureFinder.class)); + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(XmlFileStructureFinder.class)); } public void testMakeBestStructureGivenXmlAndTextOverride() throws Exception { @@ -74,12 +99,12 @@ public void testMakeBestStructureGivenXmlAndTextOverride() throws Exception { FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.SEMI_STRUCTURED_TEXT).build(); assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - overrides), instanceOf(TextLogFileStructureFinder.class)); + overrides, NOOP_TIMEOUT_CHECKER), instanceOf(TextLogFileStructureFinder.class)); } public void testMakeBestStructureGivenCsv() throws Exception { assertThat(structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - EMPTY_OVERRIDES), instanceOf(DelimitedFileStructureFinder.class)); + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class)); } public void testMakeBestStructureGivenCsvAndJsonOverride() { @@ -88,14 +113,14 @@ public void testMakeBestStructureGivenCsvAndJsonOverride() { IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - overrides)); + overrides, NOOP_TIMEOUT_CHECKER)); assertEquals("Input did not match the specified format [json]", e.getMessage()); } public void testMakeBestStructureGivenText() throws Exception { assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - EMPTY_OVERRIDES), instanceOf(TextLogFileStructureFinder.class)); + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(TextLogFileStructureFinder.class)); } public void testMakeBestStructureGivenTextAndDelimitedOverride() throws Exception { @@ -105,6 +130,48 @@ public void testMakeBestStructureGivenTextAndDelimitedOverride() throws Exceptio .setFormat(FileStructure.Format.DELIMITED).setDelimiter(':').build(); assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - overrides), instanceOf(DelimitedFileStructureFinder.class)); + overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class)); + } + + public void testFindFileStructureTimeout() throws IOException, InterruptedException { + + // The number of lines might need increasing in the future if computers get really fast, + // but currently we're not even close to finding the structure of this much data in 10ms + int linesOfJunk = 10000; + TimeValue timeout = new TimeValue(10, TimeUnit.MILLISECONDS); + + try (PipedOutputStream generator = new PipedOutputStream()) { + + Thread junkProducer = new Thread(() -> { + try { + // This is not just junk; this is comma separated junk + for (int count = 0; count < linesOfJunk; ++count) { + generator.write(randomAlphaOfLength(100).getBytes(StandardCharsets.UTF_8)); + generator.write(','); + generator.write(randomAlphaOfLength(100).getBytes(StandardCharsets.UTF_8)); + generator.write(','); + generator.write(randomAlphaOfLength(100).getBytes(StandardCharsets.UTF_8)); + generator.write('\n'); + } + } catch (IOException e) { + // Expected if timeout occurs and the input stream is closed before junk generation is complete + } + }); + junkProducer.start(); + + try (InputStream bigInput = new PipedInputStream(generator)) { + + ElasticsearchTimeoutException e = expectThrows(ElasticsearchTimeoutException.class, + () -> structureFinderManager.findFileStructure(explanation, linesOfJunk - 1, bigInput, EMPTY_OVERRIDES, timeout)); + + assertThat(e.getMessage(), startsWith("Aborting structure analysis during [")); + assertThat(e.getMessage(), endsWith("] as it has taken longer than the timeout of [" + timeout + "]")); + explanation.add(e.getMessage()); + } + + // This shouldn't take anything like 10 seconds, but VMs can stall so it's best to + // set the timeout fairly high to avoid the work that spurious failures cause + junkProducer.join(10000L); + } } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureTestCase.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureTestCase.java index 6246a7ad01e6a..1a0da875e67e2 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureTestCase.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureTestCase.java @@ -68,6 +68,9 @@ public abstract class FileStructureTestCase extends ESTestCase { "\n" + "\n"; + // This doesn't need closing because it has an infinite timeout + protected static final TimeoutChecker NOOP_TIMEOUT_CHECKER = new TimeoutChecker("unit test", null, null); + protected List explanation; @Before diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java index 698b53cf7c741..c0e175f27b2c8 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java @@ -35,8 +35,8 @@ public void testMoreLikelyGivenKeyword() { public void testGuessTimestampGivenSingleSampleSingleField() { Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); - Tuple match = - FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), EMPTY_OVERRIDES); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("field1", match.v1()); assertThat(match.v2().jodaTimestampFormats, contains("ISO8601")); @@ -48,8 +48,8 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeField FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("field1").build(); Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); - Tuple match = - FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), + overrides, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("field1", match.v1()); assertThat(match.v2().jodaTimestampFormats, contains("ISO8601")); @@ -62,7 +62,8 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeField Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides)); + () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides, + NOOP_TIMEOUT_CHECKER)); assertEquals("Specified timestamp field [field2] is not present in record [{field1=2018-05-24T17:28:31,735}]", e.getMessage()); } @@ -72,8 +73,8 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeForma FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("ISO8601").build(); Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); - Tuple match = - FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), + overrides, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("field1", match.v1()); assertThat(match.v2().jodaTimestampFormats, contains("ISO8601")); @@ -86,7 +87,8 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeForma Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides)); + () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides, + NOOP_TIMEOUT_CHECKER)); assertEquals("Specified timestamp format [EEE MMM dd HH:mm:ss YYYY] does not match for record [{field1=2018-05-24T17:28:31,735}]", e.getMessage()); @@ -95,8 +97,8 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeForma public void testGuessTimestampGivenSamplesWithSameSingleTimeField() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("field1", "2018-05-24T17:33:39,406"); - Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("field1", match.v1()); assertThat(match.v2().jodaTimestampFormats, contains("ISO8601")); @@ -106,16 +108,16 @@ public void testGuessTimestampGivenSamplesWithSameSingleTimeField() { public void testGuessTimestampGivenSamplesWithOneSingleTimeFieldDifferentFormat() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("field1", "2018-05-24 17:33:39,406"); - Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNull(match); } public void testGuessTimestampGivenSamplesWithDifferentSingleTimeField() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("another_field", "2018-05-24T17:33:39,406"); - Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNull(match); } @@ -124,8 +126,8 @@ public void testGuessTimestampGivenSingleSampleManyFieldsOneTimeFormat() { sample.put("foo", "not a time"); sample.put("time", "2018-05-24 17:28:31,735"); sample.put("bar", 42); - Tuple match = - FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), EMPTY_OVERRIDES); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().jodaTimestampFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); @@ -141,8 +143,8 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormat() { sample2.put("foo", "whatever"); sample2.put("time", "2018-05-29 11:53:02,837"); sample2.put("bar", 17); - Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().jodaTimestampFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); @@ -158,8 +160,8 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameTimeFieldDifferentTi sample2.put("foo", "whatever"); sample2.put("time", "May 29 2018 11:53:02"); sample2.put("bar", 17); - Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNull(match); } @@ -172,8 +174,8 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDist sample2.put("red_herring", "whatever"); sample2.put("time", "2018-05-29 11:53:02,837"); sample2.put("bar", 17); - Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().jodaTimestampFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); @@ -189,8 +191,8 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDist sample2.put("foo", "whatever"); sample2.put("time", "May 29 2018 11:53:02"); sample2.put("red_herring", "17"); - Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().jodaTimestampFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss")); @@ -206,8 +208,8 @@ public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentTimeFields() sample2.put("foo", "whatever"); sample2.put("time2", "May 29 2018 11:53:02"); sample2.put("bar", 42); - Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNull(match); } @@ -222,8 +224,8 @@ public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentAndConsisten sample2.put("time2", "May 10 2018 11:53:02"); sample2.put("time3", "Thu, May 10 2018 11:53:02"); sample2.put("bar", 42); - Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("time2", match.v1()); assertThat(match.v2().jodaTimestampFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss")); @@ -320,7 +322,8 @@ public void testGuessMappingsAndCalculateFieldStats() { sample2.put("nothing", null); Tuple, SortedMap> mappingsAndFieldStats = - FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, Arrays.asList(sample1, sample2), + NOOP_TIMEOUT_CHECKER); assertNotNull(mappingsAndFieldStats); Map mappings = mappingsAndFieldStats.v1(); @@ -343,8 +346,8 @@ public void testGuessMappingsAndCalculateFieldStats() { } private Map guessMapping(List explanation, String fieldName, List fieldValues) { - Tuple, FieldStats> mappingAndFieldStats = - FileStructureUtils.guessMappingAndCalculateFieldStats(explanation, fieldName, fieldValues); + Tuple, FieldStats> mappingAndFieldStats = FileStructureUtils.guessMappingAndCalculateFieldStats(explanation, + fieldName, fieldValues, NOOP_TIMEOUT_CHECKER); return (mappingAndFieldStats == null) ? null : mappingAndFieldStats.v1(); } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java index 271e071fc2717..dc48662fb35f7 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java @@ -43,7 +43,7 @@ public void testPopulatePrefacesAndEpiloguesGivenTimestamp() { Collection prefaces = new ArrayList<>(); Collection epilogues = new ArrayList<>(); - candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null, null); + candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null, null, NOOP_TIMEOUT_CHECKER); assertThat(prefaces, containsInAnyOrder("[", "[", "junk [", "[")); assertThat(epilogues, containsInAnyOrder("] DEBUG ", "] ERROR ", "] INFO ", "] DEBUG ")); @@ -60,7 +60,7 @@ public void testPopulatePrefacesAndEpiloguesGivenEmailAddress() { Collection prefaces = new ArrayList<>(); Collection epilogues = new ArrayList<>(); - candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null, null); + candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null, null, NOOP_TIMEOUT_CHECKER); assertThat(prefaces, containsInAnyOrder("before ", "abc ", "")); assertThat(epilogues, containsInAnyOrder(" after", " xyz", "")); @@ -73,7 +73,7 @@ public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() { "junk [2018-01-22T07:33:23] INFO ", "[2018-01-21T03:33:23] DEBUG "); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, NOOP_TIMEOUT_CHECKER); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); assertEquals(".*?\\[%{TIMESTAMP_ISO8601:extra_timestamp}\\] %{LOGLEVEL:loglevel} ", @@ -87,7 +87,7 @@ public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() { " (4)", " (-5) "); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, NOOP_TIMEOUT_CHECKER); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); assertEquals(".*?\\(%{INT:field}\\).*?", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -99,7 +99,7 @@ public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak() "prior to-3", "-4"); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, NOOP_TIMEOUT_CHECKER); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); // It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers @@ -113,7 +113,7 @@ public void testAppendBestGrokMatchForStringsGivenHexNumbers() { " -123", "1f is hex"); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, NOOP_TIMEOUT_CHECKER); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); assertEquals(".*?%{BASE16NUM:field}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -124,7 +124,7 @@ public void testAppendBestGrokMatchForStringsGivenHostnamesWithNumbers() { Collection snippets = Arrays.asList(" mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + NOOP_TIMEOUT_CHECKER); assertEquals("%{SYSLOGTIMESTAMP:timestamp} .*? .*?\\[%{INT:field}\\]: %{LOGLEVEL:loglevel} \\(.*? .*? .*?\\) .*? " + "%{QUOTEDSTRING:field2}: %{IP:ipaddress}#%{INT:field3}", @@ -215,7 +216,8 @@ public void testCreateGrokPatternFromExamplesGivenCatalinaLogs() { "Invalid chunk ignored."); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + NOOP_TIMEOUT_CHECKER); assertEquals("%{CATALINA_DATESTAMP:timestamp} .*? .*?\\n%{LOGLEVEL:loglevel}: .*", grokPatternCreator.createGrokPatternFromExamples("CATALINA_DATESTAMP", "timestamp")); @@ -237,7 +239,8 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogs() { "Info\tsshd\tsubsystem request for sftp"); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + NOOP_TIMEOUT_CHECKER); assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", @@ -270,7 +273,8 @@ public void testFindFullLineGrokPatternGivenApacheCombinedLogs() { "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\""); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + NOOP_TIMEOUT_CHECKER); assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), grokPatternCreator.findFullLineGrokPattern(randomBoolean() ? "timestamp" : null)); @@ -300,7 +304,7 @@ public void testAdjustForPunctuationGivenCommonPrefix() { ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"" ); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, NOOP_TIMEOUT_CHECKER); Collection adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets); assertEquals("\",", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -317,7 +321,7 @@ public void testAdjustForPunctuationGivenNoCommonPrefix() { "was added by 'User1'(id:2) to servergroup 'GAME'(id:9)" ); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, NOOP_TIMEOUT_CHECKER); Collection adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets); assertEquals("", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -343,7 +347,8 @@ public void testValidateFullLineGrokPatternGivenValid() { "Info\tsshd\tsubsystem request for sftp"); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + NOOP_TIMEOUT_CHECKER); grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField); assertEquals(9, mappings.size()); @@ -371,7 +376,8 @@ public void testValidateFullLineGrokPatternGivenInvalid() { "Sep 8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53"); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + NOOP_TIMEOUT_CHECKER); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField)); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java index ce401c182eabd..55074e8c38272 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java @@ -19,7 +19,7 @@ public void testCreateConfigsGivenGoodJson() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES); + FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java index b4c28eda1496f..a848f384e2e5f 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java @@ -107,7 +107,7 @@ public void testCreateConfigsGivenElasticsearchLog() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES); + FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -137,7 +137,8 @@ public void testCreateConfigsGivenElasticsearchLogAndTimestampFieldOverride() th String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides); + FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides, + NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -168,7 +169,8 @@ public void testCreateConfigsGivenElasticsearchLogAndGrokPatternOverride() throw String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides); + FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides, + NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); @@ -202,7 +204,7 @@ public void testCreateConfigsGivenElasticsearchLogAndImpossibleGrokPatternOverri String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides)); + () -> factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides, NOOP_TIMEOUT_CHECKER)); assertEquals("Supplied Grok pattern [\\[%{LOGLEVEL:loglevel} *\\]\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] " + "\\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}] does not match sample messages", e.getMessage()); @@ -310,7 +312,8 @@ public void testMostLikelyTimestampGivenAllSame() { "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] no plugins loaded\n"; Tuple> mostLikelyMatch = - TextLogFileStructureFinder.mostLikelyTimestamp(sample.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES); + TextLogFileStructureFinder.mostLikelyTimestamp(sample.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER); assertNotNull(mostLikelyMatch); assertEquals(new TimestampMatch(9, "", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSS", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), mostLikelyMatch.v1()); @@ -319,7 +322,8 @@ public void testMostLikelyTimestampGivenAllSame() { public void testMostLikelyTimestampGivenExceptionTrace() { Tuple> mostLikelyMatch = - TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES); + TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER); assertNotNull(mostLikelyMatch); // Even though many lines have a timestamp near the end (in the Lucene version information), @@ -334,7 +338,7 @@ public void testMostLikelyTimestampGivenExceptionTraceAndTimestampFormatOverride FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("YYYY-MM-dd HH:mm:ss").build(); Tuple> mostLikelyMatch = - TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), overrides); + TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), overrides, NOOP_TIMEOUT_CHECKER); assertNotNull(mostLikelyMatch); // The override should force the seemingly inferior choice of timestamp @@ -347,7 +351,7 @@ public void testMostLikelyTimestampGivenExceptionTraceAndImpossibleTimestampForm FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("MMM dd HH:mm:ss").build(); Tuple> mostLikelyMatch = - TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), overrides); + TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), overrides, NOOP_TIMEOUT_CHECKER); assertNull(mostLikelyMatch); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimeoutCheckerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimeoutCheckerTests.java new file mode 100644 index 0000000000000..125aab7e45ee7 --- /dev/null +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimeoutCheckerTests.java @@ -0,0 +1,60 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.ml.filestructurefinder; + +import org.elasticsearch.ElasticsearchTimeoutException; +import org.elasticsearch.common.unit.TimeValue; +import org.junit.After; +import org.junit.Before; + +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledThreadPoolExecutor; + +public class TimeoutCheckerTests extends FileStructureTestCase { + + private ScheduledExecutorService scheduler; + + @Before + public void createScheduler() { + scheduler = new ScheduledThreadPoolExecutor(1); + } + + @After + public void shutdownScheduler() { + scheduler.shutdown(); + } + + public void testCheckNoTimeout() { + + NOOP_TIMEOUT_CHECKER.check("should never happen"); + } + + public void testCheckTimeoutNotExceeded() throws InterruptedException { + + TimeValue timeout = TimeValue.timeValueSeconds(10); + try (TimeoutChecker timeoutChecker = new TimeoutChecker("timeout not exceeded test", timeout, scheduler)) { + + for (int count = 0; count < 10; ++count) { + timeoutChecker.check("should not timeout"); + Thread.sleep(randomIntBetween(1, 10)); + } + } + } + + public void testCheckTimeoutExceeded() throws Exception { + + TimeValue timeout = TimeValue.timeValueMillis(10); + try (TimeoutChecker timeoutChecker = new TimeoutChecker("timeout exceeded test", timeout, scheduler)) { + + assertBusy(() -> { + ElasticsearchTimeoutException e = expectThrows(ElasticsearchTimeoutException.class, + () -> timeoutChecker.check("should timeout")); + assertEquals("Aborting timeout exceeded test during [should timeout] as it has taken longer than the timeout of [" + + timeout + "]", e.getMessage()); + }); + } + } +} diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java index 2429da0901832..b6f93a6e39b1d 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java @@ -19,7 +19,7 @@ public void testCreateConfigsGivenGoodXml() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); FileStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker, - FileStructureOverrides.EMPTY_OVERRIDES); + FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); FileStructure structure = structureFinder.getStructure(); diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json b/x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json index e630b34ddf764..fd1cbb986a617 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json +++ b/x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json @@ -8,7 +8,13 @@ "params": { "lines_to_sample": { "type": "int", - "description": "Optional parameter to specify how many lines of the file to include in the analysis" + "description": "How many lines of the file should be included in the analysis", + "default": 1000 + }, + "timeout": { + "type": "time", + "description": "Timeout after which the analysis will be aborted", + "default": "25s" }, "charset": { "type": "string", @@ -53,7 +59,8 @@ }, "explain": { "type": "boolean", - "description": "Optional parameter to include a commentary on how the structure was derived" + "description": "Whether to include a commentary on how the structure was derived", + "default": false } } }, diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml index 8f1d9a4dc9cac..6a0414fe9dd61 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml @@ -7,6 +7,7 @@ Content-Type: "application/json" xpack.ml.find_file_structure: lines_to_sample: 3 + timeout: 10s body: - airline: AAL responsetime: 132.2046