From 0ed19dc339eaddb3cc81eec12e32e6cf5f246d87 Mon Sep 17 00:00:00 2001 From: Jake Landis Date: Wed, 5 Sep 2018 09:43:44 -0700 Subject: [PATCH] ingest: Dissect processor and lib for 6x (#33422) * Introduce the dissect library (#32297) The dissect library will be used for the ingest node as an alternative to Grok to split a string based on a pattern. Dissect differs from Grok such that regular expressions are not used to split the string. Note - Regular expressions are used during construction of the objects, but not in the hot path. A dissect pattern takes the form of: '%{a} %{b},%{c}' which is composed of 3 keys (a,b,c) and two delimiters (space and comma). This dissect pattern will match a string of the form: 'foo bar,baz' and will result a key/value pairing of 'a=foo, b=bar, and c=baz'. See the comments in DissectParser for a full explanation. This commit does not include the ingest node processor that will consume it. However, the consumption should be a trivial mapping between the key/value pairing returned by the parser and the key/value pairing needed for the IngestDocument. * ingest: Introduce the dissect processor (#32884) The ingest node dissect processor is an alternative to Grok to split a string based on a pattern. Dissect differs from Grok such that regular expressions are not used to split the string. Dissect can be used to parse a source text field with a simpler pattern, and is often faster the Grok for basic string parsing. This processor uses the dissect library which does most of the work. * ingest: minor - update test to include dissect (#33211) This change also includes placing the bytes processor in the correct order (helps to avoid merge conflict when back patching processors) --- docs/reference/ingest/ingest-node.asciidoc | 431 +++++++++++++----- libs/dissect/build.gradle | 50 ++ libs/dissect/src/main/eclipse-build.gradle | 3 + .../dissect/DissectException.java | 57 +++ .../org/elasticsearch/dissect/DissectKey.java | 191 ++++++++ .../elasticsearch/dissect/DissectMatch.java | 198 ++++++++ .../elasticsearch/dissect/DissectParser.java | 310 +++++++++++++ libs/dissect/src/test/eclipse-build.gradle | 7 + .../dissect/DissectKeyTests.java | 178 ++++++++ .../dissect/DissectMatchTests.java | 93 ++++ .../dissect/DissectParserTests.java | 386 ++++++++++++++++ .../test/resources/specification/tests.json | 363 +++++++++++++++ modules/ingest-common/build.gradle | 1 + .../ingest/common/DissectProcessor.java | 76 +++ .../ingest/common/IngestCommonPlugin.java | 1 + .../common/DissectProcessorFactoryTests.java | 92 ++++ .../ingest/common/DissectProcessorTests.java | 114 +++++ .../rest-api-spec/test/ingest/10_basic.yml | 37 +- .../test/ingest/200_dissect_processor.yml | 90 ++++ 19 files changed, 2541 insertions(+), 137 deletions(-) create mode 100644 libs/dissect/build.gradle create mode 100644 libs/dissect/src/main/eclipse-build.gradle create mode 100644 libs/dissect/src/main/java/org/elasticsearch/dissect/DissectException.java create mode 100644 libs/dissect/src/main/java/org/elasticsearch/dissect/DissectKey.java create mode 100644 libs/dissect/src/main/java/org/elasticsearch/dissect/DissectMatch.java create mode 100644 libs/dissect/src/main/java/org/elasticsearch/dissect/DissectParser.java create mode 100644 libs/dissect/src/test/eclipse-build.gradle create mode 100644 libs/dissect/src/test/java/org/elasticsearch/dissect/DissectKeyTests.java create mode 100644 libs/dissect/src/test/java/org/elasticsearch/dissect/DissectMatchTests.java create mode 100644 libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java create mode 100644 libs/dissect/src/test/resources/specification/tests.json create mode 100644 modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/DissectProcessor.java create mode 100644 modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorFactoryTests.java create mode 100644 modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorTests.java create mode 100644 modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/200_dissect_processor.yml diff --git a/docs/reference/ingest/ingest-node.asciidoc b/docs/reference/ingest/ingest-node.asciidoc index 700f72850169d..bf5d291f959d9 100644 --- a/docs/reference/ingest/ingest-node.asciidoc +++ b/docs/reference/ingest/ingest-node.asciidoc @@ -1049,6 +1049,318 @@ understands this to mean `2016-04-01` as is explained in the <>, dissect also extracts structured fields out of a single text field +within a document. However unlike the <>, dissect does not use +https://en.wikipedia.org/wiki/Regular_expression[Regular Expressions]. This allows dissect's syntax to be simple and for +some cases faster than the <>. + +Dissect matches a single text field against a defined pattern. + +For example the following pattern: +[source,txt] +-------------------------------------------------- +%{clientip} %{ident} %{auth} [%{@timestamp}] \"%{verb} %{request} HTTP/%{httpversion}\" %{status} %{size} +-------------------------------------------------- +will match a log line of this format: +[source,txt] +-------------------------------------------------- +1.2.3.4 - - [30/Apr/1998:22:00:52 +0000] \"GET /english/venues/cities/images/montpellier/18.gif HTTP/1.0\" 200 3171 +-------------------------------------------------- +and result in a document with the following fields: +[source,js] +-------------------------------------------------- +"doc": { + "_index": "_index", + "_type": "_type", + "_id": "_id", + "_source": { + "request": "/english/venues/cities/images/montpellier/18.gif", + "auth": "-", + "ident": "-", + "verb": "GET", + "@timestamp": "30/Apr/1998:22:00:52 +0000", + "size": "3171", + "clientip": "1.2.3.4", + "httpversion": "1.0", + "status": "200" + } +} +-------------------------------------------------- +// NOTCONSOLE + +A dissect pattern is defined by the parts of the string that will be discarded. In the example above the first part +to be discarded is a single space. Dissect finds this space, then assigns the value of `clientip` is everything up +until that space. +Later dissect matches the `[` and then `]` and then assigns `@timestamp` to everything in-between `[` and `]`. +Paying special attention the parts of the string to discard will help build successful dissect patterns. + +Successful matches require all keys in a pattern to have a value. If any of the `%{keyname}` defined in the pattern do +not have a value, then an exception is thrown and may be handled by the <> directive. +An empty key `%{}` or a <> can be used to match values, but exclude the value from +the final document. All matched values are represented as string data types. The <> +may be used to convert to expected data type. + +Dissect also supports <> that can change dissect's default +behavior. For example you can instruct dissect to ignore certain fields, append fields, skip over padding, etc. +See <> for more information. + +[[dissect-options]] +.Dissect Options +[options="header"] +|====== +| Name | Required | Default | Description +| `field` | yes | - | The field to dissect +| `pattern` | yes | - | The pattern to apply to the field +| `append_separator`| no | "" (empty string) | The character(s) that separate the appended fields. +| `ignore_missing` | no | false | If `true` and `field` does not exist or is `null`, the processor quietly exits without modifying the document +| ` +|====== + +[source,js] +-------------------------------------------------- +{ + "dissect": { + "field": "message", + "pattern" : "%{clientip} %{ident} %{auth} [%{@timestamp}] \"%{verb} %{request} HTTP/%{httpversion}\" %{status} %{size}" + } +} +-------------------------------------------------- +// NOTCONSOLE +[[dissect-key-modifiers]] +==== Dissect key modifiers +Key modifiers can change the default behavior for dissection. Key modifiers may be found on the left or right +of the `%{keyname}` always inside the `%{` and `}`. For example `%{+keyname ->}` has the append and right padding +modifiers. + +.Dissect Key Modifiers +[options="header"] +|====== +| Modifier | Name | Position | Example | Description | Details +| `->` | Skip right padding | (far) right | `%{keyname1->}` | Skips any repeated characters to the right | <> +| `+` | Append | left | `%{+keyname} %{+keyname}` | Appends two or more fields together | <> +| `+` with `/n` | Append with order | left and right | `%{+keyname/2} %{+keyname/1}` | Appends two or more fields together in the order specified | <> +| `?` | Named skip key | left | `%{?ignoreme}` | Skips the matched value in the output. Same behavior as `%{}`| <> +| `*` and `&` | Reference keys | left | `%{*r1} %{&r1}` | Sets the output key as value of `*` and output value of `&` | <> +| ` +|====== + +[[dissect-modifier-skip-right-padding]] +===== Right padding modifier (`->`) + +The algorithm that performs the dissection is very strict in that it requires all characters in the pattern to match +the source string. For example, the pattern `%{fookey} %{barkey}` (1 space), will match the string "foo{nbsp}bar" +(1 space), but will not match the string "foo{nbsp}{nbsp}bar" (2 spaces) since the pattern has only 1 space and the +source string has 2 spaces. + +The right padding modifier helps with this case. Adding the right padding modifier to the pattern `%{fookey->} %{barkey}`, +It will now will match "foo{nbsp}bar" (1 space) and "foo{nbsp}{nbsp}bar" (2 spaces) +and even "foo{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}bar" (10 spaces). + +Use the right padding modifier to allow for repetition of the characters after a `%{keyname->}`. + +The right padding modifier may be placed on any key with any other modifiers. It should always be the furthest right +modifier. For example: `%{+keyname/1->}` and `%{->}` + +Right padding modifier example +|====== +| *Pattern* | `%{ts->} %{level}` +| *Input* | 1998-08-10T17:15:42,466{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}WARN +| *Result* a| +* ts = 1998-08-10T17:15:42,466 +* level = WARN +|====== + +The right padding modifier may be used with an empty key to help skip unwanted data. For example, the same input string, but wrapped with brackets requires the use of an empty right padded key to achieve the same result. + +Right padding modifier with empty key example +|====== +| *Pattern* | `[%{ts}]%{->}[%{level}]` +| *Input* | [1998-08-10T17:15:42,466]{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}[WARN] +| *Result* a| +* ts = 1998-08-10T17:15:42,466 +* level = WARN +|====== + +===== Append modifier (`+`) +[[dissect-modifier-append-key]] +Dissect supports appending two or more results together for the output. +Values are appended left to right. An append separator can be specified. +In this example the append_separator is defined as a space. + +Append modifier example +|====== +| *Pattern* | `%{+name} %{+name} %{+name} %{+name}` +| *Input* | john jacob jingleheimer schmidt +| *Result* a| +* name = john jacob jingleheimer schmidt +|====== + +===== Append with order modifier (`+` and `/n`) +[[dissect-modifier-append-key-with-order]] +Dissect supports appending two or more results together for the output. +Values are appended based on the order defined (`/n`). An append separator can be specified. +In this example the append_separator is defined as a comma. + +Append with order modifier example +|====== +| *Pattern* | `%{+name/2} %{+name/4} %{+name/3} %{+name/1}` +| *Input* | john jacob jingleheimer schmidt +| *Result* a| +* name = schmidt,john,jingleheimer,jacob +|====== + +===== Named skip key (`?`) +[[dissect-modifier-named-skip-key]] +Dissect supports ignoring matches in the final result. This can be done with an empty key `%{}`, but for readability +it may be desired to give that empty key a name. + +Named skip key modifier example +|====== +| *Pattern* | `%{clientip} %{?ident} %{?auth} [%{@timestamp}]` +| *Input* | 1.2.3.4 - - [30/Apr/1998:22:00:52 +0000] +| *Result* a| +* ip = 1.2.3.4 +* @timestamp = 30/Apr/1998:22:00:52 +0000 +|====== + +===== Reference keys (`*` and `&`) +[[dissect-modifier-reference-keys]] +Dissect support using parsed values as the key/value pairings for the structured content. Imagine a system that +partially logs in key/value pairs. Reference keys allow you to maintain that key/value relationship. + +Reference key modifier example +|====== +| *Pattern* | `[%{ts}] [%{level}] %{*p1}:%{&p1} %{*p2}:%{&p2}` +| *Input* | [2018-08-10T17:15:42,466] [ERR] ip:1.2.3.4 error:REFUSED +| *Result* a| +* ts = 1998-08-10T17:15:42,466 +* level = ERR +* ip = 1.2.3.4 +* error = REFUSED +|====== + +[[dot-expand-processor]] +=== Dot Expander Processor + +Expands a field with dots into an object field. This processor allows fields +with dots in the name to be accessible by other processors in the pipeline. +Otherwise these <> can't be accessed by any processor. + +[[dot-expender-options]] +.Dot Expand Options +[options="header"] +|====== +| Name | Required | Default | Description +| `field` | yes | - | The field to expand into an object field +| `path` | no | - | The field that contains the field to expand. Only required if the field to expand is part another object field, because the `field` option can only understand leaf fields. +|====== + +[source,js] +-------------------------------------------------- +{ + "dot_expander": { + "field": "foo.bar" + } +} +-------------------------------------------------- +// NOTCONSOLE + +For example the dot expand processor would turn this document: + +[source,js] +-------------------------------------------------- +{ + "foo.bar" : "value" +} +-------------------------------------------------- +// NOTCONSOLE + +into: + +[source,js] +-------------------------------------------------- +{ + "foo" : { + "bar" : "value" + } +} +-------------------------------------------------- +// NOTCONSOLE + +If there is already a `bar` field nested under `foo` then +this processor merges the `foo.bar` field into it. If the field is +a scalar value then it will turn that field into an array field. + +For example, the following document: + +[source,js] +-------------------------------------------------- +{ + "foo.bar" : "value2", + "foo" : { + "bar" : "value1" + } +} +-------------------------------------------------- +// NOTCONSOLE + +is transformed by the `dot_expander` processor into: + +[source,js] +-------------------------------------------------- +{ + "foo" : { + "bar" : ["value1", "value2"] + } +} +-------------------------------------------------- +// NOTCONSOLE + +If any field outside of the leaf field conflicts with a pre-existing field of the same name, +then that field needs to be renamed first. + +Consider the following document: + +[source,js] +-------------------------------------------------- +{ + "foo": "value1", + "foo.bar": "value2" +} +-------------------------------------------------- +// NOTCONSOLE + +Then the `foo` needs to be renamed first before the `dot_expander` +processor is applied. So in order for the `foo.bar` field to properly +be expanded into the `bar` field under the `foo` field the following +pipeline should be used: + +[source,js] +-------------------------------------------------- +{ + "processors" : [ + { + "rename" : { + "field" : "foo", + "target_field" : "foo.bar"" + } + }, + { + "dot_expander": { + "field": "foo.bar" + } + } + ] +} +-------------------------------------------------- +// NOTCONSOLE + +The reason for this is that Ingest doesn't know how to automatically cast +a scalar field to an object field. + [[fail-processor]] === Fail Processor Raises an exception. This is useful for when @@ -2058,125 +2370,6 @@ Converts a string to its uppercase equivalent. -------------------------------------------------- // NOTCONSOLE -[[dot-expand-processor]] -=== Dot Expander Processor - -Expands a field with dots into an object field. This processor allows fields -with dots in the name to be accessible by other processors in the pipeline. -Otherwise these <> can't be accessed by any processor. - -[[dot-expender-options]] -.Dot Expand Options -[options="header"] -|====== -| Name | Required | Default | Description -| `field` | yes | - | The field to expand into an object field -| `path` | no | - | The field that contains the field to expand. Only required if the field to expand is part another object field, because the `field` option can only understand leaf fields. -|====== - -[source,js] --------------------------------------------------- -{ - "dot_expander": { - "field": "foo.bar" - } -} --------------------------------------------------- -// NOTCONSOLE - -For example the dot expand processor would turn this document: - -[source,js] --------------------------------------------------- -{ - "foo.bar" : "value" -} --------------------------------------------------- -// NOTCONSOLE - -into: - -[source,js] --------------------------------------------------- -{ - "foo" : { - "bar" : "value" - } -} --------------------------------------------------- -// NOTCONSOLE - -If there is already a `bar` field nested under `foo` then -this processor merges the `foo.bar` field into it. If the field is -a scalar value then it will turn that field into an array field. - -For example, the following document: - -[source,js] --------------------------------------------------- -{ - "foo.bar" : "value2", - "foo" : { - "bar" : "value1" - } -} --------------------------------------------------- -// NOTCONSOLE - -is transformed by the `dot_expander` processor into: - -[source,js] --------------------------------------------------- -{ - "foo" : { - "bar" : ["value1", "value2"] - } -} --------------------------------------------------- -// NOTCONSOLE - -If any field outside of the leaf field conflicts with a pre-existing field of the same name, -then that field needs to be renamed first. - -Consider the following document: - -[source,js] --------------------------------------------------- -{ - "foo": "value1", - "foo.bar": "value2" -} --------------------------------------------------- -// NOTCONSOLE - -Then the `foo` needs to be renamed first before the `dot_expander` -processor is applied. So in order for the `foo.bar` field to properly -be expanded into the `bar` field under the `foo` field the following -pipeline should be used: - -[source,js] --------------------------------------------------- -{ - "processors" : [ - { - "rename" : { - "field" : "foo", - "target_field" : "foo.bar"" - } - }, - { - "dot_expander": { - "field": "foo.bar" - } - } - ] -} --------------------------------------------------- -// NOTCONSOLE - -The reason for this is that Ingest doesn't know how to automatically cast -a scalar field to an object field. - [[urldecode-processor]] === URL Decode Processor URL-decodes a string diff --git a/libs/dissect/build.gradle b/libs/dissect/build.gradle new file mode 100644 index 0000000000000..577f9bdb337a7 --- /dev/null +++ b/libs/dissect/build.gradle @@ -0,0 +1,50 @@ +import org.elasticsearch.gradle.precommit.PrecommitTasks + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +archivesBaseName = 'elasticsearch-dissect' + +dependencies { + if (isEclipse == false || project.path == ":libs:dissect-tests") { + testCompile("org.elasticsearch.test:framework:${version}") { + exclude group: 'org.elasticsearch', module: 'dissect' + } + } + testCompile "com.fasterxml.jackson.core:jackson-core:${versions.jackson}" + testCompile("com.fasterxml.jackson.core:jackson-annotations:${versions.jackson}") + testCompile("com.fasterxml.jackson.core:jackson-databind:${versions.jackson}") +} + +forbiddenApisMain { + replaceSignatureFiles 'jdk-signatures' +} + +if (isEclipse) { + // in eclipse the project is under a fake root, we need to change around the source sets + sourceSets { + if (project.path == ":libs:dissect") { + main.java.srcDirs = ['java'] + main.resources.srcDirs = ['resources'] + } else { + test.java.srcDirs = ['java'] + test.resources.srcDirs = ['resources'] + } + } +} diff --git a/libs/dissect/src/main/eclipse-build.gradle b/libs/dissect/src/main/eclipse-build.gradle new file mode 100644 index 0000000000000..c2b72bd21e1f1 --- /dev/null +++ b/libs/dissect/src/main/eclipse-build.gradle @@ -0,0 +1,3 @@ + +// this is just shell gradle file for eclipse to have separate projects for dissect src and tests +apply from: '../../build.gradle' diff --git a/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectException.java b/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectException.java new file mode 100644 index 0000000000000..a2f1ab336401b --- /dev/null +++ b/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectException.java @@ -0,0 +1,57 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.dissect; + +/** + * Parent class for all dissect related exceptions. Consumers may catch this exception or more specific child exceptions. + */ +public abstract class DissectException extends RuntimeException { + DissectException(String message) { + super(message); + } + + /** + * Error while parsing a dissect pattern + */ + static class PatternParse extends DissectException { + PatternParse(String pattern, String reason) { + super("Unable to parse pattern: " + pattern + " Reason: " + reason); + } + } + + /** + * Error while parsing a dissect key + */ + static class KeyParse extends DissectException { + KeyParse(String key, String reason) { + super("Unable to parse key: " + key + " Reason: " + reason); + } + } + + /** + * Unable to find a match between pattern and source string + */ + static class FindMatch extends DissectException { + FindMatch(String pattern, String source) { + super("Unable to find match for dissect pattern: " + pattern + " against source: " + source); + + } + } +} diff --git a/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectKey.java b/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectKey.java new file mode 100644 index 0000000000000..67a6842182da7 --- /dev/null +++ b/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectKey.java @@ -0,0 +1,191 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.dissect; + +import java.util.EnumSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + *

A Key of a dissect pattern. This class models the name and modifiers and provides some validation.

+ *

For dissect pattern of {@code %{a} %{+a} %{b}} the dissect keys are: + *

    + *
  • {@code a}
  • + *
  • {@code +a}
  • + *
  • {@code b}
  • + *
+ * This class represents a single key. + *

A single key is composed of a name and it's modifiers. For the key {@code +a}, {@code a} is the name and {@code +} is the modifier. + * @see DissectParser + */ +public final class DissectKey { + private static final Pattern LEFT_MODIFIER_PATTERN = Pattern.compile("([+*&?])(.*?)(->)?$", Pattern.DOTALL); + private static final Pattern RIGHT_PADDING_PATTERN = Pattern.compile("^(.*?)(->)?$", Pattern.DOTALL); + private static final Pattern APPEND_WITH_ORDER_PATTERN = Pattern.compile("[+](.*?)(/)([0-9]+)(->)?$", Pattern.DOTALL); + private final Modifier modifier; + private boolean skip; + private boolean skipRightPadding; + private int appendPosition; + private String name; + + /** + * Constructor - parses the String key into it's name and modifier(s) + * + * @param key The key without the leading %{ or trailing }, for example {@code a->} + */ + DissectKey(String key) { + skip = key == null || key.isEmpty(); + modifier = Modifier.findModifier(key); + switch (modifier) { + case NONE: + Matcher matcher = RIGHT_PADDING_PATTERN.matcher(key); + while (matcher.find()) { + name = matcher.group(1); + skipRightPadding = matcher.group(2) != null; + } + skip = name.isEmpty(); + break; + case NAMED_SKIP: + matcher = LEFT_MODIFIER_PATTERN.matcher(key); + while (matcher.find()) { + name = matcher.group(2); + skipRightPadding = matcher.group(3) != null; + } + skip = true; + break; + case APPEND: + matcher = LEFT_MODIFIER_PATTERN.matcher(key); + while (matcher.find()) { + name = matcher.group(2); + skipRightPadding = matcher.group(3) != null; + } + break; + case FIELD_NAME: + matcher = LEFT_MODIFIER_PATTERN.matcher(key); + while (matcher.find()) { + name = matcher.group(2); + skipRightPadding = matcher.group(3) != null; + } + break; + case FIELD_VALUE: + matcher = LEFT_MODIFIER_PATTERN.matcher(key); + while (matcher.find()) { + name = matcher.group(2); + skipRightPadding = matcher.group(3) != null; + } + break; + case APPEND_WITH_ORDER: + matcher = APPEND_WITH_ORDER_PATTERN.matcher(key); + while (matcher.find()) { + name = matcher.group(1); + appendPosition = Short.valueOf(matcher.group(3)); + skipRightPadding = matcher.group(4) != null; + } + break; + } + + if (name == null || (name.isEmpty() && !skip)) { + throw new DissectException.KeyParse(key, "The key name could be determined"); + } + } + + /** + * Copy constructor to explicitly override the modifier. + * @param key The key to copy (except for the modifier) + * @param modifier the modifer to use for this copy + */ + DissectKey(DissectKey key, DissectKey.Modifier modifier){ + this.modifier = modifier; + this.skipRightPadding = key.skipRightPadding; + this.skip = key.skip; + this.name = key.name; + this.appendPosition = key.appendPosition; + } + + Modifier getModifier() { + return modifier; + } + + boolean skip() { + return skip; + } + + boolean skipRightPadding() { + return skipRightPadding; + } + + int getAppendPosition() { + return appendPosition; + } + + String getName() { + return name; + } + + //generated + @Override + public String toString() { + return "DissectKey{" + + "modifier=" + modifier + + ", skip=" + skip + + ", appendPosition=" + appendPosition + + ", name='" + name + '\'' + + '}'; + } + + public enum Modifier { + NONE(""), APPEND_WITH_ORDER("/"), APPEND("+"), FIELD_NAME("*"), FIELD_VALUE("&"), NAMED_SKIP("?"); + + private static final Pattern MODIFIER_PATTERN = Pattern.compile("[/+*&?]"); + + private final String modifier; + + @Override + public String toString() { + return modifier; + } + + Modifier(final String modifier) { + this.modifier = modifier; + } + + //package private for testing + static Modifier fromString(String modifier) { + return EnumSet.allOf(Modifier.class).stream().filter(km -> km.modifier.equals(modifier)) + .findFirst().orElseThrow(() -> new IllegalArgumentException("Found invalid modifier.")); //throw should never happen + } + + private static Modifier findModifier(String key) { + Modifier modifier = Modifier.NONE; + if (key != null && !key.isEmpty()) { + Matcher matcher = MODIFIER_PATTERN.matcher(key); + int matches = 0; + while (matcher.find()) { + Modifier priorModifier = modifier; + modifier = Modifier.fromString(matcher.group()); + if (++matches > 1 && !(APPEND.equals(priorModifier) && APPEND_WITH_ORDER.equals(modifier))) { + throw new DissectException.KeyParse(key, "multiple modifiers are not allowed."); + } + } + } + return modifier; + } + } +} diff --git a/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectMatch.java b/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectMatch.java new file mode 100644 index 0000000000000..9217413e07557 --- /dev/null +++ b/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectMatch.java @@ -0,0 +1,198 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.dissect; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Represents the matches of a {@link DissectParser#parse(String)}. Handles the appending and referencing based on the key instruction. + */ +final class DissectMatch { + + private final String appendSeparator; + private final Map results; + private final Map simpleResults; + private final Map referenceResults; + private final Map appendResults; + private int implicitAppendOrder = -1000; + private final int maxMatches; + private final int maxResults; + private final int appendCount; + private final int referenceCount; + private final int simpleCount; + private int matches = 0; + + DissectMatch(String appendSeparator, int maxMatches, int maxResults, int appendCount, int referenceCount) { + if (maxMatches <= 0 || maxResults <= 0) { + throw new IllegalArgumentException("Expected results are zero, can not construct DissectMatch");//should never happen + } + this.maxMatches = maxMatches; + this.maxResults = maxResults; + this.appendCount = appendCount; + this.referenceCount = referenceCount; + this.appendSeparator = appendSeparator; + results = new HashMap<>(maxResults); + this.simpleCount = maxMatches - referenceCount - appendCount; + simpleResults = simpleCount <= 0 ? null : new HashMap<>(simpleCount); + referenceResults = referenceCount <= 0 ? null : new HashMap<>(referenceCount); + appendResults = appendCount <= 0 ? null : new HashMap<>(appendCount); + } + + /** + * Add the key/value that was found as result of the parsing + * @param key the {@link DissectKey} + * @param value the discovered value for the key + */ + void add(DissectKey key, String value) { + matches++; + if (key.skip()) { + return; + } + switch (key.getModifier()) { + case NONE: + simpleResults.put(key.getName(), value); + break; + case APPEND: + appendResults.computeIfAbsent(key.getName(), k -> new AppendResult(appendSeparator)).addValue(value, implicitAppendOrder++); + break; + case APPEND_WITH_ORDER: + appendResults.computeIfAbsent(key.getName(), + k -> new AppendResult(appendSeparator)).addValue(value, key.getAppendPosition()); + break; + case FIELD_NAME: + referenceResults.computeIfAbsent(key.getName(), k -> new ReferenceResult()).setKey(value); + break; + case FIELD_VALUE: + referenceResults.computeIfAbsent(key.getName(), k -> new ReferenceResult()).setValue(value); + break; + } + } + + boolean fullyMatched() { + return matches == maxMatches; + } + + /** + * Checks if results are valid. + * @param results the results to check + * @return true if all dissect keys have been matched and the results are of the expected size. + */ + boolean isValid(Map results) { + return fullyMatched() && results.size() == maxResults; + } + + /** + * Gets all the current matches. Pass the results of this to isValid to determine if a fully successful match has occured. + * + * @return the map of the results. + */ + Map getResults() { + results.clear(); + if (simpleCount > 0) { + results.putAll(simpleResults); + } + if (referenceCount > 0) { + referenceResults.forEach((k, v) -> results.put(v.getKey(), v.getValue())); + } + if (appendCount > 0) { + appendResults.forEach((k, v) -> results.put(k, v.getAppendResult())); + } + + return results; + } + + /** + * a result that will need to be part of an append operation. + */ + private final class AppendResult { + private final List values = new ArrayList<>(); + private final String appendSeparator; + + private AppendResult(String appendSeparator) { + this.appendSeparator = appendSeparator; + } + + private void addValue(String value, int order) { + values.add(new AppendValue(value, order)); + } + + private String getAppendResult() { + Collections.sort(values); + return values.stream().map(AppendValue::getValue).collect(Collectors.joining(appendSeparator)); + } + } + + /** + * An appendable value that can be sorted based on the provided order + */ + private final class AppendValue implements Comparable { + private final String value; + private final int order; + + private AppendValue(String value, int order) { + this.value = value; + this.order = order; + } + + private String getValue() { + return value; + } + + private int getOrder() { + return order; + } + + @Override + public int compareTo(AppendValue o) { + return Integer.compare(this.order, o.getOrder()); + } + } + + /** + * A result that needs to be converted to a key/value reference + */ + private final class ReferenceResult { + + private String key; + + private String getKey() { + return key; + } + + private String getValue() { + return value; + } + + private String value; + + private void setValue(String value) { + this.value = value; + } + + private void setKey(String key) { + this.key = key; + } + } +} diff --git a/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectParser.java b/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectParser.java new file mode 100644 index 0000000000000..407d73134b611 --- /dev/null +++ b/libs/dissect/src/main/java/org/elasticsearch/dissect/DissectParser.java @@ -0,0 +1,310 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.dissect; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.EnumSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Function; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + *

Splits (dissects) a string into its parts based on a pattern.

A dissect pattern is composed of a set of keys and delimiters. + * For example the dissect pattern:

%{a} %{b},%{c}
has 3 keys (a,b,c) and two delimiters (space and comma). This pattern will + * match a string of the form:
foo bar,baz
and will result a key/value pairing of
a=foo, b=bar, and c=baz.
+ *

Matches are all or nothing. For example, the same pattern will NOT match

foo bar baz
since all of the delimiters did not + * match. (the comma did not match) + *

Dissect patterns can optionally have modifiers. These modifiers instruct the parser to change it's behavior. For example the + * dissect pattern of

%{a},%{b}:%{c}
would not match
foo,bar,baz
since there the colon never matches. + *

Modifiers appear to the left or the right of the key name. The supported modifiers are: + *

    + *
  • {@code ->} Instructs the parser to ignore repeating delimiters to the right of the key. Example:
    + * pattern: {@code %{a->} %{b} %{c}}
    + * string: {@code foo         bar baz}
    + * result: {@code a=foo, b=bar, c=baz}
    + * 
  • + *
  • {@code +} Instructs the parser to appends this key's value to value of prior key with the same name. + * Example:
    + * pattern: {@code %{a} %{+a} %{+a}}
    + * string: {@code foo bar baz}
    + * result: {@code a=foobarbaz}
    + * 
  • + *
  • {@code /} Instructs the parser to appends this key's value to value of a key based based on the order specified after the + * {@code /}. Requires the {@code +} modifier to also be present in the key. Example:
    + * pattern: {@code %{a} %{+a/2} %{+a/1}}
    + * string: {@code foo bar baz}
    + * result: {@code a=foobazbar}
    + * 
    + *
  • + *
  • {@code *} Instructs the parser to ignore the name of this key, instead use the value of key as the key name. + * Requires another key with the same name and the {@code &} modifier to be the value. Example:
    + * pattern: {@code %{*a} %{b} %{&a}}
    + * string: {@code foo bar baz}
    + * result: {@code foo=baz, b=bar}
    + * 
  • + *
  • {@code &} Instructs the parser to ignore this key and place the matched value to a key of the same name with the {@code *} modifier. + * Requires another key with the same name and the {@code *} modifier. + * Example:
    + * pattern: {@code %{*a} %{b} %{&a}}
    + * string: {@code foo bar baz}
    + * result: {@code foo=baz, b=bar}
    + * 
  • + *
  • {@code ?} Instructs the parser to ignore this key. The key name exists only for the purpose of human readability. Example + *
    + *  pattern: {@code %{a} %{?skipme} %{c}}
    + *  string: {@code foo bar baz}
    + *  result: {@code a=foo, c=baz}
    + * 
    + *
+ *

Empty key names patterns are also supported. They behave just like the {@code ?} modifier, except the name is not required. + * The result will simply be ignored. Example + *

+ * pattern: {@code %{a} %{} %{c}}
+ * string: {@code foo bar baz}
+ * result: {@code a=foo, c=baz}
+ * 
+ + *

+ * Inspired by the Logstash Dissect Filter by Guy Boertje + */ +public final class DissectParser { + private static final Pattern LEADING_DELIMITER_PATTERN = Pattern.compile("^(.*?)%"); + private static final Pattern KEY_DELIMITER_FIELD_PATTERN = Pattern.compile("%\\{([^}]*?)}([^%]*)", Pattern.DOTALL); + private static final EnumSet ASSOCIATE_MODIFIERS = EnumSet.of( + DissectKey.Modifier.FIELD_NAME, + DissectKey.Modifier.FIELD_VALUE); + private static final EnumSet APPEND_MODIFIERS = EnumSet.of( + DissectKey.Modifier.APPEND, + DissectKey.Modifier.APPEND_WITH_ORDER); + private static final Function KEY_NAME = val -> val.getKey().getName(); + private final List matchPairs; + private final String pattern; + private String leadingDelimiter = ""; + private final int maxMatches; + private final int maxResults; + private final int appendCount; + private final int referenceCount; + private final String appendSeparator; + + public DissectParser(String pattern, String appendSeparator) { + this.pattern = pattern; + this.appendSeparator = appendSeparator == null ? "" : appendSeparator; + Matcher matcher = LEADING_DELIMITER_PATTERN.matcher(pattern); + while (matcher.find()) { + leadingDelimiter = matcher.group(1); + } + List matchPairs = new ArrayList<>(); + matcher = KEY_DELIMITER_FIELD_PATTERN.matcher(pattern.substring(leadingDelimiter.length())); + while (matcher.find()) { + DissectKey key = new DissectKey(matcher.group(1)); + String delimiter = matcher.group(2); + matchPairs.add(new DissectPair(key, delimiter)); + } + this.maxMatches = matchPairs.size(); + this.maxResults = Long.valueOf(matchPairs.stream() + .filter(dissectPair -> !dissectPair.getKey().skip()).map(KEY_NAME).distinct().count()).intValue(); + if (this.maxMatches == 0 || maxResults == 0) { + throw new DissectException.PatternParse(pattern, "Unable to find any keys or delimiters."); + } + //append validation - look through all of the keys to see if there are any keys that need to participate in an append operation + // but don't have the '+' defined + Set appendKeyNames = matchPairs.stream() + .filter(dissectPair -> APPEND_MODIFIERS.contains(dissectPair.getKey().getModifier())) + .map(KEY_NAME).distinct().collect(Collectors.toSet()); + if (appendKeyNames.size() > 0) { + List modifiedMatchPairs = new ArrayList<>(matchPairs.size()); + for (DissectPair p : matchPairs) { + if (p.getKey().getModifier().equals(DissectKey.Modifier.NONE) && appendKeyNames.contains(p.getKey().getName())) { + modifiedMatchPairs.add(new DissectPair(new DissectKey(p.getKey(), DissectKey.Modifier.APPEND), p.getDelimiter())); + } else { + modifiedMatchPairs.add(p); + } + } + matchPairs = modifiedMatchPairs; + } + appendCount = appendKeyNames.size(); + + //reference validation - ensure that '*' and '&' come in pairs + Map> referenceGroupings = matchPairs.stream() + .filter(dissectPair -> ASSOCIATE_MODIFIERS.contains(dissectPair.getKey().getModifier())) + .collect(Collectors.groupingBy(KEY_NAME)); + for (Map.Entry> entry : referenceGroupings.entrySet()) { + if (entry.getValue().size() != 2) { + throw new DissectException.PatternParse(pattern, "Found invalid key/reference associations: '" + + entry.getValue().stream().map(KEY_NAME).collect(Collectors.joining(",")) + + "' Please ensure each '*' is matched with a matching '&"); + } + } + + referenceCount = referenceGroupings.size() * 2; + this.matchPairs = Collections.unmodifiableList(matchPairs); + } + + + /** + *

Entry point to dissect a string into it's parts.

+ * + * @param inputString The string to dissect + * @return the key/value Map of the results + * @throws DissectException if unable to dissect a pair into it's parts. + */ + public Map parse(String inputString) { + /** + * + * This implements a naive string matching algorithm. The string is walked left to right, comparing each byte against + * another string's bytes looking for matches. If the bytes match, then a second cursor looks ahead to see if all the bytes + * of the other string matches. If they all match, record it and advances the primary cursor to the match point. If it can not match + * all of the bytes then progress the main cursor. Repeat till the end of the input string. Since the string being searching for + * (the delimiter) is generally small and rare the naive approach is efficient. + * + * In this case the the string that is walked is the input string, and the string being searched for is the current delimiter. + * For example for a dissect pattern of {@code %{a},%{b}:%{c}} the delimiters (comma then colon) are searched for in the + * input string. At class construction the list of keys+delimiters are found (dissectPairs), which allows the use of that ordered + * list to know which delimiter to use for the search. The delimiters is progressed once the current delimiter is matched. + * + * There are two special cases that requires additional parsing beyond the standard naive algorithm. Consecutive delimiters should + * results in a empty matches unless the {@code ->} is provided. For example given the dissect pattern of + * {@code %{a},%{b},%{c},%{d}} and input string of {@code foo,,,} the match should be successful with empty values for b,c and d. + * However, if the key modifier {@code ->}, is present it will simply skip over any delimiters just to the right of the key + * without assigning any values. For example {@code %{a->},{%b}} will match the input string of {@code foo,,,,,,bar} with a=foo and + * b=bar. + * + */ + DissectMatch dissectMatch = new DissectMatch(appendSeparator, maxMatches, maxResults, appendCount, referenceCount); + Iterator it = matchPairs.iterator(); + //ensure leading delimiter matches + if (inputString != null && inputString.length() > leadingDelimiter.length() + && leadingDelimiter.equals(inputString.substring(0, leadingDelimiter.length()))) { + byte[] input = inputString.getBytes(StandardCharsets.UTF_8); + //grab the first key/delimiter pair + DissectPair dissectPair = it.next(); + DissectKey key = dissectPair.getKey(); + byte[] delimiter = dissectPair.getDelimiter().getBytes(StandardCharsets.UTF_8); + //start dissection after the first delimiter + int i = leadingDelimiter.length(); + int valueStart = i; + int lookAheadMatches; + //start walking the input string byte by byte, look ahead for matches where needed + //if a match is found jump forward to the end of the match + for (; i < input.length; i++) { + lookAheadMatches = 0; + //potential match between delimiter and input string + if (delimiter.length > 0 && input[i] == delimiter[0]) { + //look ahead to see if the entire delimiter matches the input string + for (int j = 0; j < delimiter.length; j++) { + if (i + j < input.length && input[i + j] == delimiter[j]) { + lookAheadMatches++; + } + } + //found a full delimiter match + if (lookAheadMatches == delimiter.length) { + //record the key/value tuple + byte[] value = Arrays.copyOfRange(input, valueStart, i); + dissectMatch.add(key, new String(value, StandardCharsets.UTF_8)); + //jump to the end of the match + i += lookAheadMatches; + //look for consecutive delimiters (e.g. a,,,,d,e) + while (i < input.length) { + lookAheadMatches = 0; + for (int j = 0; j < delimiter.length; j++) { + if (i + j < input.length && input[i + j] == delimiter[j]) { + lookAheadMatches++; + } + } + //found consecutive delimiters + if (lookAheadMatches == delimiter.length) { + //jump to the end of the match + i += lookAheadMatches; + if (!key.skipRightPadding()) { + //progress the keys/delimiter if possible + if (!it.hasNext()) { + break; //the while loop + } + dissectPair = it.next(); + key = dissectPair.getKey(); + //add the key with an empty value for the empty delimiter + dissectMatch.add(key, ""); + } + } else { + break; //the while loop + } + } + //progress the keys/delimiter if possible + if (!it.hasNext()) { + break; //the for loop + } + dissectPair = it.next(); + key = dissectPair.getKey(); + delimiter = dissectPair.getDelimiter().getBytes(StandardCharsets.UTF_8); + //i is always one byte after the last found delimiter, aka the start of the next value + valueStart = i; + } + } + } + //the last key, grab the rest of the input (unless consecutive delimiters already grabbed the last key) + //and there is no trailing delimiter + if (!dissectMatch.fullyMatched() && delimiter.length == 0 ) { + byte[] value = Arrays.copyOfRange(input, valueStart, input.length); + String valueString = new String(value, StandardCharsets.UTF_8); + dissectMatch.add(key, valueString); + } + } + Map results = dissectMatch.getResults(); + + if (!dissectMatch.isValid(results)) { + throw new DissectException.FindMatch(pattern, inputString); + } + return results; + } + + /** + * A tuple class to hold the dissect key and delimiter + */ + private class DissectPair { + + private final DissectKey key; + private final String delimiter; + + private DissectPair(DissectKey key, String delimiter) { + this.key = key; + this.delimiter = delimiter; + } + + private DissectKey getKey() { + return key; + } + + private String getDelimiter() { + return delimiter; + } + } + +} + + + diff --git a/libs/dissect/src/test/eclipse-build.gradle b/libs/dissect/src/test/eclipse-build.gradle new file mode 100644 index 0000000000000..56d632f23b129 --- /dev/null +++ b/libs/dissect/src/test/eclipse-build.gradle @@ -0,0 +1,7 @@ + +// this is just shell gradle file for eclipse to have separate projects for dissect src and tests +apply from: '../../build.gradle' + +dependencies { + testCompile project(':libs:dissect') +} diff --git a/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectKeyTests.java b/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectKeyTests.java new file mode 100644 index 0000000000000..0f3f7ed041df5 --- /dev/null +++ b/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectKeyTests.java @@ -0,0 +1,178 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.dissect; + +import org.elasticsearch.test.ESTestCase; +import org.hamcrest.CoreMatchers; + +import java.util.EnumSet; +import java.util.List; +import java.util.stream.Collectors; + +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.is; + +public class DissectKeyTests extends ESTestCase { + + public void testNoModifier() { + String keyName = randomAlphaOfLengthBetween(1, 10); + DissectKey dissectKey = new DissectKey(keyName); + assertThat(dissectKey.getModifier(), equalTo(DissectKey.Modifier.NONE)); + assertThat(dissectKey.skip(), is(false)); + assertThat(dissectKey.skipRightPadding(), is(false)); + assertThat(dissectKey.getAppendPosition(), equalTo(0)); + assertThat(dissectKey.getName(), equalTo(keyName)); + } + + public void testAppendModifier() { + String keyName = randomAlphaOfLengthBetween(1, 10); + DissectKey dissectKey = new DissectKey("+" + keyName); + assertThat(dissectKey.getModifier(), equalTo(DissectKey.Modifier.APPEND)); + assertThat(dissectKey.skip(), is(false)); + assertThat(dissectKey.skipRightPadding(), is(false)); + assertThat(dissectKey.getAppendPosition(), equalTo(0)); + assertThat(dissectKey.getName(), equalTo(keyName)); + } + + public void testAppendWithOrderModifier() { + String keyName = randomAlphaOfLengthBetween(1, 10); + int length = randomIntBetween(1, 100); + DissectKey dissectKey = new DissectKey("+" + keyName + "/" + length); + assertThat(dissectKey.getModifier(), equalTo(DissectKey.Modifier.APPEND_WITH_ORDER)); + assertThat(dissectKey.skip(), is(false)); + assertThat(dissectKey.skipRightPadding(), is(false)); + assertThat(dissectKey.getAppendPosition(), equalTo(length)); + assertThat(dissectKey.getName(), equalTo(keyName)); + } + + public void testAppendWithOrderModifierNoName() { + int length = randomIntBetween(1, 100); + DissectException e = expectThrows(DissectException.class, () -> new DissectKey("+/" + length)); + assertThat(e.getMessage(), CoreMatchers.containsString("Unable to parse key")); + } + + public void testOrderModifierWithoutAppend() { + String keyName = randomAlphaOfLengthBetween(1, 10); + int length = randomIntBetween(1, 100); + DissectException e = expectThrows(DissectException.class, () -> new DissectKey(keyName + "/" + length)); + assertThat(e.getMessage(), CoreMatchers.containsString("Unable to parse key")); + } + + public void testFieldNameModifier() { + String keyName = randomAlphaOfLengthBetween(1, 10); + DissectKey dissectKey = new DissectKey("*" + keyName); + assertThat(dissectKey.getModifier(), equalTo(DissectKey.Modifier.FIELD_NAME)); + assertThat(dissectKey.skip(), is(false)); + assertThat(dissectKey.skipRightPadding(), is(false)); + assertThat(dissectKey.getAppendPosition(), equalTo(0)); + assertThat(dissectKey.getName(), equalTo(keyName)); + } + + public void testFieldValueModifiers() { + String keyName = randomAlphaOfLengthBetween(1, 10); + DissectKey dissectKey = new DissectKey("&" + keyName); + assertThat(dissectKey.getModifier(), equalTo(DissectKey.Modifier.FIELD_VALUE)); + assertThat(dissectKey.skip(), is(false)); + assertThat(dissectKey.skipRightPadding(), is(false)); + assertThat(dissectKey.getAppendPosition(), equalTo(0)); + assertThat(dissectKey.getName(), equalTo(keyName)); + } + + public void testRightPaddingModifiers() { + String keyName = randomAlphaOfLengthBetween(1, 10); + DissectKey dissectKey = new DissectKey(keyName + "->"); + assertThat(dissectKey.getModifier(), equalTo(DissectKey.Modifier.NONE)); + assertThat(dissectKey.skip(), is(false)); + assertThat(dissectKey.skipRightPadding(), is(true)); + assertThat(dissectKey.getAppendPosition(), equalTo(0)); + assertThat(dissectKey.getName(), equalTo(keyName)); + + dissectKey = new DissectKey("*" + keyName + "->"); + assertThat(dissectKey.skipRightPadding(), is(true)); + + dissectKey = new DissectKey("&" + keyName + "->"); + assertThat(dissectKey.skipRightPadding(), is(true)); + + dissectKey = new DissectKey("+" + keyName + "->"); + assertThat(dissectKey.skipRightPadding(), is(true)); + + dissectKey = new DissectKey("?" + keyName + "->"); + assertThat(dissectKey.skipRightPadding(), is(true)); + + dissectKey = new DissectKey("+" + keyName + "/2->"); + assertThat(dissectKey.skipRightPadding(), is(true)); + } + + public void testMultipleLeftModifiers() { + String keyName = randomAlphaOfLengthBetween(1, 10); + List validModifiers = EnumSet.allOf(DissectKey.Modifier.class).stream() + .filter(m -> !m.equals(DissectKey.Modifier.NONE)) + .map(DissectKey.Modifier::toString) + .collect(Collectors.toList()); + String modifier1 = randomFrom(validModifiers); + String modifier2 = randomFrom(validModifiers); + DissectException e = expectThrows(DissectException.class, () -> new DissectKey(modifier1 + modifier2 + keyName)); + assertThat(e.getMessage(), CoreMatchers.containsString("Unable to parse key")); + } + + public void testSkipKey() { + String keyName = ""; + DissectKey dissectKey = new DissectKey(keyName); + assertThat(dissectKey.getModifier(), equalTo(DissectKey.Modifier.NONE)); + assertThat(dissectKey.skip(), is(true)); + assertThat(dissectKey.skipRightPadding(), is(false)); + assertThat(dissectKey.getAppendPosition(), equalTo(0)); + assertThat(dissectKey.getName(), equalTo(keyName)); + } + public void testNamedSkipKey() { + String keyName = "myname"; + DissectKey dissectKey = new DissectKey("?" +keyName); + assertThat(dissectKey.getModifier(), equalTo(DissectKey.Modifier.NAMED_SKIP)); + assertThat(dissectKey.skip(), is(true)); + assertThat(dissectKey.skipRightPadding(), is(false)); + assertThat(dissectKey.getAppendPosition(), equalTo(0)); + assertThat(dissectKey.getName(), equalTo(keyName)); + } + + public void testSkipKeyWithPadding() { + String keyName = ""; + DissectKey dissectKey = new DissectKey(keyName + "->"); + assertThat(dissectKey.getModifier(), equalTo(DissectKey.Modifier.NONE)); + assertThat(dissectKey.skip(), is(true)); + assertThat(dissectKey.skipRightPadding(), is(true)); + assertThat(dissectKey.getAppendPosition(), equalTo(0)); + assertThat(dissectKey.getName(), equalTo(keyName)); + } + public void testNamedEmptySkipKeyWithPadding() { + String keyName = ""; + DissectKey dissectKey = new DissectKey("?" +keyName + "->"); + assertThat(dissectKey.getModifier(), equalTo(DissectKey.Modifier.NAMED_SKIP)); + assertThat(dissectKey.skip(), is(true)); + assertThat(dissectKey.skipRightPadding(), is(true)); + assertThat(dissectKey.getAppendPosition(), equalTo(0)); + assertThat(dissectKey.getName(), equalTo(keyName)); + } + + public void testInvalidModifiers() { + //should never happen due to regex + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> DissectKey.Modifier.fromString("x")); + assertThat(e.getMessage(), CoreMatchers.containsString("invalid modifier")); + } +} diff --git a/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectMatchTests.java b/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectMatchTests.java new file mode 100644 index 0000000000000..d562afb636308 --- /dev/null +++ b/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectMatchTests.java @@ -0,0 +1,93 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.dissect; + +import org.elasticsearch.common.collect.MapBuilder; +import org.elasticsearch.test.ESTestCase; + +import java.nio.charset.StandardCharsets; +import java.util.Map; +import java.util.stream.IntStream; + +import static org.hamcrest.Matchers.equalTo; + +public class DissectMatchTests extends ESTestCase { + + public void testIllegalArgs() { + expectThrows(IllegalArgumentException.class, () -> new DissectMatch("", 0, 1, 0, 0)); + expectThrows(IllegalArgumentException.class, () -> new DissectMatch("", 1, 0, 0, 0)); + } + + public void testValidAndFullyMatched() { + int expectedMatches = randomIntBetween(1, 26); + DissectMatch dissectMatch = new DissectMatch("", expectedMatches, expectedMatches, 0, 0); + IntStream.range(97, 97 + expectedMatches) //allow for a-z values + .forEach(i -> dissectMatch.add(new DissectKey(new String(new byte[]{(byte) i}, StandardCharsets.UTF_8)), "")); + assertThat(dissectMatch.fullyMatched(), equalTo(true)); + assertThat(dissectMatch.isValid(dissectMatch.getResults()), equalTo(true)); + } + + public void testNotValidAndFullyMatched() { + int expectedMatches = randomIntBetween(1, 26); + DissectMatch dissectMatch = new DissectMatch("", expectedMatches, expectedMatches, 0, 0); + IntStream.range(97, 97 + expectedMatches - 1) //allow for a-z values + .forEach(i -> dissectMatch.add(new DissectKey(new String(new byte[]{(byte) i}, StandardCharsets.UTF_8)), "")); + assertThat(dissectMatch.fullyMatched(), equalTo(false)); + assertThat(dissectMatch.isValid(dissectMatch.getResults()), equalTo(false)); + } + + public void testGetResultsIdempotent(){ + int expectedMatches = randomIntBetween(1, 26); + DissectMatch dissectMatch = new DissectMatch("", expectedMatches, expectedMatches, 0, 0); + IntStream.range(97, 97 + expectedMatches) //allow for a-z values + .forEach(i -> dissectMatch.add(new DissectKey(new String(new byte[]{(byte) i}, StandardCharsets.UTF_8)), "")); + assertThat(dissectMatch.getResults(), equalTo(dissectMatch.getResults())); + } + + public void testAppend(){ + DissectMatch dissectMatch = new DissectMatch("-", 3, 1, 3, 0); + dissectMatch.add(new DissectKey("+a"), "x"); + dissectMatch.add(new DissectKey("+a"), "y"); + dissectMatch.add(new DissectKey("+a"), "z"); + Map results = dissectMatch.getResults(); + assertThat(dissectMatch.isValid(results), equalTo(true)); + assertThat(results, equalTo(MapBuilder.newMapBuilder().put("a", "x-y-z").map())); + } + + public void testAppendWithOrder(){ + DissectMatch dissectMatch = new DissectMatch("-", 3, 1, 3, 0); + dissectMatch.add(new DissectKey("+a/3"), "x"); + dissectMatch.add(new DissectKey("+a"), "y"); + dissectMatch.add(new DissectKey("+a/1"), "z"); + Map results = dissectMatch.getResults(); + assertThat(dissectMatch.isValid(results), equalTo(true)); + assertThat(results, equalTo(MapBuilder.newMapBuilder().put("a", "y-z-x").map())); + } + + public void testReference(){ + DissectMatch dissectMatch = new DissectMatch("-", 2, 1, 0, 1); + dissectMatch.add(new DissectKey("&a"), "x"); + dissectMatch.add(new DissectKey("*a"), "y"); + Map results = dissectMatch.getResults(); + assertThat(dissectMatch.isValid(results), equalTo(true)); + assertThat(results, equalTo(MapBuilder.newMapBuilder().put("y", "x").map())); + } + +} diff --git a/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java b/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java new file mode 100644 index 0000000000000..c22cec98eb79a --- /dev/null +++ b/libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java @@ -0,0 +1,386 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.dissect; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.elasticsearch.test.ESTestCase; +import org.hamcrest.CoreMatchers; +import org.hamcrest.Matchers; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import static com.carrotsearch.randomizedtesting.RandomizedTest.randomAsciiAlphanumOfLengthBetween; + +public class DissectParserTests extends ESTestCase { + + public void testJavaDocExamples() { + assertMatch("%{a} %{b},%{c}", "foo bar,baz", Arrays.asList("a", "b", "c"), Arrays.asList("foo", "bar", "baz")); + assertMiss("%{a},%{b}:%{c}", "foo,bar,baz"); + assertMatch("%{a->} %{b} %{c}", "foo bar baz", Arrays.asList("a", "b", "c"), Arrays.asList("foo", "bar", "baz")); + assertMatch("%{a} %{+a} %{+a}", "foo bar baz", Arrays.asList("a"), Arrays.asList("foobarbaz")); + assertMatch("%{a} %{+a/2} %{+a/1}", "foo bar baz", Arrays.asList("a"), Arrays.asList("foobazbar")); + assertMatch("%{*a} %{b} %{&a}", "foo bar baz", Arrays.asList("foo", "b"), Arrays.asList("baz", "bar")); + assertMatch("%{a} %{} %{c}", "foo bar baz", Arrays.asList("a", "c"), Arrays.asList("foo", "baz")); + assertMatch("%{a} %{?skipme} %{c}", "foo bar baz", Arrays.asList("a", "c"), Arrays.asList("foo", "baz")); + assertMatch("%{a},%{b},%{c},%{d}", "foo,,,", Arrays.asList("a", "b", "c", "d"), Arrays.asList("foo", "", "", "")); + assertMatch("%{a->},%{b}", "foo,,,,,,bar", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + } + + /** + * Borrowed from Logstash's test cases: + * https://github.com/logstash-plugins/logstash-filter-dissect/blob/master/src/test/java/org/logstash/dissect/DissectorTest.java + * Append Note - Logstash appends with the delimiter as the separator between values, this uses a user defined separator + */ + public void testLogstashSpecs() { + assertMatch("%{a} %{b->} %{c}", "foo bar baz", Arrays.asList("a", "b", "c"), Arrays.asList("foo", "bar", "baz")); + assertMiss("%{a}%{b} %{c}", null); + assertMiss("%{a} %{b}%{c} %{d}", "foo bar baz"); + assertMiss("%{a} %{b} %{c}%{d}", "foo bar baz quux"); + assertMatch("%{a} %{b->} %{c}", "foo bar baz", Arrays.asList("a", "b", "c"), Arrays.asList("foo", "bar", "baz")); + assertMatch("%{a} %{} %{c}", "foo bar baz", Arrays.asList("a", "c"), Arrays.asList("foo", "baz")); + assertMatch("%{a} %{b} %{+b} %{z}", "foo bar baz quux", Arrays.asList("a", "b", "z"), Arrays.asList("foo", "bar baz", "quux"), " "); + assertMatch("%{a}------->%{b}", "foo------->bar baz quux", Arrays.asList("a", "b"), Arrays.asList("foo", "bar baz quux")); + assertMatch("%{a}------->%{}", "foo------->bar baz quux", Arrays.asList("a"), Arrays.asList("foo")); + assertMatch("%{a} » %{b}»%{c}€%{d}", "foo » bar»baz€quux", + Arrays.asList("a", "b", "c", "d"), Arrays.asList("foo", "bar", "baz", "quux")); + assertMatch("%{a} %{b} %{+a}", "foo bar baz quux", Arrays.asList("a", "b"), Arrays.asList("foo baz quux", "bar"), " "); + //Logstash supports implicit ordering based anchored by the the key without the '+' + //This implementation will only honor implicit ordering for appending right to left else explicit order (/N) is required. + //The results of this test differ from Logstash. + assertMatch("%{+a} %{a} %{+a} %{b}", "December 31 1999 quux", + Arrays.asList("a", "b"), Arrays.asList("December 31 1999", "quux"), " "); + //Same test as above, but with same result as Logstash using explicit ordering in the pattern + assertMatch("%{+a/1} %{a} %{+a/2} %{b}", "December 31 1999 quux", + Arrays.asList("a", "b"), Arrays.asList("31 December 1999", "quux"), " "); + assertMatch("%{+a/2} %{+a/4} %{+a/1} %{+a/3}", "bar quux foo baz", Arrays.asList("a"), Arrays.asList("foo bar baz quux"), " "); + assertMatch("%{+a} %{b}", "foo bar", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + assertMatch("%{+a} %{b} %{+a} %{c}", "foo bar baz quux", + Arrays.asList("a", "b", "c"), Arrays.asList("foo baz", "bar", "quux"), " "); + assertMatch("%{} %{syslog_timestamp} %{hostname} %{rt}: %{reason} %{+reason} %{src_ip}/%{src_port}->%{dst_ip}/%{dst_port} " + + "%{polrt} %{+polrt} %{+polrt} %{from_zone} %{to_zone} %{rest}", + "42 2016-05-25T14:47:23Z host.name.com RT_FLOW - RT_FLOW_SESSION_DENY: session denied 2.2.2.20/60000->1.1.1.10/8090 None " + + "6(0) DEFAULT-DENY ZONE-UNTRUST ZONE-DMZ UNKNOWN UNKNOWN N/A(N/A) ge-0/0/0.0", + Arrays.asList("syslog_timestamp", "hostname", "rt", "reason", "src_ip", "src_port", "dst_ip", "dst_port", "polrt" + , "from_zone", "to_zone", "rest"), + Arrays.asList("2016-05-25T14:47:23Z", "host.name.com", "RT_FLOW - RT_FLOW_SESSION_DENY", "session denied", "2.2.2.20", "60000" + , "1.1.1.10", "8090", "None 6(0) DEFAULT-DENY", "ZONE-UNTRUST", "ZONE-DMZ", "UNKNOWN UNKNOWN N/A(N/A) ge-0/0/0.0"), " "); + assertBadKey("%{+/2}"); + assertBadKey("%{&+a_field}"); + assertMatch("%{a->} %{b->}---%{c}", "foo bar------------baz", + Arrays.asList("a", "b", "c"), Arrays.asList("foo", "bar", "baz")); + assertMatch("%{->}-%{a}", "-----666", Arrays.asList("a"), Arrays.asList("666")); + assertMatch("%{?skipme->}-%{a}", "-----666", Arrays.asList("a"), Arrays.asList("666")); + assertMatch("%{a},%{b},%{c},%{d},%{e},%{f}", "111,,333,,555,666", + Arrays.asList("a", "b", "c", "d", "e", "f"), Arrays.asList("111", "", "333", "", "555", "666")); + assertMatch("%{a}.࿏.%{b}", "⟳༒.࿏.༒⟲", Arrays.asList("a", "b"), Arrays.asList("⟳༒", "༒⟲")); + assertMatch("%{a}", "子", Arrays.asList("a"), Arrays.asList("子")); + assertMatch("%{a}{\n}%{b}", "aaa{\n}bbb", Arrays.asList("a", "b"), Arrays.asList("aaa", "bbb")); + assertMiss("MACHINE[%{a}] %{b}", "1234567890 MACHINE[foo] bar"); + assertMiss("%{a} %{b} %{c}", "foo:bar:baz"); + assertMatch("/var/%{key1}/log/%{key2}.log", "/var/foo/log/bar.log", Arrays.asList("key1", "key2"), Arrays.asList("foo", "bar")); + assertMatch("%{a->} %{b}-.-%{c}-%{d}-..-%{e}-%{f}-%{g}-%{h}", "foo bar-.-baz-1111-..-22-333-4444-55555", + Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h"), + Arrays.asList("foo", "bar", "baz", "1111", "22", "333", "4444", "55555")); + } + + public void testBasicMatch() { + String valueFirstInput = ""; + String keyFirstPattern = ""; + String delimiterFirstInput = ""; + String delimiterFirstPattern = ""; + //parallel arrays + List expectedKeys = Arrays.asList(generateRandomStringArray(100, 10, false, false)); + List expectedValues = new ArrayList<>(expectedKeys.size()); + for (String key : expectedKeys) { + String value = randomAsciiAlphanumOfLengthBetween(1, 100); + String delimiter = Integer.toString(randomInt()); //int to ensures values and delimiters don't overlap, else validation can fail + keyFirstPattern += "%{" + key + "}" + delimiter; + valueFirstInput += value + delimiter; + delimiterFirstPattern += delimiter + "%{" + key + "}"; + delimiterFirstInput += delimiter + value; + expectedValues.add(value); + } + assertMatch(keyFirstPattern, valueFirstInput, expectedKeys, expectedValues); + assertMatch(delimiterFirstPattern, delimiterFirstInput, expectedKeys, expectedValues); + } + + public void testBasicMatchUnicode() { + String valueFirstInput = ""; + String keyFirstPattern = ""; + String delimiterFirstInput = ""; + String delimiterFirstPattern = ""; + //parallel arrays + List expectedKeys = new ArrayList<>(); + List expectedValues = new ArrayList<>(); + for (int i = 0; i < randomIntBetween(1, 100); i++) { + String key = randomAsciiAlphanumOfLengthBetween(1, 100); + String value = randomRealisticUnicodeOfCodepointLengthBetween(1, 100); + String delimiter = Integer.toString(randomInt()); //int to ensures values and delimiters don't overlap, else validation can fail + keyFirstPattern += "%{" + key + "}" + delimiter; + valueFirstInput += value + delimiter; + delimiterFirstPattern += delimiter + "%{" + key + "}"; + delimiterFirstInput += delimiter + value; + expectedKeys.add(key); + expectedValues.add(value); + } + assertMatch(keyFirstPattern, valueFirstInput, expectedKeys, expectedValues); + assertMatch(delimiterFirstPattern, delimiterFirstInput, expectedKeys, expectedValues); + } + + public void testMatchUnicode() { + assertMatch("%{a} %{b}", "foo 子", Arrays.asList("a", "b"), Arrays.asList("foo", "子")); + assertMatch("%{a}࿏%{b} %{c}", "⟳༒࿏༒⟲ 子", Arrays.asList("a", "b", "c"), Arrays.asList("⟳༒", "༒⟲", "子")); + assertMatch("%{a}࿏%{+a} %{+a}", "⟳༒࿏༒⟲ 子", Arrays.asList("a"), Arrays.asList("⟳༒༒⟲子")); + assertMatch("%{a}࿏%{+a/2} %{+a/1}", "⟳༒࿏༒⟲ 子", Arrays.asList("a"), Arrays.asList("⟳༒子༒⟲")); + assertMatch("%{a->}࿏%{b}", "⟳༒࿏࿏࿏࿏࿏༒⟲", Arrays.asList("a", "b"), Arrays.asList("⟳༒", "༒⟲")); + assertMatch("%{*a}࿏%{&a}", "⟳༒࿏༒⟲", Arrays.asList("⟳༒"), Arrays.asList("༒⟲")); + assertMatch("%{}࿏%{a}", "⟳༒࿏༒⟲", Arrays.asList("a"), Arrays.asList("༒⟲")); + } + + public void testMatchRemainder() { + assertMatch("%{a}", "foo bar the rest", Arrays.asList("a"), Arrays.asList("foo bar the rest")); + assertMatch("%{a} %{b}", "foo bar the rest", Arrays.asList("a", "b"), Arrays.asList("foo", "bar the rest")); + assertMatch("%{} %{b}", "foo bar the rest", Arrays.asList("b"), Arrays.asList("bar the rest")); + assertMatch("%{a} %{b->}", "foo bar the rest", Arrays.asList("a", "b"), Arrays.asList("foo", "bar the rest")); + assertMatch("%{*a} %{&a}", "foo bar the rest", Arrays.asList("foo"), Arrays.asList("bar the rest")); + assertMatch("%{a} %{+a}", "foo bar the rest", Arrays.asList("a"), Arrays.asList("foo bar the rest"), " "); + } + + public void testAppend() { + assertMatch("%{a} %{+a} %{+a}", "foo bar baz", Arrays.asList("a"), Arrays.asList("foobarbaz")); + assertMatch("%{a} %{+a} %{b} %{+b}", "foo bar baz lol", Arrays.asList("a", "b"), Arrays.asList("foobar", "bazlol")); + assertMatch("%{a} %{+a/2} %{+a/1}", "foo bar baz", Arrays.asList("a"), Arrays.asList("foobazbar")); + assertMatch("%{a} %{+a/2} %{+a/1}", "foo bar baz", Arrays.asList("a"), Arrays.asList("foo baz bar"), " "); + } + + public void testAssociate() { + assertMatch("%{*a} %{&a}", "foo bar", Arrays.asList("foo"), Arrays.asList("bar")); + assertMatch("%{&a} %{*a}", "foo bar", Arrays.asList("bar"), Arrays.asList("foo")); + assertMatch("%{*a} %{&a} %{*b} %{&b}", "foo bar baz lol", Arrays.asList("foo", "baz"), Arrays.asList("bar", "lol")); + assertMatch("%{*a} %{&a} %{c} %{*b} %{&b}", "foo bar x baz lol", + Arrays.asList("foo", "baz", "c"), Arrays.asList("bar", "lol", "x")); + assertBadPattern("%{*a} %{a}"); + assertBadPattern("%{a} %{&a}"); + assertMiss("%{*a} %{&a} {a} %{*b} %{&b}", "foo bar x baz lol"); + } + + public void testAppendAndAssociate() { + assertMatch("%{a} %{+a} %{*b} %{&b}", "foo bar baz lol", Arrays.asList("a", "baz"), Arrays.asList("foobar", "lol")); + assertMatch("%{a->} %{+a/2} %{+a/1} %{*b} %{&b}", "foo bar baz lol x", + Arrays.asList("a", "lol"), Arrays.asList("foobazbar", "x")); + } + + public void testEmptyKey() { + assertMatch("%{} %{b}", "foo bar", Arrays.asList("b"), Arrays.asList("bar")); + assertMatch("%{a} %{}", "foo bar", Arrays.asList("a"), Arrays.asList("foo")); + assertMatch("%{->} %{b}", "foo bar", Arrays.asList("b"), Arrays.asList("bar")); + assertMatch("%{->} %{b}", " bar", Arrays.asList("b"), Arrays.asList("bar")); + assertMatch("%{a} %{->}", "foo bar ", Arrays.asList("a"), Arrays.asList("foo")); + } + + public void testNamedSkipKey() { + assertMatch("%{?foo} %{b}", "foo bar", Arrays.asList("b"), Arrays.asList("bar")); + assertMatch("%{?} %{b}", "foo bar", Arrays.asList("b"), Arrays.asList("bar")); + assertMatch("%{a} %{?bar}", "foo bar", Arrays.asList("a"), Arrays.asList("foo")); + assertMatch("%{?foo->} %{b}", "foo bar", Arrays.asList("b"), Arrays.asList("bar")); + assertMatch("%{?->} %{b}", "foo bar", Arrays.asList("b"), Arrays.asList("bar")); + assertMatch("%{?foo->} %{b}", " bar", Arrays.asList("b"), Arrays.asList("bar")); + assertMatch("%{a} %{->?bar}", "foo bar ", Arrays.asList("a"), Arrays.asList("foo")); + assertMatch("%{a} %{?skipme} %{?skipme}", "foo bar baz", Arrays.asList("a"), Arrays.asList("foo")); + assertMatch("%{a} %{?} %{?}", "foo bar baz", Arrays.asList("a"), Arrays.asList("foo")); + } + + public void testConsecutiveDelimiters() { + //leading + assertMatch("%{->},%{a}", ",,,,,foo", Arrays.asList("a"), Arrays.asList("foo")); + assertMatch("%{a->},%{b}", ",,,,,foo", Arrays.asList("a", "b"), Arrays.asList("", "foo")); + //trailing + assertMatch("%{a->},", "foo,,,,,", Arrays.asList("a"), Arrays.asList("foo")); + assertMatch("%{a} %{b},", "foo bar,,,,,", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + assertMatch("%{a} %{b->},", "foo bar,,,,,", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + //middle + assertMatch("%{a->},%{b}", "foo,,,,,bar", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + assertMatch("%{a->} %{b}", "foo bar", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + assertMatch("%{a->}x%{b}", "fooxxxxxbar", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + assertMatch("%{a->} xyz%{b}", "foo xyz xyz xyz xyz xyzbar", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + //skipped with empty values + assertMatch("%{a},%{b},%{c},%{d}", "foo,,,", Arrays.asList("a", "b", "c", "d"), Arrays.asList("foo", "", "", "")); + assertMatch("%{a},%{b},%{c},%{d}", "foo,,bar,baz", Arrays.asList("a", "b", "c", "d"), Arrays.asList("foo", "", "bar", "baz")); + assertMatch("%{a},%{b},%{c},%{d}", "foo,,,baz", Arrays.asList("a", "b", "c", "d"), Arrays.asList("foo", "", "", "baz")); + assertMatch("%{a},%{b},%{c},%{d}", ",bar,,baz", Arrays.asList("a", "b", "c", "d"), Arrays.asList("", "bar", "", "baz")); + assertMatch("%{->},%{a->},%{b}", ",,,bar,,baz", Arrays.asList("a", "b"), Arrays.asList("bar", "baz")); + } + + public void testAppendWithConsecutiveDelimiters() { + assertMatch("%{+a/1},%{+a/3}-%{+a/2} %{b}", "foo,bar----baz lol", Arrays.asList("a", "b"), Arrays.asList("foobar", "")); + assertMatch("%{+a/1},%{+a/3->}-%{+a/2} %{b}", "foo,bar----baz lol", Arrays.asList("a", "b"), Arrays.asList("foobazbar", "lol")); + } + + public void testSkipRightPadding() { + assertMatch("%{a->} %{b}", "foo bar", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + assertMatch("%{a->} %{b}", "foo bar", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + assertMatch("%{->} %{a}", "foo bar", Arrays.asList("a"), Arrays.asList("bar")); + assertMatch("%{a->} %{+a->} %{*b->} %{&b->} %{c}", "foo bar baz lol x", + Arrays.asList("a", "baz", "c"), Arrays.asList("foobar", "lol", "x")); + } + + public void testTrimmedEnd() { + assertMatch("%{a} %{b}", "foo bar", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + assertMatch("%{a} %{b->} ", "foo bar ", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + //only whitespace is trimmed in the absence of trailing characters + assertMatch("%{a} %{b->}", "foo bar,,,,,,", Arrays.asList("a", "b"), Arrays.asList("foo", "bar,,,,,,")); + //consecutive delimiters + right padding can be used to skip over the trailing delimiters + assertMatch("%{a} %{b->},", "foo bar,,,,,,", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + } + + public void testLeadingDelimiter() { + assertMatch(",,,%{a} %{b}", ",,,foo bar", Arrays.asList("a", "b"), Arrays.asList("foo", "bar")); + assertMatch(",%{a} %{b}", ",,foo bar", Arrays.asList("a", "b"), Arrays.asList(",foo", "bar")); + } + + /** + * Runtime errors + */ + public void testMiss() { + assertMiss("%{a}%{b}", "foo"); + assertMiss("%{a},%{b}", "foo bar"); + assertMiss("%{a}, %{b}", "foo,bar"); + assertMiss("x%{a},%{b}", "foo,bar"); + assertMiss("x%{},%{b}", "foo,bar"); + assertMiss("leading_delimiter_long%{a}", "foo"); + assertMiss("%{a}trailing_delimiter_long", "foo"); + assertMiss("leading_delimiter_long%{a}trailing_delimiter_long", "foo"); + assertMiss("%{a}x", "foo"); + assertMiss("%{a},%{b}x", "foo,bar"); + } + + /** + * Construction errors + */ + public void testBadPatternOrKey() { + assertBadPattern(""); + assertBadPattern("{}"); + assertBadPattern("%{*a} %{&b}"); + assertBadKey("%{*}"); + assertBadKey("%{++}"); + } + + public void testSyslog() { + assertMatch("%{timestamp} %{+timestamp} %{+timestamp} %{logsource} %{program}[%{pid}]: %{message}", + "Mar 16 00:01:25 evita postfix/smtpd[1713]: connect from camomile.cloud9.net[168.100.1.3]", + Arrays.asList("timestamp", "logsource", "program", "pid", "message"), + Arrays.asList("Mar 16 00:01:25", "evita", "postfix/smtpd", "1713", "connect from camomile.cloud9.net[168.100.1.3]"), " "); + } + + public void testApacheLog() { + assertMatch("%{clientip} %{ident} %{auth} [%{timestamp}] \"%{verb} %{request} HTTP/%{httpversion}\" %{response} %{bytes}" + + " \"%{referrer}\" \"%{agent}\" %{->}", + "31.184.238.164 - - [24/Jul/2014:05:35:37 +0530] \"GET /logs/access.log HTTP/1.0\" 200 69849 " + + "\"http://8rursodiol.enjin.com\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " + + "Chrome/30.0.1599.12785 YaBrowser/13.12.1599.12785 Safari/537.36\" \"www.dlwindianrailways.com\"", + Arrays.asList("clientip", "ident", "auth", "timestamp", "verb", "request", "httpversion", "response", "bytes", + "referrer", "agent"), + Arrays.asList("31.184.238.164", "-", "-", "24/Jul/2014:05:35:37 +0530", "GET", "/logs/access.log", "1.0", "200", "69849", + "http://8rursodiol.enjin.com", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36" + + " (KHTML, like Gecko) Chrome/30.0.1599.12785 YaBrowser/13.12.1599.12785 Safari/537.36")); + } + + /** + * Shared specification between Beats, Logstash, and Ingest node + */ + public void testJsonSpecification() throws Exception { + ObjectMapper mapper = new ObjectMapper(); + JsonNode rootNode = mapper.readTree(this.getClass().getResourceAsStream("/specification/tests.json")); + Iterator tests = rootNode.elements(); + while (tests.hasNext()) { + JsonNode test = tests.next(); + boolean skip = test.path("skip").asBoolean(); + if (!skip) { + String name = test.path("name").asText(); + logger.debug("Running Json specification: " + name); + String pattern = test.path("tok").asText(); + String input = test.path("msg").asText(); + String append = test.path("append").asText(); + boolean fail = test.path("fail").asBoolean(); + Iterator> expected = test.path("expected").fields(); + List expectedKeys = new ArrayList<>(); + List expectedValues = new ArrayList<>(); + expected.forEachRemaining(entry -> { + expectedKeys.add(entry.getKey()); + expectedValues.add(entry.getValue().asText()); + }); + if (fail) { + assertFail(pattern, input); + } else { + assertMatch(pattern, input, expectedKeys, expectedValues, append); + } + } + } + } + + private DissectException assertFail(String pattern, String input){ + return expectThrows(DissectException.class, () -> new DissectParser(pattern, null).parse(input)); + } + + private void assertMiss(String pattern, String input) { + DissectException e = assertFail(pattern, input); + assertThat(e.getMessage(), CoreMatchers.containsString("Unable to find match for dissect pattern")); + assertThat(e.getMessage(), CoreMatchers.containsString(pattern)); + assertThat(e.getMessage(), input == null ? CoreMatchers.containsString("null") : CoreMatchers.containsString(input)); + } + + private void assertBadPattern(String pattern) { + DissectException e = assertFail(pattern, null); + assertThat(e.getMessage(), CoreMatchers.containsString("Unable to parse pattern")); + assertThat(e.getMessage(), CoreMatchers.containsString(pattern)); + } + + private void assertBadKey(String pattern, String key) { + DissectException e = assertFail(pattern, null); + assertThat(e.getMessage(), CoreMatchers.containsString("Unable to parse key")); + assertThat(e.getMessage(), CoreMatchers.containsString(key)); + } + + private void assertBadKey(String pattern) { + assertBadKey(pattern, pattern.replace("%{", "").replace("}", "")); + } + + private void assertMatch(String pattern, String input, List expectedKeys, List expectedValues) { + assertMatch(pattern, input, expectedKeys, expectedValues, null); + } + + private void assertMatch(String pattern, String input, List expectedKeys, List expectedValues, String appendSeperator) { + Map results = new DissectParser(pattern, appendSeperator).parse(input); + List foundKeys = new ArrayList<>(results.keySet()); + List foundValues = new ArrayList<>(results.values()); + Collections.sort(foundKeys); + Collections.sort(foundValues); + Collections.sort(expectedKeys); + Collections.sort(expectedValues); + assertThat(foundKeys, Matchers.equalTo(expectedKeys)); + assertThat(foundValues, Matchers.equalTo(expectedValues)); + } +} diff --git a/libs/dissect/src/test/resources/specification/tests.json b/libs/dissect/src/test/resources/specification/tests.json new file mode 100644 index 0000000000000..1cb85ce651940 --- /dev/null +++ b/libs/dissect/src/test/resources/specification/tests.json @@ -0,0 +1,363 @@ +[ + { + "name": "When all the defined fields are captured by we have remaining data", + "tok": "level=%{level} ts=%{timestamp} caller=%{caller} msg=\"%{message}\"", + "msg": "level=info ts=2018-06-27T17:19:13.036579993Z caller=main.go:222 msg=\"Starting OK\" version=\"(version=2.3.1, branch=HEAD, revision=188ca45bd85ce843071e768d855722a9d9dabe03)\"}", + "expected": { + "caller": "main.go:222", + "level": "info", + "message": "Starting OK", + "timestamp": "2018-06-27T17:19:13.036579993Z" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "Complex stack trace", + "tok": "%{day}-%{month}-%{year} %{hour} %{severity} [%{thread_id}] %{origin} %{message}", + "msg": "18-Apr-2018 06:53:20.411 INFO [http-nio-8080-exec-1] org.apache.coyote.http11.Http11Processor.service Error parsing HTTP request header\n Note: further occurrences of HTTP header parsing errors will be logged at DEBUG level.\n java.lang.IllegalArgumentException: Invalid character found in method name. HTTP method names must be tokens\n at org.apache.coyote.http11.Http11InputBuffer.parseRequestLine(Http11InputBuffer.java:426)\n at org.apache.coyote.http11.Http11Processor.service(Http11Processor.java:687)\n at org.apache.coyote.AbstractProcessorLight.process(AbstractProcessorLight.java:66)\n at org.apache.coyote.AbstractProtocol$ConnectionHandler.process(AbstractProtocol.java:790)\n at org.apache.tomcat.util.net.NioEndpoint$SocketProcessor.doRun(NioEndpoint.java:1459)\n at org.apache.tomcat.util.net.SocketProcessorBase.run(SocketProcessorBase.java:49)\n at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n at org.apache.tomcat.util.threads.TaskThread$WrappingRunnable.run(TaskThread.java:61)\n at java.lang.Thread.run(Thread.java:748)", + "expected": { + "day": "18", + "hour": "06:53:20.411", + "message": "Error parsing HTTP request header\n Note: further occurrences of HTTP header parsing errors will be logged at DEBUG level.\n java.lang.IllegalArgumentException: Invalid character found in method name. HTTP method names must be tokens\n at org.apache.coyote.http11.Http11InputBuffer.parseRequestLine(Http11InputBuffer.java:426)\n at org.apache.coyote.http11.Http11Processor.service(Http11Processor.java:687)\n at org.apache.coyote.AbstractProcessorLight.process(AbstractProcessorLight.java:66)\n at org.apache.coyote.AbstractProtocol$ConnectionHandler.process(AbstractProtocol.java:790)\n at org.apache.tomcat.util.net.NioEndpoint$SocketProcessor.doRun(NioEndpoint.java:1459)\n at org.apache.tomcat.util.net.SocketProcessorBase.run(SocketProcessorBase.java:49)\n at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n at org.apache.tomcat.util.threads.TaskThread$WrappingRunnable.run(TaskThread.java:61)\n at java.lang.Thread.run(Thread.java:748)", + "month": "Apr", + "origin": "org.apache.coyote.http11.Http11Processor.service", + "severity": "INFO", + "thread_id": "http-nio-8080-exec-1", + "year": "2018" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "success when delimiter found at the beginning and end of the string", + "tok": "/var/log/%{key}.log", + "msg": "/var/log/foobar.log", + "expected": { + "key": "foobar" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "fails when delimiter is not found at the beginning of the string", + "tok": "/var/log/%{key}.log", + "msg": "foobar", + "expected": null, + "skip": false, + "fail": true, + "append": "" + }, + { + "name": "fails when delimiter is not found after the key", + "tok": "/var/log/%{key}.log", + "msg": "/var/log/foobar", + "expected": null, + "skip": false, + "fail": true, + "append": "" + }, + { + "name": "simple dissect", + "tok": "%{key}", + "msg": "foobar", + "expected": { + "key": "foobar" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "dissect two replacement", + "tok": "%{key1} %{key2}", + "msg": "foo bar", + "expected": { + "key1": "foo", + "key2": "bar" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "fail on partial match", + "tok": "%{key1} %{key2} %{key3}", + "msg": "foo bar", + "expected": null, + "skip": false, + "fail": true, + "append": "" + }, + { + "name": "one level dissect not end of string", + "tok": "/var/%{key}/log", + "msg": "/var/foobar/log", + "expected": { + "key": "foobar" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "one level dissect", + "tok": "/var/%{key}", + "msg": "/var/foobar/log", + "expected": { + "key": "foobar/log" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "multiple keys dissect end of string", + "tok": "/var/%{key}/log/%{key1}", + "msg": "/var/foobar/log/apache", + "expected": { + "key": "foobar", + "key1": "apache" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "multiple keys not end of string", + "tok": "/var/%{key}/log/%{key1}.log", + "msg": "/var/foobar/log/apache.log", + "expected": { + "key": "foobar", + "key1": "apache" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "append with order", + "tok": "%{+key/3} %{+key/1} %{+key/2}", + "msg": "1 2 3", + "expected": { + "key": "231" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "append with order and separator", + "tok": "%{+key/3} %{+key/1} %{+key/2}", + "msg": "1 2 3", + "expected": { + "key": "2::3::1" + }, + "skip": false, + "fail": false, + "append": "::" + }, + { + "name": "append with order and right padding", + "tok": "%{+key/3} %{+key/1-\u003e} %{+key/2}", + "msg": "1 2 3", + "expected": { + "key": "231" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "simple append", + "tok": "%{key}-%{+key}-%{+key}", + "msg": "1-2-3", + "expected": { + "key": "123" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "simple append with separator", + "tok": "%{key}-%{+key}-%{+key}", + "msg": "1-2-3", + "expected": { + "key": "1,2,3" + }, + "skip": false, + "fail": false, + "append": "," + }, + { + "name": "reference field", + "tok": "%{*key} %{\u0026key}", + "msg": "hello world", + "expected": { + "hello": "world" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "reference field alt order", + "tok": "%{\u0026key} %{*key}", + "msg": "hello world", + "expected": { + "world": "hello" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "nameless skip field", + "tok": "%{} %{key}", + "msg": "hello world", + "expected": { + "key": "world" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "named skip field", + "tok": "%{?skipme} %{key}", + "msg": "hello world", + "expected": { + "key": "world" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "reference without pairing", + "tok": "%{key} %{\u0026key}", + "msg": "hello world", + "expected": null, + "skip": false, + "fail": true, + "append": "" + }, + { + "name": "missing fields (consecutive delimiters)", + "tok": "%{name},%{addr1},%{addr2},%{addr3},%{city},%{zip}", + "msg": "Jane Doe,4321 Fifth Avenue,,,New York,87432", + "expected": { + "addr1": "4321 Fifth Avenue", + "addr2": "", + "addr3": "", + "city": "New York", + "name": "Jane Doe", + "zip": "87432" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "missing fields with right padding (consecutive delimiters)", + "tok": "%{name},%{addr1-\u003e},%{city},%{zip}", + "msg": "Jane Doe,4321 Fifth Avenue,,,New York,87432", + "expected": { + "addr1": "4321 Fifth Avenue", + "city": "New York", + "name": "Jane Doe", + "zip": "87432" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "ignore right padding", + "tok": "%{id} %{function-\u003e} %{server}", + "msg": "00000043 ViewReceive machine-321", + "expected": { + "function": "ViewReceive", + "id": "00000043", + "server": "machine-321" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "padding on the last key need a delimiter", + "tok": "%{id} %{function} %{server-\u003e} ", + "msg": "00000043 ViewReceive machine-321 ", + "expected": { + "function": "ViewReceive", + "id": "00000043", + "server": "machine-321" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "ignore left padding", + "tok": "%{id-\u003e} %{function} %{server}", + "msg": "00000043 ViewReceive machine-321", + "expected": { + "function": "ViewReceive", + "id": "00000043", + "server": "machine-321" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "when the delimiters contains `{` and `}`", + "tok": "{%{a}}{%{b}} %{rest}", + "msg": "{c}{d} anything", + "expected": { + "a": "c", + "b": "d", + "rest": "anything" + }, + "skip": false, + "fail": false, + "append": "" + }, + { + "name": "no keys defined", + "tok": "anything", + "msg": "anything", + "expected": null, + "skip": false, + "fail": true, + "append": "" + }, + { + "name": "invalid key", + "tok": "%{some?thing}", + "msg": "anything", + "expected": null, + "skip": false, + "fail": true, + "append": "" + }, + { + "name": "matches non-ascii", + "tok": "%{a}࿏%{b} %{c}", + "msg": "⟳༒࿏༒⟲ 子", + "expected": { + "a": "⟳༒", + "b": "༒⟲", + "c": "子" + }, + "skip": false, + "fail": false, + "append": "" + } + +] \ No newline at end of file diff --git a/modules/ingest-common/build.gradle b/modules/ingest-common/build.gradle index 4f35bbee28dfc..1681258e7c7ee 100644 --- a/modules/ingest-common/build.gradle +++ b/modules/ingest-common/build.gradle @@ -26,6 +26,7 @@ esplugin { dependencies { compileOnly project(':modules:lang-painless') compile project(':libs:grok') + compile project(':libs:dissect') } compileJava.options.compilerArgs << "-Xlint:-unchecked,-rawtypes" diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/DissectProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/DissectProcessor.java new file mode 100644 index 0000000000000..58f04ccdd431f --- /dev/null +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/DissectProcessor.java @@ -0,0 +1,76 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.dissect.DissectParser; +import org.elasticsearch.ingest.AbstractProcessor; +import org.elasticsearch.ingest.ConfigurationUtils; +import org.elasticsearch.ingest.IngestDocument; +import org.elasticsearch.ingest.Processor; + +import java.util.Map; + +public final class DissectProcessor extends AbstractProcessor { + + public static final String TYPE = "dissect"; + //package private members for testing + final String field; + final boolean ignoreMissing; + final String pattern; + final String appendSeparator; + final DissectParser dissectParser; + + DissectProcessor(String tag, String field, String pattern, String appendSeparator, boolean ignoreMissing) { + super(tag); + this.field = field; + this.ignoreMissing = ignoreMissing; + this.pattern = pattern; + this.appendSeparator = appendSeparator; + this.dissectParser = new DissectParser(pattern, appendSeparator); + } + + @Override + public void execute(IngestDocument ingestDocument) { + String input = ingestDocument.getFieldValue(field, String.class, ignoreMissing); + if (input == null && ignoreMissing) { + return; + } else if (input == null) { + throw new IllegalArgumentException("field [" + field + "] is null, cannot process it."); + } + dissectParser.parse(input).forEach(ingestDocument::setFieldValue); + } + + @Override + public String getType() { + return TYPE; + } + + public static final class Factory implements Processor.Factory { + + @Override + public DissectProcessor create(Map registry, String processorTag, Map config) { + String field = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field"); + String pattern = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "pattern"); + String appendSeparator = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "append_separator", ""); + boolean ignoreMissing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); + return new DissectProcessor(processorTag, field, pattern, appendSeparator, ignoreMissing); + } + } +} diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java index 78cb0416108ff..8b048282814ea 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java @@ -83,6 +83,7 @@ public Map getProcessors(Processor.Parameters paramet processors.put(URLDecodeProcessor.TYPE, new URLDecodeProcessor.Factory()); processors.put(BytesProcessor.TYPE, new BytesProcessor.Factory()); processors.put(PipelineProcessor.TYPE, new PipelineProcessor.Factory(parameters.ingestService)); + processors.put(DissectProcessor.TYPE, new DissectProcessor.Factory()); return Collections.unmodifiableMap(processors); } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorFactoryTests.java new file mode 100644 index 0000000000000..ba1b2bd1eb576 --- /dev/null +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorFactoryTests.java @@ -0,0 +1,92 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.dissect.DissectException; +import org.elasticsearch.ingest.RandomDocumentPicks; +import org.elasticsearch.test.ESTestCase; +import org.hamcrest.Matchers; + +import java.util.HashMap; +import java.util.Map; + +import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.CoreMatchers.notNullValue; +import static org.hamcrest.Matchers.is; + +public class DissectProcessorFactoryTests extends ESTestCase { + + public void testCreate() { + DissectProcessor.Factory factory = new DissectProcessor.Factory(); + String fieldName = RandomDocumentPicks.randomFieldName(random()); + String processorTag = randomAlphaOfLength(10); + String pattern = "%{a},%{b},%{c}"; + String appendSeparator = ":"; + + Map config = new HashMap<>(); + config.put("field", fieldName); + config.put("pattern", pattern); + config.put("append_separator", appendSeparator); + config.put("ignore_missing", true); + + DissectProcessor processor = factory.create(null, processorTag, config); + assertThat(processor.getTag(), equalTo(processorTag)); + assertThat(processor.field, equalTo(fieldName)); + assertThat(processor.pattern, equalTo(pattern)); + assertThat(processor.appendSeparator, equalTo(appendSeparator)); + assertThat(processor.dissectParser, is(notNullValue())); + assertThat(processor.ignoreMissing, is(true)); + } + + public void testCreateMissingField() { + DissectProcessor.Factory factory = new DissectProcessor.Factory(); + Map config = new HashMap<>(); + config.put("pattern", "%{a},%{b},%{c}"); + Exception e = expectThrows(ElasticsearchParseException.class, () -> factory.create(null, "_tag", config)); + assertThat(e.getMessage(), Matchers.equalTo("[field] required property is missing")); + } + + public void testCreateMissingPattern() { + DissectProcessor.Factory factory = new DissectProcessor.Factory(); + Map config = new HashMap<>(); + config.put("field", randomAlphaOfLength(10)); + Exception e = expectThrows(ElasticsearchParseException.class, () -> factory.create(null, "_tag", config)); + assertThat(e.getMessage(), Matchers.equalTo("[pattern] required property is missing")); + } + + public void testCreateMissingOptionals() { + DissectProcessor.Factory factory = new DissectProcessor.Factory(); + Map config = new HashMap<>(); + config.put("pattern", "%{a},%{b},%{c}"); + config.put("field", randomAlphaOfLength(10)); + DissectProcessor processor = factory.create(null, "_tag", config); + assertThat(processor.appendSeparator, equalTo("")); + assertThat(processor.ignoreMissing, is(false)); + } + + public void testCreateBadPattern() { + DissectProcessor.Factory factory = new DissectProcessor.Factory(); + Map config = new HashMap<>(); + config.put("pattern", "no keys defined"); + config.put("field", randomAlphaOfLength(10)); + expectThrows(DissectException.class, () -> factory.create(null, "_tag", config)); + } +} diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorTests.java new file mode 100644 index 0000000000000..d5fedb7b5abe2 --- /dev/null +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorTests.java @@ -0,0 +1,114 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.common.collect.MapBuilder; +import org.elasticsearch.dissect.DissectException; +import org.elasticsearch.ingest.IngestDocument; +import org.elasticsearch.ingest.Processor; +import org.elasticsearch.ingest.RandomDocumentPicks; +import org.elasticsearch.test.ESTestCase; +import org.hamcrest.CoreMatchers; + +import java.util.Collections; +import java.util.HashMap; + +import static org.elasticsearch.ingest.IngestDocumentMatcher.assertIngestDocument; +import static org.hamcrest.Matchers.equalTo; + +/** + * Basic tests for the {@link DissectProcessor}. See the {@link org.elasticsearch.dissect.DissectParser} test suite for a comprehensive + * set of dissect tests. + */ +public class DissectProcessorTests extends ESTestCase { + + public void testMatch() { + IngestDocument ingestDocument = new IngestDocument("_index", "_type", "_id", null, null, null, null, + Collections.singletonMap("message", "foo,bar,baz")); + DissectProcessor dissectProcessor = new DissectProcessor("", "message", "%{a},%{b},%{c}", "", true); + dissectProcessor.execute(ingestDocument); + assertThat(ingestDocument.getFieldValue("a", String.class), equalTo("foo")); + assertThat(ingestDocument.getFieldValue("b", String.class), equalTo("bar")); + assertThat(ingestDocument.getFieldValue("c", String.class), equalTo("baz")); + } + + public void testMatchOverwrite() { + IngestDocument ingestDocument = new IngestDocument("_index", "_type", "_id", null, null, null, null, + MapBuilder.newMapBuilder() + .put("message", "foo,bar,baz") + .put("a", "willgetstompped") + .map()); + assertThat(ingestDocument.getFieldValue("a", String.class), equalTo("willgetstompped")); + DissectProcessor dissectProcessor = new DissectProcessor("", "message", "%{a},%{b},%{c}", "", true); + dissectProcessor.execute(ingestDocument); + assertThat(ingestDocument.getFieldValue("a", String.class), equalTo("foo")); + assertThat(ingestDocument.getFieldValue("b", String.class), equalTo("bar")); + assertThat(ingestDocument.getFieldValue("c", String.class), equalTo("baz")); + } + + public void testAdvancedMatch() { + IngestDocument ingestDocument = new IngestDocument("_index", "_type", "_id", null, null, null, null, + Collections.singletonMap("message", "foo bar,,,,,,,baz nope:notagain 😊 🐇 🙃")); + DissectProcessor dissectProcessor = + new DissectProcessor("", "message", "%{a->} %{*b->},%{&b} %{}:%{?skipme} %{+smile/2} 🐇 %{+smile/1}", "::::", true); + dissectProcessor.execute(ingestDocument); + assertThat(ingestDocument.getFieldValue("a", String.class), equalTo("foo")); + assertThat(ingestDocument.getFieldValue("bar", String.class), equalTo("baz")); + expectThrows(IllegalArgumentException.class, () -> ingestDocument.getFieldValue("nope", String.class)); + expectThrows(IllegalArgumentException.class, () -> ingestDocument.getFieldValue("notagain", String.class)); + assertThat(ingestDocument.getFieldValue("smile", String.class), equalTo("🙃::::😊")); + } + + public void testMiss() { + IngestDocument ingestDocument = new IngestDocument("_index", "_type", "_id", null, null, null, null, + Collections.singletonMap("message", "foo:bar,baz")); + DissectProcessor dissectProcessor = new DissectProcessor("", "message", "%{a},%{b},%{c}", "", true); + DissectException e = expectThrows(DissectException.class, () -> dissectProcessor.execute(ingestDocument)); + assertThat(e.getMessage(), CoreMatchers.containsString("Unable to find match for dissect pattern")); + } + + public void testNonStringValueWithIgnoreMissing() { + String fieldName = RandomDocumentPicks.randomFieldName(random()); + Processor processor = new DissectProcessor("", fieldName, "%{a},%{b},%{c}", "", true); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>()); + ingestDocument.setFieldValue(fieldName, randomInt()); + Exception e = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + assertThat(e.getMessage(), equalTo("field [" + fieldName + "] of type [java.lang.Integer] cannot be cast to [java.lang.String]")); + } + + public void testNullValueWithIgnoreMissing() throws Exception { + String fieldName = RandomDocumentPicks.randomFieldName(random()); + Processor processor = new DissectProcessor("", fieldName, "%{a},%{b},%{c}", "", true); + IngestDocument originalIngestDocument = RandomDocumentPicks + .randomIngestDocument(random(), Collections.singletonMap(fieldName, null)); + IngestDocument ingestDocument = new IngestDocument(originalIngestDocument); + processor.execute(ingestDocument); + assertIngestDocument(originalIngestDocument, ingestDocument); + } + + public void testNullValueWithOutIgnoreMissing() { + String fieldName = RandomDocumentPicks.randomFieldName(random()); + Processor processor = new DissectProcessor("", fieldName, "%{a},%{b},%{c}", "", false); + IngestDocument originalIngestDocument = RandomDocumentPicks + .randomIngestDocument(random(), Collections.singletonMap(fieldName, null)); + IngestDocument ingestDocument = new IngestDocument(originalIngestDocument); + expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + } +} diff --git a/modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/10_basic.yml b/modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/10_basic.yml index c6f9860c1efcc..a102634db2f4b 100644 --- a/modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/10_basic.yml +++ b/modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/10_basic.yml @@ -14,22 +14,23 @@ - match: { nodes.$master.ingest.processors.2.type: convert } - match: { nodes.$master.ingest.processors.3.type: date } - match: { nodes.$master.ingest.processors.4.type: date_index_name } - - match: { nodes.$master.ingest.processors.5.type: dot_expander } - - match: { nodes.$master.ingest.processors.6.type: fail } - - match: { nodes.$master.ingest.processors.7.type: foreach } - - match: { nodes.$master.ingest.processors.8.type: grok } - - match: { nodes.$master.ingest.processors.9.type: gsub } - - match: { nodes.$master.ingest.processors.10.type: join } - - match: { nodes.$master.ingest.processors.11.type: json } - - match: { nodes.$master.ingest.processors.12.type: kv } - - match: { nodes.$master.ingest.processors.13.type: lowercase } - - match: { nodes.$master.ingest.processors.14.type: pipeline } - - match: { nodes.$master.ingest.processors.15.type: remove } - - match: { nodes.$master.ingest.processors.16.type: rename } - - match: { nodes.$master.ingest.processors.17.type: script } - - match: { nodes.$master.ingest.processors.18.type: set } - - match: { nodes.$master.ingest.processors.19.type: sort } - - match: { nodes.$master.ingest.processors.20.type: split } - - match: { nodes.$master.ingest.processors.21.type: trim } - - match: { nodes.$master.ingest.processors.22.type: uppercase } + - match: { nodes.$master.ingest.processors.5.type: dissect } + - match: { nodes.$master.ingest.processors.6.type: dot_expander } + - match: { nodes.$master.ingest.processors.7.type: fail } + - match: { nodes.$master.ingest.processors.8.type: foreach } + - match: { nodes.$master.ingest.processors.9.type: grok } + - match: { nodes.$master.ingest.processors.10.type: gsub } + - match: { nodes.$master.ingest.processors.11.type: join } + - match: { nodes.$master.ingest.processors.12.type: json } + - match: { nodes.$master.ingest.processors.13.type: kv } + - match: { nodes.$master.ingest.processors.14.type: lowercase } + - match: { nodes.$master.ingest.processors.15.type: pipeline } + - match: { nodes.$master.ingest.processors.16.type: remove } + - match: { nodes.$master.ingest.processors.17.type: rename } + - match: { nodes.$master.ingest.processors.18.type: script } + - match: { nodes.$master.ingest.processors.19.type: set } + - match: { nodes.$master.ingest.processors.20.type: sort } + - match: { nodes.$master.ingest.processors.21.type: split } + - match: { nodes.$master.ingest.processors.22.type: trim } + - match: { nodes.$master.ingest.processors.23.type: uppercase } diff --git a/modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/200_dissect_processor.yml b/modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/200_dissect_processor.yml new file mode 100644 index 0000000000000..1a7c2e593d43c --- /dev/null +++ b/modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/200_dissect_processor.yml @@ -0,0 +1,90 @@ +--- +teardown: +- do: + ingest.delete_pipeline: + id: "my_pipeline" + ignore: 404 + +--- +"Test dissect processor match": +- do: + ingest.put_pipeline: + id: "my_pipeline" + body: > + { + "description": "_description", + "processors": [ + { + "dissect" : { + "field" : "message", + "pattern" : "%{a} %{b} %{c}" + } + } + ] + } +- match: { acknowledged: true } + +- do: + index: + index: test + type: test + id: 1 + pipeline: "my_pipeline" + body: {message: "foo bar baz"} + +- do: + get: + index: test + type: test + id: 1 +- match: { _source.message: "foo bar baz" } +- match: { _source.a: "foo" } +- match: { _source.b: "bar" } +- match: { _source.c: "baz" } +--- +"Test dissect processor mismatch": +- do: + ingest.put_pipeline: + id: "my_pipeline" + body: > + { + "description": "_description", + "processors": [ + { + "dissect" : { + "field" : "message", + "pattern" : "%{a},%{b},%{c}" + } + } + ] + } +- match: { acknowledged: true } + +- do: + catch: '/Unable to find match for dissect pattern: \%\{a\},\%\{b\},\%\{c\} against source: foo bar baz/' + index: + index: test + type: test + id: 2 + pipeline: "my_pipeline" + body: {message: "foo bar baz"} + +--- +"Test fail to create dissect processor": +- do: + catch: '/Unable to parse pattern/' + ingest.put_pipeline: + id: "my_pipeline" + body: > + { + "description": "_description", + "processors": [ + { + "dissect" : { + "field" : "message", + "pattern" : "bad pattern" + } + } + ] + } +