From d1a9a14e9583aeab5ae900cfa47323f79fb68fdf Mon Sep 17 00:00:00 2001 From: Jake Landis Date: Tue, 28 Aug 2018 07:11:20 -0700 Subject: [PATCH] ingest: Introduce the dissect processor (#32884) The ingest node dissect processor is an alternative to Grok to split a string based on a pattern. Dissect differs from Grok such that regular expressions are not used to split the string. Dissect can be used to parse a source text field with a simpler pattern, and is often faster the Grok for basic string parsing. This processor uses the dissect library which does most of the work. --- docs/reference/ingest/ingest-node.asciidoc | 431 +++++++++++++----- libs/dissect/build.gradle | 2 +- modules/ingest-common/build.gradle | 1 + .../ingest/common/DissectProcessor.java | 76 +++ .../ingest/common/IngestCommonPlugin.java | 3 +- .../common/DissectProcessorFactoryTests.java | 92 ++++ .../ingest/common/DissectProcessorTests.java | 114 +++++ .../test/ingest/200_dissect_processor.yml | 90 ++++ 8 files changed, 688 insertions(+), 121 deletions(-) create mode 100644 modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/DissectProcessor.java create mode 100644 modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorFactoryTests.java create mode 100644 modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorTests.java create mode 100644 modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/200_dissect_processor.yml diff --git a/docs/reference/ingest/ingest-node.asciidoc b/docs/reference/ingest/ingest-node.asciidoc index 700f72850169d..bf5d291f959d9 100644 --- a/docs/reference/ingest/ingest-node.asciidoc +++ b/docs/reference/ingest/ingest-node.asciidoc @@ -1049,6 +1049,318 @@ understands this to mean `2016-04-01` as is explained in the <>, dissect also extracts structured fields out of a single text field +within a document. However unlike the <>, dissect does not use +https://en.wikipedia.org/wiki/Regular_expression[Regular Expressions]. This allows dissect's syntax to be simple and for +some cases faster than the <>. + +Dissect matches a single text field against a defined pattern. + +For example the following pattern: +[source,txt] +-------------------------------------------------- +%{clientip} %{ident} %{auth} [%{@timestamp}] \"%{verb} %{request} HTTP/%{httpversion}\" %{status} %{size} +-------------------------------------------------- +will match a log line of this format: +[source,txt] +-------------------------------------------------- +1.2.3.4 - - [30/Apr/1998:22:00:52 +0000] \"GET /english/venues/cities/images/montpellier/18.gif HTTP/1.0\" 200 3171 +-------------------------------------------------- +and result in a document with the following fields: +[source,js] +-------------------------------------------------- +"doc": { + "_index": "_index", + "_type": "_type", + "_id": "_id", + "_source": { + "request": "/english/venues/cities/images/montpellier/18.gif", + "auth": "-", + "ident": "-", + "verb": "GET", + "@timestamp": "30/Apr/1998:22:00:52 +0000", + "size": "3171", + "clientip": "1.2.3.4", + "httpversion": "1.0", + "status": "200" + } +} +-------------------------------------------------- +// NOTCONSOLE + +A dissect pattern is defined by the parts of the string that will be discarded. In the example above the first part +to be discarded is a single space. Dissect finds this space, then assigns the value of `clientip` is everything up +until that space. +Later dissect matches the `[` and then `]` and then assigns `@timestamp` to everything in-between `[` and `]`. +Paying special attention the parts of the string to discard will help build successful dissect patterns. + +Successful matches require all keys in a pattern to have a value. If any of the `%{keyname}` defined in the pattern do +not have a value, then an exception is thrown and may be handled by the <> directive. +An empty key `%{}` or a <> can be used to match values, but exclude the value from +the final document. All matched values are represented as string data types. The <> +may be used to convert to expected data type. + +Dissect also supports <> that can change dissect's default +behavior. For example you can instruct dissect to ignore certain fields, append fields, skip over padding, etc. +See <> for more information. + +[[dissect-options]] +.Dissect Options +[options="header"] +|====== +| Name | Required | Default | Description +| `field` | yes | - | The field to dissect +| `pattern` | yes | - | The pattern to apply to the field +| `append_separator`| no | "" (empty string) | The character(s) that separate the appended fields. +| `ignore_missing` | no | false | If `true` and `field` does not exist or is `null`, the processor quietly exits without modifying the document +| ` +|====== + +[source,js] +-------------------------------------------------- +{ + "dissect": { + "field": "message", + "pattern" : "%{clientip} %{ident} %{auth} [%{@timestamp}] \"%{verb} %{request} HTTP/%{httpversion}\" %{status} %{size}" + } +} +-------------------------------------------------- +// NOTCONSOLE +[[dissect-key-modifiers]] +==== Dissect key modifiers +Key modifiers can change the default behavior for dissection. Key modifiers may be found on the left or right +of the `%{keyname}` always inside the `%{` and `}`. For example `%{+keyname ->}` has the append and right padding +modifiers. + +.Dissect Key Modifiers +[options="header"] +|====== +| Modifier | Name | Position | Example | Description | Details +| `->` | Skip right padding | (far) right | `%{keyname1->}` | Skips any repeated characters to the right | <> +| `+` | Append | left | `%{+keyname} %{+keyname}` | Appends two or more fields together | <> +| `+` with `/n` | Append with order | left and right | `%{+keyname/2} %{+keyname/1}` | Appends two or more fields together in the order specified | <> +| `?` | Named skip key | left | `%{?ignoreme}` | Skips the matched value in the output. Same behavior as `%{}`| <> +| `*` and `&` | Reference keys | left | `%{*r1} %{&r1}` | Sets the output key as value of `*` and output value of `&` | <> +| ` +|====== + +[[dissect-modifier-skip-right-padding]] +===== Right padding modifier (`->`) + +The algorithm that performs the dissection is very strict in that it requires all characters in the pattern to match +the source string. For example, the pattern `%{fookey} %{barkey}` (1 space), will match the string "foo{nbsp}bar" +(1 space), but will not match the string "foo{nbsp}{nbsp}bar" (2 spaces) since the pattern has only 1 space and the +source string has 2 spaces. + +The right padding modifier helps with this case. Adding the right padding modifier to the pattern `%{fookey->} %{barkey}`, +It will now will match "foo{nbsp}bar" (1 space) and "foo{nbsp}{nbsp}bar" (2 spaces) +and even "foo{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}bar" (10 spaces). + +Use the right padding modifier to allow for repetition of the characters after a `%{keyname->}`. + +The right padding modifier may be placed on any key with any other modifiers. It should always be the furthest right +modifier. For example: `%{+keyname/1->}` and `%{->}` + +Right padding modifier example +|====== +| *Pattern* | `%{ts->} %{level}` +| *Input* | 1998-08-10T17:15:42,466{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}WARN +| *Result* a| +* ts = 1998-08-10T17:15:42,466 +* level = WARN +|====== + +The right padding modifier may be used with an empty key to help skip unwanted data. For example, the same input string, but wrapped with brackets requires the use of an empty right padded key to achieve the same result. + +Right padding modifier with empty key example +|====== +| *Pattern* | `[%{ts}]%{->}[%{level}]` +| *Input* | [1998-08-10T17:15:42,466]{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}{nbsp}[WARN] +| *Result* a| +* ts = 1998-08-10T17:15:42,466 +* level = WARN +|====== + +===== Append modifier (`+`) +[[dissect-modifier-append-key]] +Dissect supports appending two or more results together for the output. +Values are appended left to right. An append separator can be specified. +In this example the append_separator is defined as a space. + +Append modifier example +|====== +| *Pattern* | `%{+name} %{+name} %{+name} %{+name}` +| *Input* | john jacob jingleheimer schmidt +| *Result* a| +* name = john jacob jingleheimer schmidt +|====== + +===== Append with order modifier (`+` and `/n`) +[[dissect-modifier-append-key-with-order]] +Dissect supports appending two or more results together for the output. +Values are appended based on the order defined (`/n`). An append separator can be specified. +In this example the append_separator is defined as a comma. + +Append with order modifier example +|====== +| *Pattern* | `%{+name/2} %{+name/4} %{+name/3} %{+name/1}` +| *Input* | john jacob jingleheimer schmidt +| *Result* a| +* name = schmidt,john,jingleheimer,jacob +|====== + +===== Named skip key (`?`) +[[dissect-modifier-named-skip-key]] +Dissect supports ignoring matches in the final result. This can be done with an empty key `%{}`, but for readability +it may be desired to give that empty key a name. + +Named skip key modifier example +|====== +| *Pattern* | `%{clientip} %{?ident} %{?auth} [%{@timestamp}]` +| *Input* | 1.2.3.4 - - [30/Apr/1998:22:00:52 +0000] +| *Result* a| +* ip = 1.2.3.4 +* @timestamp = 30/Apr/1998:22:00:52 +0000 +|====== + +===== Reference keys (`*` and `&`) +[[dissect-modifier-reference-keys]] +Dissect support using parsed values as the key/value pairings for the structured content. Imagine a system that +partially logs in key/value pairs. Reference keys allow you to maintain that key/value relationship. + +Reference key modifier example +|====== +| *Pattern* | `[%{ts}] [%{level}] %{*p1}:%{&p1} %{*p2}:%{&p2}` +| *Input* | [2018-08-10T17:15:42,466] [ERR] ip:1.2.3.4 error:REFUSED +| *Result* a| +* ts = 1998-08-10T17:15:42,466 +* level = ERR +* ip = 1.2.3.4 +* error = REFUSED +|====== + +[[dot-expand-processor]] +=== Dot Expander Processor + +Expands a field with dots into an object field. This processor allows fields +with dots in the name to be accessible by other processors in the pipeline. +Otherwise these <> can't be accessed by any processor. + +[[dot-expender-options]] +.Dot Expand Options +[options="header"] +|====== +| Name | Required | Default | Description +| `field` | yes | - | The field to expand into an object field +| `path` | no | - | The field that contains the field to expand. Only required if the field to expand is part another object field, because the `field` option can only understand leaf fields. +|====== + +[source,js] +-------------------------------------------------- +{ + "dot_expander": { + "field": "foo.bar" + } +} +-------------------------------------------------- +// NOTCONSOLE + +For example the dot expand processor would turn this document: + +[source,js] +-------------------------------------------------- +{ + "foo.bar" : "value" +} +-------------------------------------------------- +// NOTCONSOLE + +into: + +[source,js] +-------------------------------------------------- +{ + "foo" : { + "bar" : "value" + } +} +-------------------------------------------------- +// NOTCONSOLE + +If there is already a `bar` field nested under `foo` then +this processor merges the `foo.bar` field into it. If the field is +a scalar value then it will turn that field into an array field. + +For example, the following document: + +[source,js] +-------------------------------------------------- +{ + "foo.bar" : "value2", + "foo" : { + "bar" : "value1" + } +} +-------------------------------------------------- +// NOTCONSOLE + +is transformed by the `dot_expander` processor into: + +[source,js] +-------------------------------------------------- +{ + "foo" : { + "bar" : ["value1", "value2"] + } +} +-------------------------------------------------- +// NOTCONSOLE + +If any field outside of the leaf field conflicts with a pre-existing field of the same name, +then that field needs to be renamed first. + +Consider the following document: + +[source,js] +-------------------------------------------------- +{ + "foo": "value1", + "foo.bar": "value2" +} +-------------------------------------------------- +// NOTCONSOLE + +Then the `foo` needs to be renamed first before the `dot_expander` +processor is applied. So in order for the `foo.bar` field to properly +be expanded into the `bar` field under the `foo` field the following +pipeline should be used: + +[source,js] +-------------------------------------------------- +{ + "processors" : [ + { + "rename" : { + "field" : "foo", + "target_field" : "foo.bar"" + } + }, + { + "dot_expander": { + "field": "foo.bar" + } + } + ] +} +-------------------------------------------------- +// NOTCONSOLE + +The reason for this is that Ingest doesn't know how to automatically cast +a scalar field to an object field. + [[fail-processor]] === Fail Processor Raises an exception. This is useful for when @@ -2058,125 +2370,6 @@ Converts a string to its uppercase equivalent. -------------------------------------------------- // NOTCONSOLE -[[dot-expand-processor]] -=== Dot Expander Processor - -Expands a field with dots into an object field. This processor allows fields -with dots in the name to be accessible by other processors in the pipeline. -Otherwise these <> can't be accessed by any processor. - -[[dot-expender-options]] -.Dot Expand Options -[options="header"] -|====== -| Name | Required | Default | Description -| `field` | yes | - | The field to expand into an object field -| `path` | no | - | The field that contains the field to expand. Only required if the field to expand is part another object field, because the `field` option can only understand leaf fields. -|====== - -[source,js] --------------------------------------------------- -{ - "dot_expander": { - "field": "foo.bar" - } -} --------------------------------------------------- -// NOTCONSOLE - -For example the dot expand processor would turn this document: - -[source,js] --------------------------------------------------- -{ - "foo.bar" : "value" -} --------------------------------------------------- -// NOTCONSOLE - -into: - -[source,js] --------------------------------------------------- -{ - "foo" : { - "bar" : "value" - } -} --------------------------------------------------- -// NOTCONSOLE - -If there is already a `bar` field nested under `foo` then -this processor merges the `foo.bar` field into it. If the field is -a scalar value then it will turn that field into an array field. - -For example, the following document: - -[source,js] --------------------------------------------------- -{ - "foo.bar" : "value2", - "foo" : { - "bar" : "value1" - } -} --------------------------------------------------- -// NOTCONSOLE - -is transformed by the `dot_expander` processor into: - -[source,js] --------------------------------------------------- -{ - "foo" : { - "bar" : ["value1", "value2"] - } -} --------------------------------------------------- -// NOTCONSOLE - -If any field outside of the leaf field conflicts with a pre-existing field of the same name, -then that field needs to be renamed first. - -Consider the following document: - -[source,js] --------------------------------------------------- -{ - "foo": "value1", - "foo.bar": "value2" -} --------------------------------------------------- -// NOTCONSOLE - -Then the `foo` needs to be renamed first before the `dot_expander` -processor is applied. So in order for the `foo.bar` field to properly -be expanded into the `bar` field under the `foo` field the following -pipeline should be used: - -[source,js] --------------------------------------------------- -{ - "processors" : [ - { - "rename" : { - "field" : "foo", - "target_field" : "foo.bar"" - } - }, - { - "dot_expander": { - "field": "foo.bar" - } - } - ] -} --------------------------------------------------- -// NOTCONSOLE - -The reason for this is that Ingest doesn't know how to automatically cast -a scalar field to an object field. - [[urldecode-processor]] === URL Decode Processor URL-decodes a string diff --git a/libs/dissect/build.gradle b/libs/dissect/build.gradle index c09a2a4ebd1b3..577f9bdb337a7 100644 --- a/libs/dissect/build.gradle +++ b/libs/dissect/build.gradle @@ -33,7 +33,7 @@ dependencies { } forbiddenApisMain { - signaturesURLs = [PrecommitTasks.getResource('/forbidden/jdk-signatures.txt')] + replaceSignatureFiles 'jdk-signatures' } if (isEclipse) { diff --git a/modules/ingest-common/build.gradle b/modules/ingest-common/build.gradle index 4f35bbee28dfc..1681258e7c7ee 100644 --- a/modules/ingest-common/build.gradle +++ b/modules/ingest-common/build.gradle @@ -26,6 +26,7 @@ esplugin { dependencies { compileOnly project(':modules:lang-painless') compile project(':libs:grok') + compile project(':libs:dissect') } compileJava.options.compilerArgs << "-Xlint:-unchecked,-rawtypes" diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/DissectProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/DissectProcessor.java new file mode 100644 index 0000000000000..58f04ccdd431f --- /dev/null +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/DissectProcessor.java @@ -0,0 +1,76 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.dissect.DissectParser; +import org.elasticsearch.ingest.AbstractProcessor; +import org.elasticsearch.ingest.ConfigurationUtils; +import org.elasticsearch.ingest.IngestDocument; +import org.elasticsearch.ingest.Processor; + +import java.util.Map; + +public final class DissectProcessor extends AbstractProcessor { + + public static final String TYPE = "dissect"; + //package private members for testing + final String field; + final boolean ignoreMissing; + final String pattern; + final String appendSeparator; + final DissectParser dissectParser; + + DissectProcessor(String tag, String field, String pattern, String appendSeparator, boolean ignoreMissing) { + super(tag); + this.field = field; + this.ignoreMissing = ignoreMissing; + this.pattern = pattern; + this.appendSeparator = appendSeparator; + this.dissectParser = new DissectParser(pattern, appendSeparator); + } + + @Override + public void execute(IngestDocument ingestDocument) { + String input = ingestDocument.getFieldValue(field, String.class, ignoreMissing); + if (input == null && ignoreMissing) { + return; + } else if (input == null) { + throw new IllegalArgumentException("field [" + field + "] is null, cannot process it."); + } + dissectParser.parse(input).forEach(ingestDocument::setFieldValue); + } + + @Override + public String getType() { + return TYPE; + } + + public static final class Factory implements Processor.Factory { + + @Override + public DissectProcessor create(Map registry, String processorTag, Map config) { + String field = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field"); + String pattern = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "pattern"); + String appendSeparator = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "append_separator", ""); + boolean ignoreMissing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); + return new DissectProcessor(processorTag, field, pattern, appendSeparator, ignoreMissing); + } + } +} diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java index 78cb0416108ff..3e0c1c9f455cf 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java @@ -82,7 +82,8 @@ public Map getProcessors(Processor.Parameters paramet processors.put(KeyValueProcessor.TYPE, new KeyValueProcessor.Factory()); processors.put(URLDecodeProcessor.TYPE, new URLDecodeProcessor.Factory()); processors.put(BytesProcessor.TYPE, new BytesProcessor.Factory()); - processors.put(PipelineProcessor.TYPE, new PipelineProcessor.Factory(parameters.ingestService)); + processors.put(PipelineProcessor.TYPE, new PipelineProcessor.Factory(parameters.ingestService)); + processors.put(DissectProcessor.TYPE, new DissectProcessor.Factory()); return Collections.unmodifiableMap(processors); } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorFactoryTests.java new file mode 100644 index 0000000000000..ba1b2bd1eb576 --- /dev/null +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorFactoryTests.java @@ -0,0 +1,92 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.dissect.DissectException; +import org.elasticsearch.ingest.RandomDocumentPicks; +import org.elasticsearch.test.ESTestCase; +import org.hamcrest.Matchers; + +import java.util.HashMap; +import java.util.Map; + +import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.CoreMatchers.notNullValue; +import static org.hamcrest.Matchers.is; + +public class DissectProcessorFactoryTests extends ESTestCase { + + public void testCreate() { + DissectProcessor.Factory factory = new DissectProcessor.Factory(); + String fieldName = RandomDocumentPicks.randomFieldName(random()); + String processorTag = randomAlphaOfLength(10); + String pattern = "%{a},%{b},%{c}"; + String appendSeparator = ":"; + + Map config = new HashMap<>(); + config.put("field", fieldName); + config.put("pattern", pattern); + config.put("append_separator", appendSeparator); + config.put("ignore_missing", true); + + DissectProcessor processor = factory.create(null, processorTag, config); + assertThat(processor.getTag(), equalTo(processorTag)); + assertThat(processor.field, equalTo(fieldName)); + assertThat(processor.pattern, equalTo(pattern)); + assertThat(processor.appendSeparator, equalTo(appendSeparator)); + assertThat(processor.dissectParser, is(notNullValue())); + assertThat(processor.ignoreMissing, is(true)); + } + + public void testCreateMissingField() { + DissectProcessor.Factory factory = new DissectProcessor.Factory(); + Map config = new HashMap<>(); + config.put("pattern", "%{a},%{b},%{c}"); + Exception e = expectThrows(ElasticsearchParseException.class, () -> factory.create(null, "_tag", config)); + assertThat(e.getMessage(), Matchers.equalTo("[field] required property is missing")); + } + + public void testCreateMissingPattern() { + DissectProcessor.Factory factory = new DissectProcessor.Factory(); + Map config = new HashMap<>(); + config.put("field", randomAlphaOfLength(10)); + Exception e = expectThrows(ElasticsearchParseException.class, () -> factory.create(null, "_tag", config)); + assertThat(e.getMessage(), Matchers.equalTo("[pattern] required property is missing")); + } + + public void testCreateMissingOptionals() { + DissectProcessor.Factory factory = new DissectProcessor.Factory(); + Map config = new HashMap<>(); + config.put("pattern", "%{a},%{b},%{c}"); + config.put("field", randomAlphaOfLength(10)); + DissectProcessor processor = factory.create(null, "_tag", config); + assertThat(processor.appendSeparator, equalTo("")); + assertThat(processor.ignoreMissing, is(false)); + } + + public void testCreateBadPattern() { + DissectProcessor.Factory factory = new DissectProcessor.Factory(); + Map config = new HashMap<>(); + config.put("pattern", "no keys defined"); + config.put("field", randomAlphaOfLength(10)); + expectThrows(DissectException.class, () -> factory.create(null, "_tag", config)); + } +} diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorTests.java new file mode 100644 index 0000000000000..d5fedb7b5abe2 --- /dev/null +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/DissectProcessorTests.java @@ -0,0 +1,114 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.common.collect.MapBuilder; +import org.elasticsearch.dissect.DissectException; +import org.elasticsearch.ingest.IngestDocument; +import org.elasticsearch.ingest.Processor; +import org.elasticsearch.ingest.RandomDocumentPicks; +import org.elasticsearch.test.ESTestCase; +import org.hamcrest.CoreMatchers; + +import java.util.Collections; +import java.util.HashMap; + +import static org.elasticsearch.ingest.IngestDocumentMatcher.assertIngestDocument; +import static org.hamcrest.Matchers.equalTo; + +/** + * Basic tests for the {@link DissectProcessor}. See the {@link org.elasticsearch.dissect.DissectParser} test suite for a comprehensive + * set of dissect tests. + */ +public class DissectProcessorTests extends ESTestCase { + + public void testMatch() { + IngestDocument ingestDocument = new IngestDocument("_index", "_type", "_id", null, null, null, null, + Collections.singletonMap("message", "foo,bar,baz")); + DissectProcessor dissectProcessor = new DissectProcessor("", "message", "%{a},%{b},%{c}", "", true); + dissectProcessor.execute(ingestDocument); + assertThat(ingestDocument.getFieldValue("a", String.class), equalTo("foo")); + assertThat(ingestDocument.getFieldValue("b", String.class), equalTo("bar")); + assertThat(ingestDocument.getFieldValue("c", String.class), equalTo("baz")); + } + + public void testMatchOverwrite() { + IngestDocument ingestDocument = new IngestDocument("_index", "_type", "_id", null, null, null, null, + MapBuilder.newMapBuilder() + .put("message", "foo,bar,baz") + .put("a", "willgetstompped") + .map()); + assertThat(ingestDocument.getFieldValue("a", String.class), equalTo("willgetstompped")); + DissectProcessor dissectProcessor = new DissectProcessor("", "message", "%{a},%{b},%{c}", "", true); + dissectProcessor.execute(ingestDocument); + assertThat(ingestDocument.getFieldValue("a", String.class), equalTo("foo")); + assertThat(ingestDocument.getFieldValue("b", String.class), equalTo("bar")); + assertThat(ingestDocument.getFieldValue("c", String.class), equalTo("baz")); + } + + public void testAdvancedMatch() { + IngestDocument ingestDocument = new IngestDocument("_index", "_type", "_id", null, null, null, null, + Collections.singletonMap("message", "foo bar,,,,,,,baz nope:notagain 😊 🐇 🙃")); + DissectProcessor dissectProcessor = + new DissectProcessor("", "message", "%{a->} %{*b->},%{&b} %{}:%{?skipme} %{+smile/2} 🐇 %{+smile/1}", "::::", true); + dissectProcessor.execute(ingestDocument); + assertThat(ingestDocument.getFieldValue("a", String.class), equalTo("foo")); + assertThat(ingestDocument.getFieldValue("bar", String.class), equalTo("baz")); + expectThrows(IllegalArgumentException.class, () -> ingestDocument.getFieldValue("nope", String.class)); + expectThrows(IllegalArgumentException.class, () -> ingestDocument.getFieldValue("notagain", String.class)); + assertThat(ingestDocument.getFieldValue("smile", String.class), equalTo("🙃::::😊")); + } + + public void testMiss() { + IngestDocument ingestDocument = new IngestDocument("_index", "_type", "_id", null, null, null, null, + Collections.singletonMap("message", "foo:bar,baz")); + DissectProcessor dissectProcessor = new DissectProcessor("", "message", "%{a},%{b},%{c}", "", true); + DissectException e = expectThrows(DissectException.class, () -> dissectProcessor.execute(ingestDocument)); + assertThat(e.getMessage(), CoreMatchers.containsString("Unable to find match for dissect pattern")); + } + + public void testNonStringValueWithIgnoreMissing() { + String fieldName = RandomDocumentPicks.randomFieldName(random()); + Processor processor = new DissectProcessor("", fieldName, "%{a},%{b},%{c}", "", true); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>()); + ingestDocument.setFieldValue(fieldName, randomInt()); + Exception e = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + assertThat(e.getMessage(), equalTo("field [" + fieldName + "] of type [java.lang.Integer] cannot be cast to [java.lang.String]")); + } + + public void testNullValueWithIgnoreMissing() throws Exception { + String fieldName = RandomDocumentPicks.randomFieldName(random()); + Processor processor = new DissectProcessor("", fieldName, "%{a},%{b},%{c}", "", true); + IngestDocument originalIngestDocument = RandomDocumentPicks + .randomIngestDocument(random(), Collections.singletonMap(fieldName, null)); + IngestDocument ingestDocument = new IngestDocument(originalIngestDocument); + processor.execute(ingestDocument); + assertIngestDocument(originalIngestDocument, ingestDocument); + } + + public void testNullValueWithOutIgnoreMissing() { + String fieldName = RandomDocumentPicks.randomFieldName(random()); + Processor processor = new DissectProcessor("", fieldName, "%{a},%{b},%{c}", "", false); + IngestDocument originalIngestDocument = RandomDocumentPicks + .randomIngestDocument(random(), Collections.singletonMap(fieldName, null)); + IngestDocument ingestDocument = new IngestDocument(originalIngestDocument); + expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + } +} diff --git a/modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/200_dissect_processor.yml b/modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/200_dissect_processor.yml new file mode 100644 index 0000000000000..1a7c2e593d43c --- /dev/null +++ b/modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/200_dissect_processor.yml @@ -0,0 +1,90 @@ +--- +teardown: +- do: + ingest.delete_pipeline: + id: "my_pipeline" + ignore: 404 + +--- +"Test dissect processor match": +- do: + ingest.put_pipeline: + id: "my_pipeline" + body: > + { + "description": "_description", + "processors": [ + { + "dissect" : { + "field" : "message", + "pattern" : "%{a} %{b} %{c}" + } + } + ] + } +- match: { acknowledged: true } + +- do: + index: + index: test + type: test + id: 1 + pipeline: "my_pipeline" + body: {message: "foo bar baz"} + +- do: + get: + index: test + type: test + id: 1 +- match: { _source.message: "foo bar baz" } +- match: { _source.a: "foo" } +- match: { _source.b: "bar" } +- match: { _source.c: "baz" } +--- +"Test dissect processor mismatch": +- do: + ingest.put_pipeline: + id: "my_pipeline" + body: > + { + "description": "_description", + "processors": [ + { + "dissect" : { + "field" : "message", + "pattern" : "%{a},%{b},%{c}" + } + } + ] + } +- match: { acknowledged: true } + +- do: + catch: '/Unable to find match for dissect pattern: \%\{a\},\%\{b\},\%\{c\} against source: foo bar baz/' + index: + index: test + type: test + id: 2 + pipeline: "my_pipeline" + body: {message: "foo bar baz"} + +--- +"Test fail to create dissect processor": +- do: + catch: '/Unable to parse pattern/' + ingest.put_pipeline: + id: "my_pipeline" + body: > + { + "description": "_description", + "processors": [ + { + "dissect" : { + "field" : "message", + "pattern" : "bad pattern" + } + } + ] + } +