-
Notifications
You must be signed in to change notification settings - Fork 24.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ingest: Dissect processor and lib for 6x (#33422)
* Introduce the dissect library (#32297) The dissect library will be used for the ingest node as an alternative to Grok to split a string based on a pattern. Dissect differs from Grok such that regular expressions are not used to split the string. Note - Regular expressions are used during construction of the objects, but not in the hot path. A dissect pattern takes the form of: '%{a} %{b},%{c}' which is composed of 3 keys (a,b,c) and two delimiters (space and comma). This dissect pattern will match a string of the form: 'foo bar,baz' and will result a key/value pairing of 'a=foo, b=bar, and c=baz'. See the comments in DissectParser for a full explanation. This commit does not include the ingest node processor that will consume it. However, the consumption should be a trivial mapping between the key/value pairing returned by the parser and the key/value pairing needed for the IngestDocument. * ingest: Introduce the dissect processor (#32884) The ingest node dissect processor is an alternative to Grok to split a string based on a pattern. Dissect differs from Grok such that regular expressions are not used to split the string. Dissect can be used to parse a source text field with a simpler pattern, and is often faster the Grok for basic string parsing. This processor uses the dissect library which does most of the work. * ingest: minor - update test to include dissect (#33211) This change also includes placing the bytes processor in the correct order (helps to avoid merge conflict when back patching processors)
- Loading branch information
1 parent
6357a57
commit 0ed19dc
Showing
19 changed files
with
2,541 additions
and
137 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import org.elasticsearch.gradle.precommit.PrecommitTasks | ||
|
||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
archivesBaseName = 'elasticsearch-dissect' | ||
|
||
dependencies { | ||
if (isEclipse == false || project.path == ":libs:dissect-tests") { | ||
testCompile("org.elasticsearch.test:framework:${version}") { | ||
exclude group: 'org.elasticsearch', module: 'dissect' | ||
} | ||
} | ||
testCompile "com.fasterxml.jackson.core:jackson-core:${versions.jackson}" | ||
testCompile("com.fasterxml.jackson.core:jackson-annotations:${versions.jackson}") | ||
testCompile("com.fasterxml.jackson.core:jackson-databind:${versions.jackson}") | ||
} | ||
|
||
forbiddenApisMain { | ||
replaceSignatureFiles 'jdk-signatures' | ||
} | ||
|
||
if (isEclipse) { | ||
// in eclipse the project is under a fake root, we need to change around the source sets | ||
sourceSets { | ||
if (project.path == ":libs:dissect") { | ||
main.java.srcDirs = ['java'] | ||
main.resources.srcDirs = ['resources'] | ||
} else { | ||
test.java.srcDirs = ['java'] | ||
test.resources.srcDirs = ['resources'] | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
|
||
// this is just shell gradle file for eclipse to have separate projects for dissect src and tests | ||
apply from: '../../build.gradle' |
57 changes: 57 additions & 0 deletions
57
libs/dissect/src/main/java/org/elasticsearch/dissect/DissectException.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.dissect; | ||
|
||
/** | ||
* Parent class for all dissect related exceptions. Consumers may catch this exception or more specific child exceptions. | ||
*/ | ||
public abstract class DissectException extends RuntimeException { | ||
DissectException(String message) { | ||
super(message); | ||
} | ||
|
||
/** | ||
* Error while parsing a dissect pattern | ||
*/ | ||
static class PatternParse extends DissectException { | ||
PatternParse(String pattern, String reason) { | ||
super("Unable to parse pattern: " + pattern + " Reason: " + reason); | ||
} | ||
} | ||
|
||
/** | ||
* Error while parsing a dissect key | ||
*/ | ||
static class KeyParse extends DissectException { | ||
KeyParse(String key, String reason) { | ||
super("Unable to parse key: " + key + " Reason: " + reason); | ||
} | ||
} | ||
|
||
/** | ||
* Unable to find a match between pattern and source string | ||
*/ | ||
static class FindMatch extends DissectException { | ||
FindMatch(String pattern, String source) { | ||
super("Unable to find match for dissect pattern: " + pattern + " against source: " + source); | ||
|
||
} | ||
} | ||
} |
191 changes: 191 additions & 0 deletions
191
libs/dissect/src/main/java/org/elasticsearch/dissect/DissectKey.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.dissect; | ||
|
||
import java.util.EnumSet; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
/** | ||
* <p>A Key of a dissect pattern. This class models the name and modifiers and provides some validation.</p> | ||
* <p>For dissect pattern of {@code %{a} %{+a} %{b}} the dissect keys are: | ||
* <ul> | ||
* <li>{@code a}</li> | ||
* <li>{@code +a}</li> | ||
* <li>{@code b}</li> | ||
* </ul> | ||
* This class represents a single key. | ||
* <p>A single key is composed of a name and it's modifiers. For the key {@code +a}, {@code a} is the name and {@code +} is the modifier. | ||
* @see DissectParser | ||
*/ | ||
public final class DissectKey { | ||
private static final Pattern LEFT_MODIFIER_PATTERN = Pattern.compile("([+*&?])(.*?)(->)?$", Pattern.DOTALL); | ||
private static final Pattern RIGHT_PADDING_PATTERN = Pattern.compile("^(.*?)(->)?$", Pattern.DOTALL); | ||
private static final Pattern APPEND_WITH_ORDER_PATTERN = Pattern.compile("[+](.*?)(/)([0-9]+)(->)?$", Pattern.DOTALL); | ||
private final Modifier modifier; | ||
private boolean skip; | ||
private boolean skipRightPadding; | ||
private int appendPosition; | ||
private String name; | ||
|
||
/** | ||
* Constructor - parses the String key into it's name and modifier(s) | ||
* | ||
* @param key The key without the leading <code>%{</code> or trailing <code>}</code>, for example {@code a->} | ||
*/ | ||
DissectKey(String key) { | ||
skip = key == null || key.isEmpty(); | ||
modifier = Modifier.findModifier(key); | ||
switch (modifier) { | ||
case NONE: | ||
Matcher matcher = RIGHT_PADDING_PATTERN.matcher(key); | ||
while (matcher.find()) { | ||
name = matcher.group(1); | ||
skipRightPadding = matcher.group(2) != null; | ||
} | ||
skip = name.isEmpty(); | ||
break; | ||
case NAMED_SKIP: | ||
matcher = LEFT_MODIFIER_PATTERN.matcher(key); | ||
while (matcher.find()) { | ||
name = matcher.group(2); | ||
skipRightPadding = matcher.group(3) != null; | ||
} | ||
skip = true; | ||
break; | ||
case APPEND: | ||
matcher = LEFT_MODIFIER_PATTERN.matcher(key); | ||
while (matcher.find()) { | ||
name = matcher.group(2); | ||
skipRightPadding = matcher.group(3) != null; | ||
} | ||
break; | ||
case FIELD_NAME: | ||
matcher = LEFT_MODIFIER_PATTERN.matcher(key); | ||
while (matcher.find()) { | ||
name = matcher.group(2); | ||
skipRightPadding = matcher.group(3) != null; | ||
} | ||
break; | ||
case FIELD_VALUE: | ||
matcher = LEFT_MODIFIER_PATTERN.matcher(key); | ||
while (matcher.find()) { | ||
name = matcher.group(2); | ||
skipRightPadding = matcher.group(3) != null; | ||
} | ||
break; | ||
case APPEND_WITH_ORDER: | ||
matcher = APPEND_WITH_ORDER_PATTERN.matcher(key); | ||
while (matcher.find()) { | ||
name = matcher.group(1); | ||
appendPosition = Short.valueOf(matcher.group(3)); | ||
skipRightPadding = matcher.group(4) != null; | ||
} | ||
break; | ||
} | ||
|
||
if (name == null || (name.isEmpty() && !skip)) { | ||
throw new DissectException.KeyParse(key, "The key name could be determined"); | ||
} | ||
} | ||
|
||
/** | ||
* Copy constructor to explicitly override the modifier. | ||
* @param key The key to copy (except for the modifier) | ||
* @param modifier the modifer to use for this copy | ||
*/ | ||
DissectKey(DissectKey key, DissectKey.Modifier modifier){ | ||
this.modifier = modifier; | ||
this.skipRightPadding = key.skipRightPadding; | ||
this.skip = key.skip; | ||
this.name = key.name; | ||
this.appendPosition = key.appendPosition; | ||
} | ||
|
||
Modifier getModifier() { | ||
return modifier; | ||
} | ||
|
||
boolean skip() { | ||
return skip; | ||
} | ||
|
||
boolean skipRightPadding() { | ||
return skipRightPadding; | ||
} | ||
|
||
int getAppendPosition() { | ||
return appendPosition; | ||
} | ||
|
||
String getName() { | ||
return name; | ||
} | ||
|
||
//generated | ||
@Override | ||
public String toString() { | ||
return "DissectKey{" + | ||
"modifier=" + modifier + | ||
", skip=" + skip + | ||
", appendPosition=" + appendPosition + | ||
", name='" + name + '\'' + | ||
'}'; | ||
} | ||
|
||
public enum Modifier { | ||
NONE(""), APPEND_WITH_ORDER("/"), APPEND("+"), FIELD_NAME("*"), FIELD_VALUE("&"), NAMED_SKIP("?"); | ||
|
||
private static final Pattern MODIFIER_PATTERN = Pattern.compile("[/+*&?]"); | ||
|
||
private final String modifier; | ||
|
||
@Override | ||
public String toString() { | ||
return modifier; | ||
} | ||
|
||
Modifier(final String modifier) { | ||
this.modifier = modifier; | ||
} | ||
|
||
//package private for testing | ||
static Modifier fromString(String modifier) { | ||
return EnumSet.allOf(Modifier.class).stream().filter(km -> km.modifier.equals(modifier)) | ||
.findFirst().orElseThrow(() -> new IllegalArgumentException("Found invalid modifier.")); //throw should never happen | ||
} | ||
|
||
private static Modifier findModifier(String key) { | ||
Modifier modifier = Modifier.NONE; | ||
if (key != null && !key.isEmpty()) { | ||
Matcher matcher = MODIFIER_PATTERN.matcher(key); | ||
int matches = 0; | ||
while (matcher.find()) { | ||
Modifier priorModifier = modifier; | ||
modifier = Modifier.fromString(matcher.group()); | ||
if (++matches > 1 && !(APPEND.equals(priorModifier) && APPEND_WITH_ORDER.equals(modifier))) { | ||
throw new DissectException.KeyParse(key, "multiple modifiers are not allowed."); | ||
} | ||
} | ||
} | ||
return modifier; | ||
} | ||
} | ||
} |
Oops, something went wrong.