Skip to content

Commit

Permalink
ingest: Dissect processor and lib for 6x (#33422)
Browse files Browse the repository at this point in the history
* Introduce the dissect library (#32297)

The dissect library will be used for the ingest node as an alternative
to Grok to split a string based on a pattern. Dissect differs from
Grok such that regular expressions are not used to split the string.
Note - Regular expressions are used during construction of the
objects, but not in the hot path.

A dissect pattern takes the form of: '%{a} %{b},%{c}' which is
composed of 3 keys (a,b,c) and two delimiters (space and comma).
This dissect pattern will match a string of the form: 'foo bar,baz'
and will result a key/value pairing of 'a=foo, b=bar, and c=baz'.
See the comments in DissectParser for a full explanation.

This commit does not include the ingest node processor that will consume
it. However, the consumption should be a trivial mapping between the
key/value pairing returned by the parser and the key/value pairing
needed for the IngestDocument.

* ingest: Introduce the dissect processor (#32884)

The ingest node dissect processor is an alternative to Grok
to split a string based on a pattern. Dissect differs from
Grok such that regular expressions are not used to split the
string.

Dissect can be used to parse a source text field with a
simpler pattern, and is often faster the Grok for basic string
parsing. This processor uses the dissect library which
does most of the work.

* ingest: minor - update test to include dissect (#33211)

This change also includes placing the bytes processor in the correct
order (helps to avoid merge conflict when back patching processors)
  • Loading branch information
jakelandis authored Sep 5, 2018
1 parent 6357a57 commit 0ed19dc
Show file tree
Hide file tree
Showing 19 changed files with 2,541 additions and 137 deletions.
431 changes: 312 additions & 119 deletions docs/reference/ingest/ingest-node.asciidoc

Large diffs are not rendered by default.

50 changes: 50 additions & 0 deletions libs/dissect/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import org.elasticsearch.gradle.precommit.PrecommitTasks

/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

archivesBaseName = 'elasticsearch-dissect'

dependencies {
if (isEclipse == false || project.path == ":libs:dissect-tests") {
testCompile("org.elasticsearch.test:framework:${version}") {
exclude group: 'org.elasticsearch', module: 'dissect'
}
}
testCompile "com.fasterxml.jackson.core:jackson-core:${versions.jackson}"
testCompile("com.fasterxml.jackson.core:jackson-annotations:${versions.jackson}")
testCompile("com.fasterxml.jackson.core:jackson-databind:${versions.jackson}")
}

forbiddenApisMain {
replaceSignatureFiles 'jdk-signatures'
}

if (isEclipse) {
// in eclipse the project is under a fake root, we need to change around the source sets
sourceSets {
if (project.path == ":libs:dissect") {
main.java.srcDirs = ['java']
main.resources.srcDirs = ['resources']
} else {
test.java.srcDirs = ['java']
test.resources.srcDirs = ['resources']
}
}
}
3 changes: 3 additions & 0 deletions libs/dissect/src/main/eclipse-build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

// this is just shell gradle file for eclipse to have separate projects for dissect src and tests
apply from: '../../build.gradle'
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.dissect;

/**
* Parent class for all dissect related exceptions. Consumers may catch this exception or more specific child exceptions.
*/
public abstract class DissectException extends RuntimeException {
DissectException(String message) {
super(message);
}

/**
* Error while parsing a dissect pattern
*/
static class PatternParse extends DissectException {
PatternParse(String pattern, String reason) {
super("Unable to parse pattern: " + pattern + " Reason: " + reason);
}
}

/**
* Error while parsing a dissect key
*/
static class KeyParse extends DissectException {
KeyParse(String key, String reason) {
super("Unable to parse key: " + key + " Reason: " + reason);
}
}

/**
* Unable to find a match between pattern and source string
*/
static class FindMatch extends DissectException {
FindMatch(String pattern, String source) {
super("Unable to find match for dissect pattern: " + pattern + " against source: " + source);

}
}
}
191 changes: 191 additions & 0 deletions libs/dissect/src/main/java/org/elasticsearch/dissect/DissectKey.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.dissect;

import java.util.EnumSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* <p>A Key of a dissect pattern. This class models the name and modifiers and provides some validation.</p>
* <p>For dissect pattern of {@code %{a} %{+a} %{b}} the dissect keys are:
* <ul>
* <li>{@code a}</li>
* <li>{@code +a}</li>
* <li>{@code b}</li>
* </ul>
* This class represents a single key.
* <p>A single key is composed of a name and it's modifiers. For the key {@code +a}, {@code a} is the name and {@code +} is the modifier.
* @see DissectParser
*/
public final class DissectKey {
private static final Pattern LEFT_MODIFIER_PATTERN = Pattern.compile("([+*&?])(.*?)(->)?$", Pattern.DOTALL);
private static final Pattern RIGHT_PADDING_PATTERN = Pattern.compile("^(.*?)(->)?$", Pattern.DOTALL);
private static final Pattern APPEND_WITH_ORDER_PATTERN = Pattern.compile("[+](.*?)(/)([0-9]+)(->)?$", Pattern.DOTALL);
private final Modifier modifier;
private boolean skip;
private boolean skipRightPadding;
private int appendPosition;
private String name;

/**
* Constructor - parses the String key into it's name and modifier(s)
*
* @param key The key without the leading <code>%{</code> or trailing <code>}</code>, for example {@code a->}
*/
DissectKey(String key) {
skip = key == null || key.isEmpty();
modifier = Modifier.findModifier(key);
switch (modifier) {
case NONE:
Matcher matcher = RIGHT_PADDING_PATTERN.matcher(key);
while (matcher.find()) {
name = matcher.group(1);
skipRightPadding = matcher.group(2) != null;
}
skip = name.isEmpty();
break;
case NAMED_SKIP:
matcher = LEFT_MODIFIER_PATTERN.matcher(key);
while (matcher.find()) {
name = matcher.group(2);
skipRightPadding = matcher.group(3) != null;
}
skip = true;
break;
case APPEND:
matcher = LEFT_MODIFIER_PATTERN.matcher(key);
while (matcher.find()) {
name = matcher.group(2);
skipRightPadding = matcher.group(3) != null;
}
break;
case FIELD_NAME:
matcher = LEFT_MODIFIER_PATTERN.matcher(key);
while (matcher.find()) {
name = matcher.group(2);
skipRightPadding = matcher.group(3) != null;
}
break;
case FIELD_VALUE:
matcher = LEFT_MODIFIER_PATTERN.matcher(key);
while (matcher.find()) {
name = matcher.group(2);
skipRightPadding = matcher.group(3) != null;
}
break;
case APPEND_WITH_ORDER:
matcher = APPEND_WITH_ORDER_PATTERN.matcher(key);
while (matcher.find()) {
name = matcher.group(1);
appendPosition = Short.valueOf(matcher.group(3));
skipRightPadding = matcher.group(4) != null;
}
break;
}

if (name == null || (name.isEmpty() && !skip)) {
throw new DissectException.KeyParse(key, "The key name could be determined");
}
}

/**
* Copy constructor to explicitly override the modifier.
* @param key The key to copy (except for the modifier)
* @param modifier the modifer to use for this copy
*/
DissectKey(DissectKey key, DissectKey.Modifier modifier){
this.modifier = modifier;
this.skipRightPadding = key.skipRightPadding;
this.skip = key.skip;
this.name = key.name;
this.appendPosition = key.appendPosition;
}

Modifier getModifier() {
return modifier;
}

boolean skip() {
return skip;
}

boolean skipRightPadding() {
return skipRightPadding;
}

int getAppendPosition() {
return appendPosition;
}

String getName() {
return name;
}

//generated
@Override
public String toString() {
return "DissectKey{" +
"modifier=" + modifier +
", skip=" + skip +
", appendPosition=" + appendPosition +
", name='" + name + '\'' +
'}';
}

public enum Modifier {
NONE(""), APPEND_WITH_ORDER("/"), APPEND("+"), FIELD_NAME("*"), FIELD_VALUE("&"), NAMED_SKIP("?");

private static final Pattern MODIFIER_PATTERN = Pattern.compile("[/+*&?]");

private final String modifier;

@Override
public String toString() {
return modifier;
}

Modifier(final String modifier) {
this.modifier = modifier;
}

//package private for testing
static Modifier fromString(String modifier) {
return EnumSet.allOf(Modifier.class).stream().filter(km -> km.modifier.equals(modifier))
.findFirst().orElseThrow(() -> new IllegalArgumentException("Found invalid modifier.")); //throw should never happen
}

private static Modifier findModifier(String key) {
Modifier modifier = Modifier.NONE;
if (key != null && !key.isEmpty()) {
Matcher matcher = MODIFIER_PATTERN.matcher(key);
int matches = 0;
while (matcher.find()) {
Modifier priorModifier = modifier;
modifier = Modifier.fromString(matcher.group());
if (++matches > 1 && !(APPEND.equals(priorModifier) && APPEND_WITH_ORDER.equals(modifier))) {
throw new DissectException.KeyParse(key, "multiple modifiers are not allowed.");
}
}
}
return modifier;
}
}
}
Loading

0 comments on commit 0ed19dc

Please sign in to comment.