Skip to content

Commit

Permalink
Support much larger filters (backport of #72277) (#72389)
Browse files Browse the repository at this point in the history
Kibana's long had a problems sending Elasticsearch thousands of fields
to the source filtering API. It's probably not right to send *thousands*
of fields to it, but, in turn, we don't really have a good excuse for
rejecting them. I mean, we do. We reject them to keep ourselves from
allocating too much memory and crashing. But we can do better. This
makes us tollerate significantly larger field lists much more
efficiently.

First, this changes the way we generate the "guts" of the filter API so
that we use a *ton* less memory thanks to Daciuk, Mihov, Watson,
and Watson: https://www.aclweb.org/anthology/J00-1002.pdf
Lucene has an implementation of their algorithm built right in. We just
have to plug it in. With this a list of four thousand fields generates
thirty four thousand automata states instead of hundreds of thousdands.

Next we replace the algorithm that the source filtering uses to match
dotted field names with one that doesn't duplicate the automata we
build. This cuts that thirty four thousand states to seventeen thousand.

Finally we bump the maximum number of automata states we accept from
10,000 to 50,000 which is *comfortable* enough to handle the number of
state need for that four thousand field list. 50,000 state automata will
likely weigh in around a megabyte of heap. Heavy, but fine. Thanks to
Daciuk, et al, you can throw absolutely massive lists of fields at us
before we get that large. I couldn't do it with copy and paste. I don't
have that kind of creativity. I expect folks will shy away from sending
field lists that are megabytes long. But I never expected thousands of
fields, so what do I know.

Those the are enough for to take most of what kibana's historically sent
us without any problems. They are still huge requests and they'd be
better off not sending massive lists, but at least now the problem isn't
so pressing. That four thousand field list was 120kb over the wire.
That's big request!
  • Loading branch information
nik9000 authored Apr 28, 2021
1 parent 16ad6bb commit dc2e63f
Show file tree
Hide file tree
Showing 5 changed files with 4,069 additions and 7 deletions.
24 changes: 23 additions & 1 deletion server/src/main/java/org/elasticsearch/common/regex/Regex.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@

package org.elasticsearch.common.regex;

import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.Strings;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -57,9 +59,29 @@ public static Automaton simpleMatchToAutomaton(String... patterns) {
if (patterns.length < 1) {
throw new IllegalArgumentException("There must be at least one pattern, zero given");
}

List<BytesRef> simpleStrings = new ArrayList<>();
List<Automaton> automata = new ArrayList<>();
for (String pattern : patterns) {
automata.add(simpleMatchToAutomaton(pattern));
// Strings longer than 1000 characters aren't supported by makeStringUnion
if (isSimpleMatchPattern(pattern) || pattern.length() >= 1000) {
automata.add(simpleMatchToAutomaton(pattern));
} else {
simpleStrings.add(new BytesRef(pattern));
}
}
if (false == simpleStrings.isEmpty()) {
Automaton simpleStringsAutomaton;
if (simpleStrings.size() > 0) {
Collections.sort(simpleStrings);
simpleStringsAutomaton = Automata.makeStringUnion(simpleStrings);
} else {
simpleStringsAutomaton = Automata.makeString(simpleStrings.get(0).utf8ToString());
}
if (automata.isEmpty()) {
return simpleStringsAutomaton;
}
automata.add(simpleStringsAutomaton);
}
return Operations.union(automata);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,24 @@
import org.elasticsearch.common.unit.TimeValue;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;

public class XContentMapValues {
/**
* Maximum number of states allowed in the two automata that we use to
* perform the map filtering. This about a megabyte or so worth of
* automata. That's about eight thousand long-ish source paths. That's
* <strong>heavy</strong> but it shouldn't knock over the node or
* anything.
* <p>
* For what it is worth, 50,000 states is way, way, way too many to
* visualize.
*/
private static final int MAX_DETERMINIZED_STATES = 50_000;

/**
* Extracts raw values (string, int, and so on) based on the path provided returning all of them
Expand Down Expand Up @@ -241,7 +251,7 @@ public static Map<String, Object> filter(Map<String, ?> map, String[] includes,
} else {
Automaton includeA = Regex.simpleMatchToAutomaton(includes);
includeA = makeMatchDotsInFieldNames(includeA);
include = new CharacterRunAutomaton(includeA);
include = new CharacterRunAutomaton(includeA, MAX_DETERMINIZED_STATES);
}

Automaton excludeA;
Expand All @@ -251,7 +261,7 @@ public static Map<String, Object> filter(Map<String, ?> map, String[] includes,
excludeA = Regex.simpleMatchToAutomaton(excludes);
excludeA = makeMatchDotsInFieldNames(excludeA);
}
CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA);
CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA, MAX_DETERMINIZED_STATES);

// NOTE: We cannot use Operations.minus because of the special case that
// we want all sub properties to match as soon as an object matches
Expand All @@ -266,9 +276,15 @@ public static Map<String, Object> filter(Map<String, ?> map, String[] includes,
* For instance, if the original simple regex is `foo`, this will translate
* it into `foo` OR `foo.*`. */
private static Automaton makeMatchDotsInFieldNames(Automaton automaton) {
return Operations.union(
automaton,
Operations.concatenate(Arrays.asList(automaton, Automata.makeChar('.'), Automata.makeAnyString())));
/*
* We presume `automaton` is quite large compared to the mechanisms
* to match the trailing `.*` bits so we duplicate it only once.
*/
Automaton tail = Operations.union(
Automata.makeEmptyString(),
Operations.concatenate(Automata.makeChar('.'), Automata.makeAnyString())
);
return Operations.concatenate(automaton, tail);
}

private static int step(CharacterRunAutomaton automaton, String key, int state) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
*/
package org.elasticsearch.common.regex;

import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.elasticsearch.test.ESTestCase;

import java.io.IOException;
import java.util.Locale;
import java.util.Random;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -79,4 +82,95 @@ public void testSimpleMatch() {
assertFalse("[" + pattern + "] should not match [" + matchingString + "]", Regex.simpleMatch(pattern, matchingString));
}
}

public void testSimpleMatchToAutomaton() {
assertTrue(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("ddd")).run("ddd"));
assertFalse(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("ddd")).run("Ddd"));
assertTrue(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("d*d*d")).run("dadd"));
assertTrue(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("**ddd")).run("dddd"));
assertFalse(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("**ddd")).run("fff"));
assertTrue(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("fff*ddd")).run("fffabcddd"));
assertTrue(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("fff**ddd")).run("fffabcddd"));
assertFalse(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("fff**ddd")).run("fffabcdd"));
assertTrue(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("fff*******ddd")).run("fffabcddd"));
assertFalse(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("fff******ddd")).run("fffabcdd"));
}

public void testSimpleMatchManyToAutomaton() {
assertMatchesAll(Regex.simpleMatchToAutomaton("ddd", "fff"), "ddd", "fff");
assertMatchesNone(Regex.simpleMatchToAutomaton("ddd", "fff"), "Ddd", "Fff");
assertMatchesAll(Regex.simpleMatchToAutomaton("d*d*d", "cc"), "dadd", "cc");
assertMatchesNone(Regex.simpleMatchToAutomaton("d*d*d", "cc"), "dadc", "Cc");
}

public void testThousands() throws IOException {
String[] patterns = new String[10000];
for (int i = 0; i < patterns.length; i++) {
patterns[i] = Integer.toString(i, Character.MAX_RADIX);
}
Automaton automaton = Regex.simpleMatchToAutomaton(patterns);
CharacterRunAutomaton run = new CharacterRunAutomaton(automaton);
for (String pattern : patterns) {
assertTrue("matches " + pattern, run.run(pattern));
}
for (int i = 0; i < 100000; i++) {
int idx = between(0, Integer.MAX_VALUE);
String pattern = Integer.toString(idx, Character.MAX_RADIX);
if (idx < patterns.length) {
assertTrue("matches " + pattern, run.run(pattern));
} else {
assertFalse("matches " + pattern, run.run(pattern));
}
}
}

public void testThousandsAndPattern() throws IOException {
int patternCount = 10000;
String[] patterns = new String[patternCount + 2];
for (int i = 0; i < patternCount; i++) {
patterns[i] = Integer.toString(i, Character.MAX_RADIX);
}
patterns[patternCount] = "foo*bar";
patterns[patternCount + 1] = "baz*bort";
Automaton automaton = Regex.simpleMatchToAutomaton(patterns);
CharacterRunAutomaton run = new CharacterRunAutomaton(automaton);
for (int i = 0; i < patternCount; i++) {
assertTrue("matches " + patterns[i], run.run(patterns[i]));
}
assertTrue("matches foobar", run.run("foobar"));
assertTrue("matches foostuffbar", run.run("foostuffbar"));
assertTrue("matches bazbort", run.run("bazbort"));
assertTrue("matches bazstuffbort", run.run("bazstuffbort"));
for (int i = 0; i < 100000; i++) {
int idx = between(0, Integer.MAX_VALUE);
String pattern = Integer.toString(idx, Character.MAX_RADIX);
if ((pattern.startsWith("foo") && pattern.endsWith("bar")) || (pattern.startsWith("baz") && pattern.endsWith("bort"))) {
fail();
}
if (idx < patternCount
|| (pattern.startsWith("foo") && pattern.endsWith("bar")) // 948437811
|| (pattern.startsWith("baz") && pattern.endsWith("bort")) // Can't match, but you get the idea
) {
assertTrue("matches " + pattern, run.run(pattern));
} else {
assertFalse("matches " + pattern, run.run(pattern));
}
assertTrue("matches " + pattern, run.run("foo" + pattern + "bar"));
assertTrue("matches " + pattern, run.run("baz" + pattern + "bort"));
}
}

private void assertMatchesAll(Automaton automaton, String... strings) {
CharacterRunAutomaton run = new CharacterRunAutomaton(automaton);
for (String s : strings) {
assertTrue(run.run(s));
}
}

private void assertMatchesNone(Automaton automaton, String... strings) {
CharacterRunAutomaton run = new CharacterRunAutomaton(automaton);
for (String s : strings) {
assertFalse(run.run(s));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,15 @@
import org.elasticsearch.test.ESTestCase;

import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import org.elasticsearch.common.io.PathUtils;
import java.util.Set;

import static java.util.Collections.emptySet;
import static java.util.Collections.singleton;
import static java.util.stream.Collectors.toSet;

/**
* Tests for {@link XContent} filtering.
Expand Down Expand Up @@ -1416,4 +1421,27 @@ public void testFilterPrefix() throws IOException {
.endObject();
testFilter(expected, sample, singleton("photosCount"), emptySet());
}

public void testManyFilters() throws IOException, URISyntaxException {
Builder deep = builder -> builder.startObject()
.startObject("system")
.startObject("process")
.startObject("cgroup")
.startObject("memory")
.startObject("stats")
.startObject("mapped_file")
.field("bytes", 100)
.endObject()
.endObject()
.endObject()
.endObject()
.endObject()
.endObject()
.endObject();
Set<String> manyFilters = Files.readAllLines(
PathUtils.get(AbstractFilteringTestCase.class.getResource("many_filters.txt").toURI()),
StandardCharsets.UTF_8
).stream().filter(s -> false == s.startsWith("#")).collect(toSet());
testFilter(deep, deep, manyFilters, emptySet());
}
}
Loading

0 comments on commit dc2e63f

Please sign in to comment.