Support much larger filters (backport of #72277) (#72389)

Kibana's long had a problems sending Elasticsearch thousands of fields to the source filtering API. It's probably not right to send *thousands* of fields to it, but, in turn, we don't really have a good excuse for rejecting them. I mean, we do. We reject them to keep ourselves from allocating too much memory and crashing. But we can do better. This makes us tollerate significantly larger field lists much more efficiently. First, this changes the way we generate the "guts" of the filter API so that we use a *ton* less memory thanks to Daciuk, Mihov, Watson, and Watson: https://www.aclweb.org/anthology/J00-1002.pdf Lucene has an implementation of their algorithm built right in. We just have to plug it in. With this a list of four thousand fields generates thirty four thousand automata states instead of hundreds of thousdands. Next we replace the algorithm that the source filtering uses to match dotted field names with one that doesn't duplicate the automata we build. This cuts that thirty four thousand states to seventeen thousand. Finally we bump the maximum number of automata states we accept from 10,000 to 50,000 which is *comfortable* enough to handle the number of state need for that four thousand field list. 50,000 state automata will likely weigh in around a megabyte of heap. Heavy, but fine. Thanks to Daciuk, et al, you can throw absolutely massive lists of fields at us before we get that large. I couldn't do it with copy and paste. I don't have that kind of creativity. I expect folks will shy away from sending field lists that are megabytes long. But I never expected thousands of fields, so what do I know. Those the are enough for to take most of what kibana's historically sent us without any problems. They are still huge requests and they'd be better off not sending massive lists, but at least now the problem isn't so pressing. That four thousand field list was 120kb over the wire. That's big request!
elastic · Apr 28, 2021 · dc2e63f · dc2e63f
1 parent 16ad6bb
commit dc2e63f
Show file tree

Hide file tree

Showing 5 changed files with 4,069 additions and 7 deletions.
diff --git a/server/src/main/java/org/elasticsearch/common/regex/Regex.java b/server/src/main/java/org/elasticsearch/common/regex/Regex.java
@@ -8,12 +8,14 @@
 
 package org.elasticsearch.common.regex;
 
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.Operations;
 import org.elasticsearch.common.Strings;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.Locale;
 import java.util.regex.Pattern;
@@ -57,9 +59,29 @@ public static Automaton simpleMatchToAutomaton(String... patterns) {
         if (patterns.length < 1) {
             throw new IllegalArgumentException("There must be at least one pattern, zero given");
         }
+
+        List<BytesRef> simpleStrings = new ArrayList<>();
         List<Automaton> automata = new ArrayList<>();
         for (String pattern : patterns) {
-            automata.add(simpleMatchToAutomaton(pattern));
+            // Strings longer than 1000 characters aren't supported by makeStringUnion
+            if (isSimpleMatchPattern(pattern) || pattern.length() >= 1000) {
+                automata.add(simpleMatchToAutomaton(pattern));
+            } else {
+                simpleStrings.add(new BytesRef(pattern));
+            }
+        }
+        if (false == simpleStrings.isEmpty()) {
+            Automaton simpleStringsAutomaton;
+            if (simpleStrings.size() > 0) {
+                Collections.sort(simpleStrings);
+                simpleStringsAutomaton = Automata.makeStringUnion(simpleStrings);
+            } else {
+                simpleStringsAutomaton = Automata.makeString(simpleStrings.get(0).utf8ToString());
+            }
+            if (automata.isEmpty()) {
+                return simpleStringsAutomaton;
+            }
+            automata.add(simpleStringsAutomaton);
         }
         return Operations.union(automata);
     }

diff --git a/server/src/main/java/org/elasticsearch/common/xcontent/support/XContentMapValues.java b/server/src/main/java/org/elasticsearch/common/xcontent/support/XContentMapValues.java
@@ -20,14 +20,24 @@
 import org.elasticsearch.common.unit.TimeValue;
 
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.function.Function;
 
 public class XContentMapValues {
+    /**
+     * Maximum number of states allowed in the two automata that we use to
+     * perform the map filtering. This about a megabyte or so worth of
+     * automata. That's about eight thousand long-ish source paths. That's
+     * <strong>heavy</strong> but it shouldn't knock over the node or
+     * anything.
+     * <p>
+     * For what it is worth, 50,000 states is way, way, way too many to
+     * visualize.
+     */
+    private static final int MAX_DETERMINIZED_STATES = 50_000;
 
     /**
      * Extracts raw values (string, int, and so on) based on the path provided returning all of them
@@ -241,7 +251,7 @@ public static Map<String, Object> filter(Map<String, ?> map, String[] includes,
         } else {
             Automaton includeA = Regex.simpleMatchToAutomaton(includes);
             includeA = makeMatchDotsInFieldNames(includeA);
-            include = new CharacterRunAutomaton(includeA);
+            include = new CharacterRunAutomaton(includeA, MAX_DETERMINIZED_STATES);
         }
 
         Automaton excludeA;
@@ -251,7 +261,7 @@ public static Map<String, Object> filter(Map<String, ?> map, String[] includes,
             excludeA = Regex.simpleMatchToAutomaton(excludes);
             excludeA = makeMatchDotsInFieldNames(excludeA);
         }
-        CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA);
+        CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA, MAX_DETERMINIZED_STATES);
 
         // NOTE: We cannot use Operations.minus because of the special case that
         // we want all sub properties to match as soon as an object matches
@@ -266,9 +276,15 @@ public static Map<String, Object> filter(Map<String, ?> map, String[] includes,
      *  For instance, if the original simple regex is `foo`, this will translate
      *  it into `foo` OR `foo.*`. */
     private static Automaton makeMatchDotsInFieldNames(Automaton automaton) {
-        return Operations.union(
-                automaton,
-                Operations.concatenate(Arrays.asList(automaton, Automata.makeChar('.'), Automata.makeAnyString())));
+        /*
+         * We presume `automaton` is quite large compared to the mechanisms
+         * to match the trailing `.*` bits so we duplicate it only once.
+         */
+        Automaton tail = Operations.union(
+            Automata.makeEmptyString(),
+            Operations.concatenate(Automata.makeChar('.'), Automata.makeAnyString())
+        );
+        return Operations.concatenate(automaton, tail);
     }
 
     private static int step(CharacterRunAutomaton automaton, String key, int state) {

diff --git a/server/src/test/java/org/elasticsearch/common/regex/RegexTests.java b/server/src/test/java/org/elasticsearch/common/regex/RegexTests.java
@@ -7,8 +7,11 @@
  */
 package org.elasticsearch.common.regex;
 
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.elasticsearch.test.ESTestCase;
 
+import java.io.IOException;
 import java.util.Locale;
 import java.util.Random;
 import java.util.regex.Pattern;
@@ -79,4 +82,95 @@ public void testSimpleMatch() {
             assertFalse("[" + pattern + "] should not match [" + matchingString + "]", Regex.simpleMatch(pattern, matchingString));
         }
     }
+
+    public void testSimpleMatchToAutomaton() {
+        assertTrue(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("ddd")).run("ddd"));
+        assertFalse(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("ddd")).run("Ddd"));
+        assertTrue(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("d*d*d")).run("dadd"));
+        assertTrue(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("**ddd")).run("dddd"));
+        assertFalse(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("**ddd")).run("fff"));
+        assertTrue(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("fff*ddd")).run("fffabcddd"));
+        assertTrue(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("fff**ddd")).run("fffabcddd"));
+        assertFalse(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("fff**ddd")).run("fffabcdd"));
+        assertTrue(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("fff*******ddd")).run("fffabcddd"));
+        assertFalse(new CharacterRunAutomaton(Regex.simpleMatchToAutomaton("fff******ddd")).run("fffabcdd"));
+    }
+
+    public void testSimpleMatchManyToAutomaton() {
+        assertMatchesAll(Regex.simpleMatchToAutomaton("ddd", "fff"), "ddd", "fff");
+        assertMatchesNone(Regex.simpleMatchToAutomaton("ddd", "fff"), "Ddd", "Fff");
+        assertMatchesAll(Regex.simpleMatchToAutomaton("d*d*d", "cc"), "dadd", "cc");
+        assertMatchesNone(Regex.simpleMatchToAutomaton("d*d*d", "cc"), "dadc", "Cc");
+    }
+
+    public void testThousands() throws IOException {
+        String[] patterns = new String[10000];
+        for (int i = 0; i < patterns.length; i++) {
+            patterns[i] = Integer.toString(i, Character.MAX_RADIX);
+        }
+        Automaton automaton = Regex.simpleMatchToAutomaton(patterns);
+        CharacterRunAutomaton run = new CharacterRunAutomaton(automaton);
+        for (String pattern : patterns) {
+            assertTrue("matches " + pattern, run.run(pattern));
+        }
+        for (int i = 0; i < 100000; i++) {
+            int idx = between(0, Integer.MAX_VALUE);
+            String pattern = Integer.toString(idx, Character.MAX_RADIX);
+            if (idx < patterns.length) {
+                assertTrue("matches " + pattern, run.run(pattern));
+            } else {
+                assertFalse("matches " + pattern, run.run(pattern));
+            }
+        }
+    }
+
+    public void testThousandsAndPattern() throws IOException {
+        int patternCount = 10000;
+        String[] patterns = new String[patternCount + 2];
+        for (int i = 0; i < patternCount; i++) {
+            patterns[i] = Integer.toString(i, Character.MAX_RADIX);
+        }
+        patterns[patternCount] = "foo*bar";
+        patterns[patternCount + 1] = "baz*bort";
+        Automaton automaton = Regex.simpleMatchToAutomaton(patterns);
+        CharacterRunAutomaton run = new CharacterRunAutomaton(automaton);
+        for (int i = 0; i < patternCount; i++) {
+            assertTrue("matches " + patterns[i], run.run(patterns[i]));
+        }
+        assertTrue("matches foobar", run.run("foobar"));
+        assertTrue("matches foostuffbar", run.run("foostuffbar"));
+        assertTrue("matches bazbort", run.run("bazbort"));
+        assertTrue("matches bazstuffbort", run.run("bazstuffbort"));
+        for (int i = 0; i < 100000; i++) {
+            int idx = between(0, Integer.MAX_VALUE);
+            String pattern = Integer.toString(idx, Character.MAX_RADIX);
+            if ((pattern.startsWith("foo") && pattern.endsWith("bar")) || (pattern.startsWith("baz") && pattern.endsWith("bort"))) {
+                fail();
+            }
+            if (idx < patternCount
+                || (pattern.startsWith("foo") && pattern.endsWith("bar"))  // 948437811
+                || (pattern.startsWith("baz") && pattern.endsWith("bort")) // Can't match, but you get the idea
+            ) {
+                assertTrue("matches " + pattern, run.run(pattern));
+            } else {
+                assertFalse("matches " + pattern, run.run(pattern));
+            }
+            assertTrue("matches " + pattern, run.run("foo" + pattern + "bar"));
+            assertTrue("matches " + pattern, run.run("baz" + pattern + "bort"));
+        }
+    }
+
+    private void assertMatchesAll(Automaton automaton, String... strings) {
+        CharacterRunAutomaton run = new CharacterRunAutomaton(automaton);
+        for (String s : strings) {
+            assertTrue(run.run(s));
+        }
+    }
+
+    private void assertMatchesNone(Automaton automaton, String... strings) {
+        CharacterRunAutomaton run = new CharacterRunAutomaton(automaton);
+        for (String s : strings) {
+            assertFalse(run.run(s));
+        }
+    }
 }
diff --git a/...er/src/test/java/org/elasticsearch/common/xcontent/support/AbstractFilteringTestCase.java b/...er/src/test/java/org/elasticsearch/common/xcontent/support/AbstractFilteringTestCase.java
@@ -16,10 +16,15 @@
 import org.elasticsearch.test.ESTestCase;
 
 import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import org.elasticsearch.common.io.PathUtils;
 import java.util.Set;
 
 import static java.util.Collections.emptySet;
 import static java.util.Collections.singleton;
+import static java.util.stream.Collectors.toSet;
 
 /**
  * Tests for {@link XContent} filtering.
@@ -1416,4 +1421,27 @@ public void testFilterPrefix() throws IOException {
                 .endObject();
         testFilter(expected, sample, singleton("photosCount"), emptySet());
     }
+
+    public void testManyFilters() throws IOException, URISyntaxException {
+        Builder deep = builder -> builder.startObject()
+            .startObject("system")
+            .startObject("process")
+            .startObject("cgroup")
+            .startObject("memory")
+            .startObject("stats")
+            .startObject("mapped_file")
+            .field("bytes", 100)
+            .endObject()
+            .endObject()
+            .endObject()
+            .endObject()
+            .endObject()
+            .endObject()
+            .endObject();
+        Set<String> manyFilters = Files.readAllLines(
+            PathUtils.get(AbstractFilteringTestCase.class.getResource("many_filters.txt").toURI()),
+            StandardCharsets.UTF_8
+        ).stream().filter(s -> false == s.startsWith("#")).collect(toSet());
+        testFilter(deep, deep, manyFilters, emptySet());
+    }
 }