From 5dfaae85eff0991343b61bc7ed8120a34732182f Mon Sep 17 00:00:00 2001
From: Tim Vernum <tim.vernum@elastic.co>
Date: Thu, 31 Dec 2020 17:19:14 +1100
Subject: [PATCH] Build complex automatons more efficiently (#66901)

This change substantially reduces the CPU and Heap usage of
StringMatcher when processing large complex patterns.

The improvement is achieved by switching the order in which we
perform concatenation and union for common styles of wildcard patterns.

Given a set of wildcard strings:
- "*-logs-*"
- "*-metrics-*"
- "web-*-prod-*"
- "web-*-staging-*"

The old implementation would perform steps roughly like:

    minimize {
        union {
            concatenate { MATCH_ANY, "-logs-", MATCH_ANY }
            concatenate { MATCH_ANY, "-metrics-", MATCH_ANY }
            concatenate { "web-", MATCH_ANY, "prod-", MATCH_ANY }
            concatenate { "web-", MATCH_ANY, "staging-", MATCH_ANY }
        }
    }

The outer minimize would require determinizing the automaton, which
was highly inefficient

The new implementation is:

    minimize {
        union {
            concatenate {
                MATCH_ANY ,
                minimize {
                    union { "-logs-", "-metrics"- }
                }
                MATCH_ANY
            }
            concatenate {
                minimize {
                    union {
                        concatenate { "web-", MATCH_ANY, "prod-" }
                        concatenate { "web-", MATCH_ANY, "staging-" }
                    }
                }
                MATCH_ANY
            }
        }
    }

By performing a union of the inner strings before concatenating the
MATCH_ANY ("*") the time & heap space spent on determinizing the
automaton is greatly reduced.

Backport of: #66724
---
 .../core/security/support/Automatons.java     | 98 +++++++++++++++++--
 .../core/security/support/StringMatcher.java  |  2 +-
 2 files changed, 90 insertions(+), 10 deletions(-)
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/Automatons.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/Automatons.java
index c0e4af238b1bd..a738a35411655 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/Automatons.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/Automatons.java
@@ -8,6 +8,8 @@
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.MinimizationOperations;
+import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.RegExp;
 import org.elasticsearch.common.cache.Cache;
 import org.elasticsearch.common.cache.CacheBuilder;
@@ -19,11 +21,13 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 import java.util.concurrent.ExecutionException;
+import java.util.function.Function;
 import java.util.function.Predicate;
 
-import static org.apache.lucene.util.automaton.MinimizationOperations.minimize;
 import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
 import static org.apache.lucene.util.automaton.Operations.concatenate;
 import static org.apache.lucene.util.automaton.Operations.intersection;
@@ -84,10 +88,82 @@ public static Automaton patterns(Collection<String> patterns) {
     }
 
     private static Automaton buildAutomaton(Collection<String> patterns) {
-        List<Automaton> automata = new ArrayList<>(patterns.size());
-        for (String pattern : patterns) {
-            final Automaton patternAutomaton = pattern(pattern);
-            automata.add(patternAutomaton);
+        if (patterns.size() == 1) {
+            return minimize(pattern(patterns.iterator().next()));
+        }
+
+        final Function<Collection<String>, Automaton> build = strings -> {
+            List<Automaton> automata = new ArrayList<>(strings.size());
+            for (String pattern : strings) {
+                final Automaton patternAutomaton = pattern(pattern);
+                automata.add(patternAutomaton);
+            }
+            return unionAndMinimize(automata);
+        };
+
+        // We originally just compiled each automaton separately and then unioned them all.
+        // However, that approach can be quite slow, and very memory intensive.
+        // It is far more efficient if
+        //   1. we strip leading/trailing "*"
+        //   2. union the automaton produced from the remaining text
+        //   3. append/prepend MatchAnyString automatons as appropriate
+        // That is:
+        //  - `MATCH_ALL + (bullseye|daredevil) + MATCH_ALL`
+        //  can be determinized more efficiently than
+        //  - `(MATCH_ALL + bullseye + MATCH_ALL)|(MATCH_ALL + daredevil + MATCH_ALL)`
+
+        final Set<String> prefix = new HashSet<>();
+        final Set<String> infix = new HashSet<>();
+        final Set<String> suffix = new HashSet<>();
+        final Set<String> misc = new HashSet<>();
+
+        for (String p : patterns) {
+            if (p.length() <= 1) {
+                // Single character strings (like "x" or "*"), or stray empty strings
+                misc.add(p);
+                continue;
+            }
+
+            final char first = p.charAt(0);
+            final char last = p.charAt(p.length() - 1);
+            if (first == '/') {
+                // regex ("/something/")
+                misc.add(p);
+            } else if (first == '*') {
+                if (last == '*') {
+                    // *something*
+                    infix.add(p.substring(1, p.length() - 1));
+                } else {
+                    // *something
+                    suffix.add(p.substring(1));
+                }
+            } else if (last == '*' && p.indexOf('*') != p.length() - 1) {
+                // some*thing*
+                // For simple prefix patterns ("something*") it's more efficient to do a single pass
+                // Lucene can efficiently determinize automata that share a trailing MATCH_ANY accept state,
+                // If we were to handle them here, we would run 2 minimize operations (one for the union of strings,
+                // then another after concatenating MATCH_ANY), which is substantially slower.
+                // However, that's not true if the string has an embedded '*' in it - in that case it is more efficient to determinize
+                //   the set of prefixes (with the embedded MATCH_ANY) and then concatenate another MATCH_ANY and minimize.
+                prefix.add(p.substring(0, p.length() - 1));
+            } else {
+                // something* / some*thing / some?thing / etc
+                misc.add(p);
+            }
+        }
+
+        final List<Automaton> automata = new ArrayList<>();
+        if (prefix.isEmpty() == false) {
+            automata.add(Operations.concatenate(build.apply(prefix), Automata.makeAnyString()));
+        }
+        if (suffix.isEmpty() == false) {
+            automata.add(Operations.concatenate(Automata.makeAnyString(), build.apply(suffix)));
+        }
+        if (infix.isEmpty() == false) {
+            automata.add(Operations.concatenate(Arrays.asList(Automata.makeAnyString(), build.apply(infix), Automata.makeAnyString())));
+        }
+        if (misc.isEmpty() == false) {
+            automata.add(build.apply(misc));
         }
         return unionAndMinimize(automata);
     }
@@ -172,18 +248,22 @@ static Automaton wildcard(String text) {
     }
 
     public static Automaton unionAndMinimize(Collection<Automaton> automata) {
-        Automaton res = union(automata);
-        return minimize(res, maxDeterminizedStates);
+        Automaton res = automata.size() == 1 ? automata.iterator().next() : union(automata);
+        return minimize(res);
     }
 
     public static Automaton minusAndMinimize(Automaton a1, Automaton a2) {
         Automaton res = minus(a1, a2, maxDeterminizedStates);
-        return minimize(res, maxDeterminizedStates);
+        return minimize(res);
     }
 
     public static Automaton intersectAndMinimize(Automaton a1, Automaton a2) {
         Automaton res = intersection(a1, a2);
-        return minimize(res, maxDeterminizedStates);
+        return minimize(res);
+    }
+
+    private static Automaton minimize(Automaton automaton) {
+        return MinimizationOperations.minimize(automaton, maxDeterminizedStates);
     }
 
     public static Predicate<String> predicate(String... patterns) {
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/StringMatcher.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/StringMatcher.java
index a1b6877a9f191..051a6e84a751e 100644
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/StringMatcher.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/StringMatcher.java
@@ -162,7 +162,7 @@ private static Predicate<String> buildAutomataPredicate(Collection<String> patte
                 if (description.length() > 80) {
                     description = Strings.cleanTruncate(description, 80) + "...";
                 }
-                throw new ElasticsearchSecurityException("The set patterns [{}] is too complex to evaluate", e, description);
+                throw new ElasticsearchSecurityException("The set of patterns [{}] is too complex to evaluate", e, description);
             }
         }
     }