From 5dfaae85eff0991343b61bc7ed8120a34732182f Mon Sep 17 00:00:00 2001 From: Tim Vernum Date: Thu, 31 Dec 2020 17:19:14 +1100 Subject: [PATCH] Build complex automatons more efficiently (#66901) This change substantially reduces the CPU and Heap usage of StringMatcher when processing large complex patterns. The improvement is achieved by switching the order in which we perform concatenation and union for common styles of wildcard patterns. Given a set of wildcard strings: - "*-logs-*" - "*-metrics-*" - "web-*-prod-*" - "web-*-staging-*" The old implementation would perform steps roughly like: minimize { union { concatenate { MATCH_ANY, "-logs-", MATCH_ANY } concatenate { MATCH_ANY, "-metrics-", MATCH_ANY } concatenate { "web-", MATCH_ANY, "prod-", MATCH_ANY } concatenate { "web-", MATCH_ANY, "staging-", MATCH_ANY } } } The outer minimize would require determinizing the automaton, which was highly inefficient The new implementation is: minimize { union { concatenate { MATCH_ANY , minimize { union { "-logs-", "-metrics"- } } MATCH_ANY } concatenate { minimize { union { concatenate { "web-", MATCH_ANY, "prod-" } concatenate { "web-", MATCH_ANY, "staging-" } } } MATCH_ANY } } } By performing a union of the inner strings before concatenating the MATCH_ANY ("*") the time & heap space spent on determinizing the automaton is greatly reduced. Backport of: #66724 --- .../core/security/support/Automatons.java | 98 +++++++++++++++++-- .../core/security/support/StringMatcher.java | 2 +- 2 files changed, 90 insertions(+), 10 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/Automatons.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/Automatons.java index c0e4af238b1bd..a738a35411655 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/Automatons.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/Automatons.java @@ -8,6 +8,8 @@ import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.MinimizationOperations; +import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; import org.elasticsearch.common.cache.Cache; import org.elasticsearch.common.cache.CacheBuilder; @@ -19,11 +21,13 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.concurrent.ExecutionException; +import java.util.function.Function; import java.util.function.Predicate; -import static org.apache.lucene.util.automaton.MinimizationOperations.minimize; import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES; import static org.apache.lucene.util.automaton.Operations.concatenate; import static org.apache.lucene.util.automaton.Operations.intersection; @@ -84,10 +88,82 @@ public static Automaton patterns(Collection patterns) { } private static Automaton buildAutomaton(Collection patterns) { - List automata = new ArrayList<>(patterns.size()); - for (String pattern : patterns) { - final Automaton patternAutomaton = pattern(pattern); - automata.add(patternAutomaton); + if (patterns.size() == 1) { + return minimize(pattern(patterns.iterator().next())); + } + + final Function, Automaton> build = strings -> { + List automata = new ArrayList<>(strings.size()); + for (String pattern : strings) { + final Automaton patternAutomaton = pattern(pattern); + automata.add(patternAutomaton); + } + return unionAndMinimize(automata); + }; + + // We originally just compiled each automaton separately and then unioned them all. + // However, that approach can be quite slow, and very memory intensive. + // It is far more efficient if + // 1. we strip leading/trailing "*" + // 2. union the automaton produced from the remaining text + // 3. append/prepend MatchAnyString automatons as appropriate + // That is: + // - `MATCH_ALL + (bullseye|daredevil) + MATCH_ALL` + // can be determinized more efficiently than + // - `(MATCH_ALL + bullseye + MATCH_ALL)|(MATCH_ALL + daredevil + MATCH_ALL)` + + final Set prefix = new HashSet<>(); + final Set infix = new HashSet<>(); + final Set suffix = new HashSet<>(); + final Set misc = new HashSet<>(); + + for (String p : patterns) { + if (p.length() <= 1) { + // Single character strings (like "x" or "*"), or stray empty strings + misc.add(p); + continue; + } + + final char first = p.charAt(0); + final char last = p.charAt(p.length() - 1); + if (first == '/') { + // regex ("/something/") + misc.add(p); + } else if (first == '*') { + if (last == '*') { + // *something* + infix.add(p.substring(1, p.length() - 1)); + } else { + // *something + suffix.add(p.substring(1)); + } + } else if (last == '*' && p.indexOf('*') != p.length() - 1) { + // some*thing* + // For simple prefix patterns ("something*") it's more efficient to do a single pass + // Lucene can efficiently determinize automata that share a trailing MATCH_ANY accept state, + // If we were to handle them here, we would run 2 minimize operations (one for the union of strings, + // then another after concatenating MATCH_ANY), which is substantially slower. + // However, that's not true if the string has an embedded '*' in it - in that case it is more efficient to determinize + // the set of prefixes (with the embedded MATCH_ANY) and then concatenate another MATCH_ANY and minimize. + prefix.add(p.substring(0, p.length() - 1)); + } else { + // something* / some*thing / some?thing / etc + misc.add(p); + } + } + + final List automata = new ArrayList<>(); + if (prefix.isEmpty() == false) { + automata.add(Operations.concatenate(build.apply(prefix), Automata.makeAnyString())); + } + if (suffix.isEmpty() == false) { + automata.add(Operations.concatenate(Automata.makeAnyString(), build.apply(suffix))); + } + if (infix.isEmpty() == false) { + automata.add(Operations.concatenate(Arrays.asList(Automata.makeAnyString(), build.apply(infix), Automata.makeAnyString()))); + } + if (misc.isEmpty() == false) { + automata.add(build.apply(misc)); } return unionAndMinimize(automata); } @@ -172,18 +248,22 @@ static Automaton wildcard(String text) { } public static Automaton unionAndMinimize(Collection automata) { - Automaton res = union(automata); - return minimize(res, maxDeterminizedStates); + Automaton res = automata.size() == 1 ? automata.iterator().next() : union(automata); + return minimize(res); } public static Automaton minusAndMinimize(Automaton a1, Automaton a2) { Automaton res = minus(a1, a2, maxDeterminizedStates); - return minimize(res, maxDeterminizedStates); + return minimize(res); } public static Automaton intersectAndMinimize(Automaton a1, Automaton a2) { Automaton res = intersection(a1, a2); - return minimize(res, maxDeterminizedStates); + return minimize(res); + } + + private static Automaton minimize(Automaton automaton) { + return MinimizationOperations.minimize(automaton, maxDeterminizedStates); } public static Predicate predicate(String... patterns) { diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/StringMatcher.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/StringMatcher.java index a1b6877a9f191..051a6e84a751e 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/StringMatcher.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/StringMatcher.java @@ -162,7 +162,7 @@ private static Predicate buildAutomataPredicate(Collection patte if (description.length() > 80) { description = Strings.cleanTruncate(description, 80) + "..."; } - throw new ElasticsearchSecurityException("The set patterns [{}] is too complex to evaluate", e, description); + throw new ElasticsearchSecurityException("The set of patterns [{}] is too complex to evaluate", e, description); } } }