Skip to content

Commit

Permalink
Build complex automatons more efficiently (#66901)
Browse files Browse the repository at this point in the history
This change substantially reduces the CPU and Heap usage of
StringMatcher when processing large complex patterns.

The improvement is achieved by switching the order in which we
perform concatenation and union for common styles of wildcard patterns.

Given a set of wildcard strings:
- "*-logs-*"
- "*-metrics-*"
- "web-*-prod-*"
- "web-*-staging-*"

The old implementation would perform steps roughly like:

    minimize {
        union {
            concatenate { MATCH_ANY, "-logs-", MATCH_ANY }
            concatenate { MATCH_ANY, "-metrics-", MATCH_ANY }
            concatenate { "web-", MATCH_ANY, "prod-", MATCH_ANY }
            concatenate { "web-", MATCH_ANY, "staging-", MATCH_ANY }
        }
    }

The outer minimize would require determinizing the automaton, which
was highly inefficient

The new implementation is:

    minimize {
        union {
            concatenate {
                MATCH_ANY ,
                minimize {
                    union { "-logs-", "-metrics"- }
                }
                MATCH_ANY
            }
            concatenate {
                minimize {
                    union {
                        concatenate { "web-", MATCH_ANY, "prod-" }
                        concatenate { "web-", MATCH_ANY, "staging-" }
                    }
                }
                MATCH_ANY
            }
        }
    }

By performing a union of the inner strings before concatenating the
MATCH_ANY ("*") the time & heap space spent on determinizing the
automaton is greatly reduced.

Backport of: #66724
  • Loading branch information
tvernum authored Dec 31, 2020
1 parent 288db85 commit 5dfaae8
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.elasticsearch.common.cache.Cache;
import org.elasticsearch.common.cache.CacheBuilder;
Expand All @@ -19,11 +21,13 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.function.Function;
import java.util.function.Predicate;

import static org.apache.lucene.util.automaton.MinimizationOperations.minimize;
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
import static org.apache.lucene.util.automaton.Operations.concatenate;
import static org.apache.lucene.util.automaton.Operations.intersection;
Expand Down Expand Up @@ -84,10 +88,82 @@ public static Automaton patterns(Collection<String> patterns) {
}

private static Automaton buildAutomaton(Collection<String> patterns) {
List<Automaton> automata = new ArrayList<>(patterns.size());
for (String pattern : patterns) {
final Automaton patternAutomaton = pattern(pattern);
automata.add(patternAutomaton);
if (patterns.size() == 1) {
return minimize(pattern(patterns.iterator().next()));
}

final Function<Collection<String>, Automaton> build = strings -> {
List<Automaton> automata = new ArrayList<>(strings.size());
for (String pattern : strings) {
final Automaton patternAutomaton = pattern(pattern);
automata.add(patternAutomaton);
}
return unionAndMinimize(automata);
};

// We originally just compiled each automaton separately and then unioned them all.
// However, that approach can be quite slow, and very memory intensive.
// It is far more efficient if
// 1. we strip leading/trailing "*"
// 2. union the automaton produced from the remaining text
// 3. append/prepend MatchAnyString automatons as appropriate
// That is:
// - `MATCH_ALL + (bullseye|daredevil) + MATCH_ALL`
// can be determinized more efficiently than
// - `(MATCH_ALL + bullseye + MATCH_ALL)|(MATCH_ALL + daredevil + MATCH_ALL)`

final Set<String> prefix = new HashSet<>();
final Set<String> infix = new HashSet<>();
final Set<String> suffix = new HashSet<>();
final Set<String> misc = new HashSet<>();

for (String p : patterns) {
if (p.length() <= 1) {
// Single character strings (like "x" or "*"), or stray empty strings
misc.add(p);
continue;
}

final char first = p.charAt(0);
final char last = p.charAt(p.length() - 1);
if (first == '/') {
// regex ("/something/")
misc.add(p);
} else if (first == '*') {
if (last == '*') {
// *something*
infix.add(p.substring(1, p.length() - 1));
} else {
// *something
suffix.add(p.substring(1));
}
} else if (last == '*' && p.indexOf('*') != p.length() - 1) {
// some*thing*
// For simple prefix patterns ("something*") it's more efficient to do a single pass
// Lucene can efficiently determinize automata that share a trailing MATCH_ANY accept state,
// If we were to handle them here, we would run 2 minimize operations (one for the union of strings,
// then another after concatenating MATCH_ANY), which is substantially slower.
// However, that's not true if the string has an embedded '*' in it - in that case it is more efficient to determinize
// the set of prefixes (with the embedded MATCH_ANY) and then concatenate another MATCH_ANY and minimize.
prefix.add(p.substring(0, p.length() - 1));
} else {
// something* / some*thing / some?thing / etc
misc.add(p);
}
}

final List<Automaton> automata = new ArrayList<>();
if (prefix.isEmpty() == false) {
automata.add(Operations.concatenate(build.apply(prefix), Automata.makeAnyString()));
}
if (suffix.isEmpty() == false) {
automata.add(Operations.concatenate(Automata.makeAnyString(), build.apply(suffix)));
}
if (infix.isEmpty() == false) {
automata.add(Operations.concatenate(Arrays.asList(Automata.makeAnyString(), build.apply(infix), Automata.makeAnyString())));
}
if (misc.isEmpty() == false) {
automata.add(build.apply(misc));
}
return unionAndMinimize(automata);
}
Expand Down Expand Up @@ -172,18 +248,22 @@ static Automaton wildcard(String text) {
}

public static Automaton unionAndMinimize(Collection<Automaton> automata) {
Automaton res = union(automata);
return minimize(res, maxDeterminizedStates);
Automaton res = automata.size() == 1 ? automata.iterator().next() : union(automata);
return minimize(res);
}

public static Automaton minusAndMinimize(Automaton a1, Automaton a2) {
Automaton res = minus(a1, a2, maxDeterminizedStates);
return minimize(res, maxDeterminizedStates);
return minimize(res);
}

public static Automaton intersectAndMinimize(Automaton a1, Automaton a2) {
Automaton res = intersection(a1, a2);
return minimize(res, maxDeterminizedStates);
return minimize(res);
}

private static Automaton minimize(Automaton automaton) {
return MinimizationOperations.minimize(automaton, maxDeterminizedStates);
}

public static Predicate<String> predicate(String... patterns) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ private static Predicate<String> buildAutomataPredicate(Collection<String> patte
if (description.length() > 80) {
description = Strings.cleanTruncate(description, 80) + "...";
}
throw new ElasticsearchSecurityException("The set patterns [{}] is too complex to evaluate", e, description);
throw new ElasticsearchSecurityException("The set of patterns [{}] is too complex to evaluate", e, description);
}
}
}
Expand Down

0 comments on commit 5dfaae8

Please sign in to comment.