Skip to content

Commit

Permalink
Wildcard field regex query fix (#78839)
Browse files Browse the repository at this point in the history
Fix for wildcard field query optimisation that rewrites to a match all for regexes like .*

A bug was found in this complex rewrite logic so we have simplified the detection of .* type regexes by examining the Automaton for the regex rather than our parsed form of it which is expressed as a Lucene BooleanQuery. The old logic relied on a recursive "simplify" function on the BooleanQuery which has now been removed. We now rely on Lucene's query rewrite logic to simplify expressions at query time and consequently some of the tests had to change to do some of this rewriting before running test comparisons.

Closes #78391
  • Loading branch information
markharwood authored Oct 11, 2021
1 parent db4d1c5 commit 216f135
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 150 deletions.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.automaton.RegExp.Kind;
import org.elasticsearch.ElasticsearchParseException;
Expand Down Expand Up @@ -348,26 +350,25 @@ public Query regexpQuery(String value, int syntaxFlags, int matchFlags, int maxD
if (value.length() == 0) {
return new MatchNoDocsQuery();
}

//Check for simple "match all expressions e.g. .*
RegExp regExp = new RegExp(value, syntaxFlags, matchFlags);
Automaton a = regExp.toAutomaton();
a = Operations.determinize(a, maxDeterminizedStates);
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
if (Operations.isTotal(a)) { // Will match all
return existsQuery(context);
}

RegExp ngramRegex = new RegExp(addLineEndChars(value), syntaxFlags, matchFlags);


Query approxBooleanQuery = toApproximationQuery(ngramRegex);
Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery);

// MatchAll is a special case meaning the regex is known to match everything .* and
// there is no need for verification.
if (approxNgramQuery instanceof MatchAllDocsQuery) {
return existsQuery(context);
}
RegExp regex = new RegExp(value, syntaxFlags, matchFlags);
Automaton automaton = regex.toAutomaton(maxDeterminizedStates);

// MatchAllButRequireVerificationQuery is a special case meaning the regex is reduced to a single
// clause which we can't accelerate at all and needs verification. Example would be ".."
if (approxNgramQuery instanceof MatchAllButRequireVerificationQuery) {
return new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(), name(), value, automaton);
}

// We can accelerate execution with the ngram query
return new BinaryDvConfirmedAutomatonQuery(approxNgramQuery, name(), value, automaton);
}
Expand Down Expand Up @@ -414,12 +415,12 @@ public static Query toApproximationQuery(RegExp r) throws IllegalArgumentExcepti
// plain TermQuery objects together. Boolean queries are interpreted as a black box and not
// concatenated.
BooleanQuery.Builder wrapper = new BooleanQuery.Builder();
wrapper.add(result, Occur.MUST);
wrapper.add(result, Occur.FILTER);
result = wrapper.build();
}
} else {
// Expressions like (a){0,3} match empty string or up to 3 a's.
result = new MatchAllButRequireVerificationQuery();
result = new MatchAllDocsQuery();
}
break;
case REGEXP_ANYSTRING:
Expand All @@ -436,7 +437,7 @@ public static Query toApproximationQuery(RegExp r) throws IllegalArgumentExcepti
case REGEXP_INTERVAL:
case REGEXP_EMPTY:
case REGEXP_AUTOMATON:
result = new MatchAllButRequireVerificationQuery();
result = new MatchAllDocsQuery();
break;
}
assert result != null; // All regex types are understood and translated to a query.
Expand All @@ -456,21 +457,21 @@ private static Query createConcatenationQuery(RegExp r) {
sequence.append(tq.getTerm().text());
} else {
if (sequence.length() > 0) {
bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST);
bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.FILTER);
sequence = new StringBuilder();
}
bAnd.add(query, Occur.MUST);
bAnd.add(query, Occur.FILTER);
}
}
if (sequence.length() > 0) {
bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST);
bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.FILTER);
}
BooleanQuery combined = bAnd.build();
if (combined.clauses().size() > 0) {
return combined;
}
// There's something in the regex we couldn't represent as a query - resort to a match all with verification
return new MatchAllButRequireVerificationQuery();
return new MatchAllDocsQuery();

}

Expand Down Expand Up @@ -498,7 +499,7 @@ private static Query createUnionQuery(RegExp r) {
}
}
// There's something in the regex we couldn't represent as a query - resort to a match all with verification
return new MatchAllButRequireVerificationQuery();
return new MatchAllDocsQuery();
}

private static void findLeaves(RegExp exp, Kind kind, List<Query> queries) {
Expand Down Expand Up @@ -529,7 +530,7 @@ private Query rewriteBoolToNgramQuery(Query approxQuery) {
for (BooleanClause clause : bq) {
Query q = rewriteBoolToNgramQuery(clause.getQuery());
if (q != null) {
if (clause.getOccur().equals(Occur.MUST)) {
if (clause.getOccur().equals(Occur.FILTER)) {
// Can't drop "should" clauses because it can elevate a sibling optional item
// to mandatory (shoulds with 1 clause) causing false negatives
// Dropping MUSTs increase false positives which are OK because are verified anyway.
Expand All @@ -541,91 +542,31 @@ private Query rewriteBoolToNgramQuery(Query approxQuery) {
rewritten.add(q, clause.getOccur());
}
}
return simplify(rewritten.build());
return rewritten.build();
}
if (approxQuery instanceof TermQuery) {
TermQuery tq = (TermQuery) approxQuery;

//Remove simple terms that are only string beginnings or ends.
String s = tq.getTerm().text();
if (s.equals(WildcardFieldMapper.TOKEN_START_STRING) || s.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
return new MatchAllButRequireVerificationQuery();
return new MatchAllDocsQuery();
}

// Break term into tokens
Set<String> tokens = new LinkedHashSet<>();
getNgramTokens(tokens, s);
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
for (String string : tokens) {
addClause(string, rewritten, Occur.MUST);
addClause(string, rewritten, Occur.FILTER);
}
return simplify(rewritten.build());
return rewritten.build();
}
if (isMatchAll(approxQuery)) {
if (approxQuery instanceof MatchAllDocsQuery) {
return approxQuery;
}
throw new IllegalStateException("Invalid query type found parsing regex query:" + approxQuery);
}

static Query simplify(Query input) {
if (input instanceof BooleanQuery == false) {
return input;
}
BooleanQuery result = (BooleanQuery) input;
if (result.clauses().size() == 0) {
// A ".*" clause can produce zero clauses in which case we return MatchAll
return new MatchAllDocsQuery();
}
if (result.clauses().size() == 1) {
return simplify(result.clauses().get(0).getQuery());
}

// We may have a mix of MatchAll and concrete queries - assess if we can simplify
int matchAllCount = 0;
int verifyCount = 0;
boolean allConcretesAreOptional = true;
for (BooleanClause booleanClause : result.clauses()) {
Query q = booleanClause.getQuery();
if (q instanceof MatchAllDocsQuery) {
matchAllCount++;
} else if (q instanceof MatchAllButRequireVerificationQuery) {
verifyCount++;
} else {
// Concrete query
if (booleanClause.getOccur() != Occur.SHOULD) {
allConcretesAreOptional = false;
}
}
}

if ((allConcretesAreOptional && matchAllCount > 0)) {
// Any match all expression takes precedence over all optional concrete queries.
return new MatchAllDocsQuery();
}

if ((allConcretesAreOptional && verifyCount > 0)) {
// Any match all expression that needs verification takes precedence over all optional concrete queries.
return new MatchAllButRequireVerificationQuery();
}

// We have some mandatory concrete queries - strip out the superfluous match all expressions
if (allConcretesAreOptional == false && matchAllCount + verifyCount > 0) {
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
for (BooleanClause booleanClause : result.clauses()) {
if (isMatchAll(booleanClause.getQuery()) == false) {
rewritten.add(booleanClause);
}
}
return simplify(rewritten.build());
}
return result;
}


static boolean isMatchAll(Query q) {
return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
}

protected void getNgramTokens(Set<String> tokens, String fragment) {
if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
// If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search
Expand Down Expand Up @@ -666,7 +607,7 @@ private void addClause(String token, BooleanQuery.Builder bqBuilder, Occur occur
if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
// there's something concrete to be searched but it's too short
// Require verification.
bqBuilder.add(new BooleanClause(new MatchAllButRequireVerificationQuery(), occur));
bqBuilder.add(new BooleanClause(new MatchAllDocsQuery(), occur));
return;
}
if (tokenSize == NGRAM_SIZE) {
Expand Down Expand Up @@ -723,11 +664,11 @@ public Query rangeQuery(

if (tokenSize == NGRAM_SIZE) {
TermQuery tq = new TermQuery(new Term(name(), token));
bqBuilder.add(new BooleanClause(tq, Occur.MUST));
bqBuilder.add(new BooleanClause(tq, Occur.FILTER));
} else {
PrefixQuery wq = new PrefixQuery(new Term(name(), token));
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
bqBuilder.add(new BooleanClause(wq, Occur.MUST));
bqBuilder.add(new BooleanClause(wq, Occur.FILTER));
}
}
BooleanQuery bq = bqBuilder.build();
Expand Down
Loading

0 comments on commit 216f135

Please sign in to comment.