Skip to content

Commit

Permalink
Search performance - better caching logic for queries on wildcard fie…
Browse files Browse the repository at this point in the history
…ld (#76035)

Remove use of BooleanQuery to prevent query caching framework from running contained verification query clauses across whole index at great expense.
The new BinaryDVConfirmedAutomatonQuery provides a replacement wrapper that plays better with caching logic

Closes #75848
  • Loading branch information
markharwood authored Aug 5, 2021
1 parent 26b41f2 commit d805c67
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
Expand All @@ -27,32 +30,64 @@
import java.util.Objects;

/**
* Query that runs an Automaton across all binary doc values.
* Expensive to run so normally used in conjunction with more selective query clauses.
* Query that runs an Automaton across all binary doc values (but only for docs that also
* match a provided approximation query which is key to getting good performance).
*/
public class AutomatonQueryOnBinaryDv extends Query {
public class BinaryDvConfirmedAutomatonQuery extends Query {

private final String field;
private final String matchPattern;
private final ByteRunAutomaton bytesMatcher;
private final Query approxQuery;

public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton automaton) {
public BinaryDvConfirmedAutomatonQuery(Query approximation, String field, String matchPattern, Automaton automaton) {
this.approxQuery = approximation;
this.field = field;
this.matchPattern = matchPattern;
bytesMatcher = new ByteRunAutomaton(automaton);
}

private BinaryDvConfirmedAutomatonQuery(Query approximation, String field, String matchPattern, ByteRunAutomaton bytesMatcher) {
this.approxQuery = approximation;
this.field = field;
this.matchPattern = matchPattern;
this.bytesMatcher = bytesMatcher;
}

@Override
public Query rewrite(IndexReader reader) throws IOException {
Query approxRewrite = approxQuery.rewrite(reader);
if (approxQuery != approxRewrite ) {
return new BinaryDvConfirmedAutomatonQuery(approxRewrite, field, matchPattern, bytesMatcher);
}
return this;
}

@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
final Weight approxWeight = approxQuery.createWeight(searcher, scoreMode, boost);

return new ConstantScoreWeight(this, boost) {

@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
ByteArrayDataInput badi = new ByteArrayDataInput();
final BinaryDocValues values = DocValues.getBinary(context.reader(), field);
TwoPhaseIterator twoPhase = new TwoPhaseIterator(values) {
Scorer approxScorer = approxWeight.scorer(context);
if (approxScorer == null) {
// No matches to be had
return null;
}
DocIdSetIterator approxDisi = approxScorer.iterator();
TwoPhaseIterator twoPhase = new TwoPhaseIterator(approxDisi) {
@Override
public boolean matches() throws IOException {
if (values.advanceExact(approxDisi.docID()) == false)
{
// Bug if we have an indexed value (i.e an approxQuery) but no doc value.
assert approxQuery instanceof MatchAllDocsQuery;
return false;
}
BytesRef arrayOfValues = values.binaryValue();
badi.reset(arrayOfValues.bytes);
badi.setPosition(arrayOfValues.offset);
Expand Down Expand Up @@ -93,14 +128,19 @@ public boolean equals(Object obj) {
if (sameClassAs(obj) == false) {
return false;
}
AutomatonQueryOnBinaryDv other = (AutomatonQueryOnBinaryDv) obj;
BinaryDvConfirmedAutomatonQuery other = (BinaryDvConfirmedAutomatonQuery) obj;
return Objects.equals(field, other.field) && Objects.equals(matchPattern, other.matchPattern)
&& Objects.equals(bytesMatcher, other.bytesMatcher);
&& Objects.equals(bytesMatcher, other.bytesMatcher)
&& Objects.equals(approxQuery, other.approxQuery);
}

@Override
public int hashCode() {
return Objects.hash(classHash(), field, matchPattern, bytesMatcher);
return Objects.hash(classHash(), field, matchPattern, bytesMatcher, approxQuery);
}

Query getApproximationQuery() {
return approxQuery;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -330,19 +330,15 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, boolean
Automaton automaton = caseInsensitive
? AutomatonQueries.toCaseInsensitiveWildcardAutomaton(new Term(name(), wildcardPattern), Integer.MAX_VALUE)
: WildcardQuery.toAutomaton(new Term(name(), wildcardPattern));
AutomatonQueryOnBinaryDv verifyingQuery = new AutomatonQueryOnBinaryDv(name(), wildcardPattern, automaton);
if (clauseCount > 0) {
// We can accelerate execution with the ngram query
BooleanQuery approxQuery = rewritten.build();
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
verifyingBuilder.add(new BooleanClause(approxQuery, Occur.MUST));
verifyingBuilder.add(new BooleanClause(verifyingQuery, Occur.MUST));
return new ConstantScoreQuery(verifyingBuilder.build());
return new BinaryDvConfirmedAutomatonQuery(approxQuery, name(), wildcardPattern, automaton);
} else if (numWildcardChars == 0 || numWildcardStrings > 0) {
// We have no concrete characters and we're not a pure length query e.g. ???
return new DocValuesFieldExistsQuery(name());
}
return verifyingQuery;
return new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(), name(), wildcardPattern, automaton);

}

Expand All @@ -365,19 +361,15 @@ public Query regexpQuery(String value, int syntaxFlags, int matchFlags, int maxD
}
RegExp regex = new RegExp(value, syntaxFlags, matchFlags);
Automaton automaton = regex.toAutomaton(maxDeterminizedStates);
AutomatonQueryOnBinaryDv verifyingQuery = new AutomatonQueryOnBinaryDv(name(), value, automaton);

// MatchAllButRequireVerificationQuery is a special case meaning the regex is reduced to a single
// clause which we can't accelerate at all and needs verification. Example would be ".."
if (approxNgramQuery instanceof MatchAllButRequireVerificationQuery) {
return verifyingQuery;
return new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(), name(), value, automaton);
}

// We can accelerate execution with the ngram query
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
verifyingBuilder.add(new BooleanClause(approxNgramQuery, Occur.MUST));
verifyingBuilder.add(new BooleanClause(verifyingQuery, Occur.MUST));
return verifyingBuilder.build();
return new BinaryDvConfirmedAutomatonQuery(approxNgramQuery, name(), value, automaton);
}

// Convert a regular expression to a simplified query consisting of BooleanQuery and TermQuery objects
Expand Down Expand Up @@ -745,16 +737,12 @@ public Query rangeQuery(
}
}
Automaton automaton = TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
AutomatonQueryOnBinaryDv slowQuery = new AutomatonQueryOnBinaryDv(name(), lower + "-" + upper, automaton);

if (accelerationQuery == null) {
return slowQuery;
return new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(),
name(), lower + "-" + upper, automaton);
}

BooleanQuery.Builder qBuilder = new BooleanQuery.Builder();
qBuilder.add(accelerationQuery, Occur.MUST);
qBuilder.add(slowQuery, Occur.MUST);
return qBuilder.build();
return new BinaryDvConfirmedAutomatonQuery(accelerationQuery, name(), lower + "-" + upper, automaton);
}

@Override
Expand All @@ -768,7 +756,6 @@ public Query fuzzyQuery(
) {
String searchTerm = BytesRefs.toString(value);
try {
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
//The approximation query can have a prefix and any number of ngrams.
BooleanQuery.Builder approxBuilder = new BooleanQuery.Builder();

Expand Down Expand Up @@ -824,9 +811,6 @@ public Query fuzzyQuery(
}

BooleanQuery ngramQ = approxBuilder.build();
if (ngramQ.clauses().size()>0) {
bqBuilder.add(ngramQ, Occur.MUST);
}

// Verification query
FuzzyQuery fq = new FuzzyQuery(
Expand All @@ -836,9 +820,12 @@ public Query fuzzyQuery(
maxExpansions,
transpositions
);
bqBuilder.add(new AutomatonQueryOnBinaryDv(name(), searchTerm, fq.getAutomata().automaton), Occur.MUST);
if (ngramQ.clauses().size() == 0) {
return new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(),
name(), searchTerm, fq.getAutomata().automaton);
}

return bqBuilder.build();
return new BinaryDvConfirmedAutomatonQuery(ngramQ, name(), searchTerm, fq.getAutomata().automaton);
} catch (IOException ioe) {
throw new ElasticsearchParseException("Error parsing wildcard field fuzzy string [" + searchTerm + "]");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
Expand Down Expand Up @@ -522,8 +521,9 @@ public void testRegexAcceleration() throws IOException, ParseException {
String matchAllButVerifyTests[]= { "..", "(a)?","(a|b){0,3}", "((foo)?|(foo|bar)?)", "@&~(abc.+)", "aaa.+&.+bbb"};
for (String regex : matchAllButVerifyTests) {
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 0, 20000, null, MOCK_CONTEXT);
BinaryDvConfirmedAutomatonQuery q = (BinaryDvConfirmedAutomatonQuery)wildcardFieldQuery;
assertTrue(regex +" was not a pure verify query " +formatQuery(wildcardFieldQuery),
wildcardFieldQuery instanceof AutomatonQueryOnBinaryDv);
q.getApproximationQuery() instanceof MatchAllDocsQuery);
}


Expand Down Expand Up @@ -575,17 +575,18 @@ public void testWildcardAcceleration() throws IOException, ParseException {
String expectedAccelerationQueryString = test[1].replaceAll("_", "" + WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_CONTEXT);
testExpectedAccelerationQuery(pattern, wildcardFieldQuery, expectedAccelerationQueryString);
assertTrue(unwrapAnyConstantScore(wildcardFieldQuery) instanceof BooleanQuery);
assertTrue(unwrapAnyConstantScore(wildcardFieldQuery) instanceof BinaryDvConfirmedAutomatonQuery);
}

// TODO All these expressions have no acceleration at all and could be improved
String slowPatterns[] = { "??" };
for (String pattern : slowPatterns) {
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_CONTEXT);
wildcardFieldQuery = unwrapAnyConstantScore(wildcardFieldQuery);
BinaryDvConfirmedAutomatonQuery q = (BinaryDvConfirmedAutomatonQuery) wildcardFieldQuery;
assertTrue(
pattern + " was not as slow as we assumed " + formatQuery(wildcardFieldQuery),
wildcardFieldQuery instanceof AutomatonQueryOnBinaryDv
q.getApproximationQuery() instanceof MatchAllDocsQuery
);
}

Expand All @@ -599,14 +600,17 @@ public void testQueryCachingEquality() throws IOException, ParseException {
new Term("field", pattern),
Integer.MAX_VALUE
);
AutomatonQueryOnBinaryDv csQ = new AutomatonQueryOnBinaryDv("field", pattern, caseSensitiveAutomaton);
AutomatonQueryOnBinaryDv ciQ = new AutomatonQueryOnBinaryDv("field", pattern, caseInSensitiveAutomaton);
BinaryDvConfirmedAutomatonQuery csQ = new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(),
"field", pattern, caseSensitiveAutomaton);
BinaryDvConfirmedAutomatonQuery ciQ = new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(),
"field", pattern, caseInSensitiveAutomaton);
assertNotEquals(csQ, ciQ);
assertNotEquals(csQ.hashCode(), ciQ.hashCode());

// Same query should be equal
Automaton caseSensitiveAutomaton2 = WildcardQuery.toAutomaton(new Term("field", pattern));
AutomatonQueryOnBinaryDv csQ2 = new AutomatonQueryOnBinaryDv("field", pattern, caseSensitiveAutomaton2);
BinaryDvConfirmedAutomatonQuery csQ2 = new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(),
"field", pattern, caseSensitiveAutomaton2);
assertEquals(csQ, csQ2);
assertEquals(csQ.hashCode(), csQ2.hashCode());
}
Expand Down Expand Up @@ -778,19 +782,8 @@ private Query unwrapAnyConstantScore(Query q) {
}

void testExpectedAccelerationQuery(String regex, Query combinedQuery, Query expectedAccelerationQuery) throws ParseException {
BooleanQuery cq = (BooleanQuery) unwrapAnyConstantScore(combinedQuery);
assert cq.clauses().size() == 2;
Query approximationQuery = null;
boolean verifyQueryFound = false;
for (BooleanClause booleanClause : cq.clauses()) {
Query q = booleanClause.getQuery();
if (q instanceof AutomatonQueryOnBinaryDv) {
verifyQueryFound = true;
} else {
approximationQuery = q;
}
}
assert verifyQueryFound;
BinaryDvConfirmedAutomatonQuery cq = (BinaryDvConfirmedAutomatonQuery) unwrapAnyConstantScore(combinedQuery);
Query approximationQuery = cq.getApproximationQuery();

String message = "regex: "+ regex +"\nactual query: " + formatQuery(approximationQuery) +
"\nexpected query: " + formatQuery(expectedAccelerationQuery) + "\n";
Expand Down

0 comments on commit d805c67

Please sign in to comment.