diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/MatchAllButRequireVerificationQuery.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/MatchAllButRequireVerificationQuery.java deleted file mode 100644 index 053d36c65c38..000000000000 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/MatchAllButRequireVerificationQuery.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0; you may not use this file except in compliance with the Elastic License - * 2.0. - */ - -package org.elasticsearch.xpack.wildcard.mapper; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.QueryVisitor; - -import java.io.IOException; - -/** - * A query that matches all documents. The class is more of a marker - * that we encountered something that will need verification. - * (A MatchAllDocs query is used to indicate we can match all - * _without_ verification) - */ -public final class MatchAllButRequireVerificationQuery extends Query { - - @Override - public Query rewrite(IndexReader reader) throws IOException { - return new MatchAllDocsQuery(); - } - - @Override - public String toString(String field) { - return "*:* (tbc)"; - } - - @Override - public boolean equals(Object o) { - return sameClassAs(o); - } - - @Override - public int hashCode() { - return classHash(); - } - - @Override - public void visit(QueryVisitor visitor) { - visitor.visitLeaf(this); - } - - -} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 787e614ebb90..1924eef6c56f 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -37,6 +37,8 @@ import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.MinimizationOperations; +import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.automaton.RegExp.Kind; import org.elasticsearch.ElasticsearchParseException; @@ -348,26 +350,25 @@ public Query regexpQuery(String value, int syntaxFlags, int matchFlags, int maxD if (value.length() == 0) { return new MatchNoDocsQuery(); } + + //Check for simple "match all expressions e.g. .* + RegExp regExp = new RegExp(value, syntaxFlags, matchFlags); + Automaton a = regExp.toAutomaton(); + a = Operations.determinize(a, maxDeterminizedStates); + a = MinimizationOperations.minimize(a, maxDeterminizedStates); + if (Operations.isTotal(a)) { // Will match all + return existsQuery(context); + } RegExp ngramRegex = new RegExp(addLineEndChars(value), syntaxFlags, matchFlags); + Query approxBooleanQuery = toApproximationQuery(ngramRegex); Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery); - // MatchAll is a special case meaning the regex is known to match everything .* and - // there is no need for verification. - if (approxNgramQuery instanceof MatchAllDocsQuery) { - return existsQuery(context); - } RegExp regex = new RegExp(value, syntaxFlags, matchFlags); Automaton automaton = regex.toAutomaton(maxDeterminizedStates); - // MatchAllButRequireVerificationQuery is a special case meaning the regex is reduced to a single - // clause which we can't accelerate at all and needs verification. Example would be ".." - if (approxNgramQuery instanceof MatchAllButRequireVerificationQuery) { - return new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(), name(), value, automaton); - } - // We can accelerate execution with the ngram query return new BinaryDvConfirmedAutomatonQuery(approxNgramQuery, name(), value, automaton); } @@ -414,12 +415,12 @@ public static Query toApproximationQuery(RegExp r) throws IllegalArgumentExcepti // plain TermQuery objects together. Boolean queries are interpreted as a black box and not // concatenated. BooleanQuery.Builder wrapper = new BooleanQuery.Builder(); - wrapper.add(result, Occur.MUST); + wrapper.add(result, Occur.FILTER); result = wrapper.build(); } } else { // Expressions like (a){0,3} match empty string or up to 3 a's. - result = new MatchAllButRequireVerificationQuery(); + result = new MatchAllDocsQuery(); } break; case REGEXP_ANYSTRING: @@ -436,7 +437,7 @@ public static Query toApproximationQuery(RegExp r) throws IllegalArgumentExcepti case REGEXP_INTERVAL: case REGEXP_EMPTY: case REGEXP_AUTOMATON: - result = new MatchAllButRequireVerificationQuery(); + result = new MatchAllDocsQuery(); break; } assert result != null; // All regex types are understood and translated to a query. @@ -456,21 +457,21 @@ private static Query createConcatenationQuery(RegExp r) { sequence.append(tq.getTerm().text()); } else { if (sequence.length() > 0) { - bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST); + bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.FILTER); sequence = new StringBuilder(); } - bAnd.add(query, Occur.MUST); + bAnd.add(query, Occur.FILTER); } } if (sequence.length() > 0) { - bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST); + bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.FILTER); } BooleanQuery combined = bAnd.build(); if (combined.clauses().size() > 0) { return combined; } // There's something in the regex we couldn't represent as a query - resort to a match all with verification - return new MatchAllButRequireVerificationQuery(); + return new MatchAllDocsQuery(); } @@ -498,7 +499,7 @@ private static Query createUnionQuery(RegExp r) { } } // There's something in the regex we couldn't represent as a query - resort to a match all with verification - return new MatchAllButRequireVerificationQuery(); + return new MatchAllDocsQuery(); } private static void findLeaves(RegExp exp, Kind kind, List queries) { @@ -529,7 +530,7 @@ private Query rewriteBoolToNgramQuery(Query approxQuery) { for (BooleanClause clause : bq) { Query q = rewriteBoolToNgramQuery(clause.getQuery()); if (q != null) { - if (clause.getOccur().equals(Occur.MUST)) { + if (clause.getOccur().equals(Occur.FILTER)) { // Can't drop "should" clauses because it can elevate a sibling optional item // to mandatory (shoulds with 1 clause) causing false negatives // Dropping MUSTs increase false positives which are OK because are verified anyway. @@ -541,7 +542,7 @@ private Query rewriteBoolToNgramQuery(Query approxQuery) { rewritten.add(q, clause.getOccur()); } } - return simplify(rewritten.build()); + return rewritten.build(); } if (approxQuery instanceof TermQuery) { TermQuery tq = (TermQuery) approxQuery; @@ -549,7 +550,7 @@ private Query rewriteBoolToNgramQuery(Query approxQuery) { //Remove simple terms that are only string beginnings or ends. String s = tq.getTerm().text(); if (s.equals(WildcardFieldMapper.TOKEN_START_STRING) || s.equals(WildcardFieldMapper.TOKEN_END_STRING)) { - return new MatchAllButRequireVerificationQuery(); + return new MatchAllDocsQuery(); } // Break term into tokens @@ -557,75 +558,15 @@ private Query rewriteBoolToNgramQuery(Query approxQuery) { getNgramTokens(tokens, s); BooleanQuery.Builder rewritten = new BooleanQuery.Builder(); for (String string : tokens) { - addClause(string, rewritten, Occur.MUST); + addClause(string, rewritten, Occur.FILTER); } - return simplify(rewritten.build()); + return rewritten.build(); } - if (isMatchAll(approxQuery)) { + if (approxQuery instanceof MatchAllDocsQuery) { return approxQuery; } throw new IllegalStateException("Invalid query type found parsing regex query:" + approxQuery); } - - static Query simplify(Query input) { - if (input instanceof BooleanQuery == false) { - return input; - } - BooleanQuery result = (BooleanQuery) input; - if (result.clauses().size() == 0) { - // A ".*" clause can produce zero clauses in which case we return MatchAll - return new MatchAllDocsQuery(); - } - if (result.clauses().size() == 1) { - return simplify(result.clauses().get(0).getQuery()); - } - - // We may have a mix of MatchAll and concrete queries - assess if we can simplify - int matchAllCount = 0; - int verifyCount = 0; - boolean allConcretesAreOptional = true; - for (BooleanClause booleanClause : result.clauses()) { - Query q = booleanClause.getQuery(); - if (q instanceof MatchAllDocsQuery) { - matchAllCount++; - } else if (q instanceof MatchAllButRequireVerificationQuery) { - verifyCount++; - } else { - // Concrete query - if (booleanClause.getOccur() != Occur.SHOULD) { - allConcretesAreOptional = false; - } - } - } - - if ((allConcretesAreOptional && matchAllCount > 0)) { - // Any match all expression takes precedence over all optional concrete queries. - return new MatchAllDocsQuery(); - } - - if ((allConcretesAreOptional && verifyCount > 0)) { - // Any match all expression that needs verification takes precedence over all optional concrete queries. - return new MatchAllButRequireVerificationQuery(); - } - - // We have some mandatory concrete queries - strip out the superfluous match all expressions - if (allConcretesAreOptional == false && matchAllCount + verifyCount > 0) { - BooleanQuery.Builder rewritten = new BooleanQuery.Builder(); - for (BooleanClause booleanClause : result.clauses()) { - if (isMatchAll(booleanClause.getQuery()) == false) { - rewritten.add(booleanClause); - } - } - return simplify(rewritten.build()); - } - return result; - } - - - static boolean isMatchAll(Query q) { - return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery; - } - protected void getNgramTokens(Set tokens, String fragment) { if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) { // If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search @@ -666,7 +607,7 @@ private void addClause(String token, BooleanQuery.Builder bqBuilder, Occur occur if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) { // there's something concrete to be searched but it's too short // Require verification. - bqBuilder.add(new BooleanClause(new MatchAllButRequireVerificationQuery(), occur)); + bqBuilder.add(new BooleanClause(new MatchAllDocsQuery(), occur)); return; } if (tokenSize == NGRAM_SIZE) { @@ -723,11 +664,11 @@ public Query rangeQuery( if (tokenSize == NGRAM_SIZE) { TermQuery tq = new TermQuery(new Term(name(), token)); - bqBuilder.add(new BooleanClause(tq, Occur.MUST)); + bqBuilder.add(new BooleanClause(tq, Occur.FILTER)); } else { PrefixQuery wq = new PrefixQuery(new Term(name(), token)); wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE); - bqBuilder.add(new BooleanClause(wq, Occur.MUST)); + bqBuilder.add(new BooleanClause(wq, Occur.FILTER)); } } BooleanQuery bq = bqBuilder.build(); diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java index 66c070da60eb..6289b0a877d0 100644 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java @@ -19,8 +19,10 @@ import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DocValuesFieldExistsQuery; import org.apache.lucene.search.IndexSearcher; @@ -34,6 +36,7 @@ import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.store.BaseDirectoryWrapper; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; @@ -97,6 +100,8 @@ static SearchExecutionContext createMockSearchExecutionContext(boolean allowExpe static WildcardFieldMapper wildcardFieldType; static WildcardFieldMapper wildcardFieldType79; static KeywordFieldMapper keywordFieldType; + private DirectoryReader rewriteReader; + private BaseDirectoryWrapper rewriteDir; @Override protected Collection getPlugins() { @@ -120,8 +125,38 @@ public void setUp() throws Exception { org.elasticsearch.index.mapper.KeywordFieldMapper.Builder kwBuilder = new KeywordFieldMapper.Builder(KEYWORD_FIELD_NAME); keywordFieldType = kwBuilder.build(MapperBuilderContext.ROOT); + + rewriteDir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_10); + RandomIndexWriter iw = new RandomIndexWriter(random(), rewriteDir, iwc); + + // Create a string that is too large and will not be indexed + String docContent = "a"; + Document doc = new Document(); + LuceneDocument parseDoc = new LuceneDocument(); + addFields(parseDoc, doc, docContent); + indexDoc(parseDoc, doc, iw); + + iw.forceMerge(1); + rewriteReader = iw.getReader(); + iw.close(); + super.setUp(); } + + + + @Override + public void tearDown() throws Exception { + try { + rewriteReader.close(); + rewriteDir.close(); + } catch (Exception ignoreCloseFailure) { + // allow any superclass tear down logic to continue + } + // TODO Auto-generated method stub + super.tearDown(); + } public void testTooBigKeywordField() throws IOException { Directory dir = newDirectory(); @@ -484,10 +519,10 @@ public void testRangeQueryVersusKeywordField() throws IOException { public void testRegexAcceleration() throws IOException, ParseException { // All these expressions should rewrite to a match all with no verification step required at all - String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"}; + String superfastRegexes[]= { ".*", "(foo|bar|.*)", "@"}; for (String regex : superfastRegexes) { Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 0, 20000, null, MOCK_CONTEXT); - assertTrue(wildcardFieldQuery instanceof DocValuesFieldExistsQuery); + assertTrue(regex + "should have been accelerated", wildcardFieldQuery instanceof DocValuesFieldExistsQuery); } String matchNoDocsRegexes[]= { ""}; for (String regex : matchNoDocsRegexes) { @@ -518,12 +553,14 @@ public void testRegexAcceleration() throws IOException, ParseException { // All these expressions should rewrite to just the verification query (there's no ngram acceleration) // TODO we can possibly improve on some of these - String matchAllButVerifyTests[]= { "..", "(a)?","(a|b){0,3}", "((foo)?|(foo|bar)?)", "@&~(abc.+)", "aaa.+&.+bbb"}; + String matchAllButVerifyTests[]= { "..", "(a)?","(a|b){0,3}", "((foo)?|(foo|bar)?)", "@&~(abc.+)", "aaa.+&.+bbb", "a*", "...*.."}; for (String regex : matchAllButVerifyTests) { Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 0, 20000, null, MOCK_CONTEXT); BinaryDvConfirmedAutomatonQuery q = (BinaryDvConfirmedAutomatonQuery)wildcardFieldQuery; + Query approximationQuery = unwrapAnyBoost(q.getApproximationQuery()); + approximationQuery = getSimplifiedApproximationQuery(q.getApproximationQuery()); assertTrue(regex +" was not a pure verify query " +formatQuery(wildcardFieldQuery), - q.getApproximationQuery() instanceof MatchAllDocsQuery); + approximationQuery instanceof MatchAllDocsQuery); } @@ -532,8 +569,8 @@ public void testRegexAcceleration() throws IOException, ParseException { String suboptimalTests[][] = { // TODO short wildcards like a* OR b* aren't great so we just drop them. // Ideally we would attach to successors to create (acd OR bcd) - { "[ab]cd", "+cc_ +c__"} - }; + { "[ab]cd", "+(+cc_ +c__) +*:*"} + }; for (String[] test : suboptimalTests) { String regex = test[0]; String expectedAccelerationQueryString = test[1].replaceAll("_", ""+WildcardFieldMapper.TOKEN_START_OR_END_CHAR); @@ -569,7 +606,7 @@ public void testWildcardAcceleration() throws IOException, ParseException { { "foo*bar", "+_eo +eoo +aaq +aq_ +q__" }, { "foo?bar", "+_eo +eoo +aaq +aq_ +q__" }, { "?foo*bar?", "+eoo +aaq" }, - { "*c", "+c__" } }; + { "*c", "c__" } }; for (String[] test : tests) { String pattern = test[0]; String expectedAccelerationQueryString = test[1].replaceAll("_", "" + WildcardFieldMapper.TOKEN_START_OR_END_CHAR); @@ -714,7 +751,7 @@ public void testFuzzyAcceleration() throws IOException, ParseException { }; for (FuzzyTest test : tests) { Query wildcardFieldQuery = test.getFuzzyQuery(); - testExpectedAccelerationQuery(test.pattern, wildcardFieldQuery, test.getExpectedApproxQuery()); + testExpectedAccelerationQuery(test.pattern, wildcardFieldQuery, getSimplifiedApproximationQuery(test.getExpectedApproxQuery())); } } @@ -765,7 +802,8 @@ public void testRangeAcceleration() throws IOException, ParseException { } } - void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException { + void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException, + IOException { QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer()); Query expectedAccelerationQuery = qsp.parse(expectedAccelerationQueryString); @@ -780,16 +818,63 @@ private Query unwrapAnyConstantScore(Query q) { return q; } } + private Query unwrapAnyBoost(Query q) { + if (q instanceof BoostQuery) { + BoostQuery csq = (BoostQuery) q; + return csq.getQuery(); + } else { + return q; + } + } + - void testExpectedAccelerationQuery(String regex, Query combinedQuery, Query expectedAccelerationQuery) throws ParseException { + void testExpectedAccelerationQuery(String regex, Query combinedQuery, Query expectedAccelerationQuery) throws ParseException, + IOException { BinaryDvConfirmedAutomatonQuery cq = (BinaryDvConfirmedAutomatonQuery) unwrapAnyConstantScore(combinedQuery); Query approximationQuery = cq.getApproximationQuery(); - + approximationQuery = getSimplifiedApproximationQuery(approximationQuery); String message = "regex: "+ regex +"\nactual query: " + formatQuery(approximationQuery) + "\nexpected query: " + formatQuery(expectedAccelerationQuery) + "\n"; assertEquals(message, expectedAccelerationQuery, approximationQuery); } + // For comparison purposes rewrite and unwrap various superfluous parts to get to raw logic + protected Query getSimplifiedApproximationQuery(Query approximationQuery) throws IOException { + int numRewrites = 0; + int maxNumRewrites = 100; + for (; numRewrites < maxNumRewrites; numRewrites++) { + Query newApprox = approximationQuery.rewrite(rewriteReader); + if (newApprox == approximationQuery) { + break; + } + approximationQuery = newApprox; + + } + assertTrue(numRewrites < maxNumRewrites); + approximationQuery = rewriteFiltersToMustsForComparisonPurposes(approximationQuery); + return approximationQuery; + } + + private Query rewriteFiltersToMustsForComparisonPurposes(Query q) { + q = unwrapAnyBoost(q); + q = unwrapAnyConstantScore(q); + if (q instanceof BooleanQuery) { + BooleanQuery.Builder result = new BooleanQuery.Builder(); + BooleanQuery bq = (BooleanQuery)q; + for (BooleanClause cq : bq.clauses()){ + Query rewritten = rewriteFiltersToMustsForComparisonPurposes(cq.getQuery()); + if(cq.getOccur() == Occur.FILTER) { + result.add(rewritten, Occur.MUST); + } else { + result.add(rewritten, cq.getOccur()); + } + } + return result.build(); + } + + return q; + } + private String getRandomFuzzyPattern(HashSet values, int edits, int prefixLength) { assert edits >=0 && edits <=2; // Pick one of the indexed document values to focus our queries on.