Search - add range query support to wildcard field (#57881) (#57988)

Backport to add range query support to wildcard field Closes #57816
elastic · Jun 12, 2020 · 2da8e57 · 2da8e57
1 parent db03e7c
commit 2da8e57
Show file tree

Hide file tree

Showing 2 changed files with 250 additions and 1 deletion.
diff --git a/...n/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/...n/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java
@@ -30,15 +30,19 @@
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.RegExp;
 import org.apache.lucene.util.automaton.RegExp.Kind;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ElasticsearchParseException;
+import org.elasticsearch.common.geo.ShapeRelation;
 import org.elasticsearch.common.lucene.BytesRefs;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.time.DateMathParser;
 import org.elasticsearch.common.unit.Fuzziness;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser;
@@ -70,6 +74,7 @@
 
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
+import java.time.ZoneId;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -613,6 +618,12 @@ static Query simplify(Query input) {
         static boolean isMatchAll(Query q) {
             return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
         }
+
+        protected String firstNgramToken(String fragment) {
+            LinkedHashSet<String> tokens = new LinkedHashSet<>();
+            getNgramTokens(tokens, fragment);
+            return tokens.iterator().next();
+        }
 
         protected void getNgramTokens(Set<String> tokens, String fragment) {
             if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
@@ -678,6 +689,90 @@ private void addClause(String token, BooleanQuery.Builder bqBuilder, Occur occur
             }
         }
 
+        @Override
+        public Query rangeQuery(
+            Object lowerTerm,
+            Object upperTerm,
+            boolean includeLower,
+            boolean includeUpper,
+            ShapeRelation relation,
+            ZoneId timeZone,
+            DateMathParser parser,
+            QueryShardContext context
+        ) {
+            if (context.allowExpensiveQueries() == false) {
+                throw new ElasticsearchException("[range] queries on [wildcard] fields cannot be executed when '" +
+                        ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false.");
+            }
+            BytesRef lower = lowerTerm == null ? null : BytesRefs.toBytesRef(lowerTerm);
+            BytesRef upper = upperTerm == null ? null : BytesRefs.toBytesRef(upperTerm);
+            Query accelerationQuery = null;
+            if (lowerTerm != null && upperTerm != null) {
+                // Long common prefixes e.g. "C:/Program Files/a,txt" to "C:/Program Files/z,txt"
+                // can be accelerated by searching for all the common leading ngrams e.g. c:/, /pr, rog, gra etc 
+                StringBuilder commonPrefix = new StringBuilder();
+                String lowerS = addLineEndChars(toLowerCase(lower.utf8ToString()));
+                String upperS = addLineEndChars(toLowerCase(upper.utf8ToString()));
+                for (int i = 0; i < Math.min(lowerS.length(), upperS.length());) {
+                    final int cL = lowerS.codePointAt(i);
+                    final int cU = upperS.codePointAt(i);
+                    if (cL == cU) {
+                        commonPrefix.append(Character.toChars(cL));
+                    } else {
+                        break;
+                    }
+                    int length = Character.charCount(cL);
+                    i += length;
+                }
+
+                if (commonPrefix.length() > 0) {
+                    Set<String> tokens = new HashSet<>();
+                    getNgramTokens(tokens, commonPrefix.toString());
+                    BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
+                    for (String token : tokens) {
+                        int tokenSize = token.codePointCount(0, token.length());
+                        if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
+                            continue;
+                        }
+
+                        if (tokenSize == NGRAM_SIZE) {
+                            TermQuery tq = new TermQuery(new Term(name(), token));
+                            bqBuilder.add(new BooleanClause(tq, Occur.MUST));
+                        } else {
+                            PrefixQuery wq = new PrefixQuery(new Term(name(), token));
+                            wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
+                            bqBuilder.add(new BooleanClause(wq, Occur.MUST));
+                        }
+                    }
+                    BooleanQuery bq = bqBuilder.build();
+                    if (bq.clauses().size() > 0) {
+                        accelerationQuery = bq;
+                    }                     
+                }                
+            }
+            if (accelerationQuery == null) {
+                // Fallback - if there is no common prefix sequence then we look for the range of ngrams that appear at the start
+                // of the string e.g. given 100 to 999 we would search for ngrams in the range
+                //   TOKEN_START_OR_END_CHAR + "10" to 
+                //   TOKEN_START_OR_END_CHAR + "99"
+                BytesRef lowerNgram = lower == null ? null : new BytesRef(firstNgramToken(
+                    addLineEndChars(toLowerCase(lower.utf8ToString()))));
+                BytesRef upperNgram = upper == null ? null : new BytesRef(firstNgramToken(
+                    addLineEndChars(toLowerCase(upper.utf8ToString()))));
+                accelerationQuery = new TermRangeQuery(name(), lowerNgram, upperNgram, true, true);                
+            }
+
+            Supplier <Automaton> deferredAutomatonSupplier = ()->{
+                return TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
+            };
+            AutomatonQueryOnBinaryDv slowQuery = new AutomatonQueryOnBinaryDv(name(), lower + "-" + upper, deferredAutomatonSupplier);
+
+            BooleanQuery.Builder qBuilder = new BooleanQuery.Builder();
+            qBuilder.add(accelerationQuery, Occur.MUST);
+            qBuilder.add(slowQuery, Occur.MUST);
+            return qBuilder.build();
+        }
+
         @Override
         public Query fuzzyQuery(
             Object value,

diff --git a/...dcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java b/...dcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java
@@ -30,6 +30,7 @@
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.store.Directory;
@@ -214,7 +215,7 @@ public void testSearchResultsVersusKeywordField() throws IOException {
             Query wildcardFieldQuery = null;
             Query keywordFieldQuery = null;
             String pattern = null;
-            switch (randomInt(3)) {
+            switch (randomInt(4)) {
             case 0:
                 pattern = getRandomWildcardPattern();                
                 wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
@@ -259,6 +260,14 @@ public void testSearchResultsVersusKeywordField() throws IOException {
                 keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50, 
                     transpositions, MOCK_QSC);
                 break;
+            case 4:
+                TermRangeQuery trq = getRandomRange(values);
+                wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(), 
+                    trq.includesUpper(), null, null, null, MOCK_QSC);
+                keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(), 
+                    trq.includesUpper(), null, null, null, MOCK_QSC);
+                break;
+
             }
             TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE);
             TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE);
@@ -294,6 +303,76 @@ public void testSearchResultsVersusKeywordField() throws IOException {
         dir.close();
     }
 
+    private void indexDoc(RandomIndexWriter iw, String value) throws IOException {
+        Document doc = new Document();
+        ParseContext.Document parseDoc = new ParseContext.Document();
+        addFields(parseDoc, doc, value);
+        indexDoc(parseDoc, doc, iw);        
+    }
+
+    public void testRangeQueryVersusKeywordField() throws IOException {
+        Directory dir = newDirectory();
+        IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
+        iwc.setMergePolicy(newTieredMergePolicy(random()));
+        RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+        // Tests for acceleration strategy based on long common prefix
+        indexDoc(iw, "C:\\Program Files\\a.txt");
+        indexDoc(iw, "C:\\Program Files\\n.txt");
+        indexDoc(iw, "C:\\Program Files\\z.txt");
+
+        // Tests for acceleration strategy based on no common prefix
+        indexDoc(iw, "a.txt");
+        indexDoc(iw, "n.txt");
+        indexDoc(iw, "z.txt");
+
+        iw.forceMerge(1);
+        DirectoryReader reader = iw.getReader();
+        IndexSearcher searcher = newSearcher(reader);
+        iw.close();
+
+
+        String [][] rangeTests = {
+            {"C:\\Program Files\\a", "C:\\Program Files\\z"}, 
+            {"C:\\Program Files\\a", "C:\\Program Files\\n"}, 
+            {null, "C:\\Program Files\\z"}, 
+            {"C:\\Program Files\\a", null},
+
+            {"a.txt", "z.txt"}, 
+            {"a.txt", "n.txt"}, 
+            {null, "z.txt"}, 
+            {"a.txt", null} 
+        };
+
+        for (String[] bounds : rangeTests) {
+            BytesRef lower = bounds[0] == null ? null :new BytesRef(bounds[0]);
+            BytesRef upper = bounds[1] == null ? null :new BytesRef(bounds[1]);
+            TermRangeQuery trq = new TermRangeQuery(WILDCARD_FIELD_NAME, lower, upper, randomBoolean(), randomBoolean());
+            Query wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(), 
+                trq.includesUpper(), null, null, null, MOCK_QSC);
+            Query keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(), 
+                trq.includesUpper(), null, null, null, MOCK_QSC);
+
+
+            TopDocs kwTopDocs = searcher.search(keywordFieldQuery, 10, Sort.RELEVANCE);
+            TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.RELEVANCE);
+            assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value));
+
+            HashSet<Integer> expectedDocs = new HashSet<>();
+            for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
+                expectedDocs.add(topDoc.doc);
+            }
+            for (ScoreDoc wcTopDoc : wildcardFieldTopDocs.scoreDocs) {
+                assertTrue(expectedDocs.remove(wcTopDoc.doc));
+            }
+            assertThat(expectedDocs.size(), equalTo(0));
+
+        }
+        reader.close();
+        dir.close();
+    }
+
+
     public void testRegexAcceleration() throws IOException, ParseException {
         // All these expressions should rewrite to a match all with no verification step required at all
         String superfastRegexes[]= { ".*",  "...*..", "(foo|bar|.*)", "@"};
@@ -485,6 +564,54 @@ public void testFuzzyAcceleration() throws IOException, ParseException {
         }
     }    
 
+
+    static class RangeTest {
+        String lower;
+        String upper;
+        String ngrams;
+
+        RangeTest(
+            String lower,
+            String upper,
+            String ngrams
+        ) {
+            super();
+            this.lower = lower;
+            this.upper = upper;
+            this.ngrams = ngrams;
+        }
+
+        Query getRangeQuery() {
+            return wildcardFieldType.fieldType().rangeQuery(lower, upper, true, true, null, null, null, MOCK_QSC);
+        }
+
+        Query getExpectedApproxQuery() throws ParseException {
+            BooleanQuery.Builder bq = new BooleanQuery.Builder();
+            if (ngrams != null) {
+                String[] tokens = ngrams.split(" ");
+                for (String token : tokens) {
+                    Query ngramQuery = new TermQuery(
+                        new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
+                    );
+                    bq.add(ngramQuery, Occur.MUST);
+                }
+            }
+            return bq.build();
+        }
+    }    
+
+    public void testRangeAcceleration() throws IOException, ParseException {
+
+        RangeTest[] tests = {
+            new RangeTest("c:/a.txt", "c:/z.txt", "_c: c:/"),
+            new RangeTest("C:/ProgramFiles/a.txt", "C:/ProgramFiles/z.txt", "_c: :/p pro ogr ram mfi ile es/"),
+        };
+        for (RangeTest test : tests) {
+            Query wildcardFieldQuery = test.getRangeQuery();
+            testExpectedAccelerationQuery(test.lower + "-" + test.upper, wildcardFieldQuery, test.getExpectedApproxQuery());
+        }
+    }      
+
     void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException {
 
         QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer());
@@ -530,6 +657,33 @@ private String getRandomFuzzyPattern(HashSet<String> values, int edits, int pref
         }
         return randomValue;
     }    
+
+    private TermRangeQuery getRandomRange(HashSet<String> values) {
+        // Pick one of the indexed document values to focus our queries on.
+        String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
+        StringBuilder upper = new StringBuilder();
+        //Pick a part of the string to change
+        int substitutionPoint = randomIntBetween(0, randomValue.length()-1);
+        int substitutionLength = randomIntBetween(1, Math.min(10, randomValue.length() - substitutionPoint));
+
+        //Add any head to the result, unchanged
+        if(substitutionPoint >0) {
+            upper.append(randomValue.substring(0,substitutionPoint));
+        }
+
+        // Modify the middle...
+        String replacementPart = randomValue.substring(substitutionPoint, substitutionPoint+substitutionLength);
+        // .-replace all a chars with z
+        upper.append(replacementPart.replaceAll("a", "z"));            
+
+        //add any remaining tail, unchanged
+        if(substitutionPoint + substitutionLength <= randomValue.length()-1) {
+            upper.append(randomValue.substring(substitutionPoint + substitutionLength));
+        }
+        return new TermRangeQuery(WILDCARD_FIELD_NAME, new BytesRef(randomValue), new BytesRef(upper.toString()), 
+            randomBoolean(), randomBoolean());
+    }    
+
 
     private String getRandomRegexPattern(HashSet<String> values) {
         // Pick one of the indexed document values to focus our queries on.