From 2da8e57f59d060e315adcdea61e0f74bc1cc6756 Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 12 Jun 2020 11:30:54 +0100 Subject: [PATCH] Search - add range query support to wildcard field (#57881) (#57988) Backport to add range query support to wildcard field Closes #57816 --- .../wildcard/mapper/WildcardFieldMapper.java | 95 +++++++++++ .../mapper/WildcardFieldMapperTests.java | 156 +++++++++++++++++- 2 files changed, 250 insertions(+), 1 deletion(-) diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index b279a69385577..b157d81e0b3a7 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -30,15 +30,19 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.automaton.RegExp.Kind; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.common.geo.ShapeRelation; import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.time.DateMathParser; import org.elasticsearch.common.unit.Fuzziness; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; @@ -70,6 +74,7 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.time.ZoneId; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; @@ -613,6 +618,12 @@ static Query simplify(Query input) { static boolean isMatchAll(Query q) { return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery; } + + protected String firstNgramToken(String fragment) { + LinkedHashSet tokens = new LinkedHashSet<>(); + getNgramTokens(tokens, fragment); + return tokens.iterator().next(); + } protected void getNgramTokens(Set tokens, String fragment) { if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) { @@ -678,6 +689,90 @@ private void addClause(String token, BooleanQuery.Builder bqBuilder, Occur occur } } + @Override + public Query rangeQuery( + Object lowerTerm, + Object upperTerm, + boolean includeLower, + boolean includeUpper, + ShapeRelation relation, + ZoneId timeZone, + DateMathParser parser, + QueryShardContext context + ) { + if (context.allowExpensiveQueries() == false) { + throw new ElasticsearchException("[range] queries on [wildcard] fields cannot be executed when '" + + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false."); + } + BytesRef lower = lowerTerm == null ? null : BytesRefs.toBytesRef(lowerTerm); + BytesRef upper = upperTerm == null ? null : BytesRefs.toBytesRef(upperTerm); + Query accelerationQuery = null; + if (lowerTerm != null && upperTerm != null) { + // Long common prefixes e.g. "C:/Program Files/a,txt" to "C:/Program Files/z,txt" + // can be accelerated by searching for all the common leading ngrams e.g. c:/, /pr, rog, gra etc + StringBuilder commonPrefix = new StringBuilder(); + String lowerS = addLineEndChars(toLowerCase(lower.utf8ToString())); + String upperS = addLineEndChars(toLowerCase(upper.utf8ToString())); + for (int i = 0; i < Math.min(lowerS.length(), upperS.length());) { + final int cL = lowerS.codePointAt(i); + final int cU = upperS.codePointAt(i); + if (cL == cU) { + commonPrefix.append(Character.toChars(cL)); + } else { + break; + } + int length = Character.charCount(cL); + i += length; + } + + if (commonPrefix.length() > 0) { + Set tokens = new HashSet<>(); + getNgramTokens(tokens, commonPrefix.toString()); + BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); + for (String token : tokens) { + int tokenSize = token.codePointCount(0, token.length()); + if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) { + continue; + } + + if (tokenSize == NGRAM_SIZE) { + TermQuery tq = new TermQuery(new Term(name(), token)); + bqBuilder.add(new BooleanClause(tq, Occur.MUST)); + } else { + PrefixQuery wq = new PrefixQuery(new Term(name(), token)); + wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE); + bqBuilder.add(new BooleanClause(wq, Occur.MUST)); + } + } + BooleanQuery bq = bqBuilder.build(); + if (bq.clauses().size() > 0) { + accelerationQuery = bq; + } + } + } + if (accelerationQuery == null) { + // Fallback - if there is no common prefix sequence then we look for the range of ngrams that appear at the start + // of the string e.g. given 100 to 999 we would search for ngrams in the range + // TOKEN_START_OR_END_CHAR + "10" to + // TOKEN_START_OR_END_CHAR + "99" + BytesRef lowerNgram = lower == null ? null : new BytesRef(firstNgramToken( + addLineEndChars(toLowerCase(lower.utf8ToString())))); + BytesRef upperNgram = upper == null ? null : new BytesRef(firstNgramToken( + addLineEndChars(toLowerCase(upper.utf8ToString())))); + accelerationQuery = new TermRangeQuery(name(), lowerNgram, upperNgram, true, true); + } + + Supplier deferredAutomatonSupplier = ()->{ + return TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper); + }; + AutomatonQueryOnBinaryDv slowQuery = new AutomatonQueryOnBinaryDv(name(), lower + "-" + upper, deferredAutomatonSupplier); + + BooleanQuery.Builder qBuilder = new BooleanQuery.Builder(); + qBuilder.add(accelerationQuery, Occur.MUST); + qBuilder.add(slowQuery, Occur.MUST); + return qBuilder.build(); + } + @Override public Query fuzzyQuery( Object value, diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java index 6af2e499c677c..a65297bfd889c 100644 --- a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java @@ -30,6 +30,7 @@ import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.store.Directory; @@ -214,7 +215,7 @@ public void testSearchResultsVersusKeywordField() throws IOException { Query wildcardFieldQuery = null; Query keywordFieldQuery = null; String pattern = null; - switch (randomInt(3)) { + switch (randomInt(4)) { case 0: pattern = getRandomWildcardPattern(); wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC); @@ -259,6 +260,14 @@ public void testSearchResultsVersusKeywordField() throws IOException { keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50, transpositions, MOCK_QSC); break; + case 4: + TermRangeQuery trq = getRandomRange(values); + wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(), + trq.includesUpper(), null, null, null, MOCK_QSC); + keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(), + trq.includesUpper(), null, null, null, MOCK_QSC); + break; + } TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE); TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE); @@ -294,6 +303,76 @@ public void testSearchResultsVersusKeywordField() throws IOException { dir.close(); } + private void indexDoc(RandomIndexWriter iw, String value) throws IOException { + Document doc = new Document(); + ParseContext.Document parseDoc = new ParseContext.Document(); + addFields(parseDoc, doc, value); + indexDoc(parseDoc, doc, iw); + } + + public void testRangeQueryVersusKeywordField() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER); + iwc.setMergePolicy(newTieredMergePolicy(random())); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + // Tests for acceleration strategy based on long common prefix + indexDoc(iw, "C:\\Program Files\\a.txt"); + indexDoc(iw, "C:\\Program Files\\n.txt"); + indexDoc(iw, "C:\\Program Files\\z.txt"); + + // Tests for acceleration strategy based on no common prefix + indexDoc(iw, "a.txt"); + indexDoc(iw, "n.txt"); + indexDoc(iw, "z.txt"); + + iw.forceMerge(1); + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + + String [][] rangeTests = { + {"C:\\Program Files\\a", "C:\\Program Files\\z"}, + {"C:\\Program Files\\a", "C:\\Program Files\\n"}, + {null, "C:\\Program Files\\z"}, + {"C:\\Program Files\\a", null}, + + {"a.txt", "z.txt"}, + {"a.txt", "n.txt"}, + {null, "z.txt"}, + {"a.txt", null} + }; + + for (String[] bounds : rangeTests) { + BytesRef lower = bounds[0] == null ? null :new BytesRef(bounds[0]); + BytesRef upper = bounds[1] == null ? null :new BytesRef(bounds[1]); + TermRangeQuery trq = new TermRangeQuery(WILDCARD_FIELD_NAME, lower, upper, randomBoolean(), randomBoolean()); + Query wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(), + trq.includesUpper(), null, null, null, MOCK_QSC); + Query keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(), + trq.includesUpper(), null, null, null, MOCK_QSC); + + + TopDocs kwTopDocs = searcher.search(keywordFieldQuery, 10, Sort.RELEVANCE); + TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.RELEVANCE); + assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value)); + + HashSet expectedDocs = new HashSet<>(); + for (ScoreDoc topDoc : kwTopDocs.scoreDocs) { + expectedDocs.add(topDoc.doc); + } + for (ScoreDoc wcTopDoc : wildcardFieldTopDocs.scoreDocs) { + assertTrue(expectedDocs.remove(wcTopDoc.doc)); + } + assertThat(expectedDocs.size(), equalTo(0)); + + } + reader.close(); + dir.close(); + } + + public void testRegexAcceleration() throws IOException, ParseException { // All these expressions should rewrite to a match all with no verification step required at all String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"}; @@ -485,6 +564,54 @@ public void testFuzzyAcceleration() throws IOException, ParseException { } } + + static class RangeTest { + String lower; + String upper; + String ngrams; + + RangeTest( + String lower, + String upper, + String ngrams + ) { + super(); + this.lower = lower; + this.upper = upper; + this.ngrams = ngrams; + } + + Query getRangeQuery() { + return wildcardFieldType.fieldType().rangeQuery(lower, upper, true, true, null, null, null, MOCK_QSC); + } + + Query getExpectedApproxQuery() throws ParseException { + BooleanQuery.Builder bq = new BooleanQuery.Builder(); + if (ngrams != null) { + String[] tokens = ngrams.split(" "); + for (String token : tokens) { + Query ngramQuery = new TermQuery( + new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING)) + ); + bq.add(ngramQuery, Occur.MUST); + } + } + return bq.build(); + } + } + + public void testRangeAcceleration() throws IOException, ParseException { + + RangeTest[] tests = { + new RangeTest("c:/a.txt", "c:/z.txt", "_c: c:/"), + new RangeTest("C:/ProgramFiles/a.txt", "C:/ProgramFiles/z.txt", "_c: :/p pro ogr ram mfi ile es/"), + }; + for (RangeTest test : tests) { + Query wildcardFieldQuery = test.getRangeQuery(); + testExpectedAccelerationQuery(test.lower + "-" + test.upper, wildcardFieldQuery, test.getExpectedApproxQuery()); + } + } + void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException { QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer()); @@ -530,6 +657,33 @@ private String getRandomFuzzyPattern(HashSet values, int edits, int pref } return randomValue; } + + private TermRangeQuery getRandomRange(HashSet values) { + // Pick one of the indexed document values to focus our queries on. + String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)]; + StringBuilder upper = new StringBuilder(); + //Pick a part of the string to change + int substitutionPoint = randomIntBetween(0, randomValue.length()-1); + int substitutionLength = randomIntBetween(1, Math.min(10, randomValue.length() - substitutionPoint)); + + //Add any head to the result, unchanged + if(substitutionPoint >0) { + upper.append(randomValue.substring(0,substitutionPoint)); + } + + // Modify the middle... + String replacementPart = randomValue.substring(substitutionPoint, substitutionPoint+substitutionLength); + // .-replace all a chars with z + upper.append(replacementPart.replaceAll("a", "z")); + + //add any remaining tail, unchanged + if(substitutionPoint + substitutionLength <= randomValue.length()-1) { + upper.append(randomValue.substring(substitutionPoint + substitutionLength)); + } + return new TermRangeQuery(WILDCARD_FIELD_NAME, new BytesRef(randomValue), new BytesRef(upper.toString()), + randomBoolean(), randomBoolean()); + } + private String getRandomRegexPattern(HashSet values) { // Pick one of the indexed document values to focus our queries on.