New TermsEnum API for discovering terms in the index. (#66452)

New api designed for use by apps like Kibana for auto-complete use cases. A search string is supplied which is used as prefix for matching terms found in a given field in the index. Supported field types are keyword, constant_keyword and flattened. A timeout can limit the amount of time spent looking for matches (default 1s) and an `index_filter` query can limit indices e.g. those in the hot or warm tier by querying the `_tier` field Closes #59137
elastic · May 6, 2021 · 73e0662 · 73e0662
1 parent c9ca64c
commit 73e0662
Show file tree

Hide file tree

Showing 29 changed files with 2,770 additions and 1 deletion.
diff --git a/docs/reference/search/terms-enum.asciidoc b/docs/reference/search/terms-enum.asciidoc
@@ -0,0 +1,97 @@
+[[search-terms-enum]]
+=== Terms enum API
+
+The terms enum API can be used to discover terms in the index that match
+a partial string. This is used for auto-complete:
+
+[source,console]
+--------------------------------------------------
+POST stackoverflow/_terms_enum
+{
+    "field" : "tags",
+    "string" : "kiba"
+}
+--------------------------------------------------
+// TEST[setup:stackoverflow]
+
+
+The API returns the following response:
+
+[source,console-result]
+--------------------------------------------------
+{
+  "_shards": {
+    "total": 1,
+    "successful": 1,
+    "failed": 0
+  },
+  "terms": [
+    "kibana"
+  ],
+  "complete" : true
+}
+--------------------------------------------------
+
+The "complete" flag is false if time or space constraints were met and the
+set of terms examined was not the full set of available values.
+
+[[search-terms-enum-api-request]]
+==== {api-request-title}
+
+`GET /<target>/_terms_enum`
+
+
+[[search-terms-enum-api-desc]]
+==== {api-description-title}
+
+The termsenum API  can be used to discover terms in the index that begin with the provided
+string. It is designed for low-latency look-ups used in auto-complete scenarios.
+
+
+[[search-terms-enum-api-path-params]]
+==== {api-path-parms-title}
+
+`<target>`::
+(Mandatory, string)
+Comma-separated list of data streams, indices, and index aliases to search.
+Wildcard (`*`) expressions are supported.
++
+To search all data streams or indices in a cluster, omit this parameter or use
+`_all` or `*`.
+
+[[search-terms-enum-api-request-body]]
+==== {api-request-body-title}
+
+[[terms-enum-field-param]]
+`field`::
+(Mandatory, string)
+Which field to match
+
+[[terms-enum-string-param]]
+`string`::
+(Mandatory, string)
+The string to match at the start of indexed terms
+
+[[terms-enum-size-param]]
+`size`::
+(Optional, integer)
+How many matching terms to return. Defaults to 10
+
+[[terms-enum-timeout-param]]
+`timeout`::
+(Optional, <<time-units,time value>>)
+The maximum length of time to spend collecting results. Defaults to "1s" (one second).
+If the timeout is exceeded the `complete` flag set to false in the response and the results may
+be partial or empty.
+
+[[terms-enum-case_insensitive-param]]
+`case_insensitive`::
+(Optional, boolean)
+When true the provided search string is matched against index terms without case sensitivity.
+Defaults to false.
+
+[[terms-enum-index_filter-param]]
+`index_filter`::
+(Optional,  <<query-dsl,query object>> Allows to filter an index shard if the provided
+query rewrites to `match_none`.
+
diff --git a/rest-api-spec/src/main/resources/rest-api-spec/api/termsenum.json b/rest-api-spec/src/main/resources/rest-api-spec/api/termsenum.json
@@ -0,0 +1,35 @@
+{
+  "termsenum":{
+    "documentation":{
+      "url":"https://www.elastic.co/guide/en/elasticsearch/reference/current/terms-enum.html",
+      "description": "The terms enum API  can be used to discover terms in the index that begin with the provided string. It is designed for low-latency look-ups used in auto-complete scenarios."
+    },
+    "stability":"beta",
+    "visibility":"public",
+    "headers":{
+      "accept": [ "application/json"],
+      "content_type": ["application/json"]
+    },
+    "url":{
+      "paths":[
+        {
+          "path": "/{index}/_terms_enum",
+          "methods": [
+            "GET",
+            "POST"
+          ],
+          "parts": {
+            "index": {
+              "type": "list",
+              "description": "A comma-separated list of index names to search; use `_all` or empty string to perform the operation on all indices"
+            }
+          }
+        }
+      ]
+    },
+    "params":{},
+    "body":{
+      "description":"field name, string which is the prefix expected in matching terms, timeout and size for max number of results"
+    }
+  }
+}
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@@ -17,8 +17,18 @@
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiTerms;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.MinimizationOperations;
+import org.apache.lucene.util.automaton.Operations;
 import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.common.lucene.search.AutomatonQueries;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
@@ -248,6 +258,25 @@ public KeywordFieldType(String name, NamedAnalyzer analyzer) {
             this.scriptValues = null;
         }
 
+        @Override
+        public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext) throws IOException {
+            IndexReader reader = queryShardContext.searcher().getTopReaderContext().reader();
+
+            Terms terms = MultiTerms.getTerms(reader, name());
+            if (terms == null) {
+                // Field does not exist on this shard.
+                return null;
+            }
+            Automaton a = caseInsensitive
+                ? AutomatonQueries.caseInsensitivePrefix(string)
+                : Automata.makeString(string);
+            a = Operations.concatenate(a, Automata.makeAnyString());
+            a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);
+
+            CompiledAutomaton automaton = new CompiledAutomaton(a);
+            return automaton.getTermsEnum(terms);            
+        }
+
         @Override
         public String typeName() {
             return CONTENT_TYPE;
@@ -470,4 +499,6 @@ protected String contentType() {
     public FieldMapper.Builder getMergeBuilder() {
         return new Builder(simpleName(), indexAnalyzers, scriptCompiler).init(this);
     }
+
+
 }
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java
@@ -13,6 +13,7 @@
 import org.apache.lucene.index.PrefixCodedTerms;
 import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.queries.intervals.IntervalsSource;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
@@ -429,4 +430,20 @@ public enum CollapseType {
         KEYWORD,
         NUMERIC
     }
+
+    /**
+     * This method is used to support auto-complete services and implementations
+     * are expected to find terms beginning with the provided string very quickly.
+     * If fields cannot look up matching terms quickly they should return null.  
+     * The returned TermEnum should implement next(), term() and doc_freq() methods
+     * but postings etc are not required.
+     * @param caseInsensitive if matches should be case insensitive
+     * @param string the partially complete word the user has typed (can be empty)
+     * @param queryShardContext the shard context
+     * @return null or an enumeration of matching terms and their doc frequencies
+     * @throws IOException Errors accessing data
+     */
+    public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext) throws IOException {
+        return null;
+    }
 }
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/flattened/FlattenedFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/flattened/FlattenedFieldMapper.java
@@ -9,14 +9,27 @@
 package org.elasticsearch.index.mapper.flattened;
 
 import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.ImpactsEnum;
+import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.MultiTerms;
 import org.apache.lucene.index.OrdinalMap;
+import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SortField;
+import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.MinimizationOperations;
+import org.apache.lucene.util.automaton.Operations;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.lucene.search.AutomatonQueries;
 import org.elasticsearch.common.unit.Fuzziness;
@@ -241,6 +254,29 @@ public Query wildcardQuery(String value,
         public Query termQueryCaseInsensitive(Object value, SearchExecutionContext context) {
             return AutomatonQueries.caseInsensitiveTermQuery(new Term(name(), indexedValueForSearch(value)));
         }
+
+        @Override
+        public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext) throws IOException {
+            IndexReader reader = queryShardContext.searcher().getTopReaderContext().reader();
+            Terms terms = MultiTerms.getTerms(reader, name());
+            if (terms == null) {
+                // Field does not exist on this shard.
+                return null;
+            }
+
+            Automaton a = Automata.makeString(key + FlattenedFieldParser.SEPARATOR);
+            if (caseInsensitive) {
+                a = Operations.concatenate(a, AutomatonQueries.caseInsensitivePrefix(string));
+            } else {
+                a = Operations.concatenate(a, Automata.makeString(string));
+                a = Operations.concatenate(a, Automata.makeAnyString());                
+            }
+            a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);
+
+            CompiledAutomaton automaton = new CompiledAutomaton(a);
+            // Wrap result in a class that strips field names from discovered terms
+            return new TranslatingTermsEnum(automaton.getTermsEnum(terms));            
+        }        
 
         @Override
         public BytesRef indexedValueForSearch(Object value) {
@@ -270,6 +306,95 @@ public ValueFetcher valueFetcher(SearchExecutionContext context, String format)
             return SourceValueFetcher.identity(rootName + "." + key, context, format);
         }
     }
+
+
+    // Wraps a raw Lucene TermsEnum to strip values of fieldnames
+    static class TranslatingTermsEnum extends TermsEnum {
+        TermsEnum delegate;
+
+        TranslatingTermsEnum(TermsEnum delegate) {
+            this.delegate = delegate;
+        }
+
+        @Override
+        public BytesRef next() throws IOException {
+            // Strip the term of the fieldname value
+            BytesRef result = delegate.next();
+            if (result != null) {
+                result = FlattenedFieldParser.extractValue(result);
+            }
+            return result;
+        }
+
+        @Override
+        public BytesRef term() throws IOException {
+            // Strip the term of the fieldname value
+            BytesRef result = delegate.term();
+            if (result != null) {
+                result = FlattenedFieldParser.extractValue(result);
+            }
+            return result;
+        }
+
+
+        @Override
+        public int docFreq() throws IOException {
+            return delegate.docFreq();
+        }         
+
+        //===============  All other TermsEnum methods not supported =================
+
+        @Override
+        public AttributeSource attributes() {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public boolean seekExact(BytesRef text) throws IOException {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public SeekStatus seekCeil(BytesRef text) throws IOException {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public void seekExact(long ord) throws IOException {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public void seekExact(BytesRef term, TermState state) throws IOException {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public long ord() throws IOException {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public long totalTermFreq() throws IOException {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public ImpactsEnum impacts(int flags) throws IOException {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public TermState termState() throws IOException {
+            throw new UnsupportedOperationException();
+        }
+
+    }
 
     /**
      * A field data implementation that gives access to the values associated with

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/flattened/FlattenedFieldParser.java b/server/src/main/java/org/elasticsearch/index/mapper/flattened/FlattenedFieldParser.java
@@ -166,4 +166,15 @@ static BytesRef extractKey(BytesRef keyedValue) {
         }
         return new BytesRef(keyedValue.bytes, keyedValue.offset, length);
     }
+
+    static BytesRef extractValue(BytesRef keyedValue) {
+        int length;
+        for (length = 0; length < keyedValue.length; length++){
+            if (keyedValue.bytes[keyedValue.offset + length] == SEPARATOR_BYTE) {
+                break;
+            }
+        }
+        int valueStart = keyedValue.offset + length + 1;
+        return new BytesRef(keyedValue.bytes, valueStart, keyedValue.length - valueStart );
+    }    
 }