Skip to content

Commit

Permalink
New TermsEnum API for discovering terms in the index. (#66452)
Browse files Browse the repository at this point in the history
New api designed for use by apps like Kibana for auto-complete use cases.
A search string is supplied which is used as prefix for matching terms found in a given field in the index.
Supported field types are keyword, constant_keyword and flattened.
A timeout can limit the amount of time spent looking for matches (default 1s) and an `index_filter` query can limit indices e.g. those in the hot or warm tier by querying the `_tier` field

Closes #59137
  • Loading branch information
markharwood authored May 6, 2021
1 parent c9ca64c commit 73e0662
Show file tree
Hide file tree
Showing 29 changed files with 2,770 additions and 1 deletion.
97 changes: 97 additions & 0 deletions docs/reference/search/terms-enum.asciidoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
[[search-terms-enum]]
=== Terms enum API

The terms enum API can be used to discover terms in the index that match
a partial string. This is used for auto-complete:

[source,console]
--------------------------------------------------
POST stackoverflow/_terms_enum
{
"field" : "tags",
"string" : "kiba"
}
--------------------------------------------------
// TEST[setup:stackoverflow]


The API returns the following response:

[source,console-result]
--------------------------------------------------
{
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"terms": [
"kibana"
],
"complete" : true
}
--------------------------------------------------

The "complete" flag is false if time or space constraints were met and the
set of terms examined was not the full set of available values.

[[search-terms-enum-api-request]]
==== {api-request-title}

`GET /<target>/_terms_enum`


[[search-terms-enum-api-desc]]
==== {api-description-title}

The termsenum API can be used to discover terms in the index that begin with the provided
string. It is designed for low-latency look-ups used in auto-complete scenarios.


[[search-terms-enum-api-path-params]]
==== {api-path-parms-title}

`<target>`::
(Mandatory, string)
Comma-separated list of data streams, indices, and index aliases to search.
Wildcard (`*`) expressions are supported.
+
To search all data streams or indices in a cluster, omit this parameter or use
`_all` or `*`.

[[search-terms-enum-api-request-body]]
==== {api-request-body-title}

[[terms-enum-field-param]]
`field`::
(Mandatory, string)
Which field to match

[[terms-enum-string-param]]
`string`::
(Mandatory, string)
The string to match at the start of indexed terms

[[terms-enum-size-param]]
`size`::
(Optional, integer)
How many matching terms to return. Defaults to 10

[[terms-enum-timeout-param]]
`timeout`::
(Optional, <<time-units,time value>>)
The maximum length of time to spend collecting results. Defaults to "1s" (one second).
If the timeout is exceeded the `complete` flag set to false in the response and the results may
be partial or empty.

[[terms-enum-case_insensitive-param]]
`case_insensitive`::
(Optional, boolean)
When true the provided search string is matched against index terms without case sensitivity.
Defaults to false.

[[terms-enum-index_filter-param]]
`index_filter`::
(Optional, <<query-dsl,query object>> Allows to filter an index shard if the provided
query rewrites to `match_none`.

35 changes: 35 additions & 0 deletions rest-api-spec/src/main/resources/rest-api-spec/api/termsenum.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"termsenum":{
"documentation":{
"url":"https://www.elastic.co/guide/en/elasticsearch/reference/current/terms-enum.html",
"description": "The terms enum API can be used to discover terms in the index that begin with the provided string. It is designed for low-latency look-ups used in auto-complete scenarios."
},
"stability":"beta",
"visibility":"public",
"headers":{
"accept": [ "application/json"],
"content_type": ["application/json"]
},
"url":{
"paths":[
{
"path": "/{index}/_terms_enum",
"methods": [
"GET",
"POST"
],
"parts": {
"index": {
"type": "list",
"description": "A comma-separated list of index names to search; use `_all` or empty string to perform the operation on all indices"
}
}
}
]
},
"params":{},
"body":{
"description":"field name, string which is the prefix expected in matching terms, timeout and size for max number of results"
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,18 @@
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.search.AutomatonQueries;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
Expand Down Expand Up @@ -248,6 +258,25 @@ public KeywordFieldType(String name, NamedAnalyzer analyzer) {
this.scriptValues = null;
}

@Override
public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext) throws IOException {
IndexReader reader = queryShardContext.searcher().getTopReaderContext().reader();

Terms terms = MultiTerms.getTerms(reader, name());
if (terms == null) {
// Field does not exist on this shard.
return null;
}
Automaton a = caseInsensitive
? AutomatonQueries.caseInsensitivePrefix(string)
: Automata.makeString(string);
a = Operations.concatenate(a, Automata.makeAnyString());
a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);

CompiledAutomaton automaton = new CompiledAutomaton(a);
return automaton.getTermsEnum(terms);
}

@Override
public String typeName() {
return CONTENT_TYPE;
Expand Down Expand Up @@ -470,4 +499,6 @@ protected String contentType() {
public FieldMapper.Builder getMergeBuilder() {
return new Builder(simpleName(), indexAnalyzers, scriptCompiler).init(this);
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
Expand Down Expand Up @@ -429,4 +430,20 @@ public enum CollapseType {
KEYWORD,
NUMERIC
}

/**
* This method is used to support auto-complete services and implementations
* are expected to find terms beginning with the provided string very quickly.
* If fields cannot look up matching terms quickly they should return null.
* The returned TermEnum should implement next(), term() and doc_freq() methods
* but postings etc are not required.
* @param caseInsensitive if matches should be case insensitive
* @param string the partially complete word the user has typed (can be empty)
* @param queryShardContext the shard context
* @return null or an enumeration of matching terms and their doc frequencies
* @throws IOException Errors accessing data
*/
public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext) throws IOException {
return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,27 @@
package org.elasticsearch.index.mapper.flattened;

import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.search.AutomatonQueries;
import org.elasticsearch.common.unit.Fuzziness;
Expand Down Expand Up @@ -241,6 +254,29 @@ public Query wildcardQuery(String value,
public Query termQueryCaseInsensitive(Object value, SearchExecutionContext context) {
return AutomatonQueries.caseInsensitiveTermQuery(new Term(name(), indexedValueForSearch(value)));
}

@Override
public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext) throws IOException {
IndexReader reader = queryShardContext.searcher().getTopReaderContext().reader();
Terms terms = MultiTerms.getTerms(reader, name());
if (terms == null) {
// Field does not exist on this shard.
return null;
}

Automaton a = Automata.makeString(key + FlattenedFieldParser.SEPARATOR);
if (caseInsensitive) {
a = Operations.concatenate(a, AutomatonQueries.caseInsensitivePrefix(string));
} else {
a = Operations.concatenate(a, Automata.makeString(string));
a = Operations.concatenate(a, Automata.makeAnyString());
}
a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);

CompiledAutomaton automaton = new CompiledAutomaton(a);
// Wrap result in a class that strips field names from discovered terms
return new TranslatingTermsEnum(automaton.getTermsEnum(terms));
}

@Override
public BytesRef indexedValueForSearch(Object value) {
Expand Down Expand Up @@ -270,6 +306,95 @@ public ValueFetcher valueFetcher(SearchExecutionContext context, String format)
return SourceValueFetcher.identity(rootName + "." + key, context, format);
}
}


// Wraps a raw Lucene TermsEnum to strip values of fieldnames
static class TranslatingTermsEnum extends TermsEnum {
TermsEnum delegate;

TranslatingTermsEnum(TermsEnum delegate) {
this.delegate = delegate;
}

@Override
public BytesRef next() throws IOException {
// Strip the term of the fieldname value
BytesRef result = delegate.next();
if (result != null) {
result = FlattenedFieldParser.extractValue(result);
}
return result;
}

@Override
public BytesRef term() throws IOException {
// Strip the term of the fieldname value
BytesRef result = delegate.term();
if (result != null) {
result = FlattenedFieldParser.extractValue(result);
}
return result;
}


@Override
public int docFreq() throws IOException {
return delegate.docFreq();
}

//=============== All other TermsEnum methods not supported =================

@Override
public AttributeSource attributes() {
throw new UnsupportedOperationException();
}

@Override
public boolean seekExact(BytesRef text) throws IOException {
throw new UnsupportedOperationException();
}

@Override
public SeekStatus seekCeil(BytesRef text) throws IOException {
throw new UnsupportedOperationException();
}

@Override
public void seekExact(long ord) throws IOException {
throw new UnsupportedOperationException();
}

@Override
public void seekExact(BytesRef term, TermState state) throws IOException {
throw new UnsupportedOperationException();
}

@Override
public long ord() throws IOException {
throw new UnsupportedOperationException();
}

@Override
public long totalTermFreq() throws IOException {
throw new UnsupportedOperationException();
}

@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
throw new UnsupportedOperationException();
}

@Override
public ImpactsEnum impacts(int flags) throws IOException {
throw new UnsupportedOperationException();
}

@Override
public TermState termState() throws IOException {
throw new UnsupportedOperationException();
}

}

/**
* A field data implementation that gives access to the values associated with
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,4 +166,15 @@ static BytesRef extractKey(BytesRef keyedValue) {
}
return new BytesRef(keyedValue.bytes, keyedValue.offset, length);
}

static BytesRef extractValue(BytesRef keyedValue) {
int length;
for (length = 0; length < keyedValue.length; length++){
if (keyedValue.bytes[keyedValue.offset + length] == SEPARATOR_BYTE) {
break;
}
}
int valueStart = keyedValue.offset + length + 1;
return new BytesRef(keyedValue.bytes, valueStart, keyedValue.length - valueStart );
}
}
Loading

0 comments on commit 73e0662

Please sign in to comment.