Skip to content

Commit

Permalink
Ignore fields with no content when querying wildcard fields (elastic#…
Browse files Browse the repository at this point in the history
…81985)

The query_string, simple_query_string, combined_fields and multi_match
queries all allow you to query a large number of fields, based on wildcard field name
matches. By default, the wildcard match is *, meaning that these queries will try
and match against every single field in your index. This can cause problems if you
have a very large number of fields defined, and your elasticsearch instance has a
fairly low maximum query clause count.

In many cases, users may have many more fields defined in their mappings than are
actually populated in their index. For example, indexes using ECS mappings may
well only use a small subset of these mapped fields for their data. In these situations,
we can put a limit on the number of fields being searched by doing a quick check of
the Lucene index metadata to see if a mapped field actually has content in the index;
if it doesn't exist, we can trivially skip it.

This commit adds a check to QueryParserHelper.resolveMappingField() that strips
out fields with no content if the field name to resolve contains a wildcard. The check
is delegated down to MappedFieldType and by default returns `true`, but the standard
indexable field types (numeric, text, keyword, range, etc) will check their fieldnames
against the names in the underlying lucene FieldInfos and return `false` if they do not
appear there.
  • Loading branch information
romseygeek authored Jan 18, 2022
1 parent 7be74a8 commit d11973b
Show file tree
Hide file tree
Showing 31 changed files with 1,040 additions and 535 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.benchmark.search;

import org.apache.logging.log4j.util.Strings;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterModule;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.core.internal.io.IOUtils;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AnalyzerScope;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.mapper.IdFieldMapper;
import org.elasticsearch.index.mapper.MapperRegistry;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.ParsedDocument;
import org.elasticsearch.index.mapper.SourceToParse;
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.index.search.QueryParserHelper;
import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.index.similarity.SimilarityService;
import org.elasticsearch.indices.IndicesModule;
import org.elasticsearch.indices.breaker.NoneCircuitBreakerService;
import org.elasticsearch.script.Script;
import org.elasticsearch.script.ScriptCompiler;
import org.elasticsearch.script.ScriptContext;
import org.elasticsearch.xcontent.NamedXContentRegistry;
import org.elasticsearch.xcontent.XContentType;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Warmup;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;

@Fork(1)
@Warmup(iterations = 5)
@Measurement(iterations = 5)
@State(Scope.Benchmark)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@BenchmarkMode(Mode.AverageTime)
public class QueryParserHelperBenchmark {

private static final int NUMBER_OF_MAPPING_FIELDS = 1000;

private Directory directory;
private IndexReader indexReader;
private MapperService mapperService;

@Setup
public void setup() throws IOException {
// pre: set up MapperService and SearchExecutionContext
List<String> fields = new ArrayList<>();
for (int i = 0; i < NUMBER_OF_MAPPING_FIELDS; i++) {
fields.add(String.format("""
"field%d":{"type":"long"}""", i));
}
String mappings = """
{"_doc":{"properties":{""" + Strings.join(fields, ',') + "}}}";

mapperService = createMapperService(mappings);
IndexWriterConfig iwc = new IndexWriterConfig(IndexShard.buildIndexAnalyzer(mapperService));
directory = new ByteBuffersDirectory();
IndexWriter iw = new IndexWriter(directory, iwc);

for (int i = 0; i < 2000; i++) {
ParsedDocument doc = mapperService.documentMapper().parse(buildDoc(i));
iw.addDocument(doc.rootDoc());
if (i % 100 == 0) {
iw.commit();
}
}
iw.close();

indexReader = DirectoryReader.open(directory);
}

private SourceToParse buildDoc(int docId) {
List<String> fields = new ArrayList<>();
for (int i = 0; i < NUMBER_OF_MAPPING_FIELDS; i++) {
if (i % 2 == 0) continue;
if (i % 3 == 0 && (docId < (NUMBER_OF_MAPPING_FIELDS / 2))) continue;
fields.add(String.format("""
"field%d":1""", i));
}
String source = "{" + String.join(",", fields) + "}";
return new SourceToParse("" + docId, new BytesArray(source), XContentType.JSON);
}

@TearDown
public void tearDown() {
IOUtils.closeWhileHandlingException(indexReader, directory);
}

@Benchmark
public void expand() {
Map<String, Float> fields = QueryParserHelper.resolveMappingFields(buildSearchExecutionContext(), Map.of("*", 1f));
assert fields.size() > 0 && fields.size() < NUMBER_OF_MAPPING_FIELDS;
}

protected SearchExecutionContext buildSearchExecutionContext() {
final SimilarityService similarityService = new SimilarityService(mapperService.getIndexSettings(), null, Map.of());
final long nowInMillis = 1;
return new SearchExecutionContext(
0,
0,
mapperService.getIndexSettings(),
null,
(ft, idxName, lookup) -> ft.fielddataBuilder(idxName, lookup)
.build(new IndexFieldDataCache.None(), new NoneCircuitBreakerService()),
mapperService,
mapperService.mappingLookup(),
similarityService,
null,
new NamedXContentRegistry(ClusterModule.getNamedXWriteables()),
new NamedWriteableRegistry(ClusterModule.getNamedWriteables()),
null,
new IndexSearcher(indexReader),
() -> nowInMillis,
null,
null,
() -> true,
null,
Collections.emptyMap()
);
}

protected final MapperService createMapperService(String mappings) {
Settings settings = Settings.builder()
.put("index.number_of_replicas", 0)
.put("index.number_of_shards", 1)
.put("index.version.created", Version.CURRENT)
.build();
IndexMetadata meta = IndexMetadata.builder("index").settings(settings).build();
IndexSettings indexSettings = new IndexSettings(meta, settings);
MapperRegistry mapperRegistry = new IndicesModule(Collections.emptyList()).getMapperRegistry();

SimilarityService similarityService = new SimilarityService(indexSettings, null, Map.of());
MapperService mapperService = new MapperService(
indexSettings,
new IndexAnalyzers(
Map.of("default", new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer())),
Map.of(),
Map.of()
),
new NamedXContentRegistry(ClusterModule.getNamedXWriteables()),
similarityService,
mapperRegistry,
() -> { throw new UnsupportedOperationException(); },
new IdFieldMapper(() -> true),
new ScriptCompiler() {
@Override
public <T> T compile(Script script, ScriptContext<T> scriptContext) {
throw new UnsupportedOperationException();
}
}
);

try {
mapperService.merge("_doc", new CompressedXContent(mappings), MapperService.MergeReason.MAPPING_UPDATE);
return mapperService;
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,11 @@ public String typeName() {
return CONTENT_TYPE;
}

@Override
public boolean mayExistInIndex(SearchExecutionContext context) {
return context.fieldExistsInIndex(name());
}

@Override
public Query termQuery(Object value, SearchExecutionContext context) {
failIfNotIndexedNorDocValuesFallback(context);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,11 @@ boolean termLengthWithinBounds(int length) {
return length >= minChars - 1 && length <= maxChars;
}

@Override
public boolean mayExistInIndex(SearchExecutionContext context) {
return false;
}

@Override
public Query prefixQuery(
String value,
Expand Down Expand Up @@ -569,6 +574,11 @@ void setPrefixFieldType(PrefixFieldType prefixFieldType) {
this.prefixFieldType = prefixFieldType;
}

@Override
public boolean mayExistInIndex(SearchExecutionContext context) {
return false;
}

@Override
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
// Because this internal field is modelled as a multi-field, SourceValueFetcher will look up its
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,14 @@

package org.elasticsearch.search.query;

import org.apache.lucene.search.IndexSearcher;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.query.Operator;
import org.elasticsearch.index.query.QueryStringQueryBuilder;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentType;
import org.junit.Before;

Expand All @@ -31,10 +27,8 @@

import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
import static org.elasticsearch.test.StreamsUtils.copyToStringFromClasspath;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures;
import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder;
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
Expand Down Expand Up @@ -234,90 +228,6 @@ public void testAllFieldsWithSpecifiedLeniency() throws IOException {
assertThat(e.getCause().getMessage(), containsString("unit [D] not supported for date math [-2D]"));
}

public void testLimitOnExpandedFields() throws Exception {

final int maxClauseCount = randomIntBetween(50, 100);

XContentBuilder builder = jsonBuilder();
builder.startObject();
{
builder.startObject("_doc");
{
builder.startObject("properties");
{
for (int i = 0; i < maxClauseCount; i++) {
builder.startObject("field_A" + i).field("type", "text").endObject();
builder.startObject("field_B" + i).field("type", "text").endObject();
}
builder.endObject();
}
builder.endObject();
}
builder.endObject();
}

assertAcked(
prepareCreate("testindex").setSettings(
Settings.builder().put(MapperService.INDEX_MAPPING_TOTAL_FIELDS_LIMIT_SETTING.getKey(), maxClauseCount + 100)
).setMapping(builder)
);

client().prepareIndex("testindex").setId("1").setSource("field_A0", "foo bar baz").get();
refresh();

int originalMaxClauses = IndexSearcher.getMaxClauseCount();
try {

IndexSearcher.setMaxClauseCount(maxClauseCount);

// single field shouldn't trigger the limit
doAssertOneHitForQueryString("field_A0:foo");
// expanding to the limit should work
doAssertOneHitForQueryString("field_A\\*:foo");

// adding a non-existing field on top shouldn't overshoot the limit
doAssertOneHitForQueryString("field_A\\*:foo unmapped:something");

// the following should exceed the limit
doAssertLimitExceededException("foo", IndexSearcher.getMaxClauseCount() * 2, "*");
doAssertLimitExceededException("*:foo", IndexSearcher.getMaxClauseCount() * 2, "*");
doAssertLimitExceededException("field_\\*:foo", IndexSearcher.getMaxClauseCount() * 2, "field_*");

} finally {
IndexSearcher.setMaxClauseCount(originalMaxClauses);
}
}

private void doAssertOneHitForQueryString(String queryString) {
QueryStringQueryBuilder qb = queryStringQuery(queryString);
if (randomBoolean()) {
qb.defaultField("*");
}
SearchResponse response = client().prepareSearch("testindex").setQuery(qb).get();
assertHitCount(response, 1);
}

private void doAssertLimitExceededException(String queryString, int exceedingFieldCount, String inputFieldPattern) {
Exception e = expectThrows(Exception.class, () -> {
QueryStringQueryBuilder qb = queryStringQuery(queryString);
if (randomBoolean()) {
qb.defaultField("*");
}
client().prepareSearch("testindex").setQuery(qb).get();
});
assertThat(
ExceptionsHelper.unwrap(e, IllegalArgumentException.class).getMessage(),
containsString(
"field expansion for ["
+ inputFieldPattern
+ "] matches too many fields, limit: "
+ IndexSearcher.getMaxClauseCount()
+ ", got: "
+ exceedingFieldCount
)
);
}

public void testFieldAlias() throws Exception {
List<IndexRequestBuilder> indexRequests = new ArrayList<>();
indexRequests.add(client().prepareIndex("test").setId("1").setSource("f3", "text", "f2", "one"));
Expand Down
Loading

0 comments on commit d11973b

Please sign in to comment.