From 4ffdad36d422258f0042f08cab88cb2c4c922738 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 25 Feb 2021 14:15:57 -0500 Subject: [PATCH] Speed up terms agg when alone (#69377) This speeds up the `terms` agg in a very specific case: 1. It has no child aggregations 2. It has no parent aggregations 3. There are no deleted documents 4. You are not using document level security 5. There is no top level query 6. The field has global ordinals 7. There are less than one thousand distinct terms That is a lot of restirctions! But the speed up pretty substantial because in those cases we can serve the entire aggregation using metadata that lucene precomputes while it builds the index. In a real rally track we have we get a 92% speed improvement, but the index isn't *that* big: ``` | 90th percentile service time | keyword-terms-low-cardinality | 446.031 | 36.7677 | -409.263 | ms | ``` In a rally track with a larger index I ran some tests by hand and the aggregation went from 2200ms to 8ms. Even though there are 7 restrictions on this, I expect it to come into play enough to matter. Restriction 6 just means you are aggregating on a `keyword` field. Or an `ip`. And its fairly common for `keyword`s to have less than a thousand distinct values. Certainly not everywhere, but some places. I expect "cold tier" indices are very very likely not to have deleted documents at all. And the optimization works segment by segment - so it'll save some time on each segment without deleted documents. But more time if the entire index doesn't have any. The optimization builds on #68871 which translates `terms` aggregations against low cardinality fields with global ordinals into a `filters` aggregation. This teaches the `filters` aggregation to recognize when it can get its results from the index metadata. Rather, it creates the infrastructure to make that fairly simple and applies it in the case of the queries generated by the terms aggregation. --- .../370_doc_count_field.yml | 2 +- .../aggregation/AggregationProfilerIT.java | 10 +- .../bucket/filter/FiltersAggregator.java | 297 ++++++-------- .../filter/FiltersAggregatorFactory.java | 16 +- .../bucket/filter/QueryToFilterAdapter.java | 369 ++++++++++++++++++ .../bucket/range/RangeAggregator.java | 10 +- .../StringTermsAggregatorFromFilters.java | 11 +- .../bucket/filter/FiltersAggregatorTests.java | 245 +++++++++++- .../internal/ContextIndexSearcherTests.java | 4 +- .../aggregations/AggregatorTestCase.java | 1 + .../test/security/authz/21_search_doc.yml | 17 + 11 files changed, 759 insertions(+), 223 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/QueryToFilterAdapter.java diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/370_doc_count_field.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/370_doc_count_field.yml index e42d88580bf1e..0a7281a24f4ea 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/370_doc_count_field.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/370_doc_count_field.yml @@ -178,4 +178,4 @@ setup: - match: { aggregations.f.buckets.foo.doc_count: 8 } - match: { aggregations.f.buckets.xyz.doc_count: 5 } - match: { profile.shards.0.aggregations.0.type: FiltersAggregator.FilterByFilter } - - gte: { profile.shards.0.aggregations.0.debug.segments_with_doc_count: 1 } + - gte: { profile.shards.0.aggregations.0.debug.segments_with_doc_count_field: 1 } diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/profile/aggregation/AggregationProfilerIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/profile/aggregation/AggregationProfilerIT.java index 5cbf4a79360cd..423fad9c15484 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/search/profile/aggregation/AggregationProfilerIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/search/profile/aggregation/AggregationProfilerIT.java @@ -41,6 +41,8 @@ import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.hamcrest.Matchers.hasEntry; +import static org.hamcrest.Matchers.hasKey; +import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.notNullValue; @ESIntegTestCase.SuiteScopeTestCase @@ -633,10 +635,16 @@ public void testFilterByFilter() throws InterruptedException, IOException { assertThat(delegate.get("delegate"), equalTo("FiltersAggregator.FilterByFilter")); Map delegateDebug = (Map) delegate.get("delegate_debug"); assertThat(delegateDebug, hasEntry("segments_with_deleted_docs", 0)); - assertThat(delegateDebug, hasEntry("segments_with_doc_count", 0)); + assertThat(delegateDebug, hasEntry("segments_with_doc_count_field", 0)); assertThat(delegateDebug, hasEntry("max_cost", (long) RangeAggregator.DOCS_PER_RANGE_TO_USE_FILTERS * 2)); assertThat(delegateDebug, hasEntry("estimated_cost", (long) RangeAggregator.DOCS_PER_RANGE_TO_USE_FILTERS * 2)); assertThat((long) delegateDebug.get("estimate_cost_time"), greaterThanOrEqualTo(0L)); // ~1,276,734 nanos is normal + List filtersDebug = (List) delegateDebug.get("filters"); + assertThat(filtersDebug, hasSize(1)); + Map queryDebug = (Map) filtersDebug.get(0); + assertThat(queryDebug, hasKey("scorers_prepared_while_estimating_cost")); + assertThat((int) queryDebug.get("scorers_prepared_while_estimating_cost"), greaterThan(0)); + assertThat(queryDebug, hasEntry("query", "ConstantScore(DocValuesFieldExistsQuery [field=date])")); } } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/FiltersAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/FiltersAggregator.java index 11924d4713050..a163ec0b37f87 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/FiltersAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/FiltersAggregator.java @@ -9,25 +9,15 @@ package org.elasticsearch.search.aggregations.bucket.filter; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BulkScorer; import org.apache.lucene.search.CollectionTerminatedException; -import org.apache.lucene.search.IndexOrDocValuesQuery; -import org.apache.lucene.search.IndexSortSortedNumericDocValuesRangeQuery; import org.apache.lucene.search.LeafCollector; -import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.PointRangeQuery; -import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorable; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.Weight; import org.apache.lucene.util.Bits; +import org.elasticsearch.common.CheckedSupplier; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; -import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.xcontent.ToXContentFragment; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.index.query.QueryBuilder; @@ -44,10 +34,13 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.function.BiConsumer; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; /** * Aggregator for {@code filters}. There are two known subclasses, @@ -134,8 +127,7 @@ public boolean equals(Object obj) { public static FiltersAggregator build( String name, AggregatorFactories factories, - String[] keys, - Query[] filters, + List> filters, boolean keyed, String otherBucketKey, AggregationContext context, @@ -144,12 +136,11 @@ public static FiltersAggregator build( Map metadata ) throws IOException { if (canUseFilterByFilter(parent, factories, otherBucketKey)) { - return buildFilterByFilter(name, factories, keys, filters, keyed, otherBucketKey, context, parent, cardinality, metadata); + return buildFilterByFilter(name, factories, filters, keyed, otherBucketKey, context, parent, cardinality, metadata); } return new FiltersAggregator.Compatible( name, factories, - keys, filters, keyed, otherBucketKey, @@ -178,8 +169,7 @@ public static boolean canUseFilterByFilter(Aggregator parent, AggregatorFactorie public static FilterByFilter buildFilterByFilter( String name, AggregatorFactories factories, - String[] keys, - Query[] filters, + List> filters, boolean keyed, String otherBucketKey, AggregationContext context, @@ -190,10 +180,13 @@ public static FilterByFilter buildFilterByFilter( if (false == canUseFilterByFilter(parent, factories, otherBucketKey)) { throw new IllegalStateException("Can't execute filter-by-filter"); } + List> filtersWithTopLevel = new ArrayList<>(filters.size()); + for (QueryToFilterAdapter f : filters) { + filtersWithTopLevel.add(f.union(context.query())); + } return new FiltersAggregator.FilterByFilter( name, - keys, - filters, + filtersWithTopLevel, keyed, context, parent, @@ -202,25 +195,29 @@ public static FilterByFilter buildFilterByFilter( ); } - private final String[] keys; + private final List> filters; private final boolean keyed; protected final String otherBucketKey; - private FiltersAggregator(String name, AggregatorFactories factories, String[] keys, boolean keyed, + private FiltersAggregator(String name, AggregatorFactories factories, List> filters, boolean keyed, String otherBucketKey, AggregationContext context, Aggregator parent, CardinalityUpperBound cardinality, Map metadata) throws IOException { - super(name, factories, context, parent, cardinality.multiply(keys.length + (otherBucketKey == null ? 0 : 1)), metadata); + super(name, factories, context, parent, cardinality.multiply(filters.size() + (otherBucketKey == null ? 0 : 1)), metadata); + this.filters = List.copyOf(filters); this.keyed = keyed; - this.keys = keys; this.otherBucketKey = otherBucketKey; } + List> filters() { + return filters; + } + @Override public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException { - return buildAggregationsForFixedBucketCount(owningBucketOrds, keys.length + (otherBucketKey == null ? 0 : 1), + return buildAggregationsForFixedBucketCount(owningBucketOrds, filters.size() + (otherBucketKey == null ? 0 : 1), (offsetInOwningOrd, docCount, subAggregationResults) -> { - if (offsetInOwningOrd < keys.length) { - return new InternalFilters.InternalBucket(keys[offsetInOwningOrd], docCount, + if (offsetInOwningOrd < filters.size()) { + return new InternalFilters.InternalBucket(filters.get(offsetInOwningOrd).key().toString(), docCount, subAggregationResults, keyed); } return new InternalFilters.InternalBucket(otherBucketKey, docCount, subAggregationResults, keyed); @@ -230,9 +227,9 @@ public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws I @Override public InternalAggregation buildEmptyAggregation() { InternalAggregations subAggs = buildEmptySubAggregations(); - List buckets = new ArrayList<>(keys.length); - for (int i = 0; i < keys.length; i++) { - InternalFilters.InternalBucket bucket = new InternalFilters.InternalBucket(keys[i], 0, subAggs, keyed); + List buckets = new ArrayList<>(filters.size() + otherBucketKey == null ? 0 : 1); + for (QueryToFilterAdapter filter : filters) { + InternalFilters.InternalBucket bucket = new InternalFilters.InternalBucket(filter.key().toString(), 0, subAggs, keyed); buckets.add(bucket); } @@ -244,6 +241,18 @@ public InternalAggregation buildEmptyAggregation() { return new InternalFilters(name, buckets, keyed, metadata()); } + @Override + public void collectDebugInfo(BiConsumer add) { + super.collectDebugInfo(add); + List> filtersDebug = new ArrayList<>(filters.size()); + for (QueryToFilterAdapter filter : filters) { + Map debug = new HashMap<>(); + filter.collectDebugInfo(debug::put); + filtersDebug.add(debug); + } + add.accept("filters", filtersDebug); + } + /** * Collects results by running each filter against the searcher and doesn't * build any {@link LeafBucketCollector}s which is generally faster than @@ -251,7 +260,6 @@ public InternalAggregation buildEmptyAggregation() { * or any child aggregators. */ public static class FilterByFilter extends FiltersAggregator { - private final Query[] filters; private final boolean profiling; private long estimatedCost = -1; /** @@ -260,31 +268,23 @@ public static class FilterByFilter extends FiltersAggregator { */ private long maxCost = -1; private long estimateCostTime; - private Weight[] weights; - /** - * If {@link #estimateCost} was called then this'll contain a - * scorer per leaf per filter. If it wasn't then this'll be {@code null}. - */ - private BulkScorer[][] scorers; private int segmentsWithDeletedDocs; /** * Count of segments with documents have consult the {@code doc_count} * field. */ - private int segmentsWithDocCount; + private int segmentsWithDocCountField; private FilterByFilter( String name, - String[] keys, - Query[] filters, + List> filters, boolean keyed, AggregationContext context, Aggregator parent, CardinalityUpperBound cardinality, Map metadata ) throws IOException { - super(name, AggregatorFactories.EMPTY, keys, keyed, null, context, parent, cardinality, metadata); - this.filters = filters; + super(name, AggregatorFactories.EMPTY, filters, keyed, null, context, parent, cardinality, metadata); this.profiling = context.profiling(); } @@ -292,50 +292,47 @@ private FilterByFilter( * Estimate the number of documents that this aggregation must visit. We'll * stop counting once we've passed {@code maxEstimatedCost} if we aren't profiling. */ + @SuppressWarnings("resource") // We're not in change of anything Closeable public long estimateCost(long maxCost) throws IOException { this.maxCost = maxCost; if (estimatedCost != -1) { return estimatedCost; } - long limit = profiling ? Long.MAX_VALUE : maxCost; long start = profiling ? System.nanoTime() : 0; estimatedCost = 0; - weights = buildWeights(topLevelQuery(), filters); - List leaves = searcher().getIndexReader().leaves(); - /* - * Its important that we save a copy of the BulkScorer because for - * queries like PointInRangeQuery building the scorer can be a big - * chunk of the run time. - */ - scorers = new BulkScorer[leaves.size()][]; - for (LeafReaderContext ctx : leaves) { - scorers[ctx.ord] = new BulkScorer[filters.length]; - for (int f = 0; f < filters.length; f++) { - scorers[ctx.ord][f] = weights[f].bulkScorer(ctx); - if (scorers[ctx.ord][f] == null) { - // Doesn't find anything in this leaf - continue; + for (LeafReaderContext ctx : searcher().getIndexReader().leaves()) { + CheckedSupplier canUseMetadata = canUseMetadata(ctx); + for (QueryToFilterAdapter filter : filters()) { + estimatedCost += filter.estimateCountCost(ctx, canUseMetadata); + if (estimatedCost < 0) { + // We've overflowed so we cap out and stop counting. + estimatedCost = Long.MAX_VALUE; + if (profiling && estimateCostTime == 0) { + estimateCostTime = System.nanoTime() - start; + } + return estimatedCost; } - if (estimatedCost >= 0 && estimatedCost <= limit) { - // If we've overflowed or are past the limit skip the cost - estimatedCost += scorers[ctx.ord][f].cost(); + if (estimatedCost > maxCost) { + if (profiling) { + /* + * If we're profiling we stop the timer the first + * time we pass the limit but we keep counting so + * we get an accurate estimate. + */ + if (estimateCostTime == 0) { + estimateCostTime = System.nanoTime() - start; + } + } else { + // We're past the limit and not profiling. No use counting further. + return estimatedCost; + } } } } - if (profiling) { + if (profiling && estimateCostTime == 0) { estimateCostTime = System.nanoTime() - start; } - // If we've overflowed use Long.MAX_VALUE - return estimatedCost < 0 ? Long.MAX_VALUE : estimatedCost; - } - - /** - * Are the scorers cached? - *

- * Package private for testing. - */ - boolean scorersCached() { - return scorers != null; + return estimatedCost; } /** @@ -347,29 +344,13 @@ boolean scorersCached() { */ @Override protected LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException { - if (weights == null) { - weights = buildWeights(topLevelQuery(), filters); - } Bits live = ctx.reader().getLiveDocs(); Counter counter = new Counter(docCountProvider); if (false == docCountProvider.alwaysOne()) { - segmentsWithDocCount++; + segmentsWithDocCountField++; } - for (int filterOrd = 0; filterOrd < filters.length; filterOrd++) { - BulkScorer scorer; - if (scorers == null) { - // No cached scorers - scorer = weights[filterOrd].bulkScorer(ctx); - } else { - // Scorers cached when calling estimateCost - scorer = scorers[ctx.ord][filterOrd]; - } - if (scorer == null) { - // the filter doesn't match any docs - continue; - } - scorer.score(counter, live); - incrementBucketDocCount(filterOrd, counter.readAndReset(ctx)); + for (int filterOrd = 0; filterOrd < filters().size(); filterOrd++) { + incrementBucketDocCount(filterOrd, filters().get(filterOrd).count(ctx, counter, live)); } // Throwing this exception is how we communicate to the collection mechanism that we don't need the segment. throw new CollectionTerminatedException(); @@ -379,7 +360,7 @@ protected LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucket public void collectDebugInfo(BiConsumer add) { super.collectDebugInfo(add); add.accept("segments_with_deleted_docs", segmentsWithDeletedDocs); - add.accept("segments_with_doc_count", segmentsWithDocCount); + add.accept("segments_with_doc_count_field", segmentsWithDocCountField); if (estimatedCost != -1) { // -1 means we didn't estimate it. add.accept("estimated_cost", estimatedCost); @@ -388,52 +369,42 @@ public void collectDebugInfo(BiConsumer add) { } } - /** - * Counts collected documents, delegating to {@link DocCountProvider} for - * how many documents each search hit is "worth". - */ - private static class Counter implements LeafCollector { - private final DocCountProvider docCount; - private long count; - - Counter(DocCountProvider docCount) { - this.docCount = docCount; - } - - public long readAndReset(LeafReaderContext ctx) throws IOException { - long result = count; - count = 0; - docCount.setLeafReaderContext(ctx); - return result; - } + CheckedSupplier canUseMetadata(LeafReaderContext ctx) { + return new CheckedSupplier() { + Boolean canUse; - @Override - public void collect(int doc) throws IOException { - count += docCount.getDocCount(doc); - } + @Override + public Boolean get() throws IOException { + if (canUse == null) { + canUse = canUse(); + } + return canUse; + } - @Override - public void setScorer(Scorable scorer) throws IOException {} + private boolean canUse() throws IOException { + if (ctx.reader().getLiveDocs() != null) { + return false; + } + docCountProvider.setLeafReaderContext(ctx); + return docCountProvider.alwaysOne(); + } + }; } } /** - * Collects results by building a {@link Bits} per filter and testing if + * Collects results by building a {@link LongPredicate} per filter and testing if * each doc sent to its {@link LeafBucketCollector} is in each filter * which is generally slower than {@link FilterByFilter} but is compatible * with parent and child aggregations. */ private static class Compatible extends FiltersAggregator { - private final Query[] filters; - private Weight[] filterWeights; - private final int totalNumKeys; Compatible( String name, AggregatorFactories factories, - String[] keys, - Query[] filters, + List> filters, boolean keyed, String otherBucketKey, AggregationContext context, @@ -441,36 +412,32 @@ private static class Compatible extends FiltersAggregator { CardinalityUpperBound cardinality, Map metadata ) throws IOException { - super(name, factories, keys, keyed, otherBucketKey, context, parent, cardinality, metadata); - this.filters = filters; + super(name, factories, filters, keyed, otherBucketKey, context, parent, cardinality, metadata); if (otherBucketKey == null) { - this.totalNumKeys = keys.length; + this.totalNumKeys = filters.size(); } else { - this.totalNumKeys = keys.length + 1; + this.totalNumKeys = filters.size() + 1; } } @Override protected LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException { - if (filterWeights == null) { - filterWeights = buildWeights(new MatchAllDocsQuery(), filters); - } - final Bits[] bits = new Bits[filters.length]; - for (int i = 0; i < filters.length; ++i) { - bits[i] = Lucene.asSequentialAccessBits(ctx.reader().maxDoc(), filterWeights[i].scorerSupplier(ctx)); + IntPredicate[] docFilters = new IntPredicate[filters().size()]; + for (int filterOrd = 0; filterOrd < filters().size(); filterOrd++) { + docFilters[filterOrd] = filters().get(filterOrd).matchingDocIds(ctx); } return new LeafBucketCollectorBase(sub, null) { @Override public void collect(int doc, long bucket) throws IOException { boolean matched = false; - for (int i = 0; i < bits.length; i++) { - if (bits[i].get(doc)) { + for (int i = 0; i < docFilters.length; i++) { + if (docFilters[i].test(doc)) { collectBucket(sub, doc, bucketOrd(bucket, i)); matched = true; } } if (otherBucketKey != null && false == matched) { - collectBucket(sub, doc, bucketOrd(bucket, bits.length)); + collectBucket(sub, doc, bucketOrd(bucket, docFilters.length)); } } }; @@ -481,51 +448,31 @@ final long bucketOrd(long owningBucketOrdinal, int filterOrd) { } } - protected Weight[] buildWeights(Query topLevelQuery, Query filters[]) throws IOException{ - Weight[] weights = new Weight[filters.length]; - for (int i = 0; i < filters.length; ++i) { - Query filter = filterMatchingBoth(topLevelQuery, filters[i]); - weights[i] = searcher().createWeight(searcher().rewrite(filter), ScoreMode.COMPLETE_NO_SCORES, 1); - } - return weights; - } - /** - * Make a filter that matches both queries, merging the - * {@link PointRangeQuery}s together if possible. The "merging together" - * part is provides a fairly substantial speed boost then executing a - * top level query on a date and a filter on a date. This kind of thing - * is very common when visualizing logs and metrics. + * Counts collected documents, delegating to {@link DocCountProvider} for + * how many documents each search hit is "worth". */ - static Query filterMatchingBoth(Query lhs, Query rhs) { - if (lhs instanceof MatchAllDocsQuery) { - return rhs; - } - if (rhs instanceof MatchAllDocsQuery) { - return lhs; - } - Query unwrappedLhs = unwrap(lhs); - Query unwrappedRhs = unwrap(rhs); - if (unwrappedLhs instanceof PointRangeQuery && unwrappedRhs instanceof PointRangeQuery) { - Query merged = MergedPointRangeQuery.merge((PointRangeQuery) unwrappedLhs, (PointRangeQuery) unwrappedRhs); - if (merged != null) { - // Should we rewrap here? - return merged; - } + static class Counter implements LeafCollector { + final DocCountProvider docCount; + private long count; + + Counter(DocCountProvider docCount) { + this.docCount = docCount; } - BooleanQuery.Builder builder = new BooleanQuery.Builder(); - builder.add(lhs, BooleanClause.Occur.MUST); - builder.add(rhs, BooleanClause.Occur.MUST); - return builder.build(); - } - private static Query unwrap(Query query) { - if (query instanceof IndexSortSortedNumericDocValuesRangeQuery) { - query = ((IndexSortSortedNumericDocValuesRangeQuery) query).getFallbackQuery(); + public long readAndReset(LeafReaderContext ctx) throws IOException { + long result = count; + count = 0; + docCount.setLeafReaderContext(ctx); + return result; } - if (query instanceof IndexOrDocValuesQuery) { - query = ((IndexOrDocValuesQuery) query).getIndexQuery(); + + @Override + public void collect(int doc) throws IOException { + count += docCount.getDocCount(doc); } - return query; + + @Override + public void setScorer(Scorable scorer) throws IOException {} } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/FiltersAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/FiltersAggregatorFactory.java index baf9540cde9c9..b0b2aa166d1df 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/FiltersAggregatorFactory.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/FiltersAggregatorFactory.java @@ -8,7 +8,6 @@ package org.elasticsearch.search.aggregations.bucket.filter; -import org.apache.lucene.search.Query; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorFactories; import org.elasticsearch.search.aggregations.AggregatorFactory; @@ -17,13 +16,13 @@ import org.elasticsearch.search.aggregations.support.AggregationContext; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.Map; public class FiltersAggregatorFactory extends AggregatorFactory { - private final String[] keys; - private final Query[] filters; + private final List> filters; private final boolean keyed; private final boolean otherBucket; private final String otherBucketKey; @@ -35,12 +34,9 @@ public FiltersAggregatorFactory(String name, List filters, boolean this.keyed = keyed; this.otherBucket = otherBucket; this.otherBucketKey = otherBucketKey; - keys = new String[filters.size()]; - this.filters = new Query[filters.size()]; - for (int i = 0; i < filters.size(); ++i) { - KeyedFilter keyedFilter = filters.get(i); - this.keys[i] = keyedFilter.key(); - this.filters[i] = context.buildQuery(keyedFilter.filter()); + this.filters = new ArrayList<>(filters.size()); + for (KeyedFilter f : filters) { + this.filters.add(QueryToFilterAdapter.build(context.searcher(), f.key(), context.buildQuery(f.filter()))); } } @@ -48,7 +44,7 @@ public FiltersAggregatorFactory(String name, List filters, boolean public Aggregator createInternal(Aggregator parent, CardinalityUpperBound cardinality, Map metadata) throws IOException { - return FiltersAggregator.build(name, factories, keys, filters, keyed, + return FiltersAggregator.build(name, factories, filters, keyed, otherBucket ? otherBucketKey : null, context, parent, cardinality, metadata); } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/QueryToFilterAdapter.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/QueryToFilterAdapter.java new file mode 100644 index 0000000000000..f930124469bcd --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/QueryToFilterAdapter.java @@ -0,0 +1,369 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.search.aggregations.bucket.filter; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BulkScorer; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.IndexOrDocValuesQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.IndexSortSortedNumericDocValuesRangeQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MatchNoDocsQuery; +import org.apache.lucene.search.PointRangeQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.Bits; +import org.elasticsearch.common.CheckedSupplier; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.aggregations.Aggregator; + +import java.io.IOException; +import java.util.function.BiConsumer; +import java.util.function.IntPredicate; + +/** + * Adapts a Lucene {@link Query} to the behaviors used be the + * {@link FiltersAggregator}. In general we try to delegate to {@linkplain Query} + * when we don't have a special optimization. + */ +public class QueryToFilterAdapter { + /** + * Build a filter for the query against the provided searcher. + *

+ * Note: This method rewrites the query against the {@link IndexSearcher} + */ + public static QueryToFilterAdapter build(IndexSearcher searcher, String key, Query query) throws IOException { + query = searcher.rewrite(query); + if (query instanceof TermQuery) { + return new TermQueryToFilterAdapter(searcher, key, (TermQuery) query); + } + if (query instanceof MatchAllDocsQuery) { + return new MatchAllQueryToFilterAdapter(searcher, key, (MatchAllDocsQuery) query); + } + if (query instanceof MatchNoDocsQuery) { + return new MatchNoneQueryToFilterAdapter(searcher, key, (MatchNoDocsQuery) query); + } + return new QueryToFilterAdapter<>(searcher, key, query); + } + + private final IndexSearcher searcher; + private final String key; + private final Q query; + /** + * The weight for the query or {@code null} if we haven't built it. Use + * {@link #weight()} to build it when needed. + */ + private Weight weight; + /** + * Scorer for each segment or {@code null} if we haven't built the scorer. + * Use {@link #bulkScorer(LeafReaderContext, Runnable)} to build the scorer + * when needed. + */ + private BulkScorer[] bulkScorers; + /** + * The number of scorers we prepared just to estimate the cost of counting + * documents. For some queries preparing the scorers is very slow so its + * nice to know how many we built. Exposed by profiling. + */ + private int scorersPreparedWhileEstimatingCost; + + private QueryToFilterAdapter(IndexSearcher searcher, String key, Q query) { + this.searcher = searcher; + this.key = key; + this.query = query; + } + + /** + * The query we're adapting. + *

+ * Subclasses should use this to fetch the query when making query + * specific optimizations. + */ + Q query() { + return query; + } + + /** + * Key for this filter. + */ + public final String key() { + return key; + } + + /** + * Searcher that this filter is targeting. + */ + protected final IndexSearcher searcher() { + return searcher; + } + + /** + * Would using index metadata like {@link IndexReader#docFreq} + * or {@link IndexReader#maxDoc} to count the number of matching documents + * produce the same answer as collecting the results with a sequence like + * {@code searcher.collect(counter); return counter.readAndReset();}? + */ + protected final boolean countCanUseMetadata(FiltersAggregator.Counter counter, Bits live) { + if (live != null) { + /* + * We can only use metadata if all of the documents in the reader + * are visible. This is done by returning a null `live` bits. The + * name `live` is traditional because most of the time a non-null + * `live` bits means that there are deleted documents. But `live` + * might also be non-null if document level security is enabled. + */ + return false; + } + /* + * We can only use metadata if we're not using the special docCount + * field. Otherwise we wouldn't know how many documents each lucene + * document represents. + */ + return counter.docCount.alwaysOne(); + } + + /** + * Make a filter that matches this filter and the provided query. + *

+ * Note: This method rewrites the query against the {@link IndexSearcher}. + */ + QueryToFilterAdapter union(Query extraQuery) throws IOException { + /* + * It'd be *wonderful* if Lucene could do fancy optimizations + * when merging queries but it doesn't at the moment. Admittedly, + * we have a much more limited problem. We don't care about score + * here at all. We know which queries its worth spending time to + * optimize because we know which aggs rewrite into this one. + */ + extraQuery = searcher().rewrite(extraQuery); + if (extraQuery instanceof MatchAllDocsQuery) { + return this; + } + Query unwrappedQuery = unwrap(query); + Query unwrappedExtraQuery = unwrap(extraQuery); + if (unwrappedQuery instanceof PointRangeQuery && unwrappedExtraQuery instanceof PointRangeQuery) { + Query merged = MergedPointRangeQuery.merge((PointRangeQuery) unwrappedQuery, (PointRangeQuery) unwrappedExtraQuery); + if (merged != null) { + // Should we rewrap here? + return new QueryToFilterAdapter<>(searcher(), key(), merged); + } + } + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.add(query, BooleanClause.Occur.MUST); + builder.add(extraQuery, BooleanClause.Occur.MUST); + return new QueryToFilterAdapter<>(searcher(), key(), builder.build()); + } + + private static Query unwrap(Query query) { + while (true) { + if (query instanceof ConstantScoreQuery) { + query = ((ConstantScoreQuery) query).getQuery(); + continue; + } + if (query instanceof IndexSortSortedNumericDocValuesRangeQuery) { + query = ((IndexSortSortedNumericDocValuesRangeQuery) query).getFallbackQuery(); + continue; + } + if (query instanceof IndexOrDocValuesQuery) { + query = ((IndexOrDocValuesQuery) query).getIndexQuery(); + continue; + } + return query; + } + } + + /** + * Build a predicate that the "compatible" implementation of the + * {@link FiltersAggregator} will use to figure out if the filter matches. + *

+ * Consumers of this method will always call it with non-negative, + * increasing {@code int}s. A sequence like {@code 0, 1, 7, 8, 10} is fine. + * It won't call with {@code 0, 1, 0} or {@code -1, 0, 1}. + */ + @SuppressWarnings("resource") // Closing the reader is someone else's problem + IntPredicate matchingDocIds(LeafReaderContext ctx) throws IOException { + return Lucene.asSequentialAccessBits(ctx.reader().maxDoc(), weight().scorerSupplier(ctx))::get; + } + + /** + * Count the number of documents that match this filter in a leaf. + */ + long count(LeafReaderContext ctx, FiltersAggregator.Counter counter, Bits live) throws IOException { + BulkScorer scorer = bulkScorer(ctx, () -> {}); + if (scorer == null) { + // No hits in this segment. + return 0; + } + scorer.score(counter, live); + return counter.readAndReset(ctx); + } + + /** + * Estimate the cost of calling {@code #count} in a leaf. + */ + long estimateCountCost(LeafReaderContext ctx, CheckedSupplier canUseMetadata) throws IOException { + BulkScorer scorer = bulkScorer(ctx, () -> scorersPreparedWhileEstimatingCost++); + if (scorer == null) { + // There aren't any matches for this filter in this leaf + return 0; + } + return scorer.cost(); // TODO in another PR (please) change this to ScorerSupplier.cost + } + + /** + * Collect profiling information for this filter. Rhymes with + * {@link Aggregator#collectDebugInfo(BiConsumer)}. + *

+ * Well behaved implementations will always call the superclass + * implementation just in case it has something interesting. They will + * also only add objects which can be serialized with + * {@link StreamOutput#writeGenericValue(Object)} and + * {@link XContentBuilder#value(Object)}. And they'll have an integration + * test. + */ + void collectDebugInfo(BiConsumer add) { + add.accept("query", query.toString()); + add.accept("scorers_prepared_while_estimating_cost", scorersPreparedWhileEstimatingCost); + } + + private BulkScorer bulkScorer(LeafReaderContext ctx, Runnable onPrepare) throws IOException { + if (bulkScorers == null) { + bulkScorers = new BulkScorer[searcher().getIndexReader().leaves().size()]; + } + if (bulkScorers[ctx.ord] == null) { + onPrepare.run(); + return bulkScorers[ctx.ord] = weight().bulkScorer(ctx); + } + return bulkScorers[ctx.ord]; + } + + private Weight weight() throws IOException { + if (weight == null) { + weight = searcher().createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 1.0f); + } + return weight; + } + + /** + * Special case when the filter can't match anything. + */ + private static class MatchNoneQueryToFilterAdapter extends QueryToFilterAdapter { + private MatchNoneQueryToFilterAdapter(IndexSearcher searcher, String key, MatchNoDocsQuery query) { + super(searcher, key, query); + } + + @Override + QueryToFilterAdapter union(Query extraQuery) throws IOException { + return this; + } + + @Override + IntPredicate matchingDocIds(LeafReaderContext ctx) throws IOException { + return l -> false; + } + + @Override + long count(LeafReaderContext ctx, FiltersAggregator.Counter counter, Bits live) throws IOException { + return 0; + } + + @Override + long estimateCountCost(LeafReaderContext ctx, CheckedSupplier canUseMetadata) throws IOException { + return 0; + } + + @Override + void collectDebugInfo(BiConsumer add) { + super.collectDebugInfo(add); + add.accept("specialized_for", "match_none"); + } + } + + /** + * Filter that matches every document. + */ + private static class MatchAllQueryToFilterAdapter extends QueryToFilterAdapter { + private int resultsFromMetadata; + + private MatchAllQueryToFilterAdapter(IndexSearcher searcher, String key, MatchAllDocsQuery query) { + super(searcher, key, query); + } + + @Override + QueryToFilterAdapter union(Query extraQuery) throws IOException { + return QueryToFilterAdapter.build(searcher(), key(), extraQuery); + } + + @Override + IntPredicate matchingDocIds(LeafReaderContext ctx) throws IOException { + return l -> true; + } + + @Override + long count(LeafReaderContext ctx, FiltersAggregator.Counter counter, Bits live) throws IOException { + if (countCanUseMetadata(counter, live)) { + resultsFromMetadata++; + return ctx.reader().maxDoc(); // TODO we could use numDocs even if live is not null because provides accurate numDocs. + } + return super.count(ctx, counter, live); + } + + @Override + long estimateCountCost(LeafReaderContext ctx, CheckedSupplier canUseMetadata) throws IOException { + return canUseMetadata.get() ? 0 : ctx.reader().maxDoc(); + } + + @Override + void collectDebugInfo(BiConsumer add) { + super.collectDebugInfo(add); + add.accept("specialized_for", "match_all"); + add.accept("results_from_metadata", resultsFromMetadata); + } + } + + private static class TermQueryToFilterAdapter extends QueryToFilterAdapter { + private int resultsFromMetadata; + + private TermQueryToFilterAdapter(IndexSearcher searcher, String key, TermQuery query) { + super(searcher, key, query); + } + + @Override + long count(LeafReaderContext ctx, FiltersAggregator.Counter counter, Bits live) throws IOException { + if (countCanUseMetadata(counter, live)) { + resultsFromMetadata++; + return ctx.reader().docFreq(query().getTerm()); + } + return super.count(ctx, counter, live); + } + + @Override + long estimateCountCost(LeafReaderContext ctx, CheckedSupplier canUseMetadata) throws IOException { + if (canUseMetadata.get()) { + return 0; + } + return super.estimateCountCost(ctx, canUseMetadata); + } + + @Override + void collectDebugInfo(BiConsumer add) { + super.collectDebugInfo(add); + add.accept("specialized_for", "term"); + add.accept("results_from_metadata", resultsFromMetadata); + } + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/range/RangeAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/range/RangeAggregator.java index f40bd1750232c..dd9536f1dc105 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/range/RangeAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/range/RangeAggregator.java @@ -8,7 +8,6 @@ package org.elasticsearch.search.aggregations.bucket.range; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.ScorerSupplier; import org.elasticsearch.common.CheckedFunction; @@ -37,6 +36,7 @@ import org.elasticsearch.search.aggregations.LeafBucketCollectorBase; import org.elasticsearch.search.aggregations.NonCollectingAggregator; import org.elasticsearch.search.aggregations.bucket.BucketsAggregator; +import org.elasticsearch.search.aggregations.bucket.filter.QueryToFilterAdapter; import org.elasticsearch.search.aggregations.bucket.filter.FiltersAggregator; import org.elasticsearch.search.aggregations.bucket.filter.InternalFilters; import org.elasticsearch.search.aggregations.bucket.range.InternalRange.Factory; @@ -345,7 +345,6 @@ public static FromFilters adaptIntoFiltersOrNull( if (averageDocsPerRange < DOCS_PER_RANGE_TO_USE_FILTERS) { return null; } - // TODO bail here for runtime fields. We should check the cost estimates on the Scorer. if (valuesSourceConfig.fieldType() instanceof DateFieldType && ((DateFieldType) valuesSourceConfig.fieldType()).resolution() == Resolution.NANOSECONDS) { // We don't generate sensible Queries for nanoseconds. @@ -355,8 +354,7 @@ public static FromFilters adaptIntoFiltersOrNull( return null; } boolean wholeNumbersOnly = false == ((ValuesSource.Numeric) valuesSourceConfig.getValuesSource()).isFloatingPoint(); - String[] keys = new String[ranges.length]; - Query[] filters = new Query[ranges.length]; + List> filters = new ArrayList<>(ranges.length); for (int i = 0; i < ranges.length; i++) { /* * If the bounds on the ranges are too high then the `double`s @@ -371,7 +369,6 @@ public static FromFilters adaptIntoFiltersOrNull( if (wholeNumbersOnly && ranges[i].to != Double.POSITIVE_INFINITY && Math.abs(ranges[i].to) > MAX_ACCURATE_BOUND) { return null; } - keys[i] = Integer.toString(i); /* * Use the native format on the field rather than the one provided * on the valuesSourceConfig because the format on the field is what @@ -383,7 +380,7 @@ public static FromFilters adaptIntoFiltersOrNull( RangeQueryBuilder builder = new RangeQueryBuilder(valuesSourceConfig.fieldType().name()); builder.from(ranges[i].from == Double.NEGATIVE_INFINITY ? null : format.format(ranges[i].from)).includeLower(true); builder.to(ranges[i].to == Double.POSITIVE_INFINITY ? null : format.format(ranges[i].to)).includeUpper(false); - filters[i] = context.buildQuery(builder); + filters.add(QueryToFilterAdapter.build(context.searcher(), Integer.toString(i), context.buildQuery(builder))); } RangeAggregator.FromFilters fromFilters = new RangeAggregator.FromFilters<>( parent, @@ -392,7 +389,6 @@ public static FromFilters adaptIntoFiltersOrNull( return FiltersAggregator.buildFilterByFilter( name, subAggregators, - keys, filters, false, null, diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregatorFromFilters.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregatorFromFilters.java index b4359ec1a7ebb..8a705a47c5036 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregatorFromFilters.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregatorFromFilters.java @@ -10,7 +10,6 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.Query; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; import org.elasticsearch.common.CheckedFunction; @@ -24,6 +23,7 @@ import org.elasticsearch.search.aggregations.InternalAggregation; import org.elasticsearch.search.aggregations.InternalOrder; import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation.Bucket; +import org.elasticsearch.search.aggregations.bucket.filter.QueryToFilterAdapter; import org.elasticsearch.search.aggregations.bucket.filter.FiltersAggregator; import org.elasticsearch.search.aggregations.bucket.filter.InternalFilters; import org.elasticsearch.search.aggregations.bucket.terms.GlobalOrdinalsStringTermsAggregator.OrdBucket; @@ -71,15 +71,13 @@ static StringTermsAggregatorFromFilters adaptIntoFiltersOrNull( if (false == FiltersAggregator.canUseFilterByFilter(parent, factories, null)) { return null; } - List keys = new ArrayList<>(); - List filters = new ArrayList<>(); + List> filters = new ArrayList<>(); TermsEnum terms = values.termsEnum(); for (long ord = 0; ord < values.getValueCount(); ord++) { if (acceptedOrds.test(ord) == false) { continue; } terms.seekExact(ord); - keys.add(Long.toString(ord)); /* * It *feels* like there should be a query that operates * directly on the global ordinals but there isn't. Building @@ -92,7 +90,7 @@ static StringTermsAggregatorFromFilters adaptIntoFiltersOrNull( valuesSourceConfig.fieldContext().field(), valuesSourceConfig.format().format(terms.term()) ); - filters.add(context.buildQuery(b)); + filters.add(QueryToFilterAdapter.build(context.searcher(), Long.toString(ord), context.buildQuery(b))); } StringTermsAggregatorFromFilters adapted = new StringTermsAggregatorFromFilters( parent, @@ -100,8 +98,7 @@ static StringTermsAggregatorFromFilters adaptIntoFiltersOrNull( subAggs -> FiltersAggregator.buildFilterByFilter( name, subAggs, - keys.toArray(String[]::new), - filters.toArray(Query[]::new), + filters, false, null, context, diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/filter/FiltersAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/filter/FiltersAggregatorTests.java index ca7f9b23c85ee..32490c97060d0 100644 --- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/filter/FiltersAggregatorTests.java +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/filter/FiltersAggregatorTests.java @@ -10,32 +10,49 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.CollectionTerminatedException; import org.apache.lucene.search.IndexOrDocValuesQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.CheckedConsumer; +import org.elasticsearch.common.lucene.index.ElasticsearchDirectoryReader; import org.elasticsearch.common.lucene.search.Queries; +import org.elasticsearch.index.cache.bitset.BitsetFilterCache; +import org.elasticsearch.index.mapper.CustomTermFreqField; import org.elasticsearch.index.mapper.DateFieldMapper; import org.elasticsearch.index.mapper.DateFieldMapper.Resolution; +import org.elasticsearch.index.mapper.DocCountFieldMapper; import org.elasticsearch.index.mapper.KeywordFieldMapper; import org.elasticsearch.index.mapper.KeywordFieldMapper.KeywordFieldType; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.ObjectMapper; import org.elasticsearch.index.query.MatchAllQueryBuilder; +import org.elasticsearch.index.query.MatchQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.RangeQueryBuilder; +import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.index.query.TermQueryBuilder; +import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.search.aggregations.AggregationBuilder; +import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorTestCase; +import org.elasticsearch.search.aggregations.InternalAggregation; import org.elasticsearch.search.aggregations.bucket.filter.FiltersAggregator.KeyedFilter; import org.elasticsearch.search.aggregations.bucket.nested.NestedAggregatorTests; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator.PipelineTree; +import org.elasticsearch.search.aggregations.support.AggregationContext; import org.elasticsearch.search.aggregations.support.AggregationInspectionHelper; +import org.elasticsearch.search.internal.ContextIndexSearcherTests.DocumentSubsetDirectoryReader; import org.junit.Before; import java.io.IOException; @@ -46,6 +63,7 @@ import java.util.Set; import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.hasEntry; import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.instanceOf; @@ -203,27 +221,43 @@ public void testRandom() throws Exception { } /** - * Test that we perform the appropriate unwrapping to merged queries. + * Test that we perform the appropriate unwrapping to merge queries. */ - public void testFilterMatchingBoth() throws IOException { - Query topLevelQuery = LongPoint.newRangeQuery( - "test", - DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2020-01-01"), - DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2020-02-01") - ); - Query filterQuery = LongPoint.newRangeQuery( - "test", - DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2020-01-01"), - DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2020-02-01") + public void testMergingQueries() throws IOException { + DateFieldMapper.DateFieldType ft = new DateFieldMapper.DateFieldType("test"); + Query topLevelQuery = ft.rangeQuery("2020-01-01", "2020-02-01", true, true, null, null, null, mock(SearchExecutionContext.class)); + FiltersAggregationBuilder builder = new FiltersAggregationBuilder( + "t", + // The range query will be wrapped in IndexOrDocValuesQuery by the date field type + new KeyedFilter("k", new RangeQueryBuilder("test").from("2020-01-01").to("2020-02-01")) ); - Query matchingBoth = FiltersAggregator.filterMatchingBoth(new IndexOrDocValuesQuery(topLevelQuery, mock(Query.class)), filterQuery); - /* - * The topLevelQuery is entirely contained within the filter query so - * it is good enough to match that. See MergedPointRangeQueryTests for - * tons more tests around this. Really in this test we're just excited - * to prove that we unwrapped the IndexOrDocValuesQuery above. - */ - assertThat(matchingBoth, equalTo(topLevelQuery)); + withAggregator(builder, topLevelQuery, iw -> { + /* + * There has to be a document inside the query and one outside + * the query or we'll end up with MatchAll or MathNone. + */ + long time = DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2010-01-02"); + iw.addDocument(List.of(new LongPoint("test", time), new SortedNumericDocValuesField("test", time))); + time = DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2020-01-02"); + iw.addDocument(List.of(new LongPoint("test", time), new SortedNumericDocValuesField("test", time))); + time = DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2020-01-01"); + iw.addDocument(List.of(new LongPoint("test", time), new SortedNumericDocValuesField("test", time))); + time = DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2020-02-01"); + iw.addDocument(List.of(new LongPoint("test", time), new SortedNumericDocValuesField("test", time))); + }, (searcher, aggregator) -> { + /* + * The topLevelQuery is entirely contained within the filter query so + * it is good enough to match that. See MergedPointRangeQueryTests for + * tons more tests around this. Really in this test we're just excited + * to prove that we unwrapped the IndexOrDocValuesQuery that the date + * field mapper adds + */ + QueryToFilterAdapter filter = ((FiltersAggregator) aggregator).filters().get(0); + assertThat(filter.query(), equalTo(((IndexOrDocValuesQuery) topLevelQuery).getIndexQuery())); + Map debug = new HashMap<>(); + filter.collectDebugInfo(debug::put); + assertThat(debug, hasEntry("query", ((IndexOrDocValuesQuery) topLevelQuery).getIndexQuery().toString())); + }, ft); } public void testWithMergedPointRangeQueries() throws IOException { @@ -274,13 +308,17 @@ public void testFilterByFilterCost() throws IOException { FiltersAggregator.FilterByFilter filterByFilter = (FiltersAggregator.FilterByFilter) agg; int maxDoc = searcher.getIndexReader().maxDoc(); assertThat(filterByFilter.estimateCost(maxDoc), equalTo(1L)); - assertThat(filterByFilter.scorersCached(), equalTo(true)); Map debug = new HashMap<>(); filterByFilter.collectDebugInfo(debug::put); assertThat(debug, hasEntry("segments_with_deleted_docs", 0)); assertThat(debug, hasEntry("estimated_cost", 1L)); assertThat(debug, hasEntry("max_cost", (long) maxDoc)); assertThat(debug, hasEntry("estimate_cost_time", 0L)); + List filtersDebug = (List) debug.get("filters"); + for (int i = 0; i < filterByFilter.filters().size(); i++) { + Map filterDebug = (Map) filtersDebug.get(i); + assertThat((int) filterDebug.get("scorers_prepared_while_estimating_cost"), greaterThan(0)); + } }, ft ); @@ -319,10 +357,177 @@ public void testNested() throws IOException { ); } + public void testMatchAll() throws IOException { + AggregationBuilder builder = new FiltersAggregationBuilder("test", new KeyedFilter("q1", new MatchAllQueryBuilder())); + CheckedConsumer buildIndex = iw -> { + for (int i = 0; i < 10; i++) { + iw.addDocument(List.of()); + } + }; + withAggregator(builder, new MatchAllDocsQuery(), buildIndex, (searcher, aggregator) -> { + assertThat(aggregator, instanceOf(FiltersAggregator.FilterByFilter.class)); + // The estimated cost is 0 because we're going to read from metadata + assertThat(((FiltersAggregator.FilterByFilter) aggregator).estimateCost(Long.MAX_VALUE), equalTo(0L)); + Map debug = collectAndGetFilterDebugInfo(searcher, aggregator); + assertThat(debug, hasEntry("specialized_for", "match_all")); + assertThat((int) debug.get("results_from_metadata"), greaterThan(0)); + }); + testCase( + builder, + new MatchAllDocsQuery(), + buildIndex, + (InternalFilters result) -> { + assertThat(result.getBuckets(), hasSize(1)); + assertThat(result.getBucketByKey("q1").getDocCount(), equalTo(10L)); + } + ); + } + + public void testMatchAllWithDocCount() throws IOException { + AggregationBuilder builder = new FiltersAggregationBuilder("test", new KeyedFilter("q1", new MatchAllQueryBuilder())); + CheckedConsumer buildIndex = iw -> { + for (int i = 0; i < 10; i++) { + iw.addDocument(List.of(new CustomTermFreqField(DocCountFieldMapper.NAME, DocCountFieldMapper.NAME, i + 1))); + } + }; + withAggregator(builder, new MatchAllDocsQuery(), buildIndex, (searcher, aggregator) -> { + assertThat(aggregator, instanceOf(FiltersAggregator.FilterByFilter.class)); + // The estimated cost is 0 because we're going to read from metadata + assertThat(((FiltersAggregator.FilterByFilter) aggregator).estimateCost(Long.MAX_VALUE), equalTo(10L)); + Map debug = collectAndGetFilterDebugInfo(searcher, aggregator); + assertThat(debug, hasEntry("specialized_for", "match_all")); + assertThat(debug, hasEntry("results_from_metadata", 0)); + }); + testCase( + builder, + new MatchAllDocsQuery(), + buildIndex, + (InternalFilters result) -> { + assertThat(result.getBuckets(), hasSize(1)); + assertThat(result.getBucketByKey("q1").getDocCount(), equalTo(55L)); + } + ); + } + + /** + * This runs {@code filters} with a single {@code match_all} filter with + * the index set up kind of like document level security. As a bonus, this + * "looks" to the agg just like an index with deleted documents. + */ + public void testMatchAllOnFilteredIndex() throws IOException { + AggregationBuilder builder = new FiltersAggregationBuilder("test", new KeyedFilter("q1", new MatchAllQueryBuilder())); + try (Directory directory = newDirectory()) { + RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory); + for (int i = 0; i < 10; i++) { + indexWriter.addDocument(List.of(new LongPoint("t", i))); + } + indexWriter.close(); + + try (DirectoryReader directoryReader = DirectoryReader.open(directory)) { + BitsetFilterCache bitsetFilterCache = new BitsetFilterCache(createIndexSettings(), new BitsetFilterCache.Listener() { + @Override + public void onRemoval(ShardId shardId, Accountable accountable) {} + + @Override + public void onCache(ShardId shardId, Accountable accountable) {} + }); + IndexReader limitedReader = new DocumentSubsetDirectoryReader( + ElasticsearchDirectoryReader.wrap(directoryReader, new ShardId(bitsetFilterCache.index(), 0)), + bitsetFilterCache, + LongPoint.newRangeQuery("t", 5, Long.MAX_VALUE) + ); + IndexSearcher searcher = newIndexSearcher(limitedReader); + AggregationContext context = createAggregationContext(searcher, new MatchAllDocsQuery()); + FiltersAggregator.FilterByFilter aggregator = createAggregator(builder, context); + // The estimated cost is 0 because we're going to read from metadata + assertThat(((FiltersAggregator.FilterByFilter) aggregator).estimateCost(Long.MAX_VALUE), equalTo(10L)); + aggregator.preCollection(); + searcher.search(context.query(), aggregator); + aggregator.postCollection(); + InternalAggregation result = aggregator.buildTopLevel(); + result = result.reduce( + List.of(result), + InternalAggregation.ReduceContext.forFinalReduction( + context.bigArrays(), + getMockScriptService(), + b -> {}, + PipelineTree.EMPTY + ) + ); + InternalFilters filters = (InternalFilters) result; + assertThat(filters.getBuckets(), hasSize(1)); + assertThat(filters.getBucketByKey("q1").getDocCount(), equalTo(5L)); + Map debug = new HashMap<>(); + ((FiltersAggregator.FilterByFilter) aggregator).filters().get(0).collectDebugInfo(debug::put); + assertThat(debug, hasEntry("specialized_for", "match_all")); + assertThat(debug, hasEntry("results_from_metadata", 0)); + } + } + } + + public void testMatchNone() throws IOException { + AggregationBuilder builder = new FiltersAggregationBuilder("test", new KeyedFilter("q1", new RangeQueryBuilder("missing").gte(0))); + CheckedConsumer buildIndex = iw -> { + for (int i = 0; i < 10; i++) { + iw.addDocument(List.of(new LongPoint("t", i))); + } + }; + withAggregator(builder, new MatchAllDocsQuery(), buildIndex, (searcher, aggregator) -> { + assertThat(aggregator, instanceOf(FiltersAggregator.FilterByFilter.class)); + // The estimated cost is 0 because we're going to read from metadata + assertThat(((FiltersAggregator.FilterByFilter) aggregator).estimateCost(Long.MAX_VALUE), equalTo(0L)); + Map debug = collectAndGetFilterDebugInfo(searcher, aggregator); + assertThat(debug, hasEntry("specialized_for", "match_none")); + }); + testCase( + builder, + new MatchAllDocsQuery(), + buildIndex, + (InternalFilters result) -> { + assertThat(result.getBuckets(), hasSize(1)); + assertThat(result.getBucketByKey("q1").getDocCount(), equalTo(0L)); + } + ); + } + + public void testTermQuery() throws IOException { + KeywordFieldMapper.KeywordFieldType ft = new KeywordFieldMapper.KeywordFieldType("f", true, false, null); + AggregationBuilder builder = new FiltersAggregationBuilder("test", new KeyedFilter("q1", new MatchQueryBuilder("f", "0"))); + CheckedConsumer buildIndex = iw -> { + for (int i = 0; i < 10; i++) { + BytesRef bytes = new BytesRef(Integer.toString(i % 3)); + iw.addDocument(List.of(new Field("f", bytes, KeywordFieldMapper.Defaults.FIELD_TYPE))); + } + }; + withAggregator(builder, new MatchAllDocsQuery(), buildIndex, (searcher, aggregator) -> { + assertThat(aggregator, instanceOf(FiltersAggregator.FilterByFilter.class)); + // The estimated cost is 0 because we're going to read from metadata + assertThat(((FiltersAggregator.FilterByFilter) aggregator).estimateCost(Long.MAX_VALUE), equalTo(0L)); + Map debug = collectAndGetFilterDebugInfo(searcher, aggregator); + assertThat(debug, hasEntry("specialized_for", "term")); + assertThat((int) debug.get("results_from_metadata"), greaterThan(0)); + assertThat((int) debug.get("scorers_prepared_while_estimating_cost"), equalTo(0)); + }, ft); + testCase(builder, new MatchAllDocsQuery(), buildIndex, (InternalFilters result) -> { + assertThat(result.getBuckets(), hasSize(1)); + assertThat(result.getBucketByKey("q1").getDocCount(), equalTo(4L)); + }, ft); + } + @Override protected List objectMappers() { return MOCK_OBJECT_MAPPERS; } + private Map collectAndGetFilterDebugInfo(IndexSearcher searcher, Aggregator aggregator) throws IOException { + aggregator.preCollection(); + for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) { + expectThrows(CollectionTerminatedException.class, () -> aggregator.getLeafCollector(ctx)); + } + Map debug = new HashMap<>(); + ((FiltersAggregator.FilterByFilter) aggregator).filters().get(0).collectDebugInfo(debug::put); + return debug; + } + static final List MOCK_OBJECT_MAPPERS = List.of(NestedAggregatorTests.nestedObject("nested_chapters")); } diff --git a/server/src/test/java/org/elasticsearch/search/internal/ContextIndexSearcherTests.java b/server/src/test/java/org/elasticsearch/search/internal/ContextIndexSearcherTests.java index 44cca94dbf3f4..1d5cb729a96ca 100644 --- a/server/src/test/java/org/elasticsearch/search/internal/ContextIndexSearcherTests.java +++ b/server/src/test/java/org/elasticsearch/search/internal/ContextIndexSearcherTests.java @@ -276,11 +276,11 @@ private SparseFixedBitSet query(LeafReaderContext leaf, String field, String val return sparseFixedBitSet; } - private static class DocumentSubsetDirectoryReader extends FilterDirectoryReader { + public static class DocumentSubsetDirectoryReader extends FilterDirectoryReader { private final BitsetFilterCache bitsetFilterCache; private final Query roleQuery; - DocumentSubsetDirectoryReader(DirectoryReader in, BitsetFilterCache bitsetFilterCache, Query roleQuery) throws IOException { + public DocumentSubsetDirectoryReader(DirectoryReader in, BitsetFilterCache bitsetFilterCache, Query roleQuery) throws IOException { super(in, new SubReaderWrapper() { @Override public LeafReader wrap(LeafReader reader) { diff --git a/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java b/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java index 0c0da065a3850..68ecb22f93256 100644 --- a/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java @@ -556,6 +556,7 @@ protected void withAggregator( } } + protected void verifyOutputFieldNames(T aggregationBuilder, V agg) throws IOException { if (aggregationBuilder.getOutputFieldNames().isEmpty()) { diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/security/authz/21_search_doc.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/security/authz/21_search_doc.yml index cd7b4772ce949..32172812ab236 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/security/authz/21_search_doc.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/security/authz/21_search_doc.yml @@ -271,3 +271,20 @@ teardown: - match: { responses.2.hits.total: 0 } # no-read - match: { responses.3.hits.total: 1 } # only_read + tag-a - match: { responses.4.error.type: "security_exception" } # only_delete + tag-a + +--- +"filters agg match_all doesn't count invisible docs": + - do: + headers: { Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" } # test_user + search: + rest_total_hits_as_int: true + size: 0 + body: + aggs: + f: + filters: + filters: + - match_all: {} + + - match: { hits.total: 6 } # can-read, read_write, everything + - match: { aggregations.f.buckets.0.doc_count: 6 }