Skip to content

Commit

Permalink
Disable optimization if we aren't sure its faster (backport of elasti…
Browse files Browse the repository at this point in the history
…c#74260)

This disables the filter-by-filter aggregation optimization used by
`terms`, `range`, `date_histogram`, and `date_range` aggregations unless
we're *sure* that its faster than the "native" implementation. Mostly this
is when the top level query is empty or we can merge it into the filter
generated by the agg rewrite process.

Now that we have hard and fast rules we can drop the cost estimation
framework without too much fear. So we remove it in this change. It
stomps a bunch of complexity. Sadly, without the cost estimation stuff
we have to add a separate mechanism for blocking the optimization
against runtime fields for which it'd be kind of garbage. For that I
added another rule preventing the filter-by-filter aggregation from
running against the queries made by runtime fields. Its not fool-proof,
but we have control over what queries we pass as a filter so its not
wide open.

I spent a lot of time working on an alternative to this that preserved
that fancy filter-by-filter collection mechanism and was much more kind
to the query cache. It detected cases where going full filter-by-filter
was bad and grouped those filters together to collect in one pass with a
funny ORing collector. It *worked*. And, if we were super concerned with
the performance of the `filters` aggregation it'd be the way to go. But
it was very complex and it was actually slower than using the native
aggregation for things like `terms` and `date_histogram`. It was
glorious. But it was wrong for us. Too complex and optimized the wrong
things.

So here we are. Hopefully this is a fairly simple solution to a sneaky
problem.
  • Loading branch information
nik9000 committed Jun 24, 2021
1 parent 7d0ef61 commit 5bc0be4
Show file tree
Hide file tree
Showing 22 changed files with 1,062 additions and 784 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -822,8 +822,8 @@ setup:
---
"string profiler via global ordinals filters implementation":
- skip:
version: " - 7.12.99"
reason: filters implementation first supported with sub-aggregators in 7.13.0
version: " - 7.99.99"
reason: profile info changed in 8.0.0 to be backported to 7.14.0
- do:
indices.create:
index: test_3
Expand Down Expand Up @@ -879,7 +879,7 @@ setup:
- match: { profile.shards.0.aggregations.0.type: StringTermsAggregatorFromFilters }
- match: { profile.shards.0.aggregations.0.description: str_terms }
- match: { profile.shards.0.aggregations.0.breakdown.collect_count: 0 }
- match: { profile.shards.0.aggregations.0.debug.delegate: FiltersAggregator.FilterByFilter }
- match: { profile.shards.0.aggregations.0.debug.delegate: FilterByFilterAggregator }
- match: { profile.shards.0.aggregations.0.debug.delegate_debug.filters.0.query: "str:cow" }
- match: { profile.shards.0.aggregations.0.debug.delegate_debug.filters.1.query: "str:pig" }
- match: { profile.shards.0.aggregations.0.debug.delegate_debug.filters.2.query: "str:sheep" }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,10 @@ setup:
---
"Test filters agg with doc_count":
- skip:
version: " - 7.12.99"
version: " - 7.13.99"
reason: profile info changed in 7.14.0
features: default_shards
reason: "name changed in 7.13.0"

- do:
search:
body:
Expand All @@ -177,7 +178,7 @@ setup:
- match: { aggregations.f.buckets.abc.doc_count: 11 }
- match: { aggregations.f.buckets.foo.doc_count: 8 }
- match: { aggregations.f.buckets.xyz.doc_count: 5 }
- match: { profile.shards.0.aggregations.0.type: FiltersAggregator.FilterByFilter }
- match: { profile.shards.0.aggregations.0.type: FilterByFilterAggregator }
# We can't assert that segments_with_doc_count_field is > 0 because we might
# end up with two shards and all of the documents with the _doc_count field
# may be on one field. We have a test for this in AggregationProfilerIT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.notNullValue;

@ESIntegTestCase.SuiteScopeTestCase
Expand Down Expand Up @@ -646,21 +645,17 @@ public void testFilterByFilter() throws InterruptedException, IOException {
"delegate_debug",
matchesMap().entry("average_docs_per_range", equalTo(RangeAggregator.DOCS_PER_RANGE_TO_USE_FILTERS * 2))
.entry("ranges", 1)
.entry("delegate", "FiltersAggregator.FilterByFilter")
.entry("delegate", "FilterByFilterAggregator")
.entry(
"delegate_debug",
matchesMap().entry("segments_with_deleted_docs", 0)
.entry("segments_with_doc_count_field", 0)
.entry("max_cost", (long) RangeAggregator.DOCS_PER_RANGE_TO_USE_FILTERS * 2)
.entry("estimated_cost", (long) RangeAggregator.DOCS_PER_RANGE_TO_USE_FILTERS * 2)
.entry("estimate_cost_time", greaterThanOrEqualTo(0L)) // ~1,276,734 nanos is normal
.entry("segments_counted", 0)
.entry("segments_collected", greaterThan(0))
.entry(
"filters",
matchesList().item(
matchesMap().entry("scorers_prepared_while_estimating_cost", greaterThan(0))
.entry("query", "DocValuesFieldExistsQuery [field=date]")
matchesMap().entry("query", "DocValuesFieldExistsQuery [field=date]")
.entry("specialized_for", "docvalues_field_exists")
.entry("results_from_metadata", 0)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public KeywordScriptFieldType(String name) {
this(name, StringFieldScript.PARSE_FROM_SOURCE, null, Collections.emptyMap(), (builder, params) -> builder);
}

KeywordScriptFieldType(
public KeywordScriptFieldType(
String name,
StringFieldScript.Factory scriptFactory,
Script script,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public LongScriptFieldType(String name) {
this(name, LongFieldScript.PARSE_FROM_SOURCE, null, Collections.emptyMap(), (builder, params) -> builder);
}

LongScriptFieldType(
public LongScriptFieldType(
String name,
LongFieldScript.Factory scriptFactory,
Script script,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public abstract class AdaptingAggregator extends Aggregator {
public AdaptingAggregator(
Aggregator parent,
AggregatorFactories subAggregators,
CheckedFunction<AggregatorFactories, Aggregator, IOException> delegate
CheckedFunction<AggregatorFactories, ? extends Aggregator, IOException> delegate
) throws IOException {
// Its important we set parent first or else when we build the sub-aggregators they can fail because they'll call this.parent.
this.parent = parent;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.Bits;
import org.elasticsearch.common.CheckedSupplier;

import java.io.IOException;
import java.util.function.BiConsumer;
Expand Down Expand Up @@ -43,14 +42,6 @@ long count(LeafReaderContext ctx, FiltersAggregator.Counter counter, Bits live)
return super.count(ctx, counter, live);
}

@Override
long estimateCountCost(LeafReaderContext ctx, CheckedSupplier<Boolean, IOException> canUseMetadata) throws IOException {
if (canUseMetadata.get() && canCountFromMetadata(ctx)) {
return 0;
}
return super.estimateCountCost(ctx, canUseMetadata);
}

private boolean canCountFromMetadata(LeafReaderContext ctx) throws IOException {
FieldInfo info = ctx.reader().getFieldInfos().fieldInfo(query().getField());
if (info == null) {
Expand Down
Loading

0 comments on commit 5bc0be4

Please sign in to comment.