From 625e9d544d88dae432ae6313871f3891836eef15 Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Thu, 5 Dec 2024 07:54:33 +0100 Subject: [PATCH 1/2] Remove bucketOrd field from InternalTerms and friends --- .../search/aggregations/BucketOrder.java | 8 +- .../search/aggregations/InternalOrder.java | 21 +-- .../countedterms/CountedTermsAggregator.java | 95 +++++++----- .../bucket/terms/BucketPriorityQueue.java | 8 +- .../BucketSignificancePriorityQueue.java | 6 +- .../GlobalOrdinalsStringTermsAggregator.java | 144 +++++++++++------- .../terms/InternalSignificantTerms.java | 15 +- .../bucket/terms/InternalTerms.java | 10 -- .../terms/MapStringTermsAggregator.java | 103 +++++++------ .../bucket/terms/NumericTermsAggregator.java | 115 ++++++++------ .../bucket/terms/TermsAggregator.java | 6 +- .../bucket/terms/TermsAggregatorFactory.java | 6 +- .../multiterms/InternalMultiTerms.java | 3 - .../multiterms/MultiTermsAggregator.java | 103 +++++++------ 14 files changed, 362 insertions(+), 281 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/BucketOrder.java b/server/src/main/java/org/elasticsearch/search/aggregations/BucketOrder.java index 2d360705f75b6..c412ecb5d6361 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/BucketOrder.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/BucketOrder.java @@ -12,6 +12,7 @@ import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation.Bucket; +import org.elasticsearch.search.aggregations.bucket.terms.BucketAndOrd; import org.elasticsearch.search.aggregations.support.AggregationPath; import org.elasticsearch.xcontent.ToXContentObject; @@ -20,13 +21,12 @@ import java.util.Comparator; import java.util.List; import java.util.function.BiFunction; -import java.util.function.ToLongFunction; /** * {@link Bucket} ordering strategy. Buckets can be order either as * "complete" buckets using {@link #comparator()} or against a combination * of the buckets internals with its ordinal with - * {@link #partiallyBuiltBucketComparator(ToLongFunction, Aggregator)}. + * {@link #partiallyBuiltBucketComparator(Aggregator)}. */ public abstract class BucketOrder implements ToXContentObject, Writeable { /** @@ -102,7 +102,7 @@ public final void validate(Aggregator aggregator) throws AggregationExecutionExc * to validate this order because doing so checks all of the appropriate * paths. */ - partiallyBuiltBucketComparator(null, aggregator); + partiallyBuiltBucketComparator(aggregator); } /** @@ -121,7 +121,7 @@ public final void validate(Aggregator aggregator) throws AggregationExecutionExc * with it all the time. *

*/ - public abstract Comparator partiallyBuiltBucketComparator(ToLongFunction ordinalReader, Aggregator aggregator); + public abstract Comparator> partiallyBuiltBucketComparator(Aggregator aggregator); /** * Build a comparator for fully built buckets. diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/InternalOrder.java b/server/src/main/java/org/elasticsearch/search/aggregations/InternalOrder.java index b2ca4a10dc4b3..3593eb5adf7e4 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/InternalOrder.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/InternalOrder.java @@ -15,6 +15,7 @@ import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.search.aggregations.Aggregator.BucketComparator; import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation.Bucket; +import org.elasticsearch.search.aggregations.bucket.terms.BucketAndOrd; import org.elasticsearch.search.aggregations.support.AggregationPath; import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.search.sort.SortValue; @@ -30,7 +31,6 @@ import java.util.List; import java.util.Objects; import java.util.function.BiFunction; -import java.util.function.ToLongFunction; /** * Implementations for {@link Bucket} ordering strategies. @@ -63,10 +63,10 @@ public AggregationPath path() { } @Override - public Comparator partiallyBuiltBucketComparator(ToLongFunction ordinalReader, Aggregator aggregator) { + public Comparator> partiallyBuiltBucketComparator(Aggregator aggregator) { try { BucketComparator bucketComparator = path.bucketComparator(aggregator, order); - return (lhs, rhs) -> bucketComparator.compare(ordinalReader.applyAsLong(lhs), ordinalReader.applyAsLong(rhs)); + return (lhs, rhs) -> bucketComparator.compare(lhs.ord, rhs.ord); } catch (IllegalArgumentException e) { throw new AggregationExecutionException.InvalidPath("Invalid aggregation order path [" + path + "]. " + e.getMessage(), e); } @@ -188,12 +188,13 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws } @Override - public Comparator partiallyBuiltBucketComparator(ToLongFunction ordinalReader, Aggregator aggregator) { - List> comparators = orderElements.stream() - .map(oe -> oe.partiallyBuiltBucketComparator(ordinalReader, aggregator)) - .toList(); + public Comparator> partiallyBuiltBucketComparator(Aggregator aggregator) { + List>> comparators = new ArrayList<>(orderElements.size()); + for (BucketOrder order : orderElements) { + comparators.add(order.partiallyBuiltBucketComparator(aggregator)); + } return (lhs, rhs) -> { - for (Comparator c : comparators) { + for (Comparator> c : comparators) { int result = c.compare(lhs, rhs); if (result != 0) { return result; @@ -299,9 +300,9 @@ byte id() { } @Override - public Comparator partiallyBuiltBucketComparator(ToLongFunction ordinalReader, Aggregator aggregator) { + public Comparator> partiallyBuiltBucketComparator(Aggregator aggregator) { Comparator comparator = comparator(); - return comparator::compare; + return (lhs, rhs) -> comparator.compare(lhs.bucket, rhs.bucket); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/countedterms/CountedTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/countedterms/CountedTermsAggregator.java index 344b90b06c4f6..5c4b4223027bc 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/countedterms/CountedTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/countedterms/CountedTermsAggregator.java @@ -13,6 +13,7 @@ import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.util.IntArray; import org.elasticsearch.common.util.LongArray; import org.elasticsearch.common.util.ObjectArray; import org.elasticsearch.core.Releasables; @@ -26,6 +27,7 @@ import org.elasticsearch.search.aggregations.InternalOrder; import org.elasticsearch.search.aggregations.LeafBucketCollector; import org.elasticsearch.search.aggregations.LeafBucketCollectorBase; +import org.elasticsearch.search.aggregations.bucket.terms.BucketAndOrd; import org.elasticsearch.search.aggregations.bucket.terms.BucketPriorityQueue; import org.elasticsearch.search.aggregations.bucket.terms.BytesKeyedBucketOrds; import org.elasticsearch.search.aggregations.bucket.terms.InternalTerms; @@ -115,51 +117,64 @@ public InternalAggregation[] buildAggregations(LongArray owningBucketOrds) throw LongArray otherDocCounts = bigArrays().newLongArray(owningBucketOrds.size()); ObjectArray topBucketsPerOrd = bigArrays().newObjectArray(owningBucketOrds.size()) ) { - for (long ordIdx = 0; ordIdx < topBucketsPerOrd.size(); ordIdx++) { - int size = (int) Math.min(bucketOrds.size(), bucketCountThresholds.getShardSize()); - - // as users can't control sort order, in practice we'll always sort by doc count descending - try ( - BucketPriorityQueue ordered = new BucketPriorityQueue<>( - size, - bigArrays(), - partiallyBuiltBucketComparator - ) - ) { - StringTerms.Bucket spare = null; - BytesKeyedBucketOrds.BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(owningBucketOrds.get(ordIdx)); - Supplier emptyBucketBuilder = () -> new StringTerms.Bucket( - new BytesRef(), - 0, - null, - false, - 0, - format - ); - while (ordsEnum.next()) { - long docCount = bucketDocCount(ordsEnum.ord()); - otherDocCounts.increment(ordIdx, docCount); - if (spare == null) { - checkRealMemoryCBForInternalBucket(); - spare = emptyBucketBuilder.get(); + try (IntArray bucketsToCollect = bigArrays().newIntArray(owningBucketOrds.size())) { + // find how many buckets we are going to collect + long ordsToCollect = 0; + for (long ordIdx = 0; ordIdx < owningBucketOrds.size(); ordIdx++) { + int size = (int) Math.min(bucketOrds.bucketsInOrd(owningBucketOrds.get(ordIdx)), bucketCountThresholds.getShardSize()); + bucketsToCollect.set(ordIdx, size); + ordsToCollect += size; + } + try (LongArray ordsArray = bigArrays().newLongArray(ordsToCollect)) { + long ordsCollected = 0; + for (long ordIdx = 0; ordIdx < topBucketsPerOrd.size(); ordIdx++) { + // as users can't control sort order, in practice we'll always sort by doc count descending + try ( + BucketPriorityQueue ordered = new BucketPriorityQueue<>( + bucketsToCollect.get(ordIdx), + bigArrays(), + order.partiallyBuiltBucketComparator(this) + ) + ) { + BucketAndOrd spare = null; + BytesKeyedBucketOrds.BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(owningBucketOrds.get(ordIdx)); + Supplier emptyBucketBuilder = () -> new StringTerms.Bucket( + new BytesRef(), + 0, + null, + false, + 0, + format + ); + while (ordsEnum.next()) { + long docCount = bucketDocCount(ordsEnum.ord()); + otherDocCounts.increment(ordIdx, docCount); + if (spare == null) { + checkRealMemoryCBForInternalBucket(); + spare = new BucketAndOrd<>(emptyBucketBuilder.get()); + } + ordsEnum.readValue(spare.bucket.getTermBytes()); + spare.bucket.setDocCount(docCount); + spare.ord = ordsEnum.ord(); + spare = ordered.insertWithOverflow(spare); + } + final int orderedSize = (int) ordered.size(); + final StringTerms.Bucket[] buckets = new StringTerms.Bucket[orderedSize]; + for (int i = (int) ordered.size() - 1; i >= 0; --i) { + BucketAndOrd bucketAndOrd = ordered.pop(); + buckets[i] = bucketAndOrd.bucket; + ordsArray.set(ordsCollected + i, bucketAndOrd.ord); + otherDocCounts.increment(ordIdx, -bucketAndOrd.bucket.getDocCount()); + } + topBucketsPerOrd.set(ordIdx, buckets); + ordsCollected += orderedSize; } - ordsEnum.readValue(spare.getTermBytes()); - spare.setDocCount(docCount); - spare.setBucketOrd(ordsEnum.ord()); - spare = ordered.insertWithOverflow(spare); - } - - topBucketsPerOrd.set(ordIdx, new StringTerms.Bucket[(int) ordered.size()]); - for (int i = (int) ordered.size() - 1; i >= 0; --i) { - topBucketsPerOrd.get(ordIdx)[i] = ordered.pop(); - otherDocCounts.increment(ordIdx, -topBucketsPerOrd.get(ordIdx)[i].getDocCount()); - topBucketsPerOrd.get(ordIdx)[i].setTermBytes(BytesRef.deepCopyOf(topBucketsPerOrd.get(ordIdx)[i].getTermBytes())); } + assert ordsCollected == ordsArray.size(); + buildSubAggsForAllBuckets(topBucketsPerOrd, ordsArray, InternalTerms.Bucket::setAggregations); } } - buildSubAggsForAllBuckets(topBucketsPerOrd, InternalTerms.Bucket::getBucketOrd, InternalTerms.Bucket::setAggregations); - return buildAggregations(Math.toIntExact(owningBucketOrds.size()), ordIdx -> { final BucketOrder reduceOrder; if (isKeyOrder(order) == false) { diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/BucketPriorityQueue.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/BucketPriorityQueue.java index 7f8e5c8c885fa..9550003a5bd1e 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/BucketPriorityQueue.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/BucketPriorityQueue.java @@ -13,17 +13,17 @@ import java.util.Comparator; -public class BucketPriorityQueue extends ObjectArrayPriorityQueue { +public class BucketPriorityQueue extends ObjectArrayPriorityQueue> { - private final Comparator comparator; + private final Comparator> comparator; - public BucketPriorityQueue(int size, BigArrays bigArrays, Comparator comparator) { + public BucketPriorityQueue(int size, BigArrays bigArrays, Comparator> comparator) { super(size, bigArrays); this.comparator = comparator; } @Override - protected boolean lessThan(B a, B b) { + protected boolean lessThan(BucketAndOrd a, BucketAndOrd b) { return comparator.compare(a, b) > 0; // reverse, since we reverse again when adding to a list } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/BucketSignificancePriorityQueue.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/BucketSignificancePriorityQueue.java index fe751c9e79189..4736f52d93622 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/BucketSignificancePriorityQueue.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/BucketSignificancePriorityQueue.java @@ -12,14 +12,14 @@ import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.util.ObjectArrayPriorityQueue; -public class BucketSignificancePriorityQueue extends ObjectArrayPriorityQueue { +public class BucketSignificancePriorityQueue extends ObjectArrayPriorityQueue> { public BucketSignificancePriorityQueue(int size, BigArrays bigArrays) { super(size, bigArrays); } @Override - protected boolean lessThan(SignificantTerms.Bucket o1, SignificantTerms.Bucket o2) { - return o1.getSignificanceScore() < o2.getSignificanceScore(); + protected boolean lessThan(BucketAndOrd o1, BucketAndOrd o2) { + return o1.bucket.getSignificanceScore() < o2.bucket.getSignificanceScore(); } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 0ec03a6f56dd9..439b61cc43ddf 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -20,6 +20,7 @@ import org.apache.lucene.util.PriorityQueue; import org.elasticsearch.common.CheckedSupplier; import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.util.IntArray; import org.elasticsearch.common.util.LongArray; import org.elasticsearch.common.util.LongHash; import org.elasticsearch.common.util.ObjectArray; @@ -561,10 +562,10 @@ InternalAggregation[] buildAggregations(LongArray owningBucketOrds) throws IOExc ) { GlobalOrdLookupFunction lookupGlobalOrd = valuesSupplier.get()::lookupOrd; final int size = (int) Math.min(valueCount, bucketCountThresholds.getShardSize()); - try (ObjectArrayPriorityQueue ordered = collectionStrategy.buildPriorityQueue(size)) { + try (ObjectArrayPriorityQueue> ordered = collectionStrategy.buildPriorityQueue(size)) { BucketUpdater updater = collectionStrategy.bucketUpdater(0, lookupGlobalOrd); collect(new BucketInfoConsumer() { - TB spare = null; + BucketAndOrd spare = null; @Override public void accept(long globalOrd, long bucketOrd, long docCount) throws IOException { @@ -572,24 +573,31 @@ public void accept(long globalOrd, long bucketOrd, long docCount) throws IOExcep if (docCount >= bucketCountThresholds.getShardMinDocCount()) { if (spare == null) { checkRealMemoryCBForInternalBucket(); - spare = collectionStrategy.buildEmptyTemporaryBucket(); + spare = new BucketAndOrd<>(collectionStrategy.buildEmptyTemporaryBucket()); } - updater.updateBucket(spare, globalOrd, bucketOrd, docCount); + spare.ord = bucketOrd; + updater.updateBucket(spare.bucket, globalOrd, docCount); spare = ordered.insertWithOverflow(spare); } } }); // Get the top buckets - topBucketsPreOrd.set(0, collectionStrategy.buildBuckets((int) ordered.size())); - for (int i = (int) ordered.size() - 1; i >= 0; --i) { - checkRealMemoryCBForInternalBucket(); - B bucket = collectionStrategy.convertTempBucketToRealBucket(ordered.pop(), lookupGlobalOrd); - topBucketsPreOrd.get(0)[i] = bucket; - otherDocCount.increment(0, -bucket.getDocCount()); + int orderedSize = (int) ordered.size(); + try (LongArray ordsArray = bigArrays().newLongArray(orderedSize)) { + B[] buckets = collectionStrategy.buildBuckets(orderedSize); + for (int i = orderedSize - 1; i >= 0; --i) { + checkRealMemoryCBForInternalBucket(); + BucketAndOrd bucketAndOrd = ordered.pop(); + B bucket = collectionStrategy.convertTempBucketToRealBucket(bucketAndOrd.bucket, lookupGlobalOrd); + ordsArray.set(i, bucketAndOrd.ord); + buckets[i] = bucket; + otherDocCount.increment(0, -bucket.getDocCount()); + } + topBucketsPreOrd.set(0, buckets); + collectionStrategy.buildSubAggs(topBucketsPreOrd, ordsArray); } } - collectionStrategy.buildSubAggs(topBucketsPreOrd); return GlobalOrdinalsStringTermsAggregator.this.buildAggregations( Math.toIntExact(owningBucketOrds.size()), ordIdx -> collectionStrategy.buildResult( @@ -710,39 +718,61 @@ InternalAggregation[] buildAggregations(LongArray owningBucketOrds) throws IOExc LongArray otherDocCount = bigArrays().newLongArray(owningBucketOrds.size(), true); ObjectArray topBucketsPreOrd = collectionStrategy.buildTopBucketsPerOrd(owningBucketOrds.size()) ) { - GlobalOrdLookupFunction lookupGlobalOrd = valuesSupplier.get()::lookupOrd; - for (long ordIdx = 0; ordIdx < topBucketsPreOrd.size(); ordIdx++) { - long owningBucketOrd = owningBucketOrds.get(ordIdx); - collectZeroDocEntriesIfNeeded(owningBucketOrds.get(ordIdx)); - int size = (int) Math.min(bucketOrds.bucketsInOrd(owningBucketOrd), bucketCountThresholds.getShardSize()); - try (ObjectArrayPriorityQueue ordered = collectionStrategy.buildPriorityQueue(size)) { - BucketUpdater updater = collectionStrategy.bucketUpdater(owningBucketOrd, lookupGlobalOrd); - LongKeyedBucketOrds.BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(owningBucketOrd); - TB spare = null; - while (ordsEnum.next()) { - long docCount = bucketDocCount(ordsEnum.ord()); - otherDocCount.increment(ordIdx, docCount); - if (docCount < bucketCountThresholds.getShardMinDocCount()) { - continue; - } - if (spare == null) { - checkRealMemoryCBForInternalBucket(); - spare = collectionStrategy.buildEmptyTemporaryBucket(); + try (IntArray bucketsToCollect = bigArrays().newIntArray(owningBucketOrds.size())) { + long ordsToCollect = 0; + for (long ordIdx = 0; ordIdx < owningBucketOrds.size(); ordIdx++) { + final long owningBucketOrd = owningBucketOrds.get(ordIdx); + collectZeroDocEntriesIfNeeded(owningBucketOrd); + final int size = (int) Math.min(bucketOrds.bucketsInOrd(owningBucketOrd), bucketCountThresholds.getShardSize()); + ordsToCollect += size; + bucketsToCollect.set(ordIdx, size); + } + try (LongArray ordsArray = bigArrays().newLongArray(ordsToCollect)) { + long ordsCollected = 0; + GlobalOrdLookupFunction lookupGlobalOrd = valuesSupplier.get()::lookupOrd; + for (long ordIdx = 0; ordIdx < topBucketsPreOrd.size(); ordIdx++) { + long owningBucketOrd = owningBucketOrds.get(ordIdx); + try ( + ObjectArrayPriorityQueue> ordered = collectionStrategy.buildPriorityQueue( + bucketsToCollect.get(ordIdx) + ) + ) { + BucketUpdater updater = collectionStrategy.bucketUpdater(owningBucketOrd, lookupGlobalOrd); + LongKeyedBucketOrds.BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(owningBucketOrd); + BucketAndOrd spare = null; + while (ordsEnum.next()) { + long docCount = bucketDocCount(ordsEnum.ord()); + otherDocCount.increment(ordIdx, docCount); + if (docCount < bucketCountThresholds.getShardMinDocCount()) { + continue; + } + if (spare == null) { + checkRealMemoryCBForInternalBucket(); + spare = new BucketAndOrd<>(collectionStrategy.buildEmptyTemporaryBucket()); + } + updater.updateBucket(spare.bucket, ordsEnum.value(), docCount); + spare.ord = ordsEnum.ord(); + spare = ordered.insertWithOverflow(spare); + } + // Get the top buckets + int orderedSize = (int) ordered.size(); + B[] buckets = collectionStrategy.buildBuckets(orderedSize); + for (int i = orderedSize - 1; i >= 0; --i) { + checkRealMemoryCBForInternalBucket(); + BucketAndOrd bucketAndOrd = ordered.pop(); + B bucket = collectionStrategy.convertTempBucketToRealBucket(bucketAndOrd.bucket, lookupGlobalOrd); + ordsArray.set(ordsCollected + i, bucketAndOrd.ord); + buckets[i] = bucket; + otherDocCount.increment(ordIdx, -bucket.getDocCount()); + } + topBucketsPreOrd.set(ordIdx, buckets); + ordsCollected += orderedSize; } - updater.updateBucket(spare, ordsEnum.value(), ordsEnum.ord(), docCount); - spare = ordered.insertWithOverflow(spare); - } - // Get the top buckets - topBucketsPreOrd.set(ordIdx, collectionStrategy.buildBuckets((int) ordered.size())); - for (int i = (int) ordered.size() - 1; i >= 0; --i) { - checkRealMemoryCBForInternalBucket(); - B bucket = collectionStrategy.convertTempBucketToRealBucket(ordered.pop(), lookupGlobalOrd); - topBucketsPreOrd.get(ordIdx)[i] = bucket; - otherDocCount.increment(ordIdx, -bucket.getDocCount()); } + assert ordsCollected == ordsArray.size(); + collectionStrategy.buildSubAggs(topBucketsPreOrd, ordsArray); } } - collectionStrategy.buildSubAggs(topBucketsPreOrd); return GlobalOrdinalsStringTermsAggregator.this.buildAggregations( Math.toIntExact(owningBucketOrds.size()), ordIdx -> collectionStrategy.buildResult( @@ -791,7 +821,7 @@ abstract class ResultStrategy< * Build a {@link PriorityQueue} to sort the buckets. After we've * collected all of the buckets we'll collect all entries in the queue. */ - abstract ObjectArrayPriorityQueue buildPriorityQueue(int size); + abstract ObjectArrayPriorityQueue> buildPriorityQueue(int size); /** * Build an array to hold the "top" buckets for each ordinal. @@ -813,7 +843,7 @@ abstract class ResultStrategy< * Build the sub-aggregations into the buckets. This will usually * delegate to {@link #buildSubAggsForAllBuckets}. */ - abstract void buildSubAggs(ObjectArray topBucketsPreOrd) throws IOException; + abstract void buildSubAggs(ObjectArray topBucketsPreOrd, LongArray ordsArray) throws IOException; /** * Turn the buckets into an aggregation result. @@ -834,7 +864,7 @@ abstract class ResultStrategy< } interface BucketUpdater { - void updateBucket(TB spare, long globalOrd, long bucketOrd, long docCount) throws IOException; + void updateBucket(TB spare, long globalOrd, long docCount) throws IOException; } /** @@ -868,29 +898,30 @@ OrdBucket buildEmptyTemporaryBucket() { @Override BucketUpdater bucketUpdater(long owningBucketOrd, GlobalOrdLookupFunction lookupGlobalOrd) { - return (spare, globalOrd, bucketOrd, docCount) -> { + return (spare, globalOrd, docCount) -> { spare.globalOrd = globalOrd; - spare.bucketOrd = bucketOrd; spare.docCount = docCount; }; } @Override - ObjectArrayPriorityQueue buildPriorityQueue(int size) { - return new BucketPriorityQueue<>(size, bigArrays(), partiallyBuiltBucketComparator); + ObjectArrayPriorityQueue> buildPriorityQueue(int size) { + return new BucketPriorityQueue<>( + size, + bigArrays(), + order.partiallyBuiltBucketComparator(GlobalOrdinalsStringTermsAggregator.this) + ); } @Override StringTerms.Bucket convertTempBucketToRealBucket(OrdBucket temp, GlobalOrdLookupFunction lookupGlobalOrd) throws IOException { BytesRef term = BytesRef.deepCopyOf(lookupGlobalOrd.apply(temp.globalOrd)); - StringTerms.Bucket result = new StringTerms.Bucket(term, temp.docCount, null, showTermDocCountError, 0, format); - result.bucketOrd = temp.bucketOrd; - return result; + return new StringTerms.Bucket(term, temp.docCount, null, showTermDocCountError, 0, format); } @Override - void buildSubAggs(ObjectArray topBucketsPreOrd) throws IOException { - buildSubAggsForAllBuckets(topBucketsPreOrd, b -> b.bucketOrd, (b, aggs) -> b.aggregations = aggs); + void buildSubAggs(ObjectArray topBucketsPreOrd, LongArray ordsArray) throws IOException { + buildSubAggsForAllBuckets(topBucketsPreOrd, ordsArray, (b, aggs) -> b.aggregations = aggs); } @Override @@ -1005,8 +1036,7 @@ private long subsetSize(long owningBucketOrd) { @Override BucketUpdater bucketUpdater(long owningBucketOrd, GlobalOrdLookupFunction lookupGlobalOrd) { long subsetSize = subsetSize(owningBucketOrd); - return (spare, globalOrd, bucketOrd, docCount) -> { - spare.bucketOrd = bucketOrd; + return (spare, globalOrd, docCount) -> { oversizedCopy(lookupGlobalOrd.apply(globalOrd), spare.termBytes); spare.subsetDf = docCount; spare.supersetDf = backgroundFrequencies.freq(spare.termBytes); @@ -1020,7 +1050,7 @@ BucketUpdater bucketUpdater(long owningBucketOrd, } @Override - ObjectArrayPriorityQueue buildPriorityQueue(int size) { + ObjectArrayPriorityQueue> buildPriorityQueue(int size) { return new BucketSignificancePriorityQueue<>(size, bigArrays()); } @@ -1033,8 +1063,8 @@ SignificantStringTerms.Bucket convertTempBucketToRealBucket( } @Override - void buildSubAggs(ObjectArray topBucketsPreOrd) throws IOException { - buildSubAggsForAllBuckets(topBucketsPreOrd, b -> b.bucketOrd, (b, aggs) -> b.aggregations = aggs); + void buildSubAggs(ObjectArray topBucketsPreOrd, LongArray ordsArray) throws IOException { + buildSubAggsForAllBuckets(topBucketsPreOrd, ordsArray, (b, aggs) -> b.aggregations = aggs); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalSignificantTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalSignificantTerms.java index 78ae2481f5d99..5108793b8a809 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalSignificantTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalSignificantTerms.java @@ -10,12 +10,12 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.util.ObjectArrayPriorityQueue; import org.elasticsearch.common.util.ObjectObjectPagedHashMap; import org.elasticsearch.core.Releasables; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.AggregationErrors; import org.elasticsearch.search.aggregations.AggregationReduceContext; -import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorReducer; import org.elasticsearch.search.aggregations.InternalAggregation; import org.elasticsearch.search.aggregations.InternalAggregations; @@ -58,12 +58,6 @@ public interface Reader> { long subsetDf; long supersetDf; - /** - * Ordinal of the bucket while it is being built. Not used after it is - * returned from {@link Aggregator#buildAggregations(org.elasticsearch.common.util.LongArray)} and not - * serialized. - */ - transient long bucketOrd; double score; protected InternalAggregations aggregations; final transient DocValueFormat format; @@ -235,7 +229,12 @@ canLeadReduction here is essentially checking if this shard returned data. Unma public InternalAggregation get() { final SignificanceHeuristic heuristic = getSignificanceHeuristic().rewrite(reduceContext); final int size = (int) (reduceContext.isFinalReduce() == false ? buckets.size() : Math.min(requiredSize, buckets.size())); - try (BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue<>(size, reduceContext.bigArrays())) { + try (ObjectArrayPriorityQueue ordered = new ObjectArrayPriorityQueue(size, reduceContext.bigArrays()) { + @Override + protected boolean lessThan(B a, B b) { + return a.getSignificanceScore() < b.getSignificanceScore(); + } + }) { buckets.forEach(entry -> { final B b = createBucket( entry.value.subsetDf[0], diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalTerms.java index 739f0b923eaab..de35046691b34 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalTerms.java @@ -38,8 +38,6 @@ public interface Reader> { B read(StreamInput in, DocValueFormat format, boolean showDocCountError) throws IOException; } - long bucketOrd; - protected long docCount; private long docCountError; protected InternalAggregations aggregations; @@ -88,14 +86,6 @@ public void setDocCount(long docCount) { this.docCount = docCount; } - public long getBucketOrd() { - return bucketOrd; - } - - public void setBucketOrd(long bucketOrd) { - this.bucketOrd = bucketOrd; - } - @Override public long getDocCountError() { return docCountError; diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java index b96c495d37489..026912a583ef3 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java @@ -17,6 +17,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.PriorityQueue; +import org.elasticsearch.common.util.IntArray; import org.elasticsearch.common.util.LongArray; import org.elasticsearch.common.util.ObjectArray; import org.elasticsearch.common.util.ObjectArrayPriorityQueue; @@ -43,6 +44,7 @@ import java.io.IOException; import java.util.Arrays; +import java.util.Comparator; import java.util.Map; import java.util.function.BiConsumer; import java.util.function.Function; @@ -287,40 +289,55 @@ private InternalAggregation[] buildAggregations(LongArray owningBucketOrds) thro LongArray otherDocCounts = bigArrays().newLongArray(owningBucketOrds.size(), true); ObjectArray topBucketsPerOrd = buildTopBucketsPerOrd(Math.toIntExact(owningBucketOrds.size())) ) { - for (long ordIdx = 0; ordIdx < topBucketsPerOrd.size(); ordIdx++) { - long owningOrd = owningBucketOrds.get(ordIdx); - collectZeroDocEntriesIfNeeded(owningOrd, excludeDeletedDocs); - int size = (int) Math.min(bucketOrds.size(), bucketCountThresholds.getShardSize()); - - try (ObjectArrayPriorityQueue ordered = buildPriorityQueue(size)) { - B spare = null; - BytesKeyedBucketOrds.BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(owningOrd); - BucketUpdater bucketUpdater = bucketUpdater(owningOrd); - while (ordsEnum.next()) { - long docCount = bucketDocCount(ordsEnum.ord()); - otherDocCounts.increment(ordIdx, docCount); - if (docCount < bucketCountThresholds.getShardMinDocCount()) { - continue; - } - if (spare == null) { - checkRealMemoryCBForInternalBucket(); - spare = buildEmptyBucket(); + try (IntArray bucketsToCollect = bigArrays().newIntArray(owningBucketOrds.size())) { + long ordsToCollect = 0; + for (long ordIdx = 0; ordIdx < owningBucketOrds.size(); ordIdx++) { + final long owningBucketOrd = owningBucketOrds.get(ordIdx); + collectZeroDocEntriesIfNeeded(owningBucketOrd, excludeDeletedDocs); + final int size = (int) Math.min(bucketOrds.bucketsInOrd(owningBucketOrd), bucketCountThresholds.getShardSize()); + ordsToCollect += size; + bucketsToCollect.set(ordIdx, size); + } + try (LongArray ordsArray = bigArrays().newLongArray(ordsToCollect)) { + long ordsCollected = 0; + for (long ordIdx = 0; ordIdx < topBucketsPerOrd.size(); ordIdx++) { + long owningOrd = owningBucketOrds.get(ordIdx); + try (ObjectArrayPriorityQueue> ordered = buildPriorityQueue(bucketsToCollect.get(ordIdx))) { + BucketAndOrd spare = null; + BytesKeyedBucketOrds.BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(owningOrd); + BucketUpdater bucketUpdater = bucketUpdater(owningOrd); + while (ordsEnum.next()) { + long docCount = bucketDocCount(ordsEnum.ord()); + otherDocCounts.increment(ordIdx, docCount); + if (docCount < bucketCountThresholds.getShardMinDocCount()) { + continue; + } + if (spare == null) { + checkRealMemoryCBForInternalBucket(); + spare = new BucketAndOrd<>(buildEmptyBucket()); + } + bucketUpdater.updateBucket(spare.bucket, ordsEnum, docCount); + spare.ord = ordsEnum.ord(); + spare = ordered.insertWithOverflow(spare); + } + + final int orderedSize = (int) ordered.size(); + final B[] buckets = buildBuckets(orderedSize); + for (int i = orderedSize - 1; i >= 0; --i) { + BucketAndOrd bucketAndOrd = ordered.pop(); + finalizeBucket(bucketAndOrd.bucket); + buckets[i] = bucketAndOrd.bucket; + ordsArray.set(ordsCollected + i, bucketAndOrd.ord); + otherDocCounts.increment(ordIdx, -bucketAndOrd.bucket.getDocCount()); + } + topBucketsPerOrd.set(ordIdx, buckets); + ordsCollected += orderedSize; } - bucketUpdater.updateBucket(spare, ordsEnum, docCount); - spare = ordered.insertWithOverflow(spare); - } - - topBucketsPerOrd.set(ordIdx, buildBuckets((int) ordered.size())); - for (int i = (int) ordered.size() - 1; i >= 0; --i) { - topBucketsPerOrd.get(ordIdx)[i] = ordered.pop(); - otherDocCounts.increment(ordIdx, -topBucketsPerOrd.get(ordIdx)[i].getDocCount()); - finalizeBucket(topBucketsPerOrd.get(ordIdx)[i]); } + assert ordsCollected == ordsArray.size(); + buildSubAggs(topBucketsPerOrd, ordsArray); } } - - buildSubAggs(topBucketsPerOrd); - return MapStringTermsAggregator.this.buildAggregations( Math.toIntExact(owningBucketOrds.size()), ordIdx -> buildResult(owningBucketOrds.get(ordIdx), otherDocCounts.get(ordIdx), topBucketsPerOrd.get(ordIdx)) @@ -355,7 +372,7 @@ private InternalAggregation[] buildAggregations(LongArray owningBucketOrds) thro * Build a {@link PriorityQueue} to sort the buckets. After we've * collected all of the buckets we'll collect all entries in the queue. */ - abstract ObjectArrayPriorityQueue buildPriorityQueue(int size); + abstract ObjectArrayPriorityQueue> buildPriorityQueue(int size); /** * Update fields in {@code spare} to reflect information collected for @@ -382,9 +399,9 @@ private InternalAggregation[] buildAggregations(LongArray owningBucketOrds) thro /** * Build the sub-aggregations into the buckets. This will usually - * delegate to {@link #buildSubAggsForAllBuckets}. + * delegate to {@link #buildSubAggsForAllBuckets(ObjectArray, LongArray, BiConsumer)}. */ - abstract void buildSubAggs(ObjectArray topBucketsPerOrd) throws IOException; + abstract void buildSubAggs(ObjectArray topBucketsPerOrd, LongArray ordsArray) throws IOException; /** * Turn the buckets into an aggregation result. @@ -407,9 +424,11 @@ interface BucketUpdater */ class StandardTermsResults extends ResultStrategy { private final ValuesSource valuesSource; + private final Comparator> comparator; - StandardTermsResults(ValuesSource valuesSource) { + StandardTermsResults(ValuesSource valuesSource, Aggregator aggregator) { this.valuesSource = valuesSource; + this.comparator = order.partiallyBuiltBucketComparator(aggregator); } @Override @@ -498,8 +517,8 @@ StringTerms.Bucket buildEmptyBucket() { } @Override - ObjectArrayPriorityQueue buildPriorityQueue(int size) { - return new BucketPriorityQueue<>(size, bigArrays(), partiallyBuiltBucketComparator); + ObjectArrayPriorityQueue> buildPriorityQueue(int size) { + return new BucketPriorityQueue<>(size, bigArrays(), comparator); } @Override @@ -507,7 +526,6 @@ BucketUpdater bucketUpdater(long owningBucketOrd) { return (spare, ordsEnum, docCount) -> { ordsEnum.readValue(spare.termBytes); spare.docCount = docCount; - spare.bucketOrd = ordsEnum.ord(); }; } @@ -532,8 +550,8 @@ void finalizeBucket(StringTerms.Bucket bucket) { } @Override - void buildSubAggs(ObjectArray topBucketsPerOrd) throws IOException { - buildSubAggsForAllBuckets(topBucketsPerOrd, b -> b.bucketOrd, (b, a) -> b.aggregations = a); + void buildSubAggs(ObjectArray topBucketsPerOrd, LongArray ordArray) throws IOException { + buildSubAggsForAllBuckets(topBucketsPerOrd, ordArray, (b, a) -> b.aggregations = a); } @Override @@ -625,7 +643,7 @@ SignificantStringTerms.Bucket buildEmptyBucket() { } @Override - ObjectArrayPriorityQueue buildPriorityQueue(int size) { + ObjectArrayPriorityQueue> buildPriorityQueue(int size) { return new BucketSignificancePriorityQueue<>(size, bigArrays()); } @@ -634,7 +652,6 @@ BucketUpdater bucketUpdater(long owningBucketOrd) long subsetSize = subsetSizes.get(owningBucketOrd); return (spare, ordsEnum, docCount) -> { ordsEnum.readValue(spare.termBytes); - spare.bucketOrd = ordsEnum.ord(); spare.subsetDf = docCount; spare.supersetDf = backgroundFrequencies.freq(spare.termBytes); /* @@ -667,8 +684,8 @@ void finalizeBucket(SignificantStringTerms.Bucket bucket) { } @Override - void buildSubAggs(ObjectArray topBucketsPerOrd) throws IOException { - buildSubAggsForAllBuckets(topBucketsPerOrd, b -> b.bucketOrd, (b, a) -> b.aggregations = a); + void buildSubAggs(ObjectArray topBucketsPerOrd, LongArray ordsArray) throws IOException { + buildSubAggsForAllBuckets(topBucketsPerOrd, ordsArray, (b, a) -> b.aggregations = a); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/NumericTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/NumericTermsAggregator.java index 5d4c15d8a3b80..a54053f712f8d 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/NumericTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/NumericTermsAggregator.java @@ -14,6 +14,7 @@ import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.util.NumericUtils; +import org.elasticsearch.common.util.IntArray; import org.elasticsearch.common.util.LongArray; import org.elasticsearch.common.util.ObjectArray; import org.elasticsearch.common.util.ObjectArrayPriorityQueue; @@ -40,6 +41,7 @@ import java.io.IOException; import java.util.Arrays; +import java.util.Comparator; import java.util.Map; import java.util.function.BiConsumer; import java.util.function.Function; @@ -167,42 +169,56 @@ private InternalAggregation[] buildAggregations(LongArray owningBucketOrds) thro LongArray otherDocCounts = bigArrays().newLongArray(owningBucketOrds.size(), true); ObjectArray topBucketsPerOrd = buildTopBucketsPerOrd(owningBucketOrds.size()) ) { - for (long ordIdx = 0; ordIdx < topBucketsPerOrd.size(); ordIdx++) { - final long owningBucketOrd = owningBucketOrds.get(ordIdx); - collectZeroDocEntriesIfNeeded(owningBucketOrd, excludeDeletedDocs); - long bucketsInOrd = bucketOrds.bucketsInOrd(owningBucketOrd); - - int size = (int) Math.min(bucketsInOrd, bucketCountThresholds.getShardSize()); - try (ObjectArrayPriorityQueue ordered = buildPriorityQueue(size)) { - B spare = null; - BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(owningBucketOrd); - BucketUpdater bucketUpdater = bucketUpdater(owningBucketOrd); - while (ordsEnum.next()) { - long docCount = bucketDocCount(ordsEnum.ord()); - otherDocCounts.increment(ordIdx, docCount); - if (docCount < bucketCountThresholds.getShardMinDocCount()) { - continue; - } - if (spare == null) { - checkRealMemoryCBForInternalBucket(); - spare = buildEmptyBucket(); - } - bucketUpdater.updateBucket(spare, ordsEnum, docCount); - spare = ordered.insertWithOverflow(spare); - } + try (IntArray bucketsToCollect = bigArrays().newIntArray(owningBucketOrds.size())) { + long ordsToCollect = 0; + for (long ordIdx = 0; ordIdx < owningBucketOrds.size(); ordIdx++) { + final long owningBucketOrd = owningBucketOrds.get(ordIdx); + collectZeroDocEntriesIfNeeded(owningBucketOrd, excludeDeletedDocs); + int size = (int) Math.min(bucketOrds.bucketsInOrd(owningBucketOrd), bucketCountThresholds.getShardSize()); + bucketsToCollect.set(ordIdx, size); + ordsToCollect += size; + } + try (LongArray ordsArray = bigArrays().newLongArray(ordsToCollect)) { + long ordsCollected = 0; + for (long ordIdx = 0; ordIdx < topBucketsPerOrd.size(); ordIdx++) { + final long owningBucketOrd = owningBucketOrds.get(ordIdx); + try (ObjectArrayPriorityQueue> ordered = buildPriorityQueue(bucketsToCollect.get(ordIdx))) { + BucketAndOrd spare = null; + BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(owningBucketOrd); + BucketUpdater bucketUpdater = bucketUpdater(owningBucketOrd); + while (ordsEnum.next()) { + long docCount = bucketDocCount(ordsEnum.ord()); + otherDocCounts.increment(ordIdx, docCount); + if (docCount < bucketCountThresholds.getShardMinDocCount()) { + continue; + } + if (spare == null) { + checkRealMemoryCBForInternalBucket(); + spare = new BucketAndOrd<>(buildEmptyBucket()); + } + bucketUpdater.updateBucket(spare.bucket, ordsEnum, docCount); + spare.ord = ordsEnum.ord(); + spare = ordered.insertWithOverflow(spare); + } + + // Get the top buckets + final int orderedSize = (int) ordered.size(); + final B[] bucketsForOrd = buildBuckets(orderedSize); + for (int b = orderedSize - 1; b >= 0; --b) { + BucketAndOrd bucketAndOrd = ordered.pop(); + bucketsForOrd[b] = bucketAndOrd.bucket; + ordsArray.set(ordsCollected + b, bucketAndOrd.ord); + otherDocCounts.increment(ordIdx, -bucketAndOrd.bucket.getDocCount()); + } + topBucketsPerOrd.set(ordIdx, bucketsForOrd); + ordsCollected += orderedSize; - // Get the top buckets - B[] bucketsForOrd = buildBuckets((int) ordered.size()); - topBucketsPerOrd.set(ordIdx, bucketsForOrd); - for (int b = (int) ordered.size() - 1; b >= 0; --b) { - topBucketsPerOrd.get(ordIdx)[b] = ordered.pop(); - otherDocCounts.increment(ordIdx, -topBucketsPerOrd.get(ordIdx)[b].getDocCount()); + } } + assert ordsCollected == ordsArray.size(); + buildSubAggs(topBucketsPerOrd, ordsArray); } } - - buildSubAggs(topBucketsPerOrd); - return NumericTermsAggregator.this.buildAggregations( Math.toIntExact(owningBucketOrds.size()), ordIdx -> buildResult(owningBucketOrds.get(ordIdx), otherDocCounts.get(ordIdx), topBucketsPerOrd.get(ordIdx)) @@ -254,13 +270,13 @@ private InternalAggregation[] buildAggregations(LongArray owningBucketOrds) thro * Build a {@link ObjectArrayPriorityQueue} to sort the buckets. After we've * collected all of the buckets we'll collect all entries in the queue. */ - abstract ObjectArrayPriorityQueue buildPriorityQueue(int size); + abstract ObjectArrayPriorityQueue> buildPriorityQueue(int size); /** * Build the sub-aggregations into the buckets. This will usually - * delegate to {@link #buildSubAggsForAllBuckets}. + * delegate to {@link #buildSubAggsForAllBuckets(ObjectArray, LongArray, BiConsumer)}. */ - abstract void buildSubAggs(ObjectArray topBucketsPerOrd) throws IOException; + abstract void buildSubAggs(ObjectArray topBucketsPerOrd, LongArray ordsArray) throws IOException; /** * Collect extra entries for "zero" hit documents if they were requested @@ -287,9 +303,11 @@ interface BucketUpdater abstract class StandardTermsResultStrategy, B extends InternalTerms.Bucket> extends ResultStrategy { protected final boolean showTermDocCountError; + private final Comparator> comparator; - StandardTermsResultStrategy(boolean showTermDocCountError) { + StandardTermsResultStrategy(boolean showTermDocCountError, Aggregator aggregator) { this.showTermDocCountError = showTermDocCountError; + this.comparator = order.partiallyBuiltBucketComparator(aggregator); } @Override @@ -298,13 +316,13 @@ final LeafBucketCollector wrapCollector(LeafBucketCollector primary) { } @Override - final ObjectArrayPriorityQueue buildPriorityQueue(int size) { - return new BucketPriorityQueue<>(size, bigArrays(), partiallyBuiltBucketComparator); + final ObjectArrayPriorityQueue> buildPriorityQueue(int size) { + return new BucketPriorityQueue<>(size, bigArrays(), comparator); } @Override - final void buildSubAggs(ObjectArray topBucketsPerOrd) throws IOException { - buildSubAggsForAllBuckets(topBucketsPerOrd, b -> b.bucketOrd, (b, aggs) -> b.aggregations = aggs); + final void buildSubAggs(ObjectArray topBucketsPerOrd, LongArray ordsArray) throws IOException { + buildSubAggsForAllBuckets(topBucketsPerOrd, ordsArray, (b, aggs) -> b.aggregations = aggs); } @Override @@ -340,8 +358,8 @@ public final void close() {} } class LongTermsResults extends StandardTermsResultStrategy { - LongTermsResults(boolean showTermDocCountError) { - super(showTermDocCountError); + LongTermsResults(boolean showTermDocCountError, Aggregator aggregator) { + super(showTermDocCountError, aggregator); } @Override @@ -374,7 +392,6 @@ BucketUpdater bucketUpdater(long owningBucketOrd) { return (LongTerms.Bucket spare, BucketOrdsEnum ordsEnum, long docCount) -> { spare.term = ordsEnum.value(); spare.docCount = docCount; - spare.bucketOrd = ordsEnum.ord(); }; } @@ -424,8 +441,8 @@ LongTerms buildEmptyResult() { class DoubleTermsResults extends StandardTermsResultStrategy { - DoubleTermsResults(boolean showTermDocCountError) { - super(showTermDocCountError); + DoubleTermsResults(boolean showTermDocCountError, Aggregator aggregator) { + super(showTermDocCountError, aggregator); } @Override @@ -458,7 +475,6 @@ BucketUpdater bucketUpdater(long owningBucketOrd) { return (DoubleTerms.Bucket spare, BucketOrdsEnum ordsEnum, long docCount) -> { spare.term = NumericUtils.sortableLongToDouble(ordsEnum.value()); spare.docCount = docCount; - spare.bucketOrd = ordsEnum.ord(); }; } @@ -575,7 +591,6 @@ BucketUpdater bucketUpdater(long owningBucketOrd) { spare.term = ordsEnum.value(); spare.subsetDf = docCount; spare.supersetDf = backgroundFrequencies.freq(spare.term); - spare.bucketOrd = ordsEnum.ord(); // During shard-local down-selection we use subset/superset stats that are for this shard only // Back at the central reducer these properties will be updated with global stats spare.updateScore(significanceHeuristic, subsetSize, supersetSize); @@ -583,13 +598,13 @@ BucketUpdater bucketUpdater(long owningBucketOrd) { } @Override - ObjectArrayPriorityQueue buildPriorityQueue(int size) { + ObjectArrayPriorityQueue> buildPriorityQueue(int size) { return new BucketSignificancePriorityQueue<>(size, bigArrays()); } @Override - void buildSubAggs(ObjectArray topBucketsPerOrd) throws IOException { - buildSubAggsForAllBuckets(topBucketsPerOrd, b -> b.bucketOrd, (b, aggs) -> b.aggregations = aggs); + void buildSubAggs(ObjectArray topBucketsPerOrd, LongArray ordsArray) throws IOException { + buildSubAggsForAllBuckets(topBucketsPerOrd, ordsArray, (b, aggs) -> b.aggregations = aggs); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregator.java index 4922be7cec1ba..c07c0726a4ae1 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregator.java @@ -27,7 +27,6 @@ import org.elasticsearch.xcontent.XContentBuilder; import java.io.IOException; -import java.util.Comparator; import java.util.HashSet; import java.util.Map; import java.util.Objects; @@ -190,7 +189,6 @@ public boolean equals(Object obj) { protected final DocValueFormat format; protected final BucketCountThresholds bucketCountThresholds; protected final BucketOrder order; - protected final Comparator> partiallyBuiltBucketComparator; protected final Set aggsUsedForSorting; protected final SubAggCollectionMode collectMode; @@ -209,7 +207,9 @@ public TermsAggregator( super(name, factories, context, parent, metadata); this.bucketCountThresholds = bucketCountThresholds; this.order = order; - partiallyBuiltBucketComparator = order == null ? null : order.partiallyBuiltBucketComparator(b -> b.bucketOrd, this); + if (order != null) { + order.validate(this); + } this.format = format; if ((subAggsNeedScore() && descendsFromNestedAggregator(parent)) || context.isInSortOrderExecutionRequired()) { /** diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java index 2c7b768fcdbb3..da5ae37b08228 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java @@ -195,12 +195,12 @@ private static TermsAggregatorSupplier numericSupplier() { if (includeExclude != null) { longFilter = includeExclude.convertToDoubleFilter(); } - resultStrategy = agg -> agg.new DoubleTermsResults(showTermDocCountError); + resultStrategy = agg -> agg.new DoubleTermsResults(showTermDocCountError, agg); } else { if (includeExclude != null) { longFilter = includeExclude.convertToLongFilter(valuesSourceConfig.format()); } - resultStrategy = agg -> agg.new LongTermsResults(showTermDocCountError); + resultStrategy = agg -> agg.new LongTermsResults(showTermDocCountError, agg); } return new NumericTermsAggregator( name, @@ -403,7 +403,7 @@ Aggregator create( name, factories, new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSourceConfig), - a -> a.new StandardTermsResults(valuesSourceConfig.getValuesSource()), + a -> a.new StandardTermsResults(valuesSourceConfig.getValuesSource(), a), order, valuesSourceConfig.format(), bucketCountThresholds, diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/multiterms/InternalMultiTerms.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/multiterms/InternalMultiTerms.java index 0d42a2856a10e..85510c8a989c0 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/multiterms/InternalMultiTerms.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/multiterms/InternalMultiTerms.java @@ -37,9 +37,6 @@ public class InternalMultiTerms extends AbstractInternalTerms { - - long bucketOrd; - protected long docCount; protected InternalAggregations aggregations; private long docCountError; diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/multiterms/MultiTermsAggregator.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/multiterms/MultiTermsAggregator.java index 1691aedf543f4..5c10e2c8feeb1 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/multiterms/MultiTermsAggregator.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/multiterms/MultiTermsAggregator.java @@ -20,6 +20,7 @@ import org.elasticsearch.common.io.stream.BytesStreamOutput; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.util.IntArray; import org.elasticsearch.common.util.LongArray; import org.elasticsearch.common.util.ObjectArray; import org.elasticsearch.common.util.ObjectArrayPriorityQueue; @@ -40,6 +41,7 @@ import org.elasticsearch.search.aggregations.LeafBucketCollector; import org.elasticsearch.search.aggregations.LeafBucketCollectorBase; import org.elasticsearch.search.aggregations.bucket.DeferableBucketAggregator; +import org.elasticsearch.search.aggregations.bucket.terms.BucketAndOrd; import org.elasticsearch.search.aggregations.bucket.terms.BucketPriorityQueue; import org.elasticsearch.search.aggregations.bucket.terms.BytesKeyedBucketOrds; import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregator; @@ -72,7 +74,7 @@ class MultiTermsAggregator extends DeferableBucketAggregator { protected final List formats; protected final TermsAggregator.BucketCountThresholds bucketCountThresholds; protected final BucketOrder order; - protected final Comparator partiallyBuiltBucketComparator; + protected final Comparator> partiallyBuiltBucketComparator; protected final Set aggsUsedForSorting; protected final SubAggCollectionMode collectMode; private final List values; @@ -99,7 +101,7 @@ protected MultiTermsAggregator( super(name, factories, context, parent, metadata); this.bucketCountThresholds = bucketCountThresholds; this.order = order; - partiallyBuiltBucketComparator = order == null ? null : order.partiallyBuiltBucketComparator(b -> b.bucketOrd, this); + partiallyBuiltBucketComparator = order == null ? null : order.partiallyBuiltBucketComparator(this); this.formats = formats; this.showTermDocCountError = showTermDocCountError; if (subAggsNeedScore() && descendsFromNestedAggregator(parent) || context.isInSortOrderExecutionRequired()) { @@ -242,52 +244,67 @@ public InternalAggregation[] buildAggregations(LongArray owningBucketOrds) throw LongArray otherDocCounts = bigArrays().newLongArray(owningBucketOrds.size(), true); ObjectArray topBucketsPerOrd = bigArrays().newObjectArray(owningBucketOrds.size()) ) { - for (long ordIdx = 0; ordIdx < owningBucketOrds.size(); ordIdx++) { - final long owningBucketOrd = owningBucketOrds.get(ordIdx); - long bucketsInOrd = bucketOrds.bucketsInOrd(owningBucketOrd); - - int size = (int) Math.min(bucketsInOrd, bucketCountThresholds.getShardSize()); - try ( - ObjectArrayPriorityQueue ordered = new BucketPriorityQueue<>( - size, - bigArrays(), - partiallyBuiltBucketComparator - ) - ) { - InternalMultiTerms.Bucket spare = null; - BytesRef spareKey = null; - BytesKeyedBucketOrds.BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(owningBucketOrd); - while (ordsEnum.next()) { - long docCount = bucketDocCount(ordsEnum.ord()); - otherDocCounts.increment(ordIdx, docCount); - if (docCount < bucketCountThresholds.getShardMinDocCount()) { - continue; - } - if (spare == null) { - checkRealMemoryCBForInternalBucket(); - spare = new InternalMultiTerms.Bucket(null, 0, null, showTermDocCountError, 0, formats, keyConverters); - spareKey = new BytesRef(); - } - ordsEnum.readValue(spareKey); - spare.terms = unpackTerms(spareKey); - spare.docCount = docCount; - spare.bucketOrd = ordsEnum.ord(); - spare = ordered.insertWithOverflow(spare); - } + try (IntArray bucketsToCollect = bigArrays().newIntArray(owningBucketOrds.size())) { + long ordsToCollect = 0; + for (long ordIdx = 0; ordIdx < owningBucketOrds.size(); ordIdx++) { + int size = (int) Math.min(bucketOrds.bucketsInOrd(owningBucketOrds.get(ordIdx)), bucketCountThresholds.getShardSize()); + ordsToCollect += size; + bucketsToCollect.set(ordIdx, size); + } + try (LongArray ordsArray = bigArrays().newLongArray(ordsToCollect)) { + long ordsCollected = 0; + for (long ordIdx = 0; ordIdx < owningBucketOrds.size(); ordIdx++) { + final long owningBucketOrd = owningBucketOrds.get(ordIdx); + long bucketsInOrd = bucketOrds.bucketsInOrd(owningBucketOrd); + + int size = (int) Math.min(bucketsInOrd, bucketCountThresholds.getShardSize()); + try ( + ObjectArrayPriorityQueue> ordered = new BucketPriorityQueue<>( + size, + bigArrays(), + partiallyBuiltBucketComparator + ) + ) { + BucketAndOrd spare = null; + BytesRef spareKey = null; + BytesKeyedBucketOrds.BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(owningBucketOrd); + while (ordsEnum.next()) { + long docCount = bucketDocCount(ordsEnum.ord()); + otherDocCounts.increment(ordIdx, docCount); + if (docCount < bucketCountThresholds.getShardMinDocCount()) { + continue; + } + if (spare == null) { + checkRealMemoryCBForInternalBucket(); + spare = new BucketAndOrd<>( + new InternalMultiTerms.Bucket(null, 0, null, showTermDocCountError, 0, formats, keyConverters) + ); + spareKey = new BytesRef(); + } + ordsEnum.readValue(spareKey); + spare.bucket.terms = unpackTerms(spareKey); + spare.bucket.docCount = docCount; + spare.ord = ordsEnum.ord(); + spare = ordered.insertWithOverflow(spare); + } - // Get the top buckets - InternalMultiTerms.Bucket[] bucketsForOrd = new InternalMultiTerms.Bucket[(int) ordered.size()]; - topBucketsPerOrd.set(ordIdx, bucketsForOrd); - for (int b = (int) ordered.size() - 1; b >= 0; --b) { - InternalMultiTerms.Bucket[] buckets = topBucketsPerOrd.get(ordIdx); - buckets[b] = ordered.pop(); - otherDocCounts.increment(ordIdx, -buckets[b].getDocCount()); + // Get the top buckets + int orderedSize = (int) ordered.size(); + InternalMultiTerms.Bucket[] buckets = new InternalMultiTerms.Bucket[orderedSize]; + for (int i = orderedSize - 1; i >= 0; --i) { + BucketAndOrd bucketAndOrd = ordered.pop(); + buckets[i] = bucketAndOrd.bucket; + ordsArray.set(ordsCollected + i, bucketAndOrd.ord); + otherDocCounts.increment(ordIdx, -buckets[i].getDocCount()); + } + topBucketsPerOrd.set(ordIdx, buckets); + ordsCollected += orderedSize; + } } + buildSubAggsForAllBuckets(topBucketsPerOrd, ordsArray, (b, a) -> b.aggregations = a); } } - buildSubAggsForAllBuckets(topBucketsPerOrd, b -> b.bucketOrd, (b, a) -> b.aggregations = a); - return buildAggregations( Math.toIntExact(owningBucketOrds.size()), ordIdx -> buildResult(otherDocCounts.get(ordIdx), topBucketsPerOrd.get(ordIdx)) From 73d4fabe1ab4f1ecfc080ef16939135ff3740f13 Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Thu, 5 Dec 2024 11:32:12 +0100 Subject: [PATCH 2/2] blind --- .../countedterms/CountedTermsAggregator.java | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/countedterms/CountedTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/countedterms/CountedTermsAggregator.java index 5c4b4223027bc..571ce3a9a4519 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/countedterms/CountedTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/countedterms/CountedTermsAggregator.java @@ -40,7 +40,6 @@ import java.util.Arrays; import java.util.Map; import java.util.function.BiConsumer; -import java.util.function.Supplier; import static java.util.Collections.emptyList; import static org.elasticsearch.search.aggregations.InternalOrder.isKeyOrder; @@ -127,7 +126,7 @@ public InternalAggregation[] buildAggregations(LongArray owningBucketOrds) throw } try (LongArray ordsArray = bigArrays().newLongArray(ordsToCollect)) { long ordsCollected = 0; - for (long ordIdx = 0; ordIdx < topBucketsPerOrd.size(); ordIdx++) { + for (long ordIdx = 0; ordIdx < owningBucketOrds.size(); ordIdx++) { // as users can't control sort order, in practice we'll always sort by doc count descending try ( BucketPriorityQueue ordered = new BucketPriorityQueue<>( @@ -138,20 +137,12 @@ public InternalAggregation[] buildAggregations(LongArray owningBucketOrds) throw ) { BucketAndOrd spare = null; BytesKeyedBucketOrds.BucketOrdsEnum ordsEnum = bucketOrds.ordsEnum(owningBucketOrds.get(ordIdx)); - Supplier emptyBucketBuilder = () -> new StringTerms.Bucket( - new BytesRef(), - 0, - null, - false, - 0, - format - ); while (ordsEnum.next()) { long docCount = bucketDocCount(ordsEnum.ord()); otherDocCounts.increment(ordIdx, docCount); if (spare == null) { checkRealMemoryCBForInternalBucket(); - spare = new BucketAndOrd<>(emptyBucketBuilder.get()); + spare = new BucketAndOrd<>(new StringTerms.Bucket(new BytesRef(), 0, null, false, 0, format)); } ordsEnum.readValue(spare.bucket.getTermBytes()); spare.bucket.setDocCount(docCount); @@ -160,11 +151,12 @@ public InternalAggregation[] buildAggregations(LongArray owningBucketOrds) throw } final int orderedSize = (int) ordered.size(); final StringTerms.Bucket[] buckets = new StringTerms.Bucket[orderedSize]; - for (int i = (int) ordered.size() - 1; i >= 0; --i) { + for (int i = orderedSize - 1; i >= 0; --i) { BucketAndOrd bucketAndOrd = ordered.pop(); buckets[i] = bucketAndOrd.bucket; ordsArray.set(ordsCollected + i, bucketAndOrd.ord); otherDocCounts.increment(ordIdx, -bucketAndOrd.bucket.getDocCount()); + bucketAndOrd.bucket.setTermBytes(BytesRef.deepCopyOf(bucketAndOrd.bucket.getTermBytes())); } topBucketsPerOrd.set(ordIdx, buckets); ordsCollected += orderedSize;