Skip to content

Commit

Permalink
Speed up collecting zero document string terms (elastic#110922)
Browse files Browse the repository at this point in the history
Use segment ordinals when possible to collect zero document buckets
  • Loading branch information
iverase authored and salvatore-campagna committed Jul 19, 2024
1 parent 8f40a9f commit c0bdb78
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 12 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/110922.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 110922
summary: Speed up collecting zero document string terms
area: Aggregations
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.PriorityQueue;
Expand Down Expand Up @@ -419,25 +422,66 @@ void collectZeroDocEntriesIfNeeded(long owningBucketOrd, boolean excludeDeletedD
}
// we need to fill-in the blanks
for (LeafReaderContext ctx : searcher().getTopReaderContext().leaves()) {
SortedBinaryDocValues values = valuesSource.bytesValues(ctx);
// brute force
for (int docId = 0; docId < ctx.reader().maxDoc(); ++docId) {
if (excludeDeletedDocs && ctx.reader().getLiveDocs() != null && ctx.reader().getLiveDocs().get(docId) == false) {
continue;
final Bits liveDocs = excludeDeletedDocs ? ctx.reader().getLiveDocs() : null;
if (liveDocs == null && valuesSource.hasOrdinals()) {
final SortedSetDocValues values = ((ValuesSource.Bytes.WithOrdinals) valuesSource).ordinalsValues(ctx);
collectZeroDocEntries(values, owningBucketOrd);
} else {
final SortedBinaryDocValues values = valuesSource.bytesValues(ctx);
final BinaryDocValues singleton = FieldData.unwrapSingleton(values);
if (singleton != null) {
collectZeroDocEntries(singleton, liveDocs, ctx.reader().maxDoc(), owningBucketOrd);
} else {
collectZeroDocEntries(values, liveDocs, ctx.reader().maxDoc(), owningBucketOrd);
}
if (values.advanceExact(docId)) {
int valueCount = values.docValueCount();
for (int i = 0; i < valueCount; ++i) {
BytesRef term = values.nextValue();
if (includeExclude == null || includeExclude.accept(term)) {
bucketOrds.add(owningBucketOrd, term);
}
}
}
}

private void collectZeroDocEntries(SortedSetDocValues values, long owningBucketOrd) throws IOException {
final TermsEnum termsEnum = values.termsEnum();
BytesRef term;
while ((term = termsEnum.next()) != null) {
if (includeExclude == null || includeExclude.accept(term)) {
bucketOrds.add(owningBucketOrd, term);
}
}
}

private void collectZeroDocEntries(SortedBinaryDocValues values, Bits liveDocs, int maxDoc, long owningBucketOrd)
throws IOException {
// brute force
for (int docId = 0; docId < maxDoc; ++docId) {
if (liveDocs != null && liveDocs.get(docId) == false) {
continue;
}
if (values.advanceExact(docId)) {
final int valueCount = values.docValueCount();
for (int i = 0; i < valueCount; ++i) {
final BytesRef term = values.nextValue();
if (includeExclude == null || includeExclude.accept(term)) {
bucketOrds.add(owningBucketOrd, term);
}
}
}
}
}

private void collectZeroDocEntries(BinaryDocValues values, Bits liveDocs, int maxDoc, long owningBucketOrd) throws IOException {
// brute force
for (int docId = 0; docId < maxDoc; ++docId) {
if (liveDocs != null && liveDocs.get(docId) == false) {
continue;
}
if (values.advanceExact(docId)) {
final BytesRef term = values.binaryValue();
if (includeExclude == null || includeExclude.accept(term)) {
bucketOrds.add(owningBucketOrd, term);
}
}
}
}

@Override
Supplier<StringTerms.Bucket> emptyBucketBuilder(long owningBucketOrd) {
return () -> new StringTerms.Bucket(new BytesRef(), 0, null, showTermDocCountError, 0, format);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,65 @@ public void testStringShardMinDocCount() throws IOException {
}
}

public void testStringShardZeroMinDocCount() throws IOException {
MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType("string", true, true, Collections.emptyMap());
for (TermsAggregatorFactory.ExecutionMode executionMode : TermsAggregatorFactory.ExecutionMode.values()) {
TermsAggregationBuilder aggregationBuilder = new TermsAggregationBuilder("_name").field("string")
.executionHint(executionMode.toString())
.size(2)
.minDocCount(0)
.executionHint("map")
.excludeDeletedDocs(true)
.order(BucketOrder.key(true));

{
boolean delete = randomBoolean();
// force single shard/segment
testCase(iw -> {
// force single shard/segment
iw.addDocuments(Arrays.asList(doc(fieldType, "a"), doc(fieldType, "b"), doc(fieldType, "c"), doc(fieldType, "d")));
if (delete) {
iw.deleteDocuments(new TermQuery(new Term("string", "b")));
}
}, (InternalTerms<?, ?> result) -> {
assertEquals(2, result.getBuckets().size());
assertEquals("a", result.getBuckets().get(0).getKeyAsString());
assertEquals(0L, result.getBuckets().get(0).getDocCount());
if (delete) {
assertEquals("c", result.getBuckets().get(1).getKeyAsString());
} else {
assertEquals("b", result.getBuckets().get(1).getKeyAsString());
}
assertEquals(0L, result.getBuckets().get(1).getDocCount());
}, new AggTestConfig(aggregationBuilder, fieldType).withQuery(new TermQuery(new Term("string", "e"))));
}

{
boolean delete = randomBoolean();
// force single shard/segment
testCase(iw -> {
// force single shard/segment
iw.addDocuments(
Arrays.asList(doc(fieldType, "a"), doc(fieldType, "c", "d"), doc(fieldType, "b", "d"), doc(fieldType, "b"))
);
if (delete) {
iw.deleteDocuments(new TermQuery(new Term("string", "b")));
}
}, (InternalTerms<?, ?> result) -> {
assertEquals(2, result.getBuckets().size());
assertEquals("a", result.getBuckets().get(0).getKeyAsString());
assertEquals(0L, result.getBuckets().get(0).getDocCount());
if (delete) {
assertEquals("c", result.getBuckets().get(1).getKeyAsString());
} else {
assertEquals("b", result.getBuckets().get(1).getKeyAsString());
}
assertEquals(0L, result.getBuckets().get(1).getDocCount());
}, new AggTestConfig(aggregationBuilder, fieldType).withQuery(new TermQuery(new Term("string", "e"))));
}
}
}

public void testManyTerms() throws Exception {
MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType("string", randomBoolean(), true, Collections.emptyMap());
TermsAggregationBuilder aggregationBuilder = new TermsAggregationBuilder("_name").executionHint(randomHint()).field("string");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1013,6 +1013,10 @@ public void testZeroMinDocAggregation() throws Exception {
prepareIndex("test").setId("2").setSource("color", "yellow", "fruit", "banana", "count", -2).setRefreshPolicy(IMMEDIATE).get();
prepareIndex("test").setId("3").setSource("color", "green", "fruit", "grape", "count", -3).setRefreshPolicy(IMMEDIATE).get();
prepareIndex("test").setId("4").setSource("color", "red", "fruit", "grape", "count", -4).setRefreshPolicy(IMMEDIATE).get();
prepareIndex("test").setId("5")
.setSource("color", new String[] { "green", "black" }, "fruit", "grape", "count", -5)
.setRefreshPolicy(IMMEDIATE)
.get();
indicesAdmin().prepareForceMerge("test").get();

assertResponse(
Expand Down

0 comments on commit c0bdb78

Please sign in to comment.