From 8e5fc200b5de7298c0824b257b686e9ee4b27955 Mon Sep 17 00:00:00 2001 From: Petar Dzepina Date: Thu, 28 Jul 2022 21:30:28 +0200 Subject: [PATCH] Add doc_count field mapper (#3985) Bucket aggregations compute bucket doc_count values by incrementing the doc_count by 1 for every document collected in the bucket. When using summary fields (such as aggregate_metric_double) one field may represent more than one document. To provide this functionality this commit implements a new field mapper (named doc_count field mapper). This field is a positive integer representing the number of documents aggregated in a single summary field. Bucket aggregations check if a field of type doc_count exists in a document and take this value into consideration when computing doc counts. Note: This originated from upstream PR 64503. Signed-off-by: Petar Dzepina (cherry picked from commit fb7d81a7c00df14223e34511a6b1b5a3eff44e65) --- .../380_doc_count_field.yml | 146 ++++++++++++++++ .../index/mapper/DocCountFieldMapper.java | 164 ++++++++++++++++++ .../org/opensearch/indices/IndicesModule.java | 2 + .../search/aggregations/AggregatorBase.java | 4 +- .../bucket/BucketsAggregator.java | 37 ++-- .../aggregations/bucket/DocCountProvider.java | 62 +++++++ .../adjacency/AdjacencyMatrixAggregator.java | 4 +- .../bucket/composite/CompositeAggregator.java | 5 +- .../CompositeValuesCollectorQueue.java | 22 +-- .../bucket/composite/SortedDocsProducer.java | 6 +- .../bucket/nested/NestedAggregator.java | 3 +- .../GlobalOrdinalsStringTermsAggregator.java | 13 +- .../mapper/DocCountFieldMapperTests.java | 83 +++++++++ .../index/mapper/DocCountFieldTypeTests.java | 69 ++++++++ .../indices/IndicesModuleTests.java | 2 + .../bucket/DocCountProviderTests.java | 107 ++++++++++++ .../CompositeValuesCollectorQueueTests.java | 2 +- 17 files changed, 692 insertions(+), 39 deletions(-) create mode 100644 rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/380_doc_count_field.yml create mode 100644 server/src/main/java/org/opensearch/index/mapper/DocCountFieldMapper.java create mode 100644 server/src/main/java/org/opensearch/search/aggregations/bucket/DocCountProvider.java create mode 100644 server/src/test/java/org/opensearch/index/mapper/DocCountFieldMapperTests.java create mode 100644 server/src/test/java/org/opensearch/index/mapper/DocCountFieldTypeTests.java create mode 100644 server/src/test/java/org/opensearch/search/aggregations/bucket/DocCountProviderTests.java diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/380_doc_count_field.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/380_doc_count_field.yml new file mode 100644 index 0000000000000..a7a796eceeb39 --- /dev/null +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/380_doc_count_field.yml @@ -0,0 +1,146 @@ +setup: + - do: + indices.create: + index: test_1 + body: + settings: + number_of_replicas: 0 + mappings: + properties: + str: + type: keyword + number: + type: integer + + - do: + bulk: + index: test_1 + refresh: true + body: + - '{"index": {}}' + - '{"_doc_count": 10, "str": "abc", "number" : 500, "unmapped": "abc" }' + - '{"index": {}}' + - '{"_doc_count": 5, "str": "xyz", "number" : 100, "unmapped": "xyz" }' + - '{"index": {}}' + - '{"_doc_count": 7, "str": "foo", "number" : 100, "unmapped": "foo" }' + - '{"index": {}}' + - '{"_doc_count": 1, "str": "foo", "number" : 200, "unmapped": "foo" }' + - '{"index": {}}' + - '{"str": "abc", "number" : 500, "unmapped": "abc" }' + +--- +"Test numeric terms agg with doc_count": + - skip: + version: "1.0.0 - " + reason: "until this is released and we know exact version" + + - do: + search: + rest_total_hits_as_int: true + body: { "size" : 0, "aggs" : { "num_terms" : { "terms" : { "field" : "number" } } } } + + - match: { hits.total: 5 } + - length: { aggregations.num_terms.buckets: 3 } + - match: { aggregations.num_terms.buckets.0.key: 100 } + - match: { aggregations.num_terms.buckets.0.doc_count: 12 } + - match: { aggregations.num_terms.buckets.1.key: 500 } + - match: { aggregations.num_terms.buckets.1.doc_count: 11 } + - match: { aggregations.num_terms.buckets.2.key: 200 } + - match: { aggregations.num_terms.buckets.2.doc_count: 1 } + +--- +"Test keyword terms agg with doc_count": + - skip: + version: "1.0.0 - " + reason: "until this is released and we know exact version" + - do: + search: + rest_total_hits_as_int: true + body: { "size" : 0, "aggs" : { "str_terms" : { "terms" : { "field" : "str" } } } } + + - match: { hits.total: 5 } + - length: { aggregations.str_terms.buckets: 3 } + - match: { aggregations.str_terms.buckets.0.key: "abc" } + - match: { aggregations.str_terms.buckets.0.doc_count: 11 } + - match: { aggregations.str_terms.buckets.1.key: "foo" } + - match: { aggregations.str_terms.buckets.1.doc_count: 8 } + - match: { aggregations.str_terms.buckets.2.key: "xyz" } + - match: { aggregations.str_terms.buckets.2.doc_count: 5 } + +--- +"Test unmapped string terms agg with doc_count": + - skip: + version: "1.0.0 - " + reason: "until this is released and we know exact version" + - do: + bulk: + index: test_2 + refresh: true + body: + - '{"index": {}}' + - '{"_doc_count": 10, "str": "abc" }' + - '{"index": {}}' + - '{"str": "abc" }' + - do: + search: + index: test_2 + rest_total_hits_as_int: true + body: { "size" : 0, "aggs" : { "str_terms" : { "terms" : { "field" : "str.keyword" } } } } + + - match: { hits.total: 2 } + - length: { aggregations.str_terms.buckets: 1 } + - match: { aggregations.str_terms.buckets.0.key: "abc" } + - match: { aggregations.str_terms.buckets.0.doc_count: 11 } + +--- +"Test composite str_terms agg with doc_count": + - skip: + version: "1.0.0 - " + reason: "until this is released and we know exact version" + - do: + search: + rest_total_hits_as_int: true + body: { "size" : 0, "aggs" : + { "composite_agg" : { "composite" : + { + "sources": ["str_terms": { "terms": { "field": "str" } }] + } + } + } + } + + - match: { hits.total: 5 } + - length: { aggregations.composite_agg.buckets: 3 } + - match: { aggregations.composite_agg.buckets.0.key.str_terms: "abc" } + - match: { aggregations.composite_agg.buckets.0.doc_count: 11 } + - match: { aggregations.composite_agg.buckets.1.key.str_terms: "foo" } + - match: { aggregations.composite_agg.buckets.1.doc_count: 8 } + - match: { aggregations.composite_agg.buckets.2.key.str_terms: "xyz" } + - match: { aggregations.composite_agg.buckets.2.doc_count: 5 } + +--- +"Test composite num_terms agg with doc_count": + - skip: + version: "1.0.0 - " + reason: "until this is released and we know exact version" + - do: + search: + rest_total_hits_as_int: true + body: { "size" : 0, "aggs" : + { "composite_agg" : + { "composite" : + { + "sources": ["num_terms" : { "terms" : { "field" : "number" } }] + } + } + } + } + + - match: { hits.total: 5 } + - length: { aggregations.composite_agg.buckets: 3 } + - match: { aggregations.composite_agg.buckets.0.key.num_terms: 100 } + - match: { aggregations.composite_agg.buckets.0.doc_count: 12 } + - match: { aggregations.composite_agg.buckets.1.key.num_terms: 200 } + - match: { aggregations.composite_agg.buckets.1.doc_count: 1 } + - match: { aggregations.composite_agg.buckets.2.key.num_terms: 500 } + - match: { aggregations.composite_agg.buckets.2.doc_count: 11 } diff --git a/server/src/main/java/org/opensearch/index/mapper/DocCountFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/DocCountFieldMapper.java new file mode 100644 index 0000000000000..e3f6f5ef130e9 --- /dev/null +++ b/server/src/main/java/org/opensearch/index/mapper/DocCountFieldMapper.java @@ -0,0 +1,164 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Modifications Copyright OpenSearch Contributors. See + * GitHub history for details. + */ +package org.opensearch.index.mapper; + +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.search.DocValuesFieldExistsQuery; +import org.apache.lucene.search.Query; +import org.opensearch.common.xcontent.XContentParser; +import org.opensearch.common.xcontent.XContentParserUtils; +import org.opensearch.index.query.QueryShardContext; +import org.opensearch.index.query.QueryShardException; +import org.opensearch.search.lookup.SearchLookup; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; + +/** + * Mapper for the doc_count field. + * + * @opensearch.internal + */ +public class DocCountFieldMapper extends MetadataFieldMapper { + + public static final String NAME = "_doc_count"; + public static final String CONTENT_TYPE = "_doc_count"; + + public static final TypeParser PARSER = new ConfigurableTypeParser( + c -> new DocCountFieldMapper(), + c -> new DocCountFieldMapper.Builder() + ); + + static class Builder extends MetadataFieldMapper.Builder { + + Builder() { + super(NAME); + } + + @Override + protected List> getParameters() { + return Collections.emptyList(); + } + + @Override + public DocCountFieldMapper build(BuilderContext context) { + return new DocCountFieldMapper(); + } + } + + /** + * Field type for DocCount Field Mapper + * + * @opensearch.internal + */ + public static final class DocCountFieldType extends MappedFieldType { + + public static final DocCountFieldType INSTANCE = new DocCountFieldType(); + + private static final Long defaultValue = 1L; + + public DocCountFieldType() { + super(NAME, false, false, true, TextSearchInfo.NONE, Collections.emptyMap()); + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + @Override + public String familyTypeName() { + return NumberFieldMapper.NumberType.LONG.typeName(); + } + + @Override + public Query existsQuery(QueryShardContext context) { + return new DocValuesFieldExistsQuery(NAME); + } + + @Override + public Query termQuery(Object value, QueryShardContext context) { + throw new QueryShardException(context, "Field [" + name() + "] of type [" + typeName() + "] is not searchable"); + } + + @Override + public ValueFetcher valueFetcher(QueryShardContext context, SearchLookup searchLookup, String format) { + if (format != null) { + throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] doesn't support formats."); + } + + return new SourceValueFetcher(name(), context, defaultValue) { + @Override + protected Object parseSourceValue(Object value) { + if ("".equals(value)) { + return defaultValue; + } else { + return NumberFieldMapper.NumberType.objectToLong(value, false); + } + } + }; + } + } + + private DocCountFieldMapper() { + super(DocCountFieldType.INSTANCE); + } + + @Override + protected void parseCreateField(ParseContext context) throws IOException { + XContentParser parser = context.parser(); + XContentParserUtils.ensureExpectedToken(XContentParser.Token.VALUE_NUMBER, parser.currentToken(), parser); + + long value = parser.longValue(false); + if (value <= 0) { + throw new IllegalArgumentException("Field [" + fieldType().name() + "] must be a positive integer."); + } + final Field docCount = new NumericDocValuesField(NAME, value); + context.doc().add(docCount); + } + + @Override + public void preParse(ParseContext context) {} + + @Override + public DocCountFieldType fieldType() { + return (DocCountFieldType) super.fieldType(); + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + +} diff --git a/server/src/main/java/org/opensearch/indices/IndicesModule.java b/server/src/main/java/org/opensearch/indices/IndicesModule.java index f188c47e7a9de..150f003e1a766 100644 --- a/server/src/main/java/org/opensearch/indices/IndicesModule.java +++ b/server/src/main/java/org/opensearch/indices/IndicesModule.java @@ -47,6 +47,7 @@ import org.opensearch.index.mapper.CompletionFieldMapper; import org.opensearch.index.mapper.DataStreamFieldMapper; import org.opensearch.index.mapper.DateFieldMapper; +import org.opensearch.index.mapper.DocCountFieldMapper; import org.opensearch.index.mapper.FieldAliasMapper; import org.opensearch.index.mapper.FieldNamesFieldMapper; import org.opensearch.index.mapper.GeoPointFieldMapper; @@ -190,6 +191,7 @@ private static Map initBuiltInMetadataMa builtInMetadataMappers.put(NestedPathFieldMapper.NAME, NestedPathFieldMapper.PARSER); builtInMetadataMappers.put(VersionFieldMapper.NAME, VersionFieldMapper.PARSER); builtInMetadataMappers.put(SeqNoFieldMapper.NAME, SeqNoFieldMapper.PARSER); + builtInMetadataMappers.put(DocCountFieldMapper.NAME, DocCountFieldMapper.PARSER); // _field_names must be added last so that it has a chance to see all the other mappers builtInMetadataMappers.put(FieldNamesFieldMapper.NAME, FieldNamesFieldMapper.PARSER); return Collections.unmodifiableMap(builtInMetadataMappers); diff --git a/server/src/main/java/org/opensearch/search/aggregations/AggregatorBase.java b/server/src/main/java/org/opensearch/search/aggregations/AggregatorBase.java index db51f00056f72..1d315980512b4 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/AggregatorBase.java +++ b/server/src/main/java/org/opensearch/search/aggregations/AggregatorBase.java @@ -200,7 +200,7 @@ public Map metadata() { @Override public final LeafBucketCollector getLeafCollector(LeafReaderContext ctx) throws IOException { - preGetSubLeafCollectors(); + preGetSubLeafCollectors(ctx); final LeafBucketCollector sub = collectableSubAggregators.getLeafCollector(ctx); return getLeafCollector(ctx, sub); } @@ -209,7 +209,7 @@ public final LeafBucketCollector getLeafCollector(LeafReaderContext ctx) throws * Can be overridden by aggregator implementations that like the perform an operation before the leaf collectors * of children aggregators are instantiated for the next segment. */ - protected void preGetSubLeafCollectors() throws IOException {} + protected void preGetSubLeafCollectors(LeafReaderContext ctx) throws IOException {} /** * Can be overridden by aggregator implementation to be called back when the collection phase starts. diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/BucketsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/BucketsAggregator.java index 79512586d06de..67af0b13eed3b 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/BucketsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/BucketsAggregator.java @@ -31,9 +31,10 @@ package org.opensearch.search.aggregations.bucket; +import org.apache.lucene.index.LeafReaderContext; import org.opensearch.common.lease.Releasable; import org.opensearch.common.util.BigArrays; -import org.opensearch.common.util.IntArray; +import org.opensearch.common.util.LongArray; import org.opensearch.search.aggregations.AggregationExecutionException; import org.opensearch.search.aggregations.Aggregator; import org.opensearch.search.aggregations.AggregatorBase; @@ -70,7 +71,8 @@ public abstract class BucketsAggregator extends AggregatorBase { private final BigArrays bigArrays; private final IntConsumer multiBucketConsumer; - private IntArray docCounts; + private LongArray docCounts; + protected final DocCountProvider docCountProvider; public BucketsAggregator( String name, @@ -87,7 +89,8 @@ public BucketsAggregator( } else { multiBucketConsumer = (count) -> {}; } - docCounts = bigArrays.newIntArray(1, true); + docCounts = bigArrays.newLongArray(1, true); + docCountProvider = new DocCountProvider(); } /** @@ -116,7 +119,8 @@ public final void collectBucket(LeafBucketCollector subCollector, int doc, long * Same as {@link #collectBucket(LeafBucketCollector, int, long)}, but doesn't check if the docCounts needs to be re-sized. */ public final void collectExistingBucket(LeafBucketCollector subCollector, int doc, long bucketOrd) throws IOException { - if (docCounts.increment(bucketOrd, 1) == 1) { + long docCount = docCountProvider.getDocCount(doc); + if (docCounts.increment(bucketOrd, docCount) == docCount) { // We calculate the final number of buckets only during the reduce phase. But we still need to // trigger bucket consumer from time to time in order to give it a chance to check available memory and break // the execution if we are running out. To achieve that we are passing 0 as a bucket count. @@ -147,11 +151,11 @@ public final void mergeBuckets(long[] mergeMap, long newNumBuckets) { * merge the actual ordinals and doc ID deltas. */ public final void mergeBuckets(long newNumBuckets, LongUnaryOperator mergeMap) { - try (IntArray oldDocCounts = docCounts) { - docCounts = bigArrays.newIntArray(newNumBuckets, true); + try (LongArray oldDocCounts = docCounts) { + docCounts = bigArrays.newLongArray(newNumBuckets, true); docCounts.fill(0, newNumBuckets, 0); for (long i = 0; i < oldDocCounts.size(); i++) { - int docCount = oldDocCounts.get(i); + long docCount = oldDocCounts.get(i); if (docCount == 0) continue; @@ -164,14 +168,14 @@ public final void mergeBuckets(long newNumBuckets, LongUnaryOperator mergeMap) { } } - public IntArray getDocCounts() { + public LongArray getDocCounts() { return docCounts; } /** * Utility method to increment the doc counts of the given bucket (identified by the bucket ordinal) */ - public final void incrementBucketDocCount(long bucketOrd, int inc) { + public final void incrementBucketDocCount(long bucketOrd, long inc) { docCounts = bigArrays.grow(docCounts, bucketOrd + 1); docCounts.increment(bucketOrd, inc); } @@ -179,7 +183,7 @@ public final void incrementBucketDocCount(long bucketOrd, int inc) { /** * Utility method to return the number of documents that fell in the given bucket (identified by the bucket ordinal) */ - public final int bucketDocCount(long bucketOrd) { + public final long bucketDocCount(long bucketOrd) { if (bucketOrd >= docCounts.size()) { // This may happen eg. if no document in the highest buckets is accepted by a sub aggregator. // For example, if there is a long terms agg on 3 terms 1,2,3 with a sub filter aggregator and if no document with 3 as a value @@ -337,7 +341,7 @@ protected final InternalAggregation[] buildAggregationsForFixedBucketCount( */ @FunctionalInterface protected interface BucketBuilderForFixedCount { - B build(int offsetInOwningOrd, int docCount, InternalAggregations subAggregationResults); + B build(int offsetInOwningOrd, long docCount, InternalAggregations subAggregationResults); } /** @@ -432,7 +436,7 @@ protected final InternalAggregation[] buildAggregationsForVariableBuckets( */ @FunctionalInterface protected interface BucketBuilderForVariable { - B build(long bucketValue, int docCount, InternalAggregations subAggregationResults); + B build(long bucketValue, long docCount, InternalAggregations subAggregationResults); } /** @@ -466,7 +470,7 @@ public BucketComparator bucketComparator(String key, SortOrder order) { return super.bucketComparator(key, order); } if (key == null || "doc_count".equals(key)) { - return (lhs, rhs) -> order.reverseMul() * Integer.compare(bucketDocCount(lhs), bucketDocCount(rhs)); + return (lhs, rhs) -> order.reverseMul() * Long.compare(bucketDocCount(lhs), bucketDocCount(rhs)); } throw new IllegalArgumentException( "Ordering on a single-bucket aggregation can only be done on its doc_count. " @@ -478,6 +482,13 @@ public BucketComparator bucketComparator(String key, SortOrder order) { ); } + @Override + protected void preGetSubLeafCollectors(LeafReaderContext ctx) throws IOException { + super.preGetSubLeafCollectors(ctx); + // Set LeafReaderContext to the doc_count provider + docCountProvider.setLeafReaderContext(ctx); + } + public static boolean descendsFromGlobalAggregator(Aggregator parent) { while (parent != null) { if (parent.getClass() == GlobalAggregator.class) { diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/DocCountProvider.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/DocCountProvider.java new file mode 100644 index 0000000000000..30fe16ec4eb0e --- /dev/null +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/DocCountProvider.java @@ -0,0 +1,62 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Modifications Copyright OpenSearch Contributors. See + * GitHub history for details. + */ + +package org.opensearch.search.aggregations.bucket; + +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; +import org.opensearch.index.mapper.DocCountFieldMapper; + +import java.io.IOException; + +/** + * An implementation of a doc_count provider that reads the value + * of the _doc_count field in the document. If a document does not have a + * _doc_count field the implementation will return 1 as the default value. + */ +public class DocCountProvider { + + private NumericDocValues docCountValues; + + public long getDocCount(int doc) throws IOException { + if (docCountValues != null && docCountValues.advanceExact(doc)) { + return docCountValues.longValue(); + } else { + return 1L; + } + } + + public void setLeafReaderContext(LeafReaderContext ctx) throws IOException { + docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME); + } +} diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/adjacency/AdjacencyMatrixAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/adjacency/AdjacencyMatrixAggregator.java index c33887878dbd6..c9efbf7d34348 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/adjacency/AdjacencyMatrixAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/adjacency/AdjacencyMatrixAggregator.java @@ -228,7 +228,7 @@ public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws I List buckets = new ArrayList<>(filters.length); for (int i = 0; i < keys.length; i++) { long bucketOrd = bucketOrd(owningBucketOrds[owningBucketOrdIdx], i); - int docCount = bucketDocCount(bucketOrd); + long docCount = bucketDocCount(bucketOrd); // Empty buckets are not returned because this aggregation will commonly be used under a // a date-histogram where we will look for transactions over time and can expect many // empty buckets. @@ -245,7 +245,7 @@ public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws I for (int i = 0; i < keys.length; i++) { for (int j = i + 1; j < keys.length; j++) { long bucketOrd = bucketOrd(owningBucketOrds[owningBucketOrdIdx], pos); - int docCount = bucketDocCount(bucketOrd); + long docCount = bucketDocCount(bucketOrd); // Empty buckets are not returned due to potential for very sparse matrices if (docCount > 0) { String intersectKey = keys[i] + separator + keys[j]; diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/CompositeAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/CompositeAggregator.java index a907d17a731fe..8b487fc499602 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/CompositeAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/CompositeAggregator.java @@ -196,7 +196,7 @@ public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws I int slot = queue.pop(); CompositeKey key = queue.toCompositeKey(slot); InternalAggregations aggs = subAggsForBuckets[slot]; - int docCount = queue.getDocCount(slot); + long docCount = queue.getDocCount(slot); buckets[queue.size()] = new InternalComposite.InternalBucket( sourceNames, formats, @@ -504,7 +504,8 @@ private LeafBucketCollector getFirstPassCollector(RoaringDocIdSet.Builder builde @Override public void collect(int doc, long bucket) throws IOException { try { - if (queue.addIfCompetitive(indexSortPrefix)) { + long docCount = docCountProvider.getDocCount(doc); + if (queue.addIfCompetitive(indexSortPrefix, docCount)) { if (builder != null && lastDoc != doc) { builder.add(doc); lastDoc = doc; diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueue.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueue.java index ac50869b5d79b..6ee1682a7b196 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueue.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueue.java @@ -38,7 +38,7 @@ import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; import org.opensearch.common.util.BigArrays; -import org.opensearch.common.util.IntArray; +import org.opensearch.common.util.LongArray; import org.opensearch.search.aggregations.LeafBucketCollector; import java.io.IOException; @@ -80,7 +80,7 @@ public int hashCode() { private final Map map; private final SingleDimensionValuesSource[] arrays; - private IntArray docCounts; + private LongArray docCounts; private boolean afterKeyIsSet = false; /** @@ -103,7 +103,7 @@ public int hashCode() { sources[i].setAfter(afterKey.get(i)); } } - this.docCounts = bigArrays.newIntArray(1, false); + this.docCounts = bigArrays.newLongArray(1, false); } @Override @@ -143,19 +143,19 @@ Comparable getUpperValueLeadSource() throws IOException { /** * Returns the document count in slot. */ - int getDocCount(int slot) { + long getDocCount(int slot) { return docCounts.get(slot); } /** * Copies the current value in slot. */ - private void copyCurrent(int slot) { + private void copyCurrent(int slot, long value) { for (int i = 0; i < arrays.length; i++) { arrays[i].copyCurrent(slot); } docCounts = bigArrays.grow(docCounts, slot + 1); - docCounts.set(slot, 1); + docCounts.set(slot, value); } /** @@ -265,8 +265,8 @@ LeafBucketCollector getLeafCollector(Comparable forceLeadSourceValue, LeafReader * Check if the current candidate should be added in the queue. * @return true if the candidate is competitive (added or already in the queue). */ - boolean addIfCompetitive() { - return addIfCompetitive(0); + boolean addIfCompetitive(long inc) { + return addIfCompetitive(0, inc); } /** @@ -279,12 +279,12 @@ boolean addIfCompetitive() { * * @throws CollectionTerminatedException if the current collection can be terminated early due to index sorting. */ - boolean addIfCompetitive(int indexSortSourcePrefix) { + boolean addIfCompetitive(int indexSortSourcePrefix, long inc) { // checks if the candidate key is competitive Integer topSlot = compareCurrent(); if (topSlot != null) { // this key is already in the top N, skip it - docCounts.increment(topSlot, 1); + docCounts.increment(topSlot, inc); return true; } if (afterKeyIsSet) { @@ -325,7 +325,7 @@ boolean addIfCompetitive(int indexSortSourcePrefix) { newSlot = size(); } // move the candidate key to its new slot - copyCurrent(newSlot); + copyCurrent(newSlot, inc); map.put(new Slot(newSlot), newSlot); add(newSlot); return true; diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/SortedDocsProducer.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/SortedDocsProducer.java index 3cdbee6b119ec..bd0a4f13ddf08 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/SortedDocsProducer.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/composite/SortedDocsProducer.java @@ -40,6 +40,7 @@ import org.apache.lucene.util.DocIdSetBuilder; import org.opensearch.common.Nullable; import org.opensearch.search.aggregations.LeafBucketCollector; +import org.opensearch.search.aggregations.bucket.DocCountProvider; import java.io.IOException; @@ -74,6 +75,8 @@ protected boolean processBucket( ) throws IOException { final int[] topCompositeCollected = new int[1]; final boolean[] hasCollected = new boolean[1]; + final DocCountProvider docCountProvider = new DocCountProvider(); + docCountProvider.setLeafReaderContext(context); final LeafBucketCollector queueCollector = new LeafBucketCollector() { int lastDoc = -1; @@ -86,7 +89,8 @@ protected boolean processBucket( @Override public void collect(int doc, long bucket) throws IOException { hasCollected[0] = true; - if (queue.addIfCompetitive()) { + long docCount = docCountProvider.getDocCount(doc); + if (queue.addIfCompetitive(docCount)) { topCompositeCollected[0]++; if (adder != null && doc != lastDoc) { if (remainingBits == 0) { diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/nested/NestedAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/nested/NestedAggregator.java index 5dc4d1c780350..02507f8569ffa 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/nested/NestedAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/nested/NestedAggregator.java @@ -130,7 +130,8 @@ public void collect(int parentDoc, long bucket) throws IOException { } @Override - protected void preGetSubLeafCollectors() throws IOException { + protected void preGetSubLeafCollectors(LeafReaderContext ctx) throws IOException { + super.preGetSubLeafCollectors(ctx); processBufferedDocs(); } diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 4fcc42b6211b2..2475ade68bc07 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -43,7 +43,6 @@ import org.opensearch.common.io.stream.StreamOutput; import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; -import org.opensearch.common.util.IntArray; import org.opensearch.common.util.LongArray; import org.opensearch.common.util.LongHash; import org.opensearch.common.xcontent.XContentBuilder; @@ -297,7 +296,7 @@ protected void doClose() { static class LowCardinality extends GlobalOrdinalsStringTermsAggregator { private LongUnaryOperator mapping; - private IntArray segmentDocCounts; + private LongArray segmentDocCounts; LowCardinality( String name, @@ -332,7 +331,7 @@ static class LowCardinality extends GlobalOrdinalsStringTermsAggregator { metadata ); assert factories == null || factories.countAggregators() == 0; - this.segmentDocCounts = context.bigArrays().newIntArray(1, true); + this.segmentDocCounts = context.bigArrays().newLongArray(1, true); } @Override @@ -356,7 +355,8 @@ public void collect(int doc, long owningBucketOrd) throws IOException { return; } int ord = singleValues.ordValue(); - segmentDocCounts.increment(ord + 1, 1); + long docCount = docCountProvider.getDocCount(doc); + segmentDocCounts.increment(ord + 1, docCount); } }); } @@ -369,7 +369,8 @@ public void collect(int doc, long owningBucketOrd) throws IOException { return; } for (long segmentOrd = segmentOrds.nextOrd(); segmentOrd != NO_MORE_ORDS; segmentOrd = segmentOrds.nextOrd()) { - segmentDocCounts.increment(segmentOrd + 1, 1); + long docCount = docCountProvider.getDocCount(doc); + segmentDocCounts.increment(segmentOrd + 1, docCount); } } }); @@ -392,7 +393,7 @@ private void mapSegmentCountsToGlobalCounts(LongUnaryOperator mapping) throws IO for (long i = 1; i < segmentDocCounts.size(); i++) { // We use set(...) here, because we need to reset the slow to 0. // segmentDocCounts get reused over the segments and otherwise counts would be too high. - int inc = segmentDocCounts.set(i, 0); + long inc = segmentDocCounts.set(i, 0); if (inc == 0) { continue; } diff --git a/server/src/test/java/org/opensearch/index/mapper/DocCountFieldMapperTests.java b/server/src/test/java/org/opensearch/index/mapper/DocCountFieldMapperTests.java new file mode 100644 index 0000000000000..634276a34b9af --- /dev/null +++ b/server/src/test/java/org/opensearch/index/mapper/DocCountFieldMapperTests.java @@ -0,0 +1,83 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Modifications Copyright OpenSearch Contributors. See + * GitHub history for details. + */ +package org.opensearch.index.mapper; + +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.IndexableField; + +import static org.hamcrest.Matchers.containsString; + +public class DocCountFieldMapperTests extends MapperServiceTestCase { + + private static final String CONTENT_TYPE = DocCountFieldMapper.CONTENT_TYPE; + private static final String DOC_COUNT_FIELD = DocCountFieldMapper.NAME; + + /** + * Test parsing field mapping and adding simple field + */ + public void testParseValue() throws Exception { + DocumentMapper mapper = createDocumentMapper(mapping(b -> {})); + ParsedDocument doc = mapper.parse(source(b -> b.field("foo", 500).field(CONTENT_TYPE, 100))); + + IndexableField field = doc.rootDoc().getField(DOC_COUNT_FIELD); + assertEquals(100L, field.numericValue()); + assertEquals(DocValuesType.NUMERIC, field.fieldType().docValuesType()); + assertEquals(1, doc.rootDoc().getFields(DOC_COUNT_FIELD).length); + } + + public void testInvalidDocument_NegativeDocCount() throws Exception { + DocumentMapper mapper = createDocumentMapper(mapping(b -> {})); + Exception e = expectThrows(MapperParsingException.class, () -> mapper.parse(source(b -> b.field(CONTENT_TYPE, -100)))); + assertThat(e.getCause().getMessage(), containsString("Field [_doc_count] must be a positive integer")); + } + + public void testInvalidDocument_ZeroDocCount() throws Exception { + DocumentMapper mapper = createDocumentMapper(mapping(b -> {})); + Exception e = expectThrows(MapperParsingException.class, () -> mapper.parse(source(b -> b.field(CONTENT_TYPE, 0)))); + assertThat(e.getCause().getMessage(), containsString("Field [_doc_count] must be a positive integer")); + } + + public void testInvalidDocument_NonNumericDocCount() throws Exception { + DocumentMapper mapper = createDocumentMapper(mapping(b -> {})); + Exception e = expectThrows(MapperParsingException.class, () -> mapper.parse(source(b -> b.field(CONTENT_TYPE, "foo")))); + assertThat( + e.getCause().getMessage(), + containsString("Failed to parse object: expecting token of type [VALUE_NUMBER] but found [VALUE_STRING]") + ); + } + + public void testInvalidDocument_FractionalDocCount() throws Exception { + DocumentMapper mapper = createDocumentMapper(mapping(b -> {})); + Exception e = expectThrows(MapperParsingException.class, () -> mapper.parse(source(b -> b.field(CONTENT_TYPE, 100.23)))); + assertThat(e.getCause().getMessage(), containsString("100.23 cannot be converted to Long without data loss")); + } +} diff --git a/server/src/test/java/org/opensearch/index/mapper/DocCountFieldTypeTests.java b/server/src/test/java/org/opensearch/index/mapper/DocCountFieldTypeTests.java new file mode 100644 index 0000000000000..4961736ea933c --- /dev/null +++ b/server/src/test/java/org/opensearch/index/mapper/DocCountFieldTypeTests.java @@ -0,0 +1,69 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Modifications Copyright OpenSearch Contributors. See + * GitHub history for details. + */ +package org.opensearch.index.mapper; + +import org.apache.lucene.search.DocValuesFieldExistsQuery; +import org.opensearch.index.query.QueryShardException; + +import java.io.IOException; +import java.util.Arrays; + +public class DocCountFieldTypeTests extends FieldTypeTestCase { + + public void testTermQuery() { + MappedFieldType ft = new DocCountFieldMapper.DocCountFieldType(); + QueryShardException e = expectThrows(QueryShardException.class, () -> ft.termQuery(10L, randomMockShardContext())); + assertEquals("Field [_doc_count] of type [_doc_count] is not searchable", e.getMessage()); + } + + public void testRangeQuery() { + MappedFieldType ft = new DocCountFieldMapper.DocCountFieldType(); + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> ft.rangeQuery(null, null, randomBoolean(), randomBoolean(), null, null, null, null) + ); + assertEquals("Field [_doc_count] of type [_doc_count] does not support range queries", e.getMessage()); + } + + public void testExistsQuery() { + MappedFieldType ft = new DocCountFieldMapper.DocCountFieldType(); + assertTrue(ft.existsQuery(randomMockShardContext()) instanceof DocValuesFieldExistsQuery); + } + + public void testFetchSourceValue() throws IOException { + MappedFieldType fieldType = new DocCountFieldMapper.DocCountFieldType(); + assertEquals(Arrays.asList(14L), fetchSourceValue(fieldType, 14)); + assertEquals(Arrays.asList(14L), fetchSourceValue(fieldType, "14")); + assertEquals(Arrays.asList(1L), fetchSourceValue(fieldType, "")); + assertEquals(Arrays.asList(1L), fetchSourceValue(fieldType, null)); + } +} diff --git a/server/src/test/java/org/opensearch/indices/IndicesModuleTests.java b/server/src/test/java/org/opensearch/indices/IndicesModuleTests.java index afcc6aa006500..ec7ab06ac86a6 100644 --- a/server/src/test/java/org/opensearch/indices/IndicesModuleTests.java +++ b/server/src/test/java/org/opensearch/indices/IndicesModuleTests.java @@ -33,6 +33,7 @@ package org.opensearch.indices; import org.opensearch.Version; +import org.opensearch.index.mapper.DocCountFieldMapper; import org.opensearch.index.mapper.DataStreamFieldMapper; import org.opensearch.index.mapper.FieldNamesFieldMapper; import org.opensearch.index.mapper.IdFieldMapper; @@ -98,6 +99,7 @@ public Map getMetadataMappers() { NestedPathFieldMapper.NAME, VersionFieldMapper.NAME, SeqNoFieldMapper.NAME, + DocCountFieldMapper.NAME, FieldNamesFieldMapper.NAME }; public void testBuiltinMappers() { diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/DocCountProviderTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/DocCountProviderTests.java new file mode 100644 index 0000000000000..708d683fe2484 --- /dev/null +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/DocCountProviderTests.java @@ -0,0 +1,107 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Modifications Copyright OpenSearch Contributors. See + * GitHub history for details. + */ + +package org.opensearch.search.aggregations.bucket; + +import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.opensearch.common.CheckedConsumer; +import org.opensearch.index.mapper.DocCountFieldMapper; +import org.opensearch.index.mapper.MappedFieldType; +import org.opensearch.index.mapper.NumberFieldMapper; +import org.opensearch.search.aggregations.AggregatorTestCase; +import org.opensearch.search.aggregations.bucket.global.GlobalAggregationBuilder; +import org.opensearch.search.aggregations.bucket.global.InternalGlobal; + +import java.io.IOException; +import java.util.List; +import java.util.function.Consumer; + +import static java.util.Collections.singleton; + +public class DocCountProviderTests extends AggregatorTestCase { + + private static final String DOC_COUNT_FIELD = DocCountFieldMapper.NAME; + private static final String NUMBER_FIELD = "number"; + + public void testDocsWithDocCount() throws IOException { + testAggregation(new MatchAllDocsQuery(), iw -> { + iw.addDocument(List.of(new NumericDocValuesField(DOC_COUNT_FIELD, 4), new SortedNumericDocValuesField(NUMBER_FIELD, 1))); + iw.addDocument(List.of(new NumericDocValuesField(DOC_COUNT_FIELD, 5), new SortedNumericDocValuesField(NUMBER_FIELD, 7))); + iw.addDocument( + List.of( + // Intentionally omit doc_count field + new SortedNumericDocValuesField(NUMBER_FIELD, 1) + ) + ); + }, global -> { assertEquals(10, global.getDocCount()); }); + } + + public void testDocsWithoutDocCount() throws IOException { + testAggregation(new MatchAllDocsQuery(), iw -> { + iw.addDocument(singleton(new SortedNumericDocValuesField(NUMBER_FIELD, 1))); + iw.addDocument(singleton(new SortedNumericDocValuesField(NUMBER_FIELD, 7))); + iw.addDocument(singleton(new SortedNumericDocValuesField(NUMBER_FIELD, 1))); + }, global -> { assertEquals(3, global.getDocCount()); }); + } + + public void testQueryFiltering() throws IOException { + testAggregation(IntPoint.newRangeQuery(NUMBER_FIELD, 4, 5), iw -> { + iw.addDocument(List.of(new NumericDocValuesField(DOC_COUNT_FIELD, 4), new IntPoint(NUMBER_FIELD, 6))); + iw.addDocument(List.of(new NumericDocValuesField(DOC_COUNT_FIELD, 2), new IntPoint(NUMBER_FIELD, 5))); + iw.addDocument( + List.of( + // Intentionally omit doc_count field + new IntPoint(NUMBER_FIELD, 1) + ) + ); + iw.addDocument( + List.of( + // Intentionally omit doc_count field + new IntPoint(NUMBER_FIELD, 5) + ) + ); + }, global -> { assertEquals(3, global.getDocCount()); }); + } + + private void testAggregation(Query query, CheckedConsumer indexer, Consumer verify) + throws IOException { + GlobalAggregationBuilder aggregationBuilder = new GlobalAggregationBuilder("_name"); + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NUMBER_FIELD, NumberFieldMapper.NumberType.LONG); + MappedFieldType docCountFieldType = new DocCountFieldMapper.DocCountFieldType(); + testCase(aggregationBuilder, query, indexer, verify, fieldType, docCountFieldType); + } +} diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueueTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueueTests.java index 3dccdf8dab95e..4571ae7496ae1 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueueTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueueTests.java @@ -347,7 +347,7 @@ private void testRandomCase(boolean forceMerge, boolean missingBucket, int index final LeafBucketCollector leafCollector = new LeafBucketCollector() { @Override public void collect(int doc, long bucket) throws IOException { - queue.addIfCompetitive(indexSortSourcePrefix); + queue.addIfCompetitive(indexSortSourcePrefix, 1); } }; final LeafBucketCollector queueCollector = queue.getLeafCollector(leafReaderContext, leafCollector);