From f217322b2899ae1fcdf37e10e90b7c4d9f32297b Mon Sep 17 00:00:00 2001 From: Nhat Nguyen Date: Wed, 11 Sep 2024 17:17:27 -0700 Subject: [PATCH] Estimate segment field usages (#112760) (#112777) We have introduced a new memory estimation method in serverless, based on the number of segments and the fields within them. This new approach works well overall, but it still falls short in cases where most fields are used more than once - for example, in both doc_values and postings, or doc_values and points. This change exposes the total usage of fields in segments, allowing us to adjust the memory estimate for these cases. --- .../codec/DeduplicatingFieldInfosFormat.java | 2 +- .../index/codec/FieldInfosWithUsages.java | 49 +++++++++++++++++++ .../elasticsearch/index/shard/IndexShard.java | 14 +++++- .../index/shard/ShardFieldStats.java | 4 +- .../index/shard/IndexShardTests.java | 9 ++++ 5 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/index/codec/FieldInfosWithUsages.java diff --git a/server/src/main/java/org/elasticsearch/index/codec/DeduplicatingFieldInfosFormat.java b/server/src/main/java/org/elasticsearch/index/codec/DeduplicatingFieldInfosFormat.java index 75ec265a68391..67d98bf30d6ec 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/DeduplicatingFieldInfosFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/DeduplicatingFieldInfosFormat.java @@ -65,7 +65,7 @@ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segm fi.isParentField() ); } - return new FieldInfos(deduplicated); + return new FieldInfosWithUsages(deduplicated); } private static Map internStringStringMap(Map m) { diff --git a/server/src/main/java/org/elasticsearch/index/codec/FieldInfosWithUsages.java b/server/src/main/java/org/elasticsearch/index/codec/FieldInfosWithUsages.java new file mode 100644 index 0000000000000..e0d4a94ee28a1 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/FieldInfosWithUsages.java @@ -0,0 +1,49 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.index.codec; + +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexOptions; + +public class FieldInfosWithUsages extends FieldInfos { + private final int totalUsages; + + public FieldInfosWithUsages(FieldInfo[] infos) { + super(infos); + this.totalUsages = computeUsages(infos); + } + + public static int computeUsages(FieldInfo[] infos) { + int usages = 0; + for (FieldInfo fi : infos) { + if (fi.getIndexOptions() != IndexOptions.NONE) { + usages++; + } + if (fi.hasNorms()) { + usages++; + } + if (fi.getDocValuesType() != DocValuesType.NONE) { + usages++; + } + if (fi.getPointDimensionCount() > 0) { + usages++; + } + if (fi.getVectorDimension() > 0) { + usages++; + } + } + return usages; + } + + public int getTotalUsages() { + return totalUsages; + } +} diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java index 8f1ae42a7475c..b618ca49a3a0b 100644 --- a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java @@ -83,6 +83,7 @@ import org.elasticsearch.index.cache.query.TrivialQueryCachingPolicy; import org.elasticsearch.index.cache.request.ShardRequestCache; import org.elasticsearch.index.codec.CodecService; +import org.elasticsearch.index.codec.FieldInfosWithUsages; import org.elasticsearch.index.engine.CommitStats; import org.elasticsearch.index.engine.Engine; import org.elasticsearch.index.engine.Engine.GetResult; @@ -4093,11 +4094,20 @@ public void afterRefresh(boolean didRefresh) { try (var searcher = getEngine().acquireSearcher("shard_field_stats", Engine.SearcherScope.INTERNAL)) { int numSegments = 0; int totalFields = 0; + long usages = 0; for (LeafReaderContext leaf : searcher.getLeafContexts()) { numSegments++; - totalFields += leaf.reader().getFieldInfos().size(); + var fieldInfos = leaf.reader().getFieldInfos(); + totalFields += fieldInfos.size(); + if (fieldInfos instanceof FieldInfosWithUsages ft) { + if (usages != -1) { + usages += ft.getTotalUsages(); + } + } else { + usages = -1; + } } - shardFieldStats = new ShardFieldStats(numSegments, totalFields); + shardFieldStats = new ShardFieldStats(numSegments, totalFields, usages); } catch (AlreadyClosedException ignored) { } diff --git a/server/src/main/java/org/elasticsearch/index/shard/ShardFieldStats.java b/server/src/main/java/org/elasticsearch/index/shard/ShardFieldStats.java index 9c53abb1e95e5..c2122b9f4b0aa 100644 --- a/server/src/main/java/org/elasticsearch/index/shard/ShardFieldStats.java +++ b/server/src/main/java/org/elasticsearch/index/shard/ShardFieldStats.java @@ -14,7 +14,9 @@ * * @param numSegments the number of segments * @param totalFields the total number of fields across the segments + * @param fieldUsages the number of usages for segment-level fields (e.g., doc_values, postings, norms, points) + * -1 if unavailable */ -public record ShardFieldStats(int numSegments, int totalFields) { +public record ShardFieldStats(int numSegments, int totalFields, long fieldUsages) { } diff --git a/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java b/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java index 28497dc4a8a6b..63bd1fd6cdeff 100644 --- a/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java +++ b/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java @@ -1793,6 +1793,7 @@ public void testShardFieldStats() throws IOException { assertNotNull(stats); assertThat(stats.numSegments(), equalTo(0)); assertThat(stats.totalFields(), equalTo(0)); + assertThat(stats.fieldUsages(), equalTo(0L)); // index some documents int numDocs = between(1, 10); for (int i = 0; i < numDocs; i++) { @@ -1809,6 +1810,9 @@ public void testShardFieldStats() throws IOException { assertThat(stats.numSegments(), equalTo(1)); // _id, _source, _version, _primary_term, _seq_no, f1, f1.keyword, f2, f2.keyword, assertThat(stats.totalFields(), equalTo(9)); + // _id(term), _source(0), _version(dv), _primary_term(dv), _seq_no(point,dv), f1(postings,norms), + // f1.keyword(term,dv), f2(postings,norms), f2.keyword(term,dv), + assertThat(stats.fieldUsages(), equalTo(13L)); // don't re-compute on refresh without change if (randomBoolean()) { shard.refresh("test"); @@ -1838,10 +1842,15 @@ public void testShardFieldStats() throws IOException { assertThat(stats.numSegments(), equalTo(2)); // 9 + _id, _source, _version, _primary_term, _seq_no, f1, f1.keyword, f2, f2.keyword, f3, f3.keyword assertThat(stats.totalFields(), equalTo(21)); + // first segment: 13, second segment: 13 + f3(postings,norms) + f3.keyword(term,dv), and __soft_deletes to previous segment + assertThat(stats.fieldUsages(), equalTo(31L)); shard.forceMerge(new ForceMergeRequest().maxNumSegments(1).flush(true)); stats = shard.getShardFieldStats(); assertThat(stats.numSegments(), equalTo(1)); assertThat(stats.totalFields(), equalTo(12)); + // _id(term), _source(0), _version(dv), _primary_term(dv), _seq_no(point,dv), f1(postings,norms), + // f1.keyword(term,dv), f2(postings,norms), f2.keyword(term,dv), f3(postings,norms), f3.keyword(term,dv), __soft_deletes + assertThat(stats.fieldUsages(), equalTo(18L)); closeShards(shard); }