Autoscaling proactive trigger on low watermark (elastic#78941)

Add test that we trigger a proactive scale up when low watermark is exceeded.
DaveCTurner · Oct 15, 2021 · c5b2ce6 · c5b2ce6
1 parent d0d91da
commit c5b2ce6
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 6 deletions.
diff --git a/...est/java/org/elasticsearch/xpack/autoscaling/storage/AutoscalingStorageIntegTestCase.java b/...est/java/org/elasticsearch/xpack/autoscaling/storage/AutoscalingStorageIntegTestCase.java
@@ -23,7 +23,8 @@
 import java.util.Collection;
 
 public class AutoscalingStorageIntegTestCase extends DiskUsageIntegTestCase {
-    protected static final long WATERMARK_BYTES = 10240;
+    protected static final long HIGH_WATERMARK_BYTES = 10240;
+    protected static final long LOW_WATERMARK_BYTES = 2 * HIGH_WATERMARK_BYTES;
 
     @Override
     protected Collection<Class<? extends Plugin>> nodePlugins() {
@@ -36,8 +37,8 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
     @Override
     protected Settings nodeSettings(final int nodeOrdinal, final Settings otherSettings) {
         final Settings.Builder builder = Settings.builder().put(super.nodeSettings(nodeOrdinal, otherSettings));
-        builder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), (WATERMARK_BYTES * 2) + "b")
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), WATERMARK_BYTES + "b")
+        builder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), LOW_WATERMARK_BYTES + "b")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), HIGH_WATERMARK_BYTES + "b")
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "0b")
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "0ms")
             .put(DiskThresholdDecider.ENABLE_FOR_SINGLE_DATA_NODE.getKey(), "true");

diff --git a/...ernalClusterTest/java/org/elasticsearch/xpack/autoscaling/storage/ProactiveStorageIT.java b/...ernalClusterTest/java/org/elasticsearch/xpack/autoscaling/storage/ProactiveStorageIT.java
@@ -72,7 +72,8 @@ public void testScaleUp() throws IOException, InterruptedException {
 
         final String dsName = randomAlphaOfLength(10).toLowerCase(Locale.ROOT);
         createDataStreamAndTemplate(dsName);
-        for (int i = 0; i < between(1, 5); ++i) {
+        final int rolloverCount = between(1, 5);
+        for (int i = 0; i < rolloverCount; ++i) {
             indexRandom(
                 true,
                 false,
@@ -95,7 +96,13 @@ public void testScaleUp() throws IOException, InterruptedException {
         IndicesStatsResponse stats = client().admin().indices().prepareStats(dsName).clear().setStore(true).get();
         long used = stats.getTotal().getStore().getSizeInBytes();
         long maxShardSize = Arrays.stream(stats.getShards()).mapToLong(s -> s.getStats().getStore().sizeInBytes()).max().getAsLong();
-        long enoughSpace = used + WATERMARK_BYTES + 1;
+        // As long as usage is above low watermark, we will trigger a proactive scale up, since the simulated shards have an in-sync
+        // set and therefore allocating these do not skip the low watermark check in the disk threshold decider.
+        // Fixing this simulation should be done as a separate effort, but we should still ensure that the low watermark is in effect
+        // at least when replicas are involved.
+        long enoughSpace = used + (randomBoolean()
+            ? LOW_WATERMARK_BYTES - 1
+            : randomLongBetween(HIGH_WATERMARK_BYTES, LOW_WATERMARK_BYTES - 1));
 
         setTotalSpace(dataNodeName, enoughSpace);
 

diff --git a/...ternalClusterTest/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageIT.java b/...ternalClusterTest/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageIT.java
@@ -91,7 +91,7 @@ public void testScaleUp() throws InterruptedException {
         long used = stats.getTotal().getStore().getSizeInBytes();
         long minShardSize = Arrays.stream(stats.getShards()).mapToLong(s -> s.getStats().getStore().sizeInBytes()).min().getAsLong();
         long maxShardSize = Arrays.stream(stats.getShards()).mapToLong(s -> s.getStats().getStore().sizeInBytes()).max().getAsLong();
-        long enoughSpace = used + WATERMARK_BYTES + 1;
+        long enoughSpace = used + HIGH_WATERMARK_BYTES + 1;
 
         setTotalSpace(dataNodeName, enoughSpace);
         GetAutoscalingCapacityAction.Response response = capacity();

diff --git a/.../main/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageDeciderService.java b/.../main/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageDeciderService.java
@@ -582,6 +582,12 @@ private SingleForecast forecast(IndexAbstraction.DataStream stream, long forecas
             for (int i = 0; i < numberNewIndices; ++i) {
                 final String uuid = UUIDs.randomBase64UUID();
                 dataStream = dataStream.rollover(state.metadata(), uuid, Version.CURRENT);
+
+                // this unintentionally copies the in-sync allocation ids too. This has the fortunate effect of these indices
+                // not being regarded new by the disk threshold decider, thereby respecting the low watermark threshold even for primaries.
+                // This is highly desirable so fixing this to clear the in-sync allocation ids will require a more elaborate solution,
+                // ensuring at least that when replicas are involved, we still respect the low watermark. This is therefore left as is
+                // for now with the intention to fix in a follow-up.
                 IndexMetadata newIndex = IndexMetadata.builder(writeIndex)
                     .index(dataStream.getWriteIndex().getName())
                     .settings(Settings.builder().put(writeIndex.getSettings()).put(IndexMetadata.SETTING_INDEX_UUID, uuid))