elastic · DaveCTurner · Oct 8, 2019 · Oct 8, 2019 · Oct 8, 2019
diff --git a/docs/reference/migration/migrate_8_0/allocation.asciidoc b/docs/reference/migration/migrate_8_0/allocation.asciidoc
@@ -20,3 +20,15 @@ block is automatically removed when a node drops below the high watermark
 again, but this behaviour could be disabled by setting the system property
 `es.disk.auto_release_flood_stage_block` to `false`. This behaviour is no
 longer optional, and this system property must now not be set.
+
+[float]
+[[breaking_80_allocation_change_include_relocations_removed]]
+==== Accounting for disk usage of relocating shards no longer optional
+
+By default {es} will account for the sizes of relocating shards when making
+allocation decisions based on the disk usage of the nodes in the cluster. In
+earlier versions the `cluster.routing.allocation.disk.include_relocations`
+setting allowed this accounting to be disabled, which would result in poor
+allocation decisions that might overshoot watermarks and require significant
+extra work to correct. This behaviour is no longer optional, and this setting
+has been removed.
diff --git a/docs/reference/modules/cluster/disk_allocator.asciidoc b/docs/reference/modules/cluster/disk_allocator.asciidoc
@@ -67,18 +67,6 @@ PUT /twitter/_settings
     How often Elasticsearch should check on disk usage for each node in the
     cluster. Defaults to `30s`.
 
-`cluster.routing.allocation.disk.include_relocations`::
-
-    deprecated[7.5, Future versions will always account for relocations.]
-    Defaults to +true+, which means that Elasticsearch will take into account
-    shards that are currently being relocated to the target node when computing
-    a node's disk usage. Taking relocating shards' sizes into account may,
-    however, mean that the disk usage for a node is incorrectly estimated on
-    the high side, since the relocation could be 90% complete and a recently
-    retrieved disk usage would include the total size of the relocating shard
-    as well as the space already used by the running relocation.
-
-
 NOTE: Percentage values refer to used disk space, while byte values refer to
 free disk space. This can be confusing, since it flips the meaning of high and
 low. For example, it makes sense to set the low watermark to 10gb and the high
@@ -100,3 +88,16 @@ PUT _cluster/settings
   }
 }
 --------------------------------------------------
+
+{es} accounts for the future disk usage of ongoing shard relocations and
+recoveries to help prevent these shard movements from breaching a watermark.
+This mechanism may double-count some data that has already been relocated onto
+a node. For instance, if a relocation of a 100GB shard is 90% complete then
+{es} has copied 90GB of data onto the target node. This 90GB consumes disk
+space and will be reflected in the node's disk usage statistics. However {es}
+also treats the relocation as if it will consume another full 100GB in the
+future, even though the shard may really only consume a further 10GB of space.
+If the node's disks are close to a watermark then this may temporarily prevent
+other shards from moving onto the same node. Eventually the relocation will
+complete and then {es} will use the node's true disk usage statistics again.
+
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettings.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettings.java
@@ -55,9 +55,6 @@ public class DiskThresholdSettings {
             (s) -> validWatermarkSetting(s, "cluster.routing.allocation.disk.watermark.flood_stage"),
             new FloodStageValidator(),
             Setting.Property.Dynamic, Setting.Property.NodeScope);
-    public static final Setting<Boolean> CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS_SETTING =
-        Setting.boolSetting("cluster.routing.allocation.disk.include_relocations", true,
-            Setting.Property.Dynamic, Setting.Property.NodeScope, Setting.Property.Deprecated);
     public static final Setting<TimeValue> CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING =
         Setting.positiveTimeSetting("cluster.routing.allocation.disk.reroute_interval", TimeValue.timeValueSeconds(60),
             Setting.Property.Dynamic, Setting.Property.NodeScope);
@@ -68,7 +65,6 @@ public class DiskThresholdSettings {
     private volatile Double freeDiskThresholdHigh;
     private volatile ByteSizeValue freeBytesThresholdLow;
     private volatile ByteSizeValue freeBytesThresholdHigh;
-    private volatile boolean includeRelocations;
     private volatile boolean enabled;
     private volatile TimeValue rerouteInterval;
     private volatile Double freeDiskThresholdFloodStage;
@@ -90,13 +86,11 @@ public DiskThresholdSettings(Settings settings, ClusterSettings clusterSettings)
         setHighWatermark(highWatermark);
         setLowWatermark(lowWatermark);
         setFloodStage(floodStage);
-        this.includeRelocations = CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS_SETTING.get(settings);
         this.rerouteInterval = CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.get(settings);
         this.enabled = CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.get(settings);
         clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING, this::setLowWatermark);
         clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING, this::setHighWatermark);
         clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING, this::setFloodStage);
-        clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS_SETTING, this::setIncludeRelocations);
         clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING, this::setRerouteInterval);
         clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING, this::setEnabled);
     }
@@ -227,10 +221,6 @@ private static void doValidateAsBytes(final String low, final String high, final
         }
     }
 
-    private void setIncludeRelocations(boolean includeRelocations) {
-        this.includeRelocations = includeRelocations;
-    }
-
     private void setRerouteInterval(TimeValue rerouteInterval) {
         this.rerouteInterval = rerouteInterval;
     }
@@ -300,10 +290,6 @@ public ByteSizeValue getFreeBytesThresholdFloodStage() {
         return freeBytesThresholdFloodStage;
     }
 
-    public boolean includeRelocations() {
-        return includeRelocations;
-    }
-
     public boolean isEnabled() {
         return enabled;
     }

diff --git a/.../main/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDecider.java b/.../main/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDecider.java
@@ -335,23 +335,16 @@ private DiskUsage getDiskUsage(RoutingNode node, RoutingAllocation allocation,
             // If there is no usage, and we have other nodes in the cluster,
             // use the average usage for all nodes as the usage for this node
             usage = averageUsage(node, usages);
-            if (logger.isDebugEnabled()) {
-                logger.debug("unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]",
-                        node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage());
-            }
+            logger.debug("unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]",
+                    node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage());
         }
 
-        if (diskThresholdSettings.includeRelocations()) {
-            long relocatingShardsSize = sizeOfRelocatingShards(node, allocation, subtractLeavingShards, usage.getPath());
-            DiskUsage usageIncludingRelocations = new DiskUsage(node.nodeId(), node.node().getName(), usage.getPath(),
-                    usage.getTotalBytes(), usage.getFreeBytes() - relocatingShardsSize);
-            if (logger.isTraceEnabled()) {
-                logger.trace("usage without relocations: {}", usage);
-                logger.trace("usage with relocations: [{} bytes] {}", relocatingShardsSize, usageIncludingRelocations);
-            }
-            usage = usageIncludingRelocations;
-        }
-        return usage;
+        final long relocatingShardsSize = sizeOfRelocatingShards(node, allocation, subtractLeavingShards, usage.getPath());
+        final DiskUsage usageIncludingRelocations = new DiskUsage(node.nodeId(), node.node().getName(), usage.getPath(),
+                usage.getTotalBytes(), usage.getFreeBytes() - relocatingShardsSize);
+        logger.trace("getDiskUsage: usage [{}] with [{}] bytes relocating yields [{}]",
+                     usage, relocatingShardsSize, usageIncludingRelocations);
+        return usageIncludingRelocations;
     }
 
     /**

diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java
@@ -213,7 +213,6 @@ public void apply(Settings value, Settings current, Settings previous) {
             DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING,
             DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING,
             DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING,
-            DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS_SETTING,
             DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING,
             SameShardAllocationDecider.CLUSTER_ROUTING_ALLOCATION_SAME_HOST_SETTING,
             InternalClusterInfoService.INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL_SETTING,

diff --git a/...rc/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettingsTests.java b/...rc/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettingsTests.java
@@ -44,7 +44,6 @@ public void testDefaults() {
         assertEquals(15.0D, diskThresholdSettings.getFreeDiskThresholdLow(), 0.0D);
         assertEquals(60L, diskThresholdSettings.getRerouteInterval().seconds());
         assertTrue(diskThresholdSettings.isEnabled());
-        assertTrue(diskThresholdSettings.includeRelocations());
         assertEquals(zeroBytes, diskThresholdSettings.getFreeBytesThresholdFloodStage());
         assertEquals(5.0D, diskThresholdSettings.getFreeDiskThresholdFloodStage(), 0.0D);
     }
@@ -55,7 +54,6 @@ public void testUpdate() {
 
         Settings newSettings = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), false)
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS_SETTING.getKey(), false)
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "500mb")
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "1000mb")
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "250mb")
@@ -71,10 +69,6 @@ public void testUpdate() {
         assertEquals(0.0D, diskThresholdSettings.getFreeDiskThresholdFloodStage(), 0.0D);
         assertEquals(30L, diskThresholdSettings.getRerouteInterval().seconds());
         assertFalse(diskThresholdSettings.isEnabled());
-        assertFalse(diskThresholdSettings.includeRelocations());
-
-        assertWarnings("[cluster.routing.allocation.disk.include_relocations] setting was deprecated in Elasticsearch and " +
-            "will be removed in a future release! See the breaking changes documentation for the next major version.");
     }
 
     public void testInvalidConstruction() {