Set recovery rate for dedicated cold nodes (elastic#68480)

This commit sets the recovery rate for dedicated cold nodes. The goal is here is enhance performance of recovery in a dedicated cold tier, where we expect such nodes to be predominantly using searchable snapshots to back the indices located on them. This commit follows a simple approach where we increase the recovery rate as a function of the node size, for nodes that appear to be dedicated cold nodes.
nik9000 · Feb 4, 2021 · 35e2de9 · 35e2de9
1 parent eab1bda
commit 35e2de9
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 3 deletions.
diff --git a/docs/reference/modules/indices/recovery.asciidoc b/docs/reference/modules/indices/recovery.asciidoc
@@ -18,7 +18,21 @@ You can view a list of in-progress and completed recoveries using the
 `indices.recovery.max_bytes_per_sec`::
 (<<cluster-update-settings,Dynamic>>) Limits total inbound and outbound
 recovery traffic for each node. Applies to both peer recoveries as well
-as snapshot recoveries (i.e., restores from a snapshot). Defaults to `40mb`.
+as snapshot recoveries (i.e., restores from a snapshot). Defaults to `40mb`
+unless the node is a <<cold-tier, dedicated cold node>> in which case the
+default relates to the total memory available to the node:
+
+.Recovery Rate for Cold Nodes
+[options="header"]
+|======
+|total memory         | default value
+|<= 4 GB              | 40 MB/s
+|> 4 GB and <= 8 GB   | 60 MB/s
+|> 8 GB and <= 16 GB  | 90 MB/s
+|> 16 GB and <= 32 GB | 125 MB/s
+|> 32 GB              | 250 MB/s
+|======
+
 +
 This limit applies to each node separately. If multiple nodes in a cluster
 perform recoveries at the same time, the cluster's total recovery traffic may

diff --git a/server/src/main/java/org/elasticsearch/indices/recovery/RecoverySettings.java b/server/src/main/java/org/elasticsearch/indices/recovery/RecoverySettings.java
@@ -12,21 +12,69 @@
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.store.RateLimiter;
 import org.apache.lucene.store.RateLimiter.SimpleRateLimiter;
+import org.elasticsearch.bootstrap.JavaVersion;
+import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.common.settings.ClusterSettings;
 import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.settings.Setting.Property;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.ByteSizeUnit;
 import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.monitor.os.OsProbe;
+import org.elasticsearch.node.NodeRoleSettings;
+
+import java.util.List;
+import java.util.stream.Collectors;
 
 public class RecoverySettings {
 
     private static final Logger logger = LogManager.getLogger(RecoverySettings.class);
 
     public static final Setting<ByteSizeValue> INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING =
-        Setting.byteSizeSetting("indices.recovery.max_bytes_per_sec", new ByteSizeValue(40, ByteSizeUnit.MB),
-            Property.Dynamic, Property.NodeScope);
+        Setting.byteSizeSetting(
+            "indices.recovery.max_bytes_per_sec",
+            s -> {
+                final ByteSizeValue defaultMaxBytesPerSec = new ByteSizeValue(40, ByteSizeUnit.MB);
+                final List<DiscoveryNodeRole> roles = NodeRoleSettings.NODE_ROLES_SETTING.get(s);
+                final List<DiscoveryNodeRole> dataRoles =
+                    roles.stream().filter(DiscoveryNodeRole::canContainData).collect(Collectors.toList());
+                if (dataRoles.isEmpty()) {
+                    // if the node is not a data node, this value doesn't matter, use the default
+                    return defaultMaxBytesPerSec.getStringRep();
+                }
+                if ((dataRoles.size() > 1 || dataRoles.get(0).roleName().equals("data_cold") == false) ||
+                    roles.contains(DiscoveryNodeRole.MASTER_ROLE)) {
+                    // if the node is not a dedicated cold node, use the default
+                    return defaultMaxBytesPerSec.getStringRep();
+                }
+                /*
+                 * Now we are looking at a node that has a single data role, that data role is the cold data role, and the node does not
+                 * have the master role. In this case, we are going to set the recovery size as a function of the memory size. We are making
+                 * an assumption here that the size of the instance is correlated with I/O resources. That is we are assuming that the
+                 * larger the instance, the more disk and networking capacity it has available.
+                 */
+                if (JavaVersion.current().compareTo(JavaVersion.parse("14")) < 0) {
+                    // prior to JDK 14, the JDK did not take into consideration container memory limits when reporting total system memory
+                    return defaultMaxBytesPerSec.getStringRep();
+                }
+                final ByteSizeValue totalPhysicalMemory = new ByteSizeValue(OsProbe.getInstance().getTotalPhysicalMemorySize());
+                final ByteSizeValue maxBytesPerSec;
+                if (totalPhysicalMemory.compareTo(new ByteSizeValue(4, ByteSizeUnit.GB)) <= 0) {
+                    maxBytesPerSec = new ByteSizeValue(40, ByteSizeUnit.MB);
+                } else if (totalPhysicalMemory.compareTo(new ByteSizeValue(8, ByteSizeUnit.GB)) <= 0) {
+                    maxBytesPerSec = new ByteSizeValue(60, ByteSizeUnit.MB);
+                } else if (totalPhysicalMemory.compareTo(new ByteSizeValue(16, ByteSizeUnit.GB)) <= 0) {
+                    maxBytesPerSec = new ByteSizeValue(90, ByteSizeUnit.MB);
+                } else if (totalPhysicalMemory.compareTo(new ByteSizeValue(32, ByteSizeUnit.GB)) <= 0) {
+                    maxBytesPerSec = new ByteSizeValue(125, ByteSizeUnit.MB);
+                } else {
+                    maxBytesPerSec = new ByteSizeValue(250, ByteSizeUnit.MB);
+                }
+                return maxBytesPerSec.getStringRep();
+            },
+            Property.Dynamic,
+            Property.NodeScope);
 
     /**
      * Controls the maximum number of file chunk requests that can be sent concurrently from the source node to the target node.