Skip to content

Commit

Permalink
Set recovery rate for dedicated cold nodes (elastic#68480)
Browse files Browse the repository at this point in the history
This commit sets the recovery rate for dedicated cold nodes. The goal is
here is enhance performance of recovery in a dedicated cold tier, where
we expect such nodes to be predominantly using searchable snapshots to
back the indices located on them. This commit follows a simple approach
where we increase the recovery rate as a function of the node size, for
nodes that appear to be dedicated cold nodes.
  • Loading branch information
jasontedor committed Feb 4, 2021
1 parent eab1bda commit 35e2de9
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 3 deletions.
16 changes: 15 additions & 1 deletion docs/reference/modules/indices/recovery.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,21 @@ You can view a list of in-progress and completed recoveries using the
`indices.recovery.max_bytes_per_sec`::
(<<cluster-update-settings,Dynamic>>) Limits total inbound and outbound
recovery traffic for each node. Applies to both peer recoveries as well
as snapshot recoveries (i.e., restores from a snapshot). Defaults to `40mb`.
as snapshot recoveries (i.e., restores from a snapshot). Defaults to `40mb`
unless the node is a <<cold-tier, dedicated cold node>> in which case the
default relates to the total memory available to the node:

.Recovery Rate for Cold Nodes
[options="header"]
|======
|total memory | default value
|<= 4 GB | 40 MB/s
|> 4 GB and <= 8 GB | 60 MB/s
|> 8 GB and <= 16 GB | 90 MB/s
|> 16 GB and <= 32 GB | 125 MB/s
|> 32 GB | 250 MB/s
|======

+
This limit applies to each node separately. If multiple nodes in a cluster
perform recoveries at the same time, the cluster's total recovery traffic may
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,69 @@
import org.apache.logging.log4j.Logger;
import org.apache.lucene.store.RateLimiter;
import org.apache.lucene.store.RateLimiter.SimpleRateLimiter;
import org.elasticsearch.bootstrap.JavaVersion;
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Setting.Property;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.monitor.os.OsProbe;
import org.elasticsearch.node.NodeRoleSettings;

import java.util.List;
import java.util.stream.Collectors;

public class RecoverySettings {

private static final Logger logger = LogManager.getLogger(RecoverySettings.class);

public static final Setting<ByteSizeValue> INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING =
Setting.byteSizeSetting("indices.recovery.max_bytes_per_sec", new ByteSizeValue(40, ByteSizeUnit.MB),
Property.Dynamic, Property.NodeScope);
Setting.byteSizeSetting(
"indices.recovery.max_bytes_per_sec",
s -> {
final ByteSizeValue defaultMaxBytesPerSec = new ByteSizeValue(40, ByteSizeUnit.MB);
final List<DiscoveryNodeRole> roles = NodeRoleSettings.NODE_ROLES_SETTING.get(s);
final List<DiscoveryNodeRole> dataRoles =
roles.stream().filter(DiscoveryNodeRole::canContainData).collect(Collectors.toList());
if (dataRoles.isEmpty()) {
// if the node is not a data node, this value doesn't matter, use the default
return defaultMaxBytesPerSec.getStringRep();
}
if ((dataRoles.size() > 1 || dataRoles.get(0).roleName().equals("data_cold") == false) ||
roles.contains(DiscoveryNodeRole.MASTER_ROLE)) {
// if the node is not a dedicated cold node, use the default
return defaultMaxBytesPerSec.getStringRep();
}
/*
* Now we are looking at a node that has a single data role, that data role is the cold data role, and the node does not
* have the master role. In this case, we are going to set the recovery size as a function of the memory size. We are making
* an assumption here that the size of the instance is correlated with I/O resources. That is we are assuming that the
* larger the instance, the more disk and networking capacity it has available.
*/
if (JavaVersion.current().compareTo(JavaVersion.parse("14")) < 0) {
// prior to JDK 14, the JDK did not take into consideration container memory limits when reporting total system memory
return defaultMaxBytesPerSec.getStringRep();
}
final ByteSizeValue totalPhysicalMemory = new ByteSizeValue(OsProbe.getInstance().getTotalPhysicalMemorySize());
final ByteSizeValue maxBytesPerSec;
if (totalPhysicalMemory.compareTo(new ByteSizeValue(4, ByteSizeUnit.GB)) <= 0) {
maxBytesPerSec = new ByteSizeValue(40, ByteSizeUnit.MB);
} else if (totalPhysicalMemory.compareTo(new ByteSizeValue(8, ByteSizeUnit.GB)) <= 0) {
maxBytesPerSec = new ByteSizeValue(60, ByteSizeUnit.MB);
} else if (totalPhysicalMemory.compareTo(new ByteSizeValue(16, ByteSizeUnit.GB)) <= 0) {
maxBytesPerSec = new ByteSizeValue(90, ByteSizeUnit.MB);
} else if (totalPhysicalMemory.compareTo(new ByteSizeValue(32, ByteSizeUnit.GB)) <= 0) {
maxBytesPerSec = new ByteSizeValue(125, ByteSizeUnit.MB);
} else {
maxBytesPerSec = new ByteSizeValue(250, ByteSizeUnit.MB);
}
return maxBytesPerSec.getStringRep();
},
Property.Dynamic,
Property.NodeScope);

/**
* Controls the maximum number of file chunk requests that can be sent concurrently from the source node to the target node.
Expand Down

0 comments on commit 35e2de9

Please sign in to comment.