Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Zen2] Introduce auto_shrink_voting_configuration setting #35217

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@
import java.util.stream.StreamSupport;

import static java.util.Collections.emptySet;
import static org.elasticsearch.cluster.coordination.Reconfigurator.CLUSTER_MASTER_NODES_FAILURE_TOLERANCE;
import static org.elasticsearch.discovery.DiscoverySettings.NO_MASTER_BLOCK_WRITES;
import static org.elasticsearch.gateway.GatewayService.STATE_NOT_RECOVERED_BLOCK;

Expand Down Expand Up @@ -593,8 +592,6 @@ public void setInitialConfiguration(final VotingConfiguration votingConfiguratio
MetaData.Builder metaDataBuilder = MetaData.builder();
// automatically generate a UID for the metadata if we need to
metaDataBuilder.generateClusterUuidIfNeeded(); // TODO generate UUID in bootstrapping tool?
metaDataBuilder.persistentSettings(Settings.builder().put(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.getKey(),
(votingConfiguration.getNodeIds().size() - 1) / 2).build()); // TODO set this in bootstrapping tool?
builder.metaData(metaDataBuilder);
coordinationState.get().setInitialState(builder.build());
preVoteCollector.update(getPreVoteResponse(), null); // pick up the change to last-accepted version
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,29 +41,32 @@ public class Reconfigurator extends AbstractComponent {

/**
* The cluster usually requires a vote from at least half of the master nodes in order to commit a cluster state update, and to achieve
* this it makes automatic adjustments to the quorum size as master nodes join or leave the cluster. However, if master nodes leave the
* cluster slowly enough then these automatic adjustments can end up with a single master node; if this last node were to fail then the
* cluster would be rendered permanently unavailable. Instead it may be preferable to stop processing cluster state updates and become
* unavailable when the second-last (more generally, n'th-last) node leaves the cluster, so that the cluster is never in a situation
* where a single node's failure can cause permanent unavailability. This setting determines the size of the smallest set of master
* nodes required to process a cluster state update.
* the best resilience it makes automatic adjustments to the voting configuration as master nodes join or leave the cluster. Adjustments
* that fix or increase the size of the voting configuration are always a good idea, but the wisdom of reducing the voting configuration
* size is less clear. For instance, automatically reducing the voting configuration down to a single node means the cluster requires
* this node to operate, which is not resilient: if it broke we could restore every other master-eligible node in the cluster to health
* and still the cluster would be unavailable. However not reducing the voting configuration size can also hamper resilience: in a
* five-node cluster we could lose two nodes and by reducing the voting configuration to the remaining three nodes we could tolerate the
* loss of a further node before failing.
*
* We offer two options: either we auto-shrink the voting configuration as long as it contains more than three nodes, or we don't and we
* require the user to control the voting configuration manually using the retirement API. The former, default, option, guarantees that
* as long as there have been at least three master-eligible nodes in the cluster and no more than one of them is currently unavailable,
* then the cluster will still operate, which is what almost everyone wants. Manual control is for users who want different guarantees.
*/
public static final Setting<Integer> CLUSTER_MASTER_NODES_FAILURE_TOLERANCE =
Setting.intSetting("cluster.master_nodes_failure_tolerance", 0, 0, Property.NodeScope, Property.Dynamic);
// the default is not supposed to be important since we expect to set this setting explicitly at bootstrapping time
// TODO contemplate setting the default to something larger than 0 (1? 1<<30?)
// TODO prevent this being set as a transient or a per-node setting?
public static final Setting<Boolean> CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION =
Setting.boolSetting("cluster.auto_shrink_voting_configuration", true, Property.NodeScope, Property.Dynamic);

private volatile int masterNodesFailureTolerance;
private volatile boolean autoShrinkVotingConfiguration;

public Reconfigurator(Settings settings, ClusterSettings clusterSettings) {
super(settings);
masterNodesFailureTolerance = CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.get(settings);
clusterSettings.addSettingsUpdateConsumer(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE, this::setMasterNodesFailureTolerance);
autoShrinkVotingConfiguration = CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION.get(settings);
clusterSettings.addSettingsUpdateConsumer(CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION, this::setAutoShrinkVotingConfiguration);
}

public void setMasterNodesFailureTolerance(int masterNodesFailureTolerance) {
this.masterNodesFailureTolerance = masterNodesFailureTolerance;
public void setAutoShrinkVotingConfiguration(boolean autoShrinkVotingConfiguration) {
this.autoShrinkVotingConfiguration = autoShrinkVotingConfiguration;
}

private static int roundDownToOdd(int size) {
Expand All @@ -73,7 +76,7 @@ private static int roundDownToOdd(int size) {
@Override
public String toString() {
return "Reconfigurator{" +
"masterNodesFailureTolerance=" + masterNodesFailureTolerance +
"autoShrinkVotingConfiguration=" + autoShrinkVotingConfiguration +
'}';
}

Expand All @@ -92,22 +95,26 @@ public ClusterState.VotingConfiguration reconfigure(Set<DiscoveryNode> liveNodes
ClusterState.VotingConfiguration currentConfig) {
logger.trace("{} reconfiguring {} based on liveNodes={}, retiredNodeIds={}", this, currentConfig, liveNodes, retiredNodeIds);

final int safeConfigurationSize = 2 * masterNodesFailureTolerance + 1;
if (currentConfig.getNodeIds().size() < safeConfigurationSize) {
throw new AssertionError(currentConfig + " is smaller than expected " + safeConfigurationSize);
// The new configuration may only shrink if CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION is true, and even then may not shrink
// to fewer than 3 nodes.
final int minimumConfigurationSize;
if (autoShrinkVotingConfiguration) {
minimumConfigurationSize = Math.min(roundDownToOdd(currentConfig.getNodeIds().size()), 3);
} else {
minimumConfigurationSize = currentConfig.getNodeIds().size();
}

/*
* There are three true/false properties of each node in play: live/non-live, retired/non-retired and in-config/not-in-config.
* Firstly we divide the nodes into disjoint sets based on these properties:
*
* - retiredInConfigNotLiveIds
* - nonRetiredInConfigNotLiveIds
* - retiredInConfigLiveIds
* - nonRetiredInConfigLiveIds
* - nonRetiredLiveNotInConfigIds
*
* The other 3 possibilities are not relevant:
* The other 5 possibilities are not relevant:
* - retired, in-config, live -- retired nodes should be removed from the config
* - retired, in-config, non-live -- retired nodes should be removed from the config
* - retired, not-in-config, live -- cannot add a retired node back to the config
* - retired, not-in-config, non-live -- cannot add a retired node back to the config
* - non-retired, non-live, not-in-config -- no evidence this node exists at all
Expand All @@ -119,39 +126,40 @@ public ClusterState.VotingConfiguration reconfigure(Set<DiscoveryNode> liveNodes
liveInConfigIds.retainAll(liveNodeIds);

final Set<String> inConfigNotLiveIds = Sets.sortedDifference(currentConfig.getNodeIds(), liveInConfigIds);
final Set<String> retiredInConfigNotLiveIds = new TreeSet<>(inConfigNotLiveIds);
retiredInConfigNotLiveIds.retainAll(retiredNodeIds);
final Set<String> nonRetiredInConfigNotLiveIds = new TreeSet<>(inConfigNotLiveIds);
nonRetiredInConfigNotLiveIds.removeAll(retiredInConfigNotLiveIds);
nonRetiredInConfigNotLiveIds.removeAll(retiredNodeIds);

final Set<String> retiredInConfigLiveIds = new TreeSet<>(liveInConfigIds);
retiredInConfigLiveIds.retainAll(retiredNodeIds);
final Set<String> nonRetiredInConfigLiveIds = new TreeSet<>(liveInConfigIds);
nonRetiredInConfigLiveIds.removeAll(retiredInConfigLiveIds);
nonRetiredInConfigLiveIds.removeAll(retiredNodeIds);

final Set<String> nonRetiredLiveNotInConfigIds = Sets.sortedDifference(liveNodeIds, currentConfig.getNodeIds());
nonRetiredLiveNotInConfigIds.removeAll(retiredNodeIds);

/*
* Now we work out how many nodes should be in the configuration:
*/
final int targetSize;

// ideally we want the configuration to be all the non-retired live nodes ...
final int nonRetiredLiveNodeCount = nonRetiredInConfigLiveIds.size() + nonRetiredLiveNotInConfigIds.size();

// ... except one, if even, because odd configurations are slightly more resilient ...
final int votingNodeCount = roundDownToOdd(nonRetiredLiveNodeCount);

// ... except that the new configuration must satisfy CLUSTER_MASTER_NODES_FAILURE_TOLERANCE too:
final int targetSize = Math.max(votingNodeCount, safeConfigurationSize);
final int nonRetiredConfigSize = nonRetiredInConfigLiveIds.size() + nonRetiredInConfigNotLiveIds.size();
if (autoShrinkVotingConfiguration) {
if (nonRetiredLiveNodeCount >= 3) {
targetSize = roundDownToOdd(nonRetiredLiveNodeCount);
} else {
// only have one or two available nodes; may not shrink below 3 nodes automatically, but if
// the config (excluding retired nodes) is already smaller than 3 then it's ok.
targetSize = nonRetiredConfigSize < 3 ? 1 : 3;
}
} else {
targetSize = Math.max(roundDownToOdd(nonRetiredLiveNodeCount), nonRetiredConfigSize);
}

/*
* The new configuration is formed by taking this many nodes in the following preference order:
*/
final ClusterState.VotingConfiguration newConfig = new ClusterState.VotingConfiguration(
Stream.of(nonRetiredInConfigLiveIds, nonRetiredLiveNotInConfigIds, // live nodes first, preferring the current config
retiredInConfigLiveIds, // if we need more, first use retired nodes that are still alive and haven't been removed yet
nonRetiredInConfigNotLiveIds, retiredInConfigNotLiveIds) // if we need more, use non-live nodes
// live nodes first, preferring the current config, and if we need more then use non-live nodes
Stream.of(nonRetiredInConfigLiveIds, nonRetiredLiveNotInConfigIds, nonRetiredInConfigNotLiveIds)
.flatMap(Collection::stream).limit(targetSize).collect(Collectors.toSet()));

if (newConfig.hasQuorum(liveNodeIds)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -452,11 +452,10 @@ public void apply(Settings value, Settings current, Settings previous) {
ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING,
ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING,
ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING,
Coordinator.PUBLISH_TIMEOUT_SETTING,
ElectionSchedulerFactory.ELECTION_DURATION_SETTING,
Coordinator.PUBLISH_TIMEOUT_SETTING,
JoinHelper.JOIN_TIMEOUT_SETTING,
Reconfigurator.CLUSTER_MASTER_NODES_FAILURE_TOLERANCE
Reconfigurator.CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION
)));

public static List<SettingUpgrader<?>> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList(
Expand Down
Loading