From d1a4204d9e62357b229230f7316748f040cfde48 Mon Sep 17 00:00:00 2001 From: David Turner Date: Thu, 20 Sep 2018 21:42:55 +0100 Subject: [PATCH 1/9] Calculate optimal cluster configuration We wish to commit a cluster state update after having received a response from more than half of the master-eligible nodes in the cluster. This is optimal: requiring either more or fewer votes than half harms resilience. For instance if we have three master nodes then, we want to be able to commit a cluster state after receiving responses from any two nodes; requiring responses from all three is clearly not resilient to the failure of any node, and if we could commit an update after a response from just one node then that node would be required for every commit, which is also not resilient. However, this means we must adjust the configuration (the set of voting nodes in the cluster) whenever a master-eligible node joins or leaves. The calculation of the best configuration for the cluster is the job of the Reconfigurator, introduced here. --- .../cluster/coordination/Reconfigurator.java | 124 +++++++++++++ .../common/settings/ClusterSettings.java | 4 +- .../coordination/ReconfiguratorTests.java | 167 ++++++++++++++++++ 3 files changed, 294 insertions(+), 1 deletion(-) create mode 100644 server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java create mode 100644 server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java new file mode 100644 index 0000000000000..2557588d71fb5 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java @@ -0,0 +1,124 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.cluster.coordination; + +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.common.component.AbstractComponent; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.Setting; +import org.elasticsearch.common.settings.Setting.Property; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.set.Sets; + +import java.util.Collection; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Computes the optimal configuration of voting nodes in the cluster. + */ +public class Reconfigurator extends AbstractComponent { + + /** + * The cluster usually requires a vote from at least half of the master nodes in order to commit a cluster state update, and to achieve + * this it makes automatic adjustments to the quorum size as master nodes join or leave the cluster. However, if master nodes leave the + * cluster slowly enough then these automatic adjustments can end up with a single master node; if this last node were to fail then the + * cluster would be rendered permanently unavailable. Instead it may be preferable to stop processing cluster state updates and become + * unavailable when the second-last (more generally, n'th-last) node leaves the cluster, so that the cluster is never in a situation + * where a single node's failure can cause permanent unavailability. This setting determines the size of the smallest set of master + * nodes required to process a cluster state update. + */ + public static final Setting MINIMUM_VOTING_MASTER_NODES_SETTING = + Setting.intSetting("cluster.minimum_voting_master_nodes", 1, 1, Property.NodeScope, Property.Dynamic); + + private int minVotingMasterNodes; + + public Reconfigurator(Settings settings, ClusterSettings clusterSettings) { + super(settings); + minVotingMasterNodes = MINIMUM_VOTING_MASTER_NODES_SETTING.get(settings); + clusterSettings.addSettingsUpdateConsumer(MINIMUM_VOTING_MASTER_NODES_SETTING, this::setMinVotingMasterNodes); + } + + public void setMinVotingMasterNodes(int minVotingMasterNodes) { + this.minVotingMasterNodes = minVotingMasterNodes; + } + + private static int roundDownToOdd(int size) { + return size - (size % 2 == 0 ? 1 : 0); + } + + @Override + public String toString() { + return "Reconfigurator{" + + "minVotingMasterNodes=" + minVotingMasterNodes + + '}'; + } + + /** + * Compute an optimal configuration for the cluster. + * @param liveNodes The live nodes in the cluster. The optimal configuration prefers live nodes over non-live nodes as far as possible. + * @param retiredNodeIds Nodes that are leaving the cluster and which should not appear in the configuration if possible. Nodes that are + * retired and not in the current configuration will never appear in the resulting configuration; this is useful + * for shifting the vote in a 2-node cluster so one of the nodes can be restarted without harming availability. + * @param currentConfig The current configuration. As far as possible, we prefer to keep the current config as-is. + * @return An optimal configuration, or leave the current configuration unchanged if the optimal configuration has no live quorum. + */ + public ClusterState.VotingConfiguration reconfigure(Set liveNodes, Set retiredNodeIds, + ClusterState.VotingConfiguration currentConfig) { + logger.trace("{} reconfiguring {} based on liveNodes={}, retiredNodeIds={}", this, currentConfig, liveNodes, retiredNodeIds); + + final Set liveNodeIds = liveNodes.stream() + .filter(DiscoveryNode::isMasterNode).map(DiscoveryNode::getId).collect(Collectors.toSet()); + final Set liveInConfigIds = new TreeSet<>(currentConfig.getNodeIds()); + liveInConfigIds.retainAll(liveNodeIds); + + final Set inConfigNotLiveIds = Sets.sortedDifference(currentConfig.getNodeIds(), liveInConfigIds); + final Set retiredInConfigNotLiveIds = new TreeSet<>(inConfigNotLiveIds); + retiredInConfigNotLiveIds.retainAll(retiredNodeIds); + final Set nonRetiredInConfigNotLiveIds = new TreeSet<>(inConfigNotLiveIds); + nonRetiredInConfigNotLiveIds.removeAll(retiredInConfigNotLiveIds); + + final Set retiredLiveInConfigIds = new TreeSet<>(liveInConfigIds); + retiredLiveInConfigIds.retainAll(retiredNodeIds); + final Set nonRetiredLiveInConfigIds = new TreeSet<>(liveInConfigIds); + nonRetiredLiveInConfigIds.removeAll(retiredLiveInConfigIds); + + final Set nonRetiredLiveNotInConfigIds = Sets.sortedDifference(liveNodeIds, currentConfig.getNodeIds()); + nonRetiredLiveNotInConfigIds.removeAll(retiredNodeIds); + + final int targetSize = Math.max(roundDownToOdd(nonRetiredLiveInConfigIds.size() + nonRetiredLiveNotInConfigIds.size()), + 2 * minVotingMasterNodes - 1); + + final ClusterState.VotingConfiguration newConfig = new ClusterState.VotingConfiguration( + Stream.of(nonRetiredLiveInConfigIds, nonRetiredLiveNotInConfigIds, // live nodes first, preferring the current config + retiredLiveInConfigIds, // if we need more, first use retired nodes that are still alive and haven't been removed yet + nonRetiredInConfigNotLiveIds, retiredInConfigNotLiveIds) // if we need more, use non-live nodes + .flatMap(Collection::stream).limit(targetSize).collect(Collectors.toSet())); + + if (newConfig.hasQuorum(liveNodeIds)) { + return newConfig; + } else { + return currentConfig; + } + } +} diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index 3ded6f78f1c88..8d9f9ca13da7d 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -32,6 +32,7 @@ import org.elasticsearch.cluster.action.index.MappingUpdatedAction; import org.elasticsearch.cluster.coordination.Coordinator; import org.elasticsearch.cluster.coordination.ElectionSchedulerFactory; +import org.elasticsearch.cluster.coordination.Reconfigurator; import org.elasticsearch.cluster.metadata.IndexGraveyard; import org.elasticsearch.cluster.metadata.MetaData; import org.elasticsearch.cluster.routing.OperationRouting; @@ -448,7 +449,8 @@ public void apply(Settings value, Settings current, Settings previous) { ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING, ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING, ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING, - Coordinator.PUBLISH_TIMEOUT_SETTING + Coordinator.PUBLISH_TIMEOUT_SETTING, + Reconfigurator.MINIMUM_VOTING_MASTER_NODES_SETTING ))); public static List> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList( diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java new file mode 100644 index 0000000000000..f65a8b8837031 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java @@ -0,0 +1,167 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.cluster.coordination; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.ClusterState.VotingConfiguration; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.set.Sets; +import org.elasticsearch.test.ESTestCase; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import static java.util.Collections.emptySet; +import static org.elasticsearch.cluster.coordination.Reconfigurator.MINIMUM_VOTING_MASTER_NODES_SETTING; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.sameInstance; + +public class ReconfiguratorTests extends ESTestCase { + + public void testReconfigurationExamples() { + check(nodes("a"), conf("a"), randomIntBetween(1, 10), conf("a")); + check(nodes("a", "b"), conf("a"), 1, conf("a")); + check(nodes("a", "b"), conf("a"), randomIntBetween(2, 10), conf("a", "b")); + check(nodes("a", "b", "c"), conf("a"), randomIntBetween(1, 10), conf("a", "b", "c")); + check(nodes("a", "b", "c"), conf("a", "b"), randomIntBetween(1, 10), conf("a", "b", "c")); + check(nodes("a", "b", "c"), conf("a", "b", "c"), randomIntBetween(1, 10), conf("a", "b", "c")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "c"), randomIntBetween(1, 2), conf("a", "b", "c")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "c"), randomIntBetween(3, 10), conf("a", "b", "c", "d")); + check(nodes("a", "b", "c", "d", "e"), conf("a", "b", "c"), randomIntBetween(1, 10), conf("a", "b", "c", "d", "e")); + check(nodes("a", "b"), conf("a", "b", "e"), 1, conf("a")); + check(nodes("a", "b"), conf("a", "b", "e"), randomIntBetween(2, 10), conf("a", "b", "e")); + check(nodes("a", "b", "c"), conf("a", "b", "e"), randomIntBetween(1, 2), conf("a", "b", "c")); + check(nodes("a", "b", "c"), conf("a", "b", "e"), randomIntBetween(3, 10), conf("a", "b", "c", "e")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "e"), randomIntBetween(1, 2), conf("a", "b", "c")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "e"), randomIntBetween(3, 10), conf("a", "b", "c", "d", "e")); + check(nodes("a", "b", "c", "d", "e"), conf("a", "f", "g"), randomIntBetween(1, 3), conf("a", "b", "c", "d", "e")); + check(nodes("a", "b", "c", "d", "e"), conf("a", "f", "g"), randomIntBetween(4, 10), conf("a", "b", "c", "d", "e", "f", "g")); + + // Retiring a single node shifts the votes elsewhere if possible. + check(nodes("a", "b"), retired("a"), conf("a"), 1, conf("b")); + check(nodes("a", "b"), retired("a"), conf("a"), 2, conf("a", "b")); // below min voter size, so no retirement takes place + check(nodes("a", "b"), retired("a"), conf("b"), 2, conf("b")); + check(nodes("a", "b", "c"), retired("a"), conf("a"), 1, conf("b")); + check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), 1, conf("b")); + check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), 2, conf("a", "b", "c")); + + // 7 nodes, one for each combination of live/retired/current. Ideally want the config to be the non-retired live nodes. + // Since there are 2 non-retired live nodes we round down to 1 and just use the one that's already in the config. + check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 1, conf("a")); + // If we want the config to be at least 3 nodes then we don't retire "c" just yet. + check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 2, + conf("a", "b", "c")); + // If we want the config to be at least 5 nodes then we keep the old config and add the new node "b" to it. + check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), randomIntBetween(3, 10), + conf("a", "b", "c", "d", "e")); + } + + public void testReconfigurationProperty() { + final String[] allNodes = new String[]{"a", "b", "c", "d", "e", "f", "g"}; + + final String[] liveNodes = new String[randomIntBetween(1, allNodes.length)]; + randomSubsetOf(liveNodes.length, allNodes).toArray(liveNodes); + + final String[] initialVotingNodes = new String[randomIntBetween(1, allNodes.length)]; + randomSubsetOf(initialVotingNodes.length, allNodes).toArray(initialVotingNodes); + + final int minVotingMasterNodes = randomIntBetween(1, allNodes.length + 1); + + final Reconfigurator reconfigurator = makeReconfigurator( + Settings.builder().put(MINIMUM_VOTING_MASTER_NODES_SETTING.getKey(), minVotingMasterNodes).build()); + final Set liveNodesSet = nodes(liveNodes); + final ClusterState.VotingConfiguration initialConfig = conf(initialVotingNodes); + final ClusterState.VotingConfiguration finalConfig = reconfigurator.reconfigure(liveNodesSet, emptySet(), initialConfig); + + // min configuration size comes from MINIMUM_VOTING_MASTER_NODES_SETTING as long as there are enough nodes in play + final Set mentionedNodes = Sets.union(Sets.newHashSet(liveNodes), Sets.newHashSet(initialVotingNodes)); + final int minConfigurationSize = Math.min(2 * minVotingMasterNodes - 1, mentionedNodes.size()); + + // actual size of a quorum: half the configured nodes, which is all the live nodes plus maybe some dead ones to make up numbers + final int quorumSize = Math.max(liveNodes.length, minConfigurationSize) / 2 + 1; + + if (quorumSize > liveNodes.length) { + assertFalse("reconfigure " + liveNodesSet + " from " + initialConfig + " with min of " + minVotingMasterNodes + + " yielded " + finalConfig + " without a live quorum", finalConfig.hasQuorum(Arrays.asList(liveNodes))); + } else { + final List expectedQuorum = randomSubsetOf(quorumSize, liveNodes); + assertTrue("reconfigure " + liveNodesSet + " from " + initialConfig + " with min of " + minVotingMasterNodes + + " yielded " + finalConfig + " with quorum of " + expectedQuorum, + finalConfig.hasQuorum(expectedQuorum)); + } + } + + private ClusterState.VotingConfiguration conf(String... nodes) { + return new ClusterState.VotingConfiguration(Sets.newHashSet(nodes)); + } + + private Set nodes(String... nodes) { + final Set liveNodes = new HashSet<>(); + for (String id : nodes) { + liveNodes.add(new DiscoveryNode(id, buildNewFakeTransportAddress(), Version.CURRENT)); + } + return liveNodes; + } + + private Set retired(String... nodes) { + return Arrays.stream(nodes).collect(Collectors.toSet()); + } + + private void check(Set liveNodes, ClusterState.VotingConfiguration config, int minVoterSize, + ClusterState.VotingConfiguration expectedConfig) { + check(liveNodes, retired(), config, minVoterSize, expectedConfig); + } + + private void check(Set liveNodes, Set retired, ClusterState.VotingConfiguration config, int minVoterSize, + ClusterState.VotingConfiguration expectedConfig) { + final Reconfigurator reconfigurator = makeReconfigurator(Settings.builder() + .put(MINIMUM_VOTING_MASTER_NODES_SETTING.getKey(), minVoterSize) + .build()); + final ClusterState.VotingConfiguration adaptedConfig = reconfigurator.reconfigure(liveNodes, retired, config); + assertEquals(expectedConfig, adaptedConfig); + } + + private Reconfigurator makeReconfigurator(Settings settings) { + return new Reconfigurator(settings, new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS)); + } + + public void testDynamicSetting() { + final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + final Reconfigurator reconfigurator = new Reconfigurator(Settings.EMPTY, clusterSettings); + final VotingConfiguration initialConfig = conf("a", "b", "c", "d", "e"); + + // default min is 1 + assertThat(reconfigurator.reconfigure(nodes("a"), retired(), initialConfig), equalTo(conf("a"))); + + // update min to 2 + clusterSettings.applySettings(Settings.builder().put(MINIMUM_VOTING_MASTER_NODES_SETTING.getKey(), 2).build()); + assertThat(reconfigurator.reconfigure(nodes("a"), retired(), initialConfig), sameInstance(initialConfig)); // cannot reconfigure + + expectThrows(IllegalArgumentException.class, () -> + clusterSettings.applySettings(Settings.builder().put(MINIMUM_VOTING_MASTER_NODES_SETTING.getKey(), 0).build())); + } +} From 2a9b0c4c0373a7f079704c5d66206124dc1965a5 Mon Sep 17 00:00:00 2001 From: David Turner Date: Thu, 20 Sep 2018 23:32:34 +0100 Subject: [PATCH 2/9] Darnit checkstyle, you know I love you, but you've a helluva lot to learn about rock and roll. --- .../elasticsearch/cluster/coordination/ReconfiguratorTests.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java index f65a8b8837031..25c31f1430301 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java @@ -36,7 +36,6 @@ import static java.util.Collections.emptySet; import static org.elasticsearch.cluster.coordination.Reconfigurator.MINIMUM_VOTING_MASTER_NODES_SETTING; -import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.sameInstance; From 82d5be4ab24895420a7edb7013e25963d4ed4023 Mon Sep 17 00:00:00 2001 From: David Turner Date: Fri, 21 Sep 2018 17:50:48 +0100 Subject: [PATCH 3/9] Extract variables and add comments --- .../cluster/coordination/Reconfigurator.java | 46 +++++++++++++++---- .../coordination/ReconfiguratorTests.java | 2 +- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java index 2557588d71fb5..30a75eaf1a91a 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java @@ -87,6 +87,22 @@ public ClusterState.VotingConfiguration reconfigure(Set liveNodes ClusterState.VotingConfiguration currentConfig) { logger.trace("{} reconfiguring {} based on liveNodes={}, retiredNodeIds={}", this, currentConfig, liveNodes, retiredNodeIds); + /* + * There are three true/false properties of each node in play: live/non-live, retired/non-retired and in-config/not-in-config. + * Firstly we divide the nodes into disjoint sets based on these properties: + * + * - retiredInConfigNotLiveIds + * - nonRetiredInConfigNotLiveIds + * - retiredInConfigLiveIds + * - nonRetiredInConfigLiveIds + * - nonRetiredLiveNotInConfigIds + * + * The other 3 possibilities are not relevant: + * - retired, not-in-config, live -- cannot add a retired node back to the config + * - retired, not-in-config, non-live -- cannot add a retired node back to the config + * - non-retired, non-live, not-in-config -- no evidence this node exists at all + */ + final Set liveNodeIds = liveNodes.stream() .filter(DiscoveryNode::isMasterNode).map(DiscoveryNode::getId).collect(Collectors.toSet()); final Set liveInConfigIds = new TreeSet<>(currentConfig.getNodeIds()); @@ -98,26 +114,40 @@ public ClusterState.VotingConfiguration reconfigure(Set liveNodes final Set nonRetiredInConfigNotLiveIds = new TreeSet<>(inConfigNotLiveIds); nonRetiredInConfigNotLiveIds.removeAll(retiredInConfigNotLiveIds); - final Set retiredLiveInConfigIds = new TreeSet<>(liveInConfigIds); - retiredLiveInConfigIds.retainAll(retiredNodeIds); - final Set nonRetiredLiveInConfigIds = new TreeSet<>(liveInConfigIds); - nonRetiredLiveInConfigIds.removeAll(retiredLiveInConfigIds); + final Set retiredInConfigLiveIds = new TreeSet<>(liveInConfigIds); + retiredInConfigLiveIds.retainAll(retiredNodeIds); + final Set nonRetiredInConfigLiveIds = new TreeSet<>(liveInConfigIds); + nonRetiredInConfigLiveIds.removeAll(retiredInConfigLiveIds); final Set nonRetiredLiveNotInConfigIds = Sets.sortedDifference(liveNodeIds, currentConfig.getNodeIds()); nonRetiredLiveNotInConfigIds.removeAll(retiredNodeIds); - final int targetSize = Math.max(roundDownToOdd(nonRetiredLiveInConfigIds.size() + nonRetiredLiveNotInConfigIds.size()), - 2 * minVotingMasterNodes - 1); + /* + * Now we work out how many nodes should be in the configuration: + */ + + // ideally we want the configuration to be all the non-retired live nodes ... + final int nonRetiredLiveNodeCount = nonRetiredInConfigLiveIds.size() + nonRetiredLiveNotInConfigIds.size(); + + // ... except one, if even, because odd configurations are slightly more resilient ... + final int votingNodeCount = roundDownToOdd(nonRetiredLiveNodeCount); + + // ... unless there's too few non-retired live nodes to meet the minimum configured resilience level: + final int targetSize = Math.max(votingNodeCount, 2 * minVotingMasterNodes - 1); + /* + * The new configuration is formed by taking this many nodes in the following preference order: + */ final ClusterState.VotingConfiguration newConfig = new ClusterState.VotingConfiguration( - Stream.of(nonRetiredLiveInConfigIds, nonRetiredLiveNotInConfigIds, // live nodes first, preferring the current config - retiredLiveInConfigIds, // if we need more, first use retired nodes that are still alive and haven't been removed yet + Stream.of(nonRetiredInConfigLiveIds, nonRetiredLiveNotInConfigIds, // live nodes first, preferring the current config + retiredInConfigLiveIds, // if we need more, first use retired nodes that are still alive and haven't been removed yet nonRetiredInConfigNotLiveIds, retiredInConfigNotLiveIds) // if we need more, use non-live nodes .flatMap(Collection::stream).limit(targetSize).collect(Collectors.toSet())); if (newConfig.hasQuorum(liveNodeIds)) { return newConfig; } else { + // If there are not enough live nodes to form a quorum in the newly-proposed configuration, it's better to do nothing. return currentConfig; } } diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java index 25c31f1430301..b6899775e1bb1 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java @@ -68,7 +68,7 @@ public void testReconfigurationExamples() { check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), 1, conf("b")); check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), 2, conf("a", "b", "c")); - // 7 nodes, one for each combination of live/retired/current. Ideally want the config to be the non-retired live nodes. + // 7 nodes, one for each combination of live/retired/current. Ideally we want the config to be the non-retired live nodes. // Since there are 2 non-retired live nodes we round down to 1 and just use the one that's already in the config. check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 1, conf("a")); // If we want the config to be at least 3 nodes then we don't retire "c" just yet. From b301cad3468a468a7b3cc61fc3fb800286430af2 Mon Sep 17 00:00:00 2001 From: David Turner Date: Wed, 3 Oct 2018 10:22:35 +0100 Subject: [PATCH 4/9] Treat the min voter size as a latch not a goal --- .../cluster/coordination/Reconfigurator.java | 7 ++- .../coordination/ReconfiguratorTests.java | 61 +++++++++---------- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java index 30a75eaf1a91a..79312d4023a81 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java @@ -132,8 +132,11 @@ public ClusterState.VotingConfiguration reconfigure(Set liveNodes // ... except one, if even, because odd configurations are slightly more resilient ... final int votingNodeCount = roundDownToOdd(nonRetiredLiveNodeCount); - // ... unless there's too few non-retired live nodes to meet the minimum configured resilience level: - final int targetSize = Math.max(votingNodeCount, 2 * minVotingMasterNodes - 1); + // ... except that if the current configuration satisfies MINIMUM_VOTING_MASTER_NODES_SETTING then the new one must too: + final int safeConfigurationSize = 2 * minVotingMasterNodes - 1; + final int targetSize; + targetSize = currentConfig.getNodeIds().size() >= safeConfigurationSize && votingNodeCount < safeConfigurationSize + ? safeConfigurationSize : votingNodeCount; /* * The new configuration is formed by taking this many nodes in the following preference order: diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java index b6899775e1bb1..6ddfe876b9348 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java @@ -19,6 +19,7 @@ package org.elasticsearch.cluster.coordination; +import org.apache.logging.log4j.message.ParameterizedMessage; import org.elasticsearch.Version; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ClusterState.VotingConfiguration; @@ -42,31 +43,29 @@ public class ReconfiguratorTests extends ESTestCase { public void testReconfigurationExamples() { - check(nodes("a"), conf("a"), randomIntBetween(1, 10), conf("a")); - check(nodes("a", "b"), conf("a"), 1, conf("a")); - check(nodes("a", "b"), conf("a"), randomIntBetween(2, 10), conf("a", "b")); - check(nodes("a", "b", "c"), conf("a"), randomIntBetween(1, 10), conf("a", "b", "c")); - check(nodes("a", "b", "c"), conf("a", "b"), randomIntBetween(1, 10), conf("a", "b", "c")); - check(nodes("a", "b", "c"), conf("a", "b", "c"), randomIntBetween(1, 10), conf("a", "b", "c")); - check(nodes("a", "b", "c", "d"), conf("a", "b", "c"), randomIntBetween(1, 2), conf("a", "b", "c")); - check(nodes("a", "b", "c", "d"), conf("a", "b", "c"), randomIntBetween(3, 10), conf("a", "b", "c", "d")); - check(nodes("a", "b", "c", "d", "e"), conf("a", "b", "c"), randomIntBetween(1, 10), conf("a", "b", "c", "d", "e")); - check(nodes("a", "b"), conf("a", "b", "e"), 1, conf("a")); - check(nodes("a", "b"), conf("a", "b", "e"), randomIntBetween(2, 10), conf("a", "b", "e")); - check(nodes("a", "b", "c"), conf("a", "b", "e"), randomIntBetween(1, 2), conf("a", "b", "c")); - check(nodes("a", "b", "c"), conf("a", "b", "e"), randomIntBetween(3, 10), conf("a", "b", "c", "e")); - check(nodes("a", "b", "c", "d"), conf("a", "b", "e"), randomIntBetween(1, 2), conf("a", "b", "c")); - check(nodes("a", "b", "c", "d"), conf("a", "b", "e"), randomIntBetween(3, 10), conf("a", "b", "c", "d", "e")); - check(nodes("a", "b", "c", "d", "e"), conf("a", "f", "g"), randomIntBetween(1, 3), conf("a", "b", "c", "d", "e")); - check(nodes("a", "b", "c", "d", "e"), conf("a", "f", "g"), randomIntBetween(4, 10), conf("a", "b", "c", "d", "e", "f", "g")); + for (int safetyLevel = 1; safetyLevel <= 3; safetyLevel++) { + check(nodes("a"), conf("a"), safetyLevel, conf("a")); + check(nodes("a", "b"), conf("a"), safetyLevel, conf("a")); + check(nodes("a", "b", "c"), conf("a"), safetyLevel, conf("a", "b", "c")); + check(nodes("a", "b", "c"), conf("a", "b"), safetyLevel, conf("a", "b", "c")); + check(nodes("a", "b", "c"), conf("a", "b", "c"), safetyLevel, conf("a", "b", "c")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "c"), safetyLevel, conf("a", "b", "c")); + check(nodes("a", "b", "c", "d", "e"), conf("a", "b", "c"), safetyLevel, conf("a", "b", "c", "d", "e")); + check(nodes("a", "b"), conf("a", "b", "e"), safetyLevel, safetyLevel == 2 ? conf("a", "b", "e") : conf("a")); + check(nodes("a", "b", "c"), conf("a", "b", "e"), safetyLevel, conf("a", "b", "c")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "e"), safetyLevel, conf("a", "b", "c")); + check(nodes("a", "b", "c", "d", "e"), conf("a", "f", "g"), safetyLevel, conf("a", "b", "c", "d", "e")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "c", "d", "e"), safetyLevel, + safetyLevel <= 2 ? conf("a", "b", "c") : conf("a", "b", "c", "d", "e")); + } // Retiring a single node shifts the votes elsewhere if possible. check(nodes("a", "b"), retired("a"), conf("a"), 1, conf("b")); - check(nodes("a", "b"), retired("a"), conf("a"), 2, conf("a", "b")); // below min voter size, so no retirement takes place + check(nodes("a", "b"), retired("a"), conf("a"), 2, conf("b")); // never reached min voter size, so retirement can take place check(nodes("a", "b"), retired("a"), conf("b"), 2, conf("b")); check(nodes("a", "b", "c"), retired("a"), conf("a"), 1, conf("b")); check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), 1, conf("b")); - check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), 2, conf("a", "b", "c")); + check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), 2, conf("a", "b", "c")); // min voter size prevents retirement // 7 nodes, one for each combination of live/retired/current. Ideally we want the config to be the non-retired live nodes. // Since there are 2 non-retired live nodes we round down to 1 and just use the one that's already in the config. @@ -74,9 +73,8 @@ public void testReconfigurationExamples() { // If we want the config to be at least 3 nodes then we don't retire "c" just yet. check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 2, conf("a", "b", "c")); - // If we want the config to be at least 5 nodes then we keep the old config and add the new node "b" to it. - check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), randomIntBetween(3, 10), - conf("a", "b", "c", "d", "e")); + // The current config never reached 5 nodes so retirement is allowed + check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 3, conf("a")); } public void testReconfigurationProperty() { @@ -88,27 +86,26 @@ public void testReconfigurationProperty() { final String[] initialVotingNodes = new String[randomIntBetween(1, allNodes.length)]; randomSubsetOf(initialVotingNodes.length, allNodes).toArray(initialVotingNodes); - final int minVotingMasterNodes = randomIntBetween(1, allNodes.length + 1); + final int minConfiguredVotingMasterNodes = randomIntBetween(1, 3); final Reconfigurator reconfigurator = makeReconfigurator( - Settings.builder().put(MINIMUM_VOTING_MASTER_NODES_SETTING.getKey(), minVotingMasterNodes).build()); + Settings.builder().put(MINIMUM_VOTING_MASTER_NODES_SETTING.getKey(), minConfiguredVotingMasterNodes).build()); final Set liveNodesSet = nodes(liveNodes); final ClusterState.VotingConfiguration initialConfig = conf(initialVotingNodes); final ClusterState.VotingConfiguration finalConfig = reconfigurator.reconfigure(liveNodesSet, emptySet(), initialConfig); - // min configuration size comes from MINIMUM_VOTING_MASTER_NODES_SETTING as long as there are enough nodes in play - final Set mentionedNodes = Sets.union(Sets.newHashSet(liveNodes), Sets.newHashSet(initialVotingNodes)); - final int minConfigurationSize = Math.min(2 * minVotingMasterNodes - 1, mentionedNodes.size()); + // min configuration size comes from MINIMUM_VOTING_MASTER_NODES_SETTING as long as there are enough nodes in the current config + final boolean isSafeConfiguration = initialConfig.getNodeIds().size() >= minConfiguredVotingMasterNodes * 2 - 1; // actual size of a quorum: half the configured nodes, which is all the live nodes plus maybe some dead ones to make up numbers - final int quorumSize = Math.max(liveNodes.length, minConfigurationSize) / 2 + 1; + final int quorumSize = Math.max(liveNodes.length / 2 + 1, isSafeConfiguration ? minConfiguredVotingMasterNodes : 0); if (quorumSize > liveNodes.length) { - assertFalse("reconfigure " + liveNodesSet + " from " + initialConfig + " with min of " + minVotingMasterNodes + assertFalse("reconfigure " + liveNodesSet + " from " + initialConfig + " with min of " + minConfiguredVotingMasterNodes + " yielded " + finalConfig + " without a live quorum", finalConfig.hasQuorum(Arrays.asList(liveNodes))); } else { final List expectedQuorum = randomSubsetOf(quorumSize, liveNodes); - assertTrue("reconfigure " + liveNodesSet + " from " + initialConfig + " with min of " + minVotingMasterNodes + assertTrue("reconfigure " + liveNodesSet + " from " + initialConfig + " with min of " + minConfiguredVotingMasterNodes + " yielded " + finalConfig + " with quorum of " + expectedQuorum, finalConfig.hasQuorum(expectedQuorum)); } @@ -141,7 +138,9 @@ private void check(Set liveNodes, Set retired, ClusterSta .put(MINIMUM_VOTING_MASTER_NODES_SETTING.getKey(), minVoterSize) .build()); final ClusterState.VotingConfiguration adaptedConfig = reconfigurator.reconfigure(liveNodes, retired, config); - assertEquals(expectedConfig, adaptedConfig); + assertEquals(new ParameterizedMessage("[liveNodes={}, retired={}, config={}, minVoterSize={}]", + liveNodes, retired, config, minVoterSize).getFormattedMessage(), + expectedConfig, adaptedConfig); } private Reconfigurator makeReconfigurator(Settings settings) { From 2616847f975ff4e442aae8b0f838d27eaecc560c Mon Sep 17 00:00:00 2001 From: David Turner Date: Wed, 3 Oct 2018 11:07:22 +0100 Subject: [PATCH 5/9] Rename setting to cluster.master_resilience_level ... and use an enum --- .../cluster/coordination/Reconfigurator.java | 36 +++--- .../common/settings/ClusterSettings.java | 2 +- .../coordination/ReconfiguratorTests.java | 106 +++++++++++------- 3 files changed, 89 insertions(+), 55 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java index 79312d4023a81..3d04c37de7e9a 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java @@ -48,19 +48,28 @@ public class Reconfigurator extends AbstractComponent { * where a single node's failure can cause permanent unavailability. This setting determines the size of the smallest set of master * nodes required to process a cluster state update. */ - public static final Setting MINIMUM_VOTING_MASTER_NODES_SETTING = - Setting.intSetting("cluster.minimum_voting_master_nodes", 1, 1, Property.NodeScope, Property.Dynamic); + public static final Setting CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING = + new Setting<>("cluster.master_resilience_level", MasterResilienceLevel.recommended.toString(), MasterResilienceLevel::valueOf, + Property.NodeScope, Property.Dynamic); + + public enum MasterResilienceLevel { + fragile(1), recommended(2), excessive(3); + final int minimumVotingNodes; + MasterResilienceLevel(int minimumVotingNodes) { + this.minimumVotingNodes = minimumVotingNodes; + } + } - private int minVotingMasterNodes; + private MasterResilienceLevel masterResilienceLevel; public Reconfigurator(Settings settings, ClusterSettings clusterSettings) { super(settings); - minVotingMasterNodes = MINIMUM_VOTING_MASTER_NODES_SETTING.get(settings); - clusterSettings.addSettingsUpdateConsumer(MINIMUM_VOTING_MASTER_NODES_SETTING, this::setMinVotingMasterNodes); + masterResilienceLevel = CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.get(settings); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING, this::setMasterResilienceLevel); } - public void setMinVotingMasterNodes(int minVotingMasterNodes) { - this.minVotingMasterNodes = minVotingMasterNodes; + public void setMasterResilienceLevel(MasterResilienceLevel masterResilienceLevel) { + this.masterResilienceLevel = masterResilienceLevel; } private static int roundDownToOdd(int size) { @@ -70,17 +79,19 @@ private static int roundDownToOdd(int size) { @Override public String toString() { return "Reconfigurator{" + - "minVotingMasterNodes=" + minVotingMasterNodes + + "masterResilienceLevel=" + masterResilienceLevel + '}'; } /** * Compute an optimal configuration for the cluster. - * @param liveNodes The live nodes in the cluster. The optimal configuration prefers live nodes over non-live nodes as far as possible. + * + * @param liveNodes The live nodes in the cluster. The optimal configuration prefers live nodes over non-live nodes as far as + * possible. * @param retiredNodeIds Nodes that are leaving the cluster and which should not appear in the configuration if possible. Nodes that are * retired and not in the current configuration will never appear in the resulting configuration; this is useful * for shifting the vote in a 2-node cluster so one of the nodes can be restarted without harming availability. - * @param currentConfig The current configuration. As far as possible, we prefer to keep the current config as-is. + * @param currentConfig The current configuration. As far as possible, we prefer to keep the current config as-is. * @return An optimal configuration, or leave the current configuration unchanged if the optimal configuration has no live quorum. */ public ClusterState.VotingConfiguration reconfigure(Set liveNodes, Set retiredNodeIds, @@ -133,9 +144,8 @@ public ClusterState.VotingConfiguration reconfigure(Set liveNodes final int votingNodeCount = roundDownToOdd(nonRetiredLiveNodeCount); // ... except that if the current configuration satisfies MINIMUM_VOTING_MASTER_NODES_SETTING then the new one must too: - final int safeConfigurationSize = 2 * minVotingMasterNodes - 1; - final int targetSize; - targetSize = currentConfig.getNodeIds().size() >= safeConfigurationSize && votingNodeCount < safeConfigurationSize + final int safeConfigurationSize = 2 * masterResilienceLevel.minimumVotingNodes - 1; + final int targetSize = currentConfig.getNodeIds().size() >= safeConfigurationSize && votingNodeCount < safeConfigurationSize ? safeConfigurationSize : votingNodeCount; /* diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index 03e557ed95576..c32c0c43b1fc4 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -450,7 +450,7 @@ public void apply(Settings value, Settings current, Settings previous) { ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING, ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING, Coordinator.PUBLISH_TIMEOUT_SETTING, - Reconfigurator.MINIMUM_VOTING_MASTER_NODES_SETTING + Reconfigurator.CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING ))); public static List> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList( diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java index 6ddfe876b9348..1a64039981681 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java @@ -23,6 +23,7 @@ import org.elasticsearch.Version; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ClusterState.VotingConfiguration; +import org.elasticsearch.cluster.coordination.Reconfigurator.MasterResilienceLevel; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; @@ -36,45 +37,53 @@ import java.util.stream.Collectors; import static java.util.Collections.emptySet; -import static org.elasticsearch.cluster.coordination.Reconfigurator.MINIMUM_VOTING_MASTER_NODES_SETTING; +import static org.elasticsearch.cluster.coordination.Reconfigurator.CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.sameInstance; public class ReconfiguratorTests extends ESTestCase { public void testReconfigurationExamples() { - for (int safetyLevel = 1; safetyLevel <= 3; safetyLevel++) { - check(nodes("a"), conf("a"), safetyLevel, conf("a")); - check(nodes("a", "b"), conf("a"), safetyLevel, conf("a")); - check(nodes("a", "b", "c"), conf("a"), safetyLevel, conf("a", "b", "c")); - check(nodes("a", "b", "c"), conf("a", "b"), safetyLevel, conf("a", "b", "c")); - check(nodes("a", "b", "c"), conf("a", "b", "c"), safetyLevel, conf("a", "b", "c")); - check(nodes("a", "b", "c", "d"), conf("a", "b", "c"), safetyLevel, conf("a", "b", "c")); - check(nodes("a", "b", "c", "d", "e"), conf("a", "b", "c"), safetyLevel, conf("a", "b", "c", "d", "e")); - check(nodes("a", "b"), conf("a", "b", "e"), safetyLevel, safetyLevel == 2 ? conf("a", "b", "e") : conf("a")); - check(nodes("a", "b", "c"), conf("a", "b", "e"), safetyLevel, conf("a", "b", "c")); - check(nodes("a", "b", "c", "d"), conf("a", "b", "e"), safetyLevel, conf("a", "b", "c")); - check(nodes("a", "b", "c", "d", "e"), conf("a", "f", "g"), safetyLevel, conf("a", "b", "c", "d", "e")); - check(nodes("a", "b", "c", "d"), conf("a", "b", "c", "d", "e"), safetyLevel, - safetyLevel <= 2 ? conf("a", "b", "c") : conf("a", "b", "c", "d", "e")); + for (MasterResilienceLevel masterResilienceLevel : new MasterResilienceLevel[] + {MasterResilienceLevel.fragile, MasterResilienceLevel.recommended, MasterResilienceLevel.excessive}) { + check(nodes("a"), conf("a"), masterResilienceLevel, conf("a")); + check(nodes("a", "b"), conf("a"), masterResilienceLevel, conf("a")); + check(nodes("a", "b", "c"), conf("a"), masterResilienceLevel, conf("a", "b", "c")); + check(nodes("a", "b", "c"), conf("a", "b"), masterResilienceLevel, conf("a", "b", "c")); + check(nodes("a", "b", "c"), conf("a", "b", "c"), masterResilienceLevel, conf("a", "b", "c")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "c"), masterResilienceLevel, conf("a", "b", "c")); + check(nodes("a", "b", "c", "d", "e"), conf("a", "b", "c"), masterResilienceLevel, conf("a", "b", "c", "d", "e")); + check(nodes("a", "b"), conf("a", "b", "e"), masterResilienceLevel, + masterResilienceLevel.equals(MasterResilienceLevel.recommended) ? conf("a", "b", "e") : conf("a")); + check(nodes("a", "b", "c"), conf("a", "b", "e"), masterResilienceLevel, conf("a", "b", "c")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "e"), masterResilienceLevel, conf("a", "b", "c")); + check(nodes("a", "b", "c", "d", "e"), conf("a", "f", "g"), masterResilienceLevel, conf("a", "b", "c", "d", "e")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "c", "d", "e"), masterResilienceLevel, + masterResilienceLevel.equals(MasterResilienceLevel.excessive) ? conf("a", "b", "c", "d", "e") : conf("a", "b", "c")); } // Retiring a single node shifts the votes elsewhere if possible. - check(nodes("a", "b"), retired("a"), conf("a"), 1, conf("b")); - check(nodes("a", "b"), retired("a"), conf("a"), 2, conf("b")); // never reached min voter size, so retirement can take place - check(nodes("a", "b"), retired("a"), conf("b"), 2, conf("b")); - check(nodes("a", "b", "c"), retired("a"), conf("a"), 1, conf("b")); - check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), 1, conf("b")); - check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), 2, conf("a", "b", "c")); // min voter size prevents retirement + check(nodes("a", "b"), retired("a"), conf("a"), MasterResilienceLevel.fragile, conf("b")); + + // If the safety level was never reached then retirement can take place + check(nodes("a", "b"), retired("a"), conf("a"), MasterResilienceLevel.recommended, conf("b")); + check(nodes("a", "b"), retired("a"), conf("b"), MasterResilienceLevel.recommended, conf("b")); + + // Retiring a node from a three-node cluster drops down to a one-node configuration if "fragile" + check(nodes("a", "b", "c"), retired("a"), conf("a"), MasterResilienceLevel.fragile, conf("b")); + check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), MasterResilienceLevel.fragile, conf("b")); + + // Retiring is prevented in a three-node cluster if "recommended" + check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), MasterResilienceLevel.recommended, conf("a", "b", "c")); // 7 nodes, one for each combination of live/retired/current. Ideally we want the config to be the non-retired live nodes. // Since there are 2 non-retired live nodes we round down to 1 and just use the one that's already in the config. - check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 1, conf("a")); + check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), MasterResilienceLevel.fragile, conf("a")); // If we want the config to be at least 3 nodes then we don't retire "c" just yet. - check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 2, + check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), MasterResilienceLevel.recommended, conf("a", "b", "c")); // The current config never reached 5 nodes so retirement is allowed - check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 3, conf("a")); + check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), MasterResilienceLevel.excessive, conf("a")); } public void testReconfigurationProperty() { @@ -86,26 +95,27 @@ public void testReconfigurationProperty() { final String[] initialVotingNodes = new String[randomIntBetween(1, allNodes.length)]; randomSubsetOf(initialVotingNodes.length, allNodes).toArray(initialVotingNodes); - final int minConfiguredVotingMasterNodes = randomIntBetween(1, 3); + final MasterResilienceLevel masterResilienceLevel + = randomFrom(MasterResilienceLevel.fragile, MasterResilienceLevel.recommended, MasterResilienceLevel.excessive); final Reconfigurator reconfigurator = makeReconfigurator( - Settings.builder().put(MINIMUM_VOTING_MASTER_NODES_SETTING.getKey(), minConfiguredVotingMasterNodes).build()); + Settings.builder().put(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.getKey(), masterResilienceLevel).build()); final Set liveNodesSet = nodes(liveNodes); final ClusterState.VotingConfiguration initialConfig = conf(initialVotingNodes); final ClusterState.VotingConfiguration finalConfig = reconfigurator.reconfigure(liveNodesSet, emptySet(), initialConfig); // min configuration size comes from MINIMUM_VOTING_MASTER_NODES_SETTING as long as there are enough nodes in the current config - final boolean isSafeConfiguration = initialConfig.getNodeIds().size() >= minConfiguredVotingMasterNodes * 2 - 1; + final boolean isSafeConfiguration = initialConfig.getNodeIds().size() >= masterResilienceLevel.minimumVotingNodes * 2 - 1; // actual size of a quorum: half the configured nodes, which is all the live nodes plus maybe some dead ones to make up numbers - final int quorumSize = Math.max(liveNodes.length / 2 + 1, isSafeConfiguration ? minConfiguredVotingMasterNodes : 0); + final int quorumSize = Math.max(liveNodes.length / 2 + 1, isSafeConfiguration ? masterResilienceLevel.minimumVotingNodes : 0); if (quorumSize > liveNodes.length) { - assertFalse("reconfigure " + liveNodesSet + " from " + initialConfig + " with min of " + minConfiguredVotingMasterNodes + assertFalse("reconfigure " + liveNodesSet + " from " + initialConfig + " with safety factor of " + masterResilienceLevel + " yielded " + finalConfig + " without a live quorum", finalConfig.hasQuorum(Arrays.asList(liveNodes))); } else { final List expectedQuorum = randomSubsetOf(quorumSize, liveNodes); - assertTrue("reconfigure " + liveNodesSet + " from " + initialConfig + " with min of " + minConfiguredVotingMasterNodes + assertTrue("reconfigure " + liveNodesSet + " from " + initialConfig + " with safety factor of " + masterResilienceLevel + " yielded " + finalConfig + " with quorum of " + expectedQuorum, finalConfig.hasQuorum(expectedQuorum)); } @@ -127,19 +137,19 @@ private Set retired(String... nodes) { return Arrays.stream(nodes).collect(Collectors.toSet()); } - private void check(Set liveNodes, ClusterState.VotingConfiguration config, int minVoterSize, + private void check(Set liveNodes, ClusterState.VotingConfiguration config, MasterResilienceLevel masterResilienceLevel, ClusterState.VotingConfiguration expectedConfig) { - check(liveNodes, retired(), config, minVoterSize, expectedConfig); + check(liveNodes, retired(), config, masterResilienceLevel, expectedConfig); } - private void check(Set liveNodes, Set retired, ClusterState.VotingConfiguration config, int minVoterSize, - ClusterState.VotingConfiguration expectedConfig) { + private void check(Set liveNodes, Set retired, ClusterState.VotingConfiguration config, + MasterResilienceLevel masterResilienceLevel, ClusterState.VotingConfiguration expectedConfig) { final Reconfigurator reconfigurator = makeReconfigurator(Settings.builder() - .put(MINIMUM_VOTING_MASTER_NODES_SETTING.getKey(), minVoterSize) + .put(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.getKey(), masterResilienceLevel) .build()); final ClusterState.VotingConfiguration adaptedConfig = reconfigurator.reconfigure(liveNodes, retired, config); - assertEquals(new ParameterizedMessage("[liveNodes={}, retired={}, config={}, minVoterSize={}]", - liveNodes, retired, config, minVoterSize).getFormattedMessage(), + assertEquals(new ParameterizedMessage("[liveNodes={}, retired={}, config={}, masterResilienceLevel={}]", + liveNodes, retired, config, masterResilienceLevel).getFormattedMessage(), expectedConfig, adaptedConfig); } @@ -152,14 +162,28 @@ public void testDynamicSetting() { final Reconfigurator reconfigurator = new Reconfigurator(Settings.EMPTY, clusterSettings); final VotingConfiguration initialConfig = conf("a", "b", "c", "d", "e"); - // default min is 1 + // default is "recommended" + assertThat(reconfigurator.reconfigure(nodes("a"), retired(), initialConfig), sameInstance(initialConfig)); // cannot reconfigure + assertThat(reconfigurator.reconfigure(nodes("a", "b", "c"), retired(), initialConfig), equalTo(conf("a", "b", "c"))); + + // update to "fragile" + clusterSettings.applySettings(Settings.builder() + .put(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.getKey(), MasterResilienceLevel.fragile.toString()).build()); assertThat(reconfigurator.reconfigure(nodes("a"), retired(), initialConfig), equalTo(conf("a"))); - // update min to 2 - clusterSettings.applySettings(Settings.builder().put(MINIMUM_VOTING_MASTER_NODES_SETTING.getKey(), 2).build()); + // update to "excessive" + clusterSettings.applySettings(Settings.builder() + .put(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.getKey(), MasterResilienceLevel.excessive.toString()).build()); + assertThat(reconfigurator.reconfigure(nodes("a"), retired(), initialConfig), sameInstance(initialConfig)); // cannot reconfigure + assertThat(reconfigurator.reconfigure(nodes("a", "b", "c"), retired(), initialConfig), equalTo(conf("a", "b", "c", "d", "e"))); + + // explicitly specify "recommended" + clusterSettings.applySettings(Settings.builder() + .put(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.getKey(), MasterResilienceLevel.recommended.toString()).build()); assertThat(reconfigurator.reconfigure(nodes("a"), retired(), initialConfig), sameInstance(initialConfig)); // cannot reconfigure + assertThat(reconfigurator.reconfigure(nodes("a", "b", "c"), retired(), initialConfig), equalTo(conf("a", "b", "c"))); expectThrows(IllegalArgumentException.class, () -> - clusterSettings.applySettings(Settings.builder().put(MINIMUM_VOTING_MASTER_NODES_SETTING.getKey(), 0).build())); + clusterSettings.applySettings(Settings.builder().put(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.getKey(), "illegal").build())); } } From 7bec1785d7e0e03ebc4883f49ff4e3a0ae86e383 Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 16 Oct 2018 17:10:26 +0100 Subject: [PATCH 6/9] Change to master_nodes_failure_tolerance --- .../cluster/coordination/Reconfigurator.java | 29 ++-- .../common/settings/ClusterSettings.java | 2 +- .../coordination/ReconfiguratorTests.java | 127 +++++++++--------- 3 files changed, 75 insertions(+), 83 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java index 3d04c37de7e9a..d977791238af5 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java @@ -48,28 +48,19 @@ public class Reconfigurator extends AbstractComponent { * where a single node's failure can cause permanent unavailability. This setting determines the size of the smallest set of master * nodes required to process a cluster state update. */ - public static final Setting CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING = - new Setting<>("cluster.master_resilience_level", MasterResilienceLevel.recommended.toString(), MasterResilienceLevel::valueOf, - Property.NodeScope, Property.Dynamic); - - public enum MasterResilienceLevel { - fragile(1), recommended(2), excessive(3); - final int minimumVotingNodes; - MasterResilienceLevel(int minimumVotingNodes) { - this.minimumVotingNodes = minimumVotingNodes; - } - } + public static final Setting CLUSTER_MASTER_NODES_FAILURE_TOLERANCE = + Setting.intSetting("cluster.master_nodes_failure_tolerance", 0, 0, Property.NodeScope, Property.Dynamic); - private MasterResilienceLevel masterResilienceLevel; + private int masterNodesFailureTolerance; public Reconfigurator(Settings settings, ClusterSettings clusterSettings) { super(settings); - masterResilienceLevel = CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.get(settings); - clusterSettings.addSettingsUpdateConsumer(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING, this::setMasterResilienceLevel); + masterNodesFailureTolerance = CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.get(settings); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE, this::setMasterNodesFailureTolerance); } - public void setMasterResilienceLevel(MasterResilienceLevel masterResilienceLevel) { - this.masterResilienceLevel = masterResilienceLevel; + public void setMasterNodesFailureTolerance(int masterNodesFailureTolerance) { + this.masterNodesFailureTolerance = masterNodesFailureTolerance; } private static int roundDownToOdd(int size) { @@ -79,7 +70,7 @@ private static int roundDownToOdd(int size) { @Override public String toString() { return "Reconfigurator{" + - "masterResilienceLevel=" + masterResilienceLevel + + "masterNodesFailureTolerance=" + masterNodesFailureTolerance + '}'; } @@ -143,8 +134,8 @@ public ClusterState.VotingConfiguration reconfigure(Set liveNodes // ... except one, if even, because odd configurations are slightly more resilient ... final int votingNodeCount = roundDownToOdd(nonRetiredLiveNodeCount); - // ... except that if the current configuration satisfies MINIMUM_VOTING_MASTER_NODES_SETTING then the new one must too: - final int safeConfigurationSize = 2 * masterResilienceLevel.minimumVotingNodes - 1; + // ... except that if the current configuration satisfies CLUSTER_MASTER_NODES_FAILURE_TOLERANCE then the new one must too: + final int safeConfigurationSize = 2 * masterNodesFailureTolerance + 1; final int targetSize = currentConfig.getNodeIds().size() >= safeConfigurationSize && votingNodeCount < safeConfigurationSize ? safeConfigurationSize : votingNodeCount; diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index 4d141a544aedf..0789c50ac606c 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -455,7 +455,7 @@ public void apply(Settings value, Settings current, Settings previous) { ElectionSchedulerFactory.ELECTION_DURATION_SETTING, Coordinator.PUBLISH_TIMEOUT_SETTING, JoinHelper.JOIN_TIMEOUT_SETTING, - Reconfigurator.CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING + Reconfigurator.CLUSTER_MASTER_NODES_FAILURE_TOLERANCE ))); public static List> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList( diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java index 1a64039981681..488df4fb72ddc 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java @@ -23,7 +23,6 @@ import org.elasticsearch.Version; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ClusterState.VotingConfiguration; -import org.elasticsearch.cluster.coordination.Reconfigurator.MasterResilienceLevel; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; @@ -37,53 +36,51 @@ import java.util.stream.Collectors; import static java.util.Collections.emptySet; -import static org.elasticsearch.cluster.coordination.Reconfigurator.CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING; +import static org.elasticsearch.cluster.coordination.Reconfigurator.CLUSTER_MASTER_NODES_FAILURE_TOLERANCE; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.sameInstance; public class ReconfiguratorTests extends ESTestCase { public void testReconfigurationExamples() { - for (MasterResilienceLevel masterResilienceLevel : new MasterResilienceLevel[] - {MasterResilienceLevel.fragile, MasterResilienceLevel.recommended, MasterResilienceLevel.excessive}) { - check(nodes("a"), conf("a"), masterResilienceLevel, conf("a")); - check(nodes("a", "b"), conf("a"), masterResilienceLevel, conf("a")); - check(nodes("a", "b", "c"), conf("a"), masterResilienceLevel, conf("a", "b", "c")); - check(nodes("a", "b", "c"), conf("a", "b"), masterResilienceLevel, conf("a", "b", "c")); - check(nodes("a", "b", "c"), conf("a", "b", "c"), masterResilienceLevel, conf("a", "b", "c")); - check(nodes("a", "b", "c", "d"), conf("a", "b", "c"), masterResilienceLevel, conf("a", "b", "c")); - check(nodes("a", "b", "c", "d", "e"), conf("a", "b", "c"), masterResilienceLevel, conf("a", "b", "c", "d", "e")); - check(nodes("a", "b"), conf("a", "b", "e"), masterResilienceLevel, - masterResilienceLevel.equals(MasterResilienceLevel.recommended) ? conf("a", "b", "e") : conf("a")); - check(nodes("a", "b", "c"), conf("a", "b", "e"), masterResilienceLevel, conf("a", "b", "c")); - check(nodes("a", "b", "c", "d"), conf("a", "b", "e"), masterResilienceLevel, conf("a", "b", "c")); - check(nodes("a", "b", "c", "d", "e"), conf("a", "f", "g"), masterResilienceLevel, conf("a", "b", "c", "d", "e")); - check(nodes("a", "b", "c", "d"), conf("a", "b", "c", "d", "e"), masterResilienceLevel, - masterResilienceLevel.equals(MasterResilienceLevel.excessive) ? conf("a", "b", "c", "d", "e") : conf("a", "b", "c")); + for (int masterNodesFailureTolerance = 0; masterNodesFailureTolerance <= 2; masterNodesFailureTolerance++) { + check(nodes("a"), conf("a"), masterNodesFailureTolerance, conf("a")); + check(nodes("a", "b"), conf("a"), masterNodesFailureTolerance, conf("a")); + check(nodes("a", "b", "c"), conf("a"), masterNodesFailureTolerance, conf("a", "b", "c")); + check(nodes("a", "b", "c"), conf("a", "b"), masterNodesFailureTolerance, conf("a", "b", "c")); + check(nodes("a", "b", "c"), conf("a", "b", "c"), masterNodesFailureTolerance, conf("a", "b", "c")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "c"), masterNodesFailureTolerance, conf("a", "b", "c")); + check(nodes("a", "b", "c", "d", "e"), conf("a", "b", "c"), masterNodesFailureTolerance, conf("a", "b", "c", "d", "e")); + check(nodes("a", "b"), conf("a", "b", "e"), masterNodesFailureTolerance, + masterNodesFailureTolerance == 1 ? conf("a", "b", "e") : conf("a")); + check(nodes("a", "b", "c"), conf("a", "b", "e"), masterNodesFailureTolerance, conf("a", "b", "c")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "e"), masterNodesFailureTolerance, conf("a", "b", "c")); + check(nodes("a", "b", "c", "d", "e"), conf("a", "f", "g"), masterNodesFailureTolerance, conf("a", "b", "c", "d", "e")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "c", "d", "e"), masterNodesFailureTolerance, + masterNodesFailureTolerance == 2 ? conf("a", "b", "c", "d", "e") : conf("a", "b", "c")); } // Retiring a single node shifts the votes elsewhere if possible. - check(nodes("a", "b"), retired("a"), conf("a"), MasterResilienceLevel.fragile, conf("b")); + check(nodes("a", "b"), retired("a"), conf("a"), 0, conf("b")); // If the safety level was never reached then retirement can take place - check(nodes("a", "b"), retired("a"), conf("a"), MasterResilienceLevel.recommended, conf("b")); - check(nodes("a", "b"), retired("a"), conf("b"), MasterResilienceLevel.recommended, conf("b")); + check(nodes("a", "b"), retired("a"), conf("a"), 1, conf("b")); + check(nodes("a", "b"), retired("a"), conf("b"), 1, conf("b")); // Retiring a node from a three-node cluster drops down to a one-node configuration if "fragile" - check(nodes("a", "b", "c"), retired("a"), conf("a"), MasterResilienceLevel.fragile, conf("b")); - check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), MasterResilienceLevel.fragile, conf("b")); + check(nodes("a", "b", "c"), retired("a"), conf("a"), 0, conf("b")); + check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), 0, conf("b")); // Retiring is prevented in a three-node cluster if "recommended" - check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), MasterResilienceLevel.recommended, conf("a", "b", "c")); + check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), 1, conf("a", "b", "c")); // 7 nodes, one for each combination of live/retired/current. Ideally we want the config to be the non-retired live nodes. // Since there are 2 non-retired live nodes we round down to 1 and just use the one that's already in the config. - check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), MasterResilienceLevel.fragile, conf("a")); + check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 0, conf("a")); // If we want the config to be at least 3 nodes then we don't retire "c" just yet. - check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), MasterResilienceLevel.recommended, - conf("a", "b", "c")); + check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 1, conf("a", "b", "c")); // The current config never reached 5 nodes so retirement is allowed - check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), MasterResilienceLevel.excessive, conf("a")); + check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 2, conf("a")); } public void testReconfigurationProperty() { @@ -95,29 +92,36 @@ public void testReconfigurationProperty() { final String[] initialVotingNodes = new String[randomIntBetween(1, allNodes.length)]; randomSubsetOf(initialVotingNodes.length, allNodes).toArray(initialVotingNodes); - final MasterResilienceLevel masterResilienceLevel - = randomFrom(MasterResilienceLevel.fragile, MasterResilienceLevel.recommended, MasterResilienceLevel.excessive); + final int masterNodesFailureTolerance + = randomFrom(0, 1, 2); final Reconfigurator reconfigurator = makeReconfigurator( - Settings.builder().put(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.getKey(), masterResilienceLevel).build()); + Settings.builder().put(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.getKey(), masterNodesFailureTolerance).build()); final Set liveNodesSet = nodes(liveNodes); final ClusterState.VotingConfiguration initialConfig = conf(initialVotingNodes); - final ClusterState.VotingConfiguration finalConfig = reconfigurator.reconfigure(liveNodesSet, emptySet(), initialConfig); - // min configuration size comes from MINIMUM_VOTING_MASTER_NODES_SETTING as long as there are enough nodes in the current config - final boolean isSafeConfiguration = initialConfig.getNodeIds().size() >= masterResilienceLevel.minimumVotingNodes * 2 - 1; + // min configuration size comes from CLUSTER_MASTER_NODES_FAILURE_TOLERANCE as long as there are enough nodes in the current config - // actual size of a quorum: half the configured nodes, which is all the live nodes plus maybe some dead ones to make up numbers - final int quorumSize = Math.max(liveNodes.length / 2 + 1, isSafeConfiguration ? masterResilienceLevel.minimumVotingNodes : 0); + if (initialConfig.getNodeIds().size() >= masterNodesFailureTolerance * 2 + 1) { + // actual size of a quorum: half the configured nodes, which is all the live nodes plus maybe some dead ones to make up numbers + final int quorumSize = Math.max(liveNodes.length / 2 + 1, masterNodesFailureTolerance); - if (quorumSize > liveNodes.length) { - assertFalse("reconfigure " + liveNodesSet + " from " + initialConfig + " with safety factor of " + masterResilienceLevel - + " yielded " + finalConfig + " without a live quorum", finalConfig.hasQuorum(Arrays.asList(liveNodes))); + final ClusterState.VotingConfiguration finalConfig = reconfigurator.reconfigure(liveNodesSet, emptySet(), initialConfig); + + final String description = "reconfigure " + liveNodesSet + " from " + initialConfig + " with safety factor of " + + masterNodesFailureTolerance + " yielded " + finalConfig; + + if (quorumSize > liveNodes.length) { + assertFalse(description + " without a live quorum", finalConfig.hasQuorum(Arrays.asList(liveNodes))); + } else { + final List expectedQuorum = randomSubsetOf(quorumSize, liveNodes); + assertTrue(description + " with quorum of " + expectedQuorum, finalConfig.hasQuorum(expectedQuorum)); + } } else { - final List expectedQuorum = randomSubsetOf(quorumSize, liveNodes); - assertTrue("reconfigure " + liveNodesSet + " from " + initialConfig + " with safety factor of " + masterResilienceLevel - + " yielded " + finalConfig + " with quorum of " + expectedQuorum, - finalConfig.hasQuorum(expectedQuorum)); + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, + () -> reconfigurator.reconfigure(liveNodesSet, emptySet(), initialConfig)); + + assertThat(exception.getMessage(), equalTo("foo")); } } @@ -137,19 +141,20 @@ private Set retired(String... nodes) { return Arrays.stream(nodes).collect(Collectors.toSet()); } - private void check(Set liveNodes, ClusterState.VotingConfiguration config, MasterResilienceLevel masterResilienceLevel, + private void check(Set liveNodes, ClusterState.VotingConfiguration config, int masterNodesFailureTolerance, ClusterState.VotingConfiguration expectedConfig) { - check(liveNodes, retired(), config, masterResilienceLevel, expectedConfig); + check(liveNodes, retired(), config, masterNodesFailureTolerance, expectedConfig); } private void check(Set liveNodes, Set retired, ClusterState.VotingConfiguration config, - MasterResilienceLevel masterResilienceLevel, ClusterState.VotingConfiguration expectedConfig) { + int masterNodesFailureTolerance, ClusterState.VotingConfiguration expectedConfig) { final Reconfigurator reconfigurator = makeReconfigurator(Settings.builder() - .put(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.getKey(), masterResilienceLevel) + .put(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.getKey(), masterNodesFailureTolerance) .build()); + final ClusterState.VotingConfiguration adaptedConfig = reconfigurator.reconfigure(liveNodes, retired, config); - assertEquals(new ParameterizedMessage("[liveNodes={}, retired={}, config={}, masterResilienceLevel={}]", - liveNodes, retired, config, masterResilienceLevel).getFormattedMessage(), + assertEquals(new ParameterizedMessage("[liveNodes={}, retired={}, config={}, masterNodesFailureTolerance={}]", + liveNodes, retired, config, masterNodesFailureTolerance).getFormattedMessage(), expectedConfig, adaptedConfig); } @@ -162,28 +167,24 @@ public void testDynamicSetting() { final Reconfigurator reconfigurator = new Reconfigurator(Settings.EMPTY, clusterSettings); final VotingConfiguration initialConfig = conf("a", "b", "c", "d", "e"); - // default is "recommended" - assertThat(reconfigurator.reconfigure(nodes("a"), retired(), initialConfig), sameInstance(initialConfig)); // cannot reconfigure - assertThat(reconfigurator.reconfigure(nodes("a", "b", "c"), retired(), initialConfig), equalTo(conf("a", "b", "c"))); - - // update to "fragile" - clusterSettings.applySettings(Settings.builder() - .put(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.getKey(), MasterResilienceLevel.fragile.toString()).build()); + // default is "0" assertThat(reconfigurator.reconfigure(nodes("a"), retired(), initialConfig), equalTo(conf("a"))); - // update to "excessive" - clusterSettings.applySettings(Settings.builder() - .put(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.getKey(), MasterResilienceLevel.excessive.toString()).build()); + // update to "2" + clusterSettings.applySettings(Settings.builder().put(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.getKey(), "2").build()); assertThat(reconfigurator.reconfigure(nodes("a"), retired(), initialConfig), sameInstance(initialConfig)); // cannot reconfigure assertThat(reconfigurator.reconfigure(nodes("a", "b", "c"), retired(), initialConfig), equalTo(conf("a", "b", "c", "d", "e"))); - // explicitly specify "recommended" - clusterSettings.applySettings(Settings.builder() - .put(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.getKey(), MasterResilienceLevel.recommended.toString()).build()); + // update to "1" + clusterSettings.applySettings(Settings.builder().put(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.getKey(), "1").build()); assertThat(reconfigurator.reconfigure(nodes("a"), retired(), initialConfig), sameInstance(initialConfig)); // cannot reconfigure assertThat(reconfigurator.reconfigure(nodes("a", "b", "c"), retired(), initialConfig), equalTo(conf("a", "b", "c"))); + // explicitly set to "0" + clusterSettings.applySettings(Settings.builder().put(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.getKey(), "0").build()); + assertThat(reconfigurator.reconfigure(nodes("a"), retired(), initialConfig), equalTo(conf("a"))); + expectThrows(IllegalArgumentException.class, () -> - clusterSettings.applySettings(Settings.builder().put(CLUSTER_MASTER_RESILIENCE_LEVEL_SETTING.getKey(), "illegal").build())); + clusterSettings.applySettings(Settings.builder().put(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.getKey(), "-1").build())); } } From 0e05ad83b37720174c099d4ef5f4a51a1df8fd32 Mon Sep 17 00:00:00 2001 From: David Turner Date: Wed, 17 Oct 2018 08:32:56 +0100 Subject: [PATCH 7/9] Throw AssertionError on illegal state --- .../cluster/coordination/Reconfigurator.java | 11 +++++--- .../coordination/ReconfiguratorTests.java | 27 +++++++++++-------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java index d977791238af5..edcbd2c52f05d 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java @@ -89,6 +89,11 @@ public ClusterState.VotingConfiguration reconfigure(Set liveNodes ClusterState.VotingConfiguration currentConfig) { logger.trace("{} reconfiguring {} based on liveNodes={}, retiredNodeIds={}", this, currentConfig, liveNodes, retiredNodeIds); + final int safeConfigurationSize = 2 * masterNodesFailureTolerance + 1; + if (currentConfig.getNodeIds().size() < safeConfigurationSize) { + throw new AssertionError(currentConfig + " does not satisfy masterNodesFailureTolerance of " + masterNodesFailureTolerance); + } + /* * There are three true/false properties of each node in play: live/non-live, retired/non-retired and in-config/not-in-config. * Firstly we divide the nodes into disjoint sets based on these properties: @@ -134,10 +139,8 @@ public ClusterState.VotingConfiguration reconfigure(Set liveNodes // ... except one, if even, because odd configurations are slightly more resilient ... final int votingNodeCount = roundDownToOdd(nonRetiredLiveNodeCount); - // ... except that if the current configuration satisfies CLUSTER_MASTER_NODES_FAILURE_TOLERANCE then the new one must too: - final int safeConfigurationSize = 2 * masterNodesFailureTolerance + 1; - final int targetSize = currentConfig.getNodeIds().size() >= safeConfigurationSize && votingNodeCount < safeConfigurationSize - ? safeConfigurationSize : votingNodeCount; + // ... except that the new configuration must satisfy CLUSTER_MASTER_NODES_FAILURE_TOLERANCE too: + final int targetSize = Math.max(votingNodeCount, safeConfigurationSize); /* * The new configuration is formed by taking this many nodes in the following preference order: diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java index 488df4fb72ddc..991468ff89a48 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java @@ -37,6 +37,7 @@ import static java.util.Collections.emptySet; import static org.elasticsearch.cluster.coordination.Reconfigurator.CLUSTER_MASTER_NODES_FAILURE_TOLERANCE; +import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.sameInstance; @@ -104,24 +105,23 @@ public void testReconfigurationProperty() { if (initialConfig.getNodeIds().size() >= masterNodesFailureTolerance * 2 + 1) { // actual size of a quorum: half the configured nodes, which is all the live nodes plus maybe some dead ones to make up numbers - final int quorumSize = Math.max(liveNodes.length / 2 + 1, masterNodesFailureTolerance); + final int quorumSize = Math.max(liveNodes.length / 2 + 1, masterNodesFailureTolerance + 1); final ClusterState.VotingConfiguration finalConfig = reconfigurator.reconfigure(liveNodesSet, emptySet(), initialConfig); - final String description = "reconfigure " + liveNodesSet + " from " + initialConfig + " with safety factor of " + final String description = "reconfigure " + liveNodesSet + " from " + initialConfig + " with failure tolerance of " + masterNodesFailureTolerance + " yielded " + finalConfig; if (quorumSize > liveNodes.length) { assertFalse(description + " without a live quorum", finalConfig.hasQuorum(Arrays.asList(liveNodes))); } else { final List expectedQuorum = randomSubsetOf(quorumSize, liveNodes); - assertTrue(description + " with quorum of " + expectedQuorum, finalConfig.hasQuorum(expectedQuorum)); + assertTrue(description + " with quorum[" + quorumSize + "] of " + expectedQuorum, finalConfig.hasQuorum(expectedQuorum)); } } else { - IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, - () -> reconfigurator.reconfigure(liveNodesSet, emptySet(), initialConfig)); - - assertThat(exception.getMessage(), equalTo("foo")); + assertThat(expectThrows(AssertionError.class, + () -> reconfigurator.reconfigure(liveNodesSet, emptySet(), initialConfig)).getMessage(), + containsString("does not satisfy masterNodesFailureTolerance")); } } @@ -152,10 +152,15 @@ private void check(Set liveNodes, Set retired, ClusterSta .put(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.getKey(), masterNodesFailureTolerance) .build()); - final ClusterState.VotingConfiguration adaptedConfig = reconfigurator.reconfigure(liveNodes, retired, config); - assertEquals(new ParameterizedMessage("[liveNodes={}, retired={}, config={}, masterNodesFailureTolerance={}]", - liveNodes, retired, config, masterNodesFailureTolerance).getFormattedMessage(), - expectedConfig, adaptedConfig); + if (config.getNodeIds().size() < 2 * masterNodesFailureTolerance + 1) { + assertThat(expectThrows(AssertionError.class, () -> reconfigurator.reconfigure(liveNodes, retired, config)).getMessage(), + containsString("does not satisfy masterNodesFailureTolerance")); + } else { + final ClusterState.VotingConfiguration adaptedConfig = reconfigurator.reconfigure(liveNodes, retired, config); + assertEquals(new ParameterizedMessage("[liveNodes={}, retired={}, config={}, masterNodesFailureTolerance={}]", + liveNodes, retired, config, masterNodesFailureTolerance).getFormattedMessage(), + expectedConfig, adaptedConfig); + } } private Reconfigurator makeReconfigurator(Settings settings) { From 064fbbd3fe97d0cff422c634886bb1f76a0168b5 Mon Sep 17 00:00:00 2001 From: David Turner Date: Wed, 17 Oct 2018 11:57:02 +0100 Subject: [PATCH 8/9] Review feedback --- .../cluster/coordination/Reconfigurator.java | 4 +- .../coordination/ReconfiguratorTests.java | 51 ++++++++----------- 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java index edcbd2c52f05d..097dfc8bfe0b0 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java @@ -51,7 +51,7 @@ public class Reconfigurator extends AbstractComponent { public static final Setting CLUSTER_MASTER_NODES_FAILURE_TOLERANCE = Setting.intSetting("cluster.master_nodes_failure_tolerance", 0, 0, Property.NodeScope, Property.Dynamic); - private int masterNodesFailureTolerance; + private volatile int masterNodesFailureTolerance; public Reconfigurator(Settings settings, ClusterSettings clusterSettings) { super(settings); @@ -91,7 +91,7 @@ public ClusterState.VotingConfiguration reconfigure(Set liveNodes final int safeConfigurationSize = 2 * masterNodesFailureTolerance + 1; if (currentConfig.getNodeIds().size() < safeConfigurationSize) { - throw new AssertionError(currentConfig + " does not satisfy masterNodesFailureTolerance of " + masterNodesFailureTolerance); + throw new AssertionError(currentConfig + " is smaller than expected " + safeConfigurationSize); } /* diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java index 991468ff89a48..b94a3de854c69 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java @@ -44,35 +44,34 @@ public class ReconfiguratorTests extends ESTestCase { public void testReconfigurationExamples() { - for (int masterNodesFailureTolerance = 0; masterNodesFailureTolerance <= 2; masterNodesFailureTolerance++) { - check(nodes("a"), conf("a"), masterNodesFailureTolerance, conf("a")); - check(nodes("a", "b"), conf("a"), masterNodesFailureTolerance, conf("a")); - check(nodes("a", "b", "c"), conf("a"), masterNodesFailureTolerance, conf("a", "b", "c")); - check(nodes("a", "b", "c"), conf("a", "b"), masterNodesFailureTolerance, conf("a", "b", "c")); + + check(nodes("a"), conf("a"), 0, conf("a")); + check(nodes("a", "b"), conf("a"), 0, conf("a")); + check(nodes("a", "b", "c"), conf("a"), 0, conf("a", "b", "c")); + check(nodes("a", "b", "c"), conf("a", "b"), 0, conf("a", "b", "c")); + check(nodes("a", "b"), conf("a", "b", "e"), 0, conf("a")); + check(nodes("a", "b"), conf("a", "b", "e"), 1, conf("a", "b", "e")); + + for (int masterNodesFailureTolerance = 0; masterNodesFailureTolerance <= 1; masterNodesFailureTolerance++) { check(nodes("a", "b", "c"), conf("a", "b", "c"), masterNodesFailureTolerance, conf("a", "b", "c")); check(nodes("a", "b", "c", "d"), conf("a", "b", "c"), masterNodesFailureTolerance, conf("a", "b", "c")); check(nodes("a", "b", "c", "d", "e"), conf("a", "b", "c"), masterNodesFailureTolerance, conf("a", "b", "c", "d", "e")); - check(nodes("a", "b"), conf("a", "b", "e"), masterNodesFailureTolerance, - masterNodesFailureTolerance == 1 ? conf("a", "b", "e") : conf("a")); check(nodes("a", "b", "c"), conf("a", "b", "e"), masterNodesFailureTolerance, conf("a", "b", "c")); check(nodes("a", "b", "c", "d"), conf("a", "b", "e"), masterNodesFailureTolerance, conf("a", "b", "c")); check(nodes("a", "b", "c", "d", "e"), conf("a", "f", "g"), masterNodesFailureTolerance, conf("a", "b", "c", "d", "e")); - check(nodes("a", "b", "c", "d"), conf("a", "b", "c", "d", "e"), masterNodesFailureTolerance, - masterNodesFailureTolerance == 2 ? conf("a", "b", "c", "d", "e") : conf("a", "b", "c")); + check(nodes("a", "b", "c", "d"), conf("a", "b", "c", "d", "e"), masterNodesFailureTolerance, conf("a", "b", "c")); } + check(nodes("a", "b", "c", "d"), conf("a", "b", "c", "d", "e"), 2, conf("a", "b", "c", "d", "e")); + // Retiring a single node shifts the votes elsewhere if possible. check(nodes("a", "b"), retired("a"), conf("a"), 0, conf("b")); - // If the safety level was never reached then retirement can take place - check(nodes("a", "b"), retired("a"), conf("a"), 1, conf("b")); - check(nodes("a", "b"), retired("a"), conf("b"), 1, conf("b")); - - // Retiring a node from a three-node cluster drops down to a one-node configuration if "fragile" + // Retiring a node from a three-node cluster drops down to a one-node configuration if failure tolerance is 0 check(nodes("a", "b", "c"), retired("a"), conf("a"), 0, conf("b")); check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), 0, conf("b")); - // Retiring is prevented in a three-node cluster if "recommended" + // Retiring is prevented in a three-node cluster if failure tolerance is 1 check(nodes("a", "b", "c"), retired("a"), conf("a", "b", "c"), 1, conf("a", "b", "c")); // 7 nodes, one for each combination of live/retired/current. Ideally we want the config to be the non-retired live nodes. @@ -80,8 +79,8 @@ public void testReconfigurationExamples() { check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 0, conf("a")); // If we want the config to be at least 3 nodes then we don't retire "c" just yet. check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 1, conf("a", "b", "c")); - // The current config never reached 5 nodes so retirement is allowed - check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e"), 2, conf("a")); + // If we want the config to be at least 5 nodes then we keep "d" and "h". + check(nodes("a", "b", "c", "f"), retired("c", "e", "f", "g"), conf("a", "c", "d", "e", "h"), 2, conf("a", "b", "c", "d", "h")); } public void testReconfigurationProperty() { @@ -93,8 +92,7 @@ public void testReconfigurationProperty() { final String[] initialVotingNodes = new String[randomIntBetween(1, allNodes.length)]; randomSubsetOf(initialVotingNodes.length, allNodes).toArray(initialVotingNodes); - final int masterNodesFailureTolerance - = randomFrom(0, 1, 2); + final int masterNodesFailureTolerance = randomIntBetween(0, 2); final Reconfigurator reconfigurator = makeReconfigurator( Settings.builder().put(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.getKey(), masterNodesFailureTolerance).build()); @@ -121,7 +119,7 @@ public void testReconfigurationProperty() { } else { assertThat(expectThrows(AssertionError.class, () -> reconfigurator.reconfigure(liveNodesSet, emptySet(), initialConfig)).getMessage(), - containsString("does not satisfy masterNodesFailureTolerance")); + containsString("is smaller than expected")); } } @@ -152,15 +150,10 @@ private void check(Set liveNodes, Set retired, ClusterSta .put(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.getKey(), masterNodesFailureTolerance) .build()); - if (config.getNodeIds().size() < 2 * masterNodesFailureTolerance + 1) { - assertThat(expectThrows(AssertionError.class, () -> reconfigurator.reconfigure(liveNodes, retired, config)).getMessage(), - containsString("does not satisfy masterNodesFailureTolerance")); - } else { - final ClusterState.VotingConfiguration adaptedConfig = reconfigurator.reconfigure(liveNodes, retired, config); - assertEquals(new ParameterizedMessage("[liveNodes={}, retired={}, config={}, masterNodesFailureTolerance={}]", - liveNodes, retired, config, masterNodesFailureTolerance).getFormattedMessage(), - expectedConfig, adaptedConfig); - } + final ClusterState.VotingConfiguration adaptedConfig = reconfigurator.reconfigure(liveNodes, retired, config); + assertEquals(new ParameterizedMessage("[liveNodes={}, retired={}, config={}, masterNodesFailureTolerance={}]", + liveNodes, retired, config, masterNodesFailureTolerance).getFormattedMessage(), + expectedConfig, adaptedConfig); } private Reconfigurator makeReconfigurator(Settings settings) { From d26133ec896d04a28fb0596a2b498bf49f5b4340 Mon Sep 17 00:00:00 2001 From: David Turner Date: Wed, 17 Oct 2018 13:37:40 +0100 Subject: [PATCH 9/9] Add TODO comment about default --- .../org/elasticsearch/cluster/coordination/Reconfigurator.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java index 097dfc8bfe0b0..64459eadda471 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java @@ -50,6 +50,8 @@ public class Reconfigurator extends AbstractComponent { */ public static final Setting CLUSTER_MASTER_NODES_FAILURE_TOLERANCE = Setting.intSetting("cluster.master_nodes_failure_tolerance", 0, 0, Property.NodeScope, Property.Dynamic); + // the default is not supposed to be important since we expect to set this setting explicitly at bootstrapping time + // TODO contemplate setting the default to something larger than 0 (1? 1<<30?) private volatile int masterNodesFailureTolerance;