From 2e064e0d13241114c3418b031d20943d99e85bbf Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 18 Jun 2019 12:10:01 +0100 Subject: [PATCH] Allow election of nodes outside voting config (#43243) Today we suppress election attempts on master-eligible nodes that are not in the voting configuration. In fact this restriction is not necessary: any master-eligible node can safely become master as long as it has a fresh enough cluster state and can gather a quorum of votes. Moreover, this restriction is sometimes undesirable: there may be a reason why we do not want any of the nodes in the voting configuration to become master. The reason for this restriction is as follows. If you want to shut the master down then you might first exclude it from the voting configuration. When this exclusion succeeds you might reasonably expect that a new master has been elected, since the voting config exclusion is almost always a step towards shutting the node down. If we allow nodes outside the voting configuration to be the master then the excluded node will continue to be master, which is confusing. This commit adjusts the logic to allow master-eligible nodes to attempt an election even if they are not in the voting configuration. If such a master is successfully elected then it adds itself to the voting configuration. This commit also adjusts the logic that causes master nodes to abdicate when they are excluded from the voting configuration, to avoid the confusion described above. Relates #37712, #37802. --- .../cluster/coordination/Coordinator.java | 32 ++++---- .../cluster/coordination/Reconfigurator.java | 10 +-- .../coordination/ReconfiguratorTests.java | 2 +- .../coordination/VotingConfigurationIT.java | 75 ++++++++++++++++++- 4 files changed, 98 insertions(+), 21 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java index f169442e9e20f..2e98195da4314 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java @@ -380,9 +380,8 @@ private void startElection() { // The preVoteCollector is only active while we are candidate, but it does not call this method with synchronisation, so we have // to check our mode again here. if (mode == Mode.CANDIDATE) { - if (electionQuorumContainsLocalNode(getLastAcceptedState()) == false) { - logger.trace("skip election as local node is not part of election quorum: {}", - getLastAcceptedState().coordinationMetaData()); + if (localNodeMayWinElection(getLastAcceptedState()) == false) { + logger.trace("skip election as local node may not win it: {}", getLastAcceptedState().coordinationMetaData()); return; } @@ -415,16 +414,17 @@ private void abdicateTo(DiscoveryNode newMaster) { becomeCandidate("after abdicating to " + newMaster); } - private static boolean electionQuorumContainsLocalNode(ClusterState lastAcceptedState) { + private static boolean localNodeMayWinElection(ClusterState lastAcceptedState) { final DiscoveryNode localNode = lastAcceptedState.nodes().getLocalNode(); assert localNode != null; - return electionQuorumContains(lastAcceptedState, localNode); + return nodeMayWinElection(lastAcceptedState, localNode); } - private static boolean electionQuorumContains(ClusterState lastAcceptedState, DiscoveryNode node) { + private static boolean nodeMayWinElection(ClusterState lastAcceptedState, DiscoveryNode node) { final String nodeId = node.getId(); return lastAcceptedState.getLastCommittedConfiguration().getNodeIds().contains(nodeId) - || lastAcceptedState.getLastAcceptedConfiguration().getNodeIds().contains(nodeId); + || lastAcceptedState.getLastAcceptedConfiguration().getNodeIds().contains(nodeId) + || lastAcceptedState.getVotingConfigExclusions().stream().noneMatch(vce -> vce.getNodeId().equals(nodeId)); } private Optional ensureTermAtLeast(DiscoveryNode sourceNode, long targetTerm) { @@ -867,8 +867,8 @@ public boolean setInitialConfiguration(final VotingConfiguration votingConfigura metaDataBuilder.coordinationMetaData(coordinationMetaData); coordinationState.get().setInitialState(ClusterState.builder(currentState).metaData(metaDataBuilder).build()); - assert electionQuorumContainsLocalNode(getLastAcceptedState()) : - "initial state does not have local node in its election quorum: " + getLastAcceptedState().coordinationMetaData(); + assert localNodeMayWinElection(getLastAcceptedState()) : + "initial state does not allow local node to win election: " + getLastAcceptedState().coordinationMetaData(); preVoteCollector.update(getPreVoteResponse(), null); // pick up the change to last-accepted version startElectionScheduler(); return true; @@ -1164,8 +1164,8 @@ public void run() { if (mode == Mode.CANDIDATE) { final ClusterState lastAcceptedState = coordinationState.get().getLastAcceptedState(); - if (electionQuorumContainsLocalNode(lastAcceptedState) == false) { - logger.trace("skip prevoting as local node is not part of election quorum: {}", + if (localNodeMayWinElection(lastAcceptedState) == false) { + logger.trace("skip prevoting as local node may not win election: {}", lastAcceptedState.coordinationMetaData()); return; } @@ -1329,16 +1329,20 @@ public void onSuccess(String source) { updateMaxTermSeen(getCurrentTerm()); if (mode == Mode.LEADER) { + // if necessary, abdicate to another node or improve the voting configuration + boolean attemptReconfiguration = true; final ClusterState state = getLastAcceptedState(); // committed state - if (electionQuorumContainsLocalNode(state) == false) { + if (localNodeMayWinElection(state) == false) { final List masterCandidates = completedNodes().stream() .filter(DiscoveryNode::isMasterNode) - .filter(node -> electionQuorumContains(state, node)) + .filter(node -> nodeMayWinElection(state, node)) .collect(Collectors.toList()); if (masterCandidates.isEmpty() == false) { abdicateTo(masterCandidates.get(random.nextInt(masterCandidates.size()))); + attemptReconfiguration = false; } - } else { + } + if (attemptReconfiguration) { scheduleReconfigurationIfNeeded(); } } diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java index 7a3a54d73b2fe..fd9bab3af11ec 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java @@ -150,6 +150,11 @@ static class VotingConfigNode implements Comparable { @Override public int compareTo(VotingConfigNode other) { + // prefer current master + final int currentMasterComp = Boolean.compare(other.currentMaster, currentMaster); + if (currentMasterComp != 0) { + return currentMasterComp; + } // prefer nodes that are live final int liveComp = Boolean.compare(other.live, live); if (liveComp != 0) { @@ -160,11 +165,6 @@ public int compareTo(VotingConfigNode other) { if (inCurrentConfigComp != 0) { return inCurrentConfigComp; } - // prefer current master - final int currentMasterComp = Boolean.compare(other.currentMaster, currentMaster); - if (currentMasterComp != 0) { - return currentMasterComp; - } // tiebreak by node id to have stable ordering return id.compareTo(other.id); } diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java index bbd9514222c77..a1d12d98398ca 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java @@ -53,7 +53,7 @@ public void testReconfigurationExamples() { check(nodes("a"), conf("a"), true, conf("a")); check(nodes("a", "b"), conf("a"), true, conf("a")); - check(nodes("a", "b"), conf("b"), true, conf("b")); + check(nodes("a", "b"), conf("b"), true, conf("a")); check(nodes("a", "b"), conf("a", "c"), true, conf("a")); check(nodes("a", "b"), conf("a", "b"), true, conf("a")); check(nodes("a", "b"), conf("a", "b", "e"), true, conf("a", "b", "e")); diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/VotingConfigurationIT.java b/server/src/test/java/org/elasticsearch/cluster/coordination/VotingConfigurationIT.java index 8c6775cb6c91e..c3c3abf8866b7 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/VotingConfigurationIT.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/VotingConfigurationIT.java @@ -18,17 +18,38 @@ */ package org.elasticsearch.cluster.coordination; +import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsAction; import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsRequest; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.common.Priority; +import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.test.transport.MockTransportService; +import org.elasticsearch.transport.TransportService; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Set; import java.util.concurrent.ExecutionException; -@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0) +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.hasItem; +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.nullValue; + +@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0, autoManageMasterNodes = false) public class VotingConfigurationIT extends ESIntegTestCase { + @Override + protected Collection> nodePlugins() { + return Collections.singletonList(MockTransportService.TestPlugin.class); + } + public void testAbdicateAfterVotingConfigExclusionAdded() throws ExecutionException, InterruptedException { + internalCluster().setBootstrapMasterNodeIndex(0); internalCluster().startNodes(2); final String originalMaster = internalCluster().getMasterName(); @@ -38,4 +59,56 @@ public void testAbdicateAfterVotingConfigExclusionAdded() throws ExecutionExcept client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get(); assertNotEquals(originalMaster, internalCluster().getMasterName()); } + + public void testElectsNodeNotInVotingConfiguration() throws Exception { + internalCluster().setBootstrapMasterNodeIndex(0); + final List nodeNames = internalCluster().startNodes(4); + + // a 4-node cluster settles on a 3-node configuration; we then prevent the nodes in the configuration from winning an election + // by failing at the pre-voting stage, so that the extra node must be elected instead when the master shuts down. This extra node + // should then add itself into the voting configuration. + + assertFalse(internalCluster().client().admin().cluster().prepareHealth() + .setWaitForNodes("4").setWaitForEvents(Priority.LANGUID).get().isTimedOut()); + + String excludedNodeName = null; + final ClusterState clusterState + = internalCluster().client().admin().cluster().prepareState().clear().setNodes(true).setMetaData(true).get().getState(); + final Set votingConfiguration = clusterState.getLastCommittedConfiguration().getNodeIds(); + assertThat(votingConfiguration, hasSize(3)); + assertThat(clusterState.nodes().getSize(), equalTo(4)); + assertThat(votingConfiguration, hasItem(clusterState.nodes().getMasterNodeId())); + for (DiscoveryNode discoveryNode : clusterState.nodes()) { + if (votingConfiguration.contains(discoveryNode.getId()) == false) { + assertThat(excludedNodeName, nullValue()); + excludedNodeName = discoveryNode.getName(); + } + } + + for (final String sender : nodeNames) { + if (sender.equals(excludedNodeName)) { + continue; + } + final MockTransportService senderTransportService + = (MockTransportService) internalCluster().getInstance(TransportService.class, sender); + for (final String receiver : nodeNames) { + senderTransportService.addSendBehavior(internalCluster().getInstance(TransportService.class, receiver), + (connection, requestId, action, request, options) -> { + if (action.equals(PreVoteCollector.REQUEST_PRE_VOTE_ACTION_NAME)) { + throw new ElasticsearchException("rejected"); + } + connection.sendRequest(requestId, action, request, options); + }); + } + } + + internalCluster().stopCurrentMasterNode(); + assertFalse(internalCluster().client().admin().cluster().prepareHealth() + .setWaitForNodes("3").setWaitForEvents(Priority.LANGUID).get().isTimedOut()); + + final ClusterState newClusterState + = internalCluster().client().admin().cluster().prepareState().clear().setNodes(true).setMetaData(true).get().getState(); + assertThat(newClusterState.nodes().getMasterNode().getName(), equalTo(excludedNodeName)); + assertThat(newClusterState.getLastCommittedConfiguration().getNodeIds(), hasItem(newClusterState.nodes().getMasterNodeId())); + } }