Skip to content

Commit

Permalink
Allow election of nodes outside voting config (#43243)
Browse files Browse the repository at this point in the history
Today we suppress election attempts on master-eligible nodes that are not in
the voting configuration. In fact this restriction is not necessary: any
master-eligible node can safely become master as long as it has a fresh enough
cluster state and can gather a quorum of votes. Moreover, this restriction is
sometimes undesirable: there may be a reason why we do not want any of the
nodes in the voting configuration to become master.

The reason for this restriction is as follows. If you want to shut the master
down then you might first exclude it from the voting configuration. When this
exclusion succeeds you might reasonably expect that a new master has been
elected, since the voting config exclusion is almost always a step towards
shutting the node down. If we allow nodes outside the voting configuration to
be the master then the excluded node will continue to be master, which is
confusing.

This commit adjusts the logic to allow master-eligible nodes to attempt an
election even if they are not in the voting configuration. If such a master is
successfully elected then it adds itself to the voting configuration. This
commit also adjusts the logic that causes master nodes to abdicate when they
are excluded from the voting configuration, to avoid the confusion described
above.

Relates #37712, #37802.
  • Loading branch information
DaveCTurner committed Jun 18, 2019
1 parent 5a9c483 commit 2e064e0
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -380,9 +380,8 @@ private void startElection() {
// The preVoteCollector is only active while we are candidate, but it does not call this method with synchronisation, so we have
// to check our mode again here.
if (mode == Mode.CANDIDATE) {
if (electionQuorumContainsLocalNode(getLastAcceptedState()) == false) {
logger.trace("skip election as local node is not part of election quorum: {}",
getLastAcceptedState().coordinationMetaData());
if (localNodeMayWinElection(getLastAcceptedState()) == false) {
logger.trace("skip election as local node may not win it: {}", getLastAcceptedState().coordinationMetaData());
return;
}

Expand Down Expand Up @@ -415,16 +414,17 @@ private void abdicateTo(DiscoveryNode newMaster) {
becomeCandidate("after abdicating to " + newMaster);
}

private static boolean electionQuorumContainsLocalNode(ClusterState lastAcceptedState) {
private static boolean localNodeMayWinElection(ClusterState lastAcceptedState) {
final DiscoveryNode localNode = lastAcceptedState.nodes().getLocalNode();
assert localNode != null;
return electionQuorumContains(lastAcceptedState, localNode);
return nodeMayWinElection(lastAcceptedState, localNode);
}

private static boolean electionQuorumContains(ClusterState lastAcceptedState, DiscoveryNode node) {
private static boolean nodeMayWinElection(ClusterState lastAcceptedState, DiscoveryNode node) {
final String nodeId = node.getId();
return lastAcceptedState.getLastCommittedConfiguration().getNodeIds().contains(nodeId)
|| lastAcceptedState.getLastAcceptedConfiguration().getNodeIds().contains(nodeId);
|| lastAcceptedState.getLastAcceptedConfiguration().getNodeIds().contains(nodeId)
|| lastAcceptedState.getVotingConfigExclusions().stream().noneMatch(vce -> vce.getNodeId().equals(nodeId));
}

private Optional<Join> ensureTermAtLeast(DiscoveryNode sourceNode, long targetTerm) {
Expand Down Expand Up @@ -867,8 +867,8 @@ public boolean setInitialConfiguration(final VotingConfiguration votingConfigura
metaDataBuilder.coordinationMetaData(coordinationMetaData);

coordinationState.get().setInitialState(ClusterState.builder(currentState).metaData(metaDataBuilder).build());
assert electionQuorumContainsLocalNode(getLastAcceptedState()) :
"initial state does not have local node in its election quorum: " + getLastAcceptedState().coordinationMetaData();
assert localNodeMayWinElection(getLastAcceptedState()) :
"initial state does not allow local node to win election: " + getLastAcceptedState().coordinationMetaData();
preVoteCollector.update(getPreVoteResponse(), null); // pick up the change to last-accepted version
startElectionScheduler();
return true;
Expand Down Expand Up @@ -1164,8 +1164,8 @@ public void run() {
if (mode == Mode.CANDIDATE) {
final ClusterState lastAcceptedState = coordinationState.get().getLastAcceptedState();

if (electionQuorumContainsLocalNode(lastAcceptedState) == false) {
logger.trace("skip prevoting as local node is not part of election quorum: {}",
if (localNodeMayWinElection(lastAcceptedState) == false) {
logger.trace("skip prevoting as local node may not win election: {}",
lastAcceptedState.coordinationMetaData());
return;
}
Expand Down Expand Up @@ -1329,16 +1329,20 @@ public void onSuccess(String source) {
updateMaxTermSeen(getCurrentTerm());

if (mode == Mode.LEADER) {
// if necessary, abdicate to another node or improve the voting configuration
boolean attemptReconfiguration = true;
final ClusterState state = getLastAcceptedState(); // committed state
if (electionQuorumContainsLocalNode(state) == false) {
if (localNodeMayWinElection(state) == false) {
final List<DiscoveryNode> masterCandidates = completedNodes().stream()
.filter(DiscoveryNode::isMasterNode)
.filter(node -> electionQuorumContains(state, node))
.filter(node -> nodeMayWinElection(state, node))
.collect(Collectors.toList());
if (masterCandidates.isEmpty() == false) {
abdicateTo(masterCandidates.get(random.nextInt(masterCandidates.size())));
attemptReconfiguration = false;
}
} else {
}
if (attemptReconfiguration) {
scheduleReconfigurationIfNeeded();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,11 @@ static class VotingConfigNode implements Comparable<VotingConfigNode> {

@Override
public int compareTo(VotingConfigNode other) {
// prefer current master
final int currentMasterComp = Boolean.compare(other.currentMaster, currentMaster);
if (currentMasterComp != 0) {
return currentMasterComp;
}
// prefer nodes that are live
final int liveComp = Boolean.compare(other.live, live);
if (liveComp != 0) {
Expand All @@ -160,11 +165,6 @@ public int compareTo(VotingConfigNode other) {
if (inCurrentConfigComp != 0) {
return inCurrentConfigComp;
}
// prefer current master
final int currentMasterComp = Boolean.compare(other.currentMaster, currentMaster);
if (currentMasterComp != 0) {
return currentMasterComp;
}
// tiebreak by node id to have stable ordering
return id.compareTo(other.id);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public void testReconfigurationExamples() {

check(nodes("a"), conf("a"), true, conf("a"));
check(nodes("a", "b"), conf("a"), true, conf("a"));
check(nodes("a", "b"), conf("b"), true, conf("b"));
check(nodes("a", "b"), conf("b"), true, conf("a"));
check(nodes("a", "b"), conf("a", "c"), true, conf("a"));
check(nodes("a", "b"), conf("a", "b"), true, conf("a"));
check(nodes("a", "b"), conf("a", "b", "e"), true, conf("a", "b", "e"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,38 @@
*/
package org.elasticsearch.cluster.coordination;

import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsAction;
import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsRequest;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.Priority;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.transport.MockTransportService;
import org.elasticsearch.transport.TransportService;

import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutionException;

@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0)
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasItem;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.nullValue;

@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0, autoManageMasterNodes = false)
public class VotingConfigurationIT extends ESIntegTestCase {

@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Collections.singletonList(MockTransportService.TestPlugin.class);
}

public void testAbdicateAfterVotingConfigExclusionAdded() throws ExecutionException, InterruptedException {
internalCluster().setBootstrapMasterNodeIndex(0);
internalCluster().startNodes(2);
final String originalMaster = internalCluster().getMasterName();

Expand All @@ -38,4 +59,56 @@ public void testAbdicateAfterVotingConfigExclusionAdded() throws ExecutionExcept
client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
assertNotEquals(originalMaster, internalCluster().getMasterName());
}

public void testElectsNodeNotInVotingConfiguration() throws Exception {
internalCluster().setBootstrapMasterNodeIndex(0);
final List<String> nodeNames = internalCluster().startNodes(4);

// a 4-node cluster settles on a 3-node configuration; we then prevent the nodes in the configuration from winning an election
// by failing at the pre-voting stage, so that the extra node must be elected instead when the master shuts down. This extra node
// should then add itself into the voting configuration.

assertFalse(internalCluster().client().admin().cluster().prepareHealth()
.setWaitForNodes("4").setWaitForEvents(Priority.LANGUID).get().isTimedOut());

String excludedNodeName = null;
final ClusterState clusterState
= internalCluster().client().admin().cluster().prepareState().clear().setNodes(true).setMetaData(true).get().getState();
final Set<String> votingConfiguration = clusterState.getLastCommittedConfiguration().getNodeIds();
assertThat(votingConfiguration, hasSize(3));
assertThat(clusterState.nodes().getSize(), equalTo(4));
assertThat(votingConfiguration, hasItem(clusterState.nodes().getMasterNodeId()));
for (DiscoveryNode discoveryNode : clusterState.nodes()) {
if (votingConfiguration.contains(discoveryNode.getId()) == false) {
assertThat(excludedNodeName, nullValue());
excludedNodeName = discoveryNode.getName();
}
}

for (final String sender : nodeNames) {
if (sender.equals(excludedNodeName)) {
continue;
}
final MockTransportService senderTransportService
= (MockTransportService) internalCluster().getInstance(TransportService.class, sender);
for (final String receiver : nodeNames) {
senderTransportService.addSendBehavior(internalCluster().getInstance(TransportService.class, receiver),
(connection, requestId, action, request, options) -> {
if (action.equals(PreVoteCollector.REQUEST_PRE_VOTE_ACTION_NAME)) {
throw new ElasticsearchException("rejected");
}
connection.sendRequest(requestId, action, request, options);
});
}
}

internalCluster().stopCurrentMasterNode();
assertFalse(internalCluster().client().admin().cluster().prepareHealth()
.setWaitForNodes("3").setWaitForEvents(Priority.LANGUID).get().isTimedOut());

final ClusterState newClusterState
= internalCluster().client().admin().cluster().prepareState().clear().setNodes(true).setMetaData(true).get().getState();
assertThat(newClusterState.nodes().getMasterNode().getName(), equalTo(excludedNodeName));
assertThat(newClusterState.getLastCommittedConfiguration().getNodeIds(), hasItem(newClusterState.nodes().getMasterNodeId()));
}
}

0 comments on commit 2e064e0

Please sign in to comment.