elastic · DaveCTurner · Nov 6, 2018 · Nov 2, 2018 · Nov 2, 2018 · Nov 6, 2018
diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java
@@ -72,7 +72,6 @@
 import java.util.stream.StreamSupport;
 
 import static java.util.Collections.emptySet;
-import static org.elasticsearch.cluster.coordination.Reconfigurator.CLUSTER_MASTER_NODES_FAILURE_TOLERANCE;
 import static org.elasticsearch.discovery.DiscoverySettings.NO_MASTER_BLOCK_WRITES;
 import static org.elasticsearch.gateway.GatewayService.STATE_NOT_RECOVERED_BLOCK;
 
@@ -593,8 +592,6 @@ public void setInitialConfiguration(final VotingConfiguration votingConfiguratio
             MetaData.Builder metaDataBuilder = MetaData.builder();
             // automatically generate a UID for the metadata if we need to
             metaDataBuilder.generateClusterUuidIfNeeded(); // TODO generate UUID in bootstrapping tool?
-            metaDataBuilder.persistentSettings(Settings.builder().put(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.getKey(),
-                (votingConfiguration.getNodeIds().size() - 1) / 2).build()); // TODO set this in bootstrapping tool?
             builder.metaData(metaDataBuilder);
             coordinationState.get().setInitialState(builder.build());
             preVoteCollector.update(getPreVoteResponse(), null); // pick up the change to last-accepted version

diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java
@@ -41,29 +41,32 @@ public class Reconfigurator extends AbstractComponent {
 
     /**
      * The cluster usually requires a vote from at least half of the master nodes in order to commit a cluster state update, and to achieve
-     * this it makes automatic adjustments to the quorum size as master nodes join or leave the cluster. However, if master nodes leave the
-     * cluster slowly enough then these automatic adjustments can end up with a single master node; if this last node were to fail then the
-     * cluster would be rendered permanently unavailable. Instead it may be preferable to stop processing cluster state updates and become
-     * unavailable when the second-last (more generally, n'th-last) node leaves the cluster, so that the cluster is never in a situation
-     * where a single node's failure can cause permanent unavailability. This setting determines the size of the smallest set of master
-     * nodes required to process a cluster state update.
+     * the best resilience it makes automatic adjustments to the voting configuration as master nodes join or leave the cluster. Adjustments
+     * that fix or increase the size of the voting configuration are always a good idea, but the wisdom of reducing the voting configuration
+     * size is less clear. For instance, automatically reducing the voting configuration down to a single node means the cluster requires
+     * this node to operate, which is not resilient: if it broke we could restore every other master-eligible node in the cluster to health
+     * and still the cluster would be unavailable. However not reducing the voting configuration size can also hamper resilience: in a
+     * five-node cluster we could lose two nodes and by reducing the voting configuration to the remaining three nodes we could tolerate the
+     * loss of a further node before failing.
+     *
+     * We offer two options: either we auto-shrink the voting configuration as long as it contains more than three nodes, or we don't and we
+     * require the user to control the voting configuration manually using the retirement API. The former, default, option, guarantees that
+     * as long as there have been at least three master-eligible nodes in the cluster and no more than one of them is currently unavailable,
+     * then the cluster will still operate, which is what almost everyone wants. Manual control is for users who want different guarantees.
      */
-    public static final Setting<Integer> CLUSTER_MASTER_NODES_FAILURE_TOLERANCE =
-        Setting.intSetting("cluster.master_nodes_failure_tolerance", 0, 0, Property.NodeScope, Property.Dynamic);
-    // the default is not supposed to be important since we expect to set this setting explicitly at bootstrapping time
-    // TODO contemplate setting the default to something larger than 0 (1? 1<<30?)
-    // TODO prevent this being set as a transient or a per-node setting?
+    public static final Setting<Boolean> CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION =
+        Setting.boolSetting("cluster.auto_shrink_voting_configuration", true, Property.NodeScope, Property.Dynamic);
 
-    private volatile int masterNodesFailureTolerance;
+    private volatile boolean autoShrinkVotingConfiguration;
 
     public Reconfigurator(Settings settings, ClusterSettings clusterSettings) {
         super(settings);
-        masterNodesFailureTolerance = CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.get(settings);
-        clusterSettings.addSettingsUpdateConsumer(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE, this::setMasterNodesFailureTolerance);
+        autoShrinkVotingConfiguration = CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION.get(settings);
+        clusterSettings.addSettingsUpdateConsumer(CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION, this::setAutoShrinkVotingConfiguration);
     }
 
-    public void setMasterNodesFailureTolerance(int masterNodesFailureTolerance) {
-        this.masterNodesFailureTolerance = masterNodesFailureTolerance;
+    public void setAutoShrinkVotingConfiguration(boolean autoShrinkVotingConfiguration) {
+        this.autoShrinkVotingConfiguration = autoShrinkVotingConfiguration;
     }
 
     private static int roundDownToOdd(int size) {
@@ -73,7 +76,7 @@ private static int roundDownToOdd(int size) {
     @Override
     public String toString() {
         return "Reconfigurator{" +
-            "masterNodesFailureTolerance=" + masterNodesFailureTolerance +
+            "autoShrinkVotingConfiguration=" + autoShrinkVotingConfiguration +
             '}';
     }
 
@@ -92,22 +95,26 @@ public ClusterState.VotingConfiguration reconfigure(Set<DiscoveryNode> liveNodes
                                                         ClusterState.VotingConfiguration currentConfig) {
         logger.trace("{} reconfiguring {} based on liveNodes={}, retiredNodeIds={}", this, currentConfig, liveNodes, retiredNodeIds);
 
-        final int safeConfigurationSize = 2 * masterNodesFailureTolerance + 1;
-        if (currentConfig.getNodeIds().size() < safeConfigurationSize) {
-            throw new AssertionError(currentConfig + " is smaller than expected " + safeConfigurationSize);
+        // The new configuration may only shrink if CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION is true, and even then may not shrink
+        // to fewer than 3 nodes.
+        final int minimumConfigurationSize;
+        if (autoShrinkVotingConfiguration) {
+            minimumConfigurationSize = Math.min(roundDownToOdd(currentConfig.getNodeIds().size()), 3);
+        } else {
+            minimumConfigurationSize = currentConfig.getNodeIds().size();
         }
 
         /*
          *  There are three true/false properties of each node in play: live/non-live, retired/non-retired and in-config/not-in-config.
          *  Firstly we divide the nodes into disjoint sets based on these properties:
          *
-         *  -    retiredInConfigNotLiveIds
          *  - nonRetiredInConfigNotLiveIds
-         *  -    retiredInConfigLiveIds
          *  - nonRetiredInConfigLiveIds
          *  - nonRetiredLiveNotInConfigIds
          *
-         *  The other 3 possibilities are not relevant:
+         *  The other 5 possibilities are not relevant:
+         *  - retired, in-config, live             -- retired nodes should be removed from the config
+         *  - retired, in-config, non-live         -- retired nodes should be removed from the config
          *  - retired, not-in-config, live         -- cannot add a retired node back to the config
          *  - retired, not-in-config, non-live     -- cannot add a retired node back to the config
          *  - non-retired, non-live, not-in-config -- no evidence this node exists at all
@@ -119,39 +126,40 @@ public ClusterState.VotingConfiguration reconfigure(Set<DiscoveryNode> liveNodes
         liveInConfigIds.retainAll(liveNodeIds);
 
         final Set<String> inConfigNotLiveIds = Sets.sortedDifference(currentConfig.getNodeIds(), liveInConfigIds);
-        final Set<String> retiredInConfigNotLiveIds = new TreeSet<>(inConfigNotLiveIds);
-        retiredInConfigNotLiveIds.retainAll(retiredNodeIds);
         final Set<String> nonRetiredInConfigNotLiveIds = new TreeSet<>(inConfigNotLiveIds);
-        nonRetiredInConfigNotLiveIds.removeAll(retiredInConfigNotLiveIds);
+        nonRetiredInConfigNotLiveIds.removeAll(retiredNodeIds);
 
-        final Set<String> retiredInConfigLiveIds = new TreeSet<>(liveInConfigIds);
-        retiredInConfigLiveIds.retainAll(retiredNodeIds);
         final Set<String> nonRetiredInConfigLiveIds = new TreeSet<>(liveInConfigIds);
-        nonRetiredInConfigLiveIds.removeAll(retiredInConfigLiveIds);
+        nonRetiredInConfigLiveIds.removeAll(retiredNodeIds);
 
         final Set<String> nonRetiredLiveNotInConfigIds = Sets.sortedDifference(liveNodeIds, currentConfig.getNodeIds());
         nonRetiredLiveNotInConfigIds.removeAll(retiredNodeIds);
 
         /*
          * Now we work out how many nodes should be in the configuration:
          */
+        final int targetSize;
 
-        // ideally we want the configuration to be all the non-retired live nodes ...
         final int nonRetiredLiveNodeCount = nonRetiredInConfigLiveIds.size() + nonRetiredLiveNotInConfigIds.size();
-
-        // ... except one, if even, because odd configurations are slightly more resilient ...
-        final int votingNodeCount = roundDownToOdd(nonRetiredLiveNodeCount);
-
-        // ... except that the new configuration must satisfy CLUSTER_MASTER_NODES_FAILURE_TOLERANCE too:
-        final int targetSize = Math.max(votingNodeCount, safeConfigurationSize);
+        final int nonRetiredConfigSize = nonRetiredInConfigLiveIds.size() + nonRetiredInConfigNotLiveIds.size();
+        if (autoShrinkVotingConfiguration) {
+            if (nonRetiredLiveNodeCount >= 3) {
+                targetSize = roundDownToOdd(nonRetiredLiveNodeCount);
+            } else {
+                // only have one or two available nodes; may not shrink below 3 nodes automatically, but if
+                // the config (excluding retired nodes) is already smaller than 3 then it's ok.
+                targetSize = nonRetiredConfigSize < 3 ? 1 : 3;
+            }
+        } else {
+            targetSize = Math.max(roundDownToOdd(nonRetiredLiveNodeCount), nonRetiredConfigSize);
+        }
 
         /*
          * The new configuration is formed by taking this many nodes in the following preference order:
          */
         final ClusterState.VotingConfiguration newConfig = new ClusterState.VotingConfiguration(
-            Stream.of(nonRetiredInConfigLiveIds, nonRetiredLiveNotInConfigIds, // live nodes first, preferring the current config
-                retiredInConfigLiveIds, // if we need more, first use retired nodes that are still alive and haven't been removed yet
-                nonRetiredInConfigNotLiveIds, retiredInConfigNotLiveIds) // if we need more, use non-live nodes
+            // live nodes first, preferring the current config, and if we need more then use non-live nodes
+            Stream.of(nonRetiredInConfigLiveIds, nonRetiredLiveNotInConfigIds, nonRetiredInConfigNotLiveIds)
                 .flatMap(Collection::stream).limit(targetSize).collect(Collectors.toSet()));
 
         if (newConfig.hasQuorum(liveNodeIds)) {

diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java
@@ -452,11 +452,10 @@ public void apply(Settings value, Settings current, Settings previous) {
                     ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING,
                     ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING,
                     ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING,
-                    Coordinator.PUBLISH_TIMEOUT_SETTING,
                     ElectionSchedulerFactory.ELECTION_DURATION_SETTING,
                     Coordinator.PUBLISH_TIMEOUT_SETTING,
                     JoinHelper.JOIN_TIMEOUT_SETTING,
-                    Reconfigurator.CLUSTER_MASTER_NODES_FAILURE_TOLERANCE
+                    Reconfigurator.CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION
             )));
 
     public static List<SettingUpgrader<?>> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList(