elastic · gwbrown · Oct 23, 2018 · Aug 13, 2018 · Sep 25, 2018 · Sep 26, 2018
diff --git a/docs/reference/migration/migrate_7_0/cluster.asciidoc b/docs/reference/migration/migrate_7_0/cluster.asciidoc
@@ -18,3 +18,10 @@ primary shards of the opened index to be allocated.
 [float]
 ==== Shard preferences `_primary`, `_primary_first`, `_replica`, and `_replica_first` are removed
 These shard preferences are removed in favour of the `_prefer_nodes` and `_only_nodes` preferences.
+
+[float]
+==== Cluster-wide shard soft limit
+Clusters now have soft limits on the total number of open shards in the cluster
+based on the number of nodes and the `cluster.max_shards_per_node` cluster
+setting, to prevent accidental operations that would destabilize the cluster.
+More information can be found in the <<misc-cluster,documentation for that setting>>.
diff --git a/docs/reference/modules/cluster/misc.asciidoc b/docs/reference/modules/cluster/misc.asciidoc
@@ -22,6 +22,48 @@ user with access to the <<cluster-update-settings,cluster-update-settings>>
 API can make the cluster read-write again.
 
 
+[[cluster-shard-limit]]
+
+==== Cluster Shard Limit
+
+In a Elasticsearch 7.0 and later, there will be a soft limit on the number of
+shards in a cluster, based on the number of nodes in the cluster.  This is
+intended to prevent operations which may unintentionally destabilize the
+cluster. Prior to 7.0, actions which would result in the cluster going over the
+limit will issue a deprecation warning.
+
+NOTE: You can set the system property `es.enforce_max_shards_per_node` to `true`
+to opt in to strict enforcement of the shard limit. If this system property is
+set, actions which would result in the cluster going over the limit will result
+in an error, rather than a deprecation warning. This property will be removed in
+Elasticsearch 7.0, as strict enforcement of the limit will be the default and
+only behavior.
+
+If an operation, such as creating a new index, restoring a snapshot of an index,
+or opening a closed index would lead to the number of shards in the cluster
+going over this limit, the operation will issue a deprecation warning.
+
+If the cluster is already over the limit, due to changes in node membership or
+setting changes, all operations that create or open indices will issue warnings
+until either the limit is increased as described below, or some indices are
+<<indices-open-close,closed>> or <<indices-delete-index,deleted>> to bring the
+number of shards below the limit.
+
+Replicas count towards this limit, but closed indexes do not. An index with 5
+primary shards and 2 replicas will be counted as 15 shards.  Any closed index
+is counted as 0, no matter how many shards and replicas it contains.
+
+The limit defaults to 1,000 shards per node, and be dynamically adjusted using
+the following property:
+
+`cluster.max_shards_per_node`::
+
+     Controls the number of shards allowed in the cluster per node.
+
+For example, a 3-node cluster with the default setting would allow 3,000 shards
+total, across all open indexes.  If the above setting is changed to 1,500, then
+the cluster would allow 4,500 shards total.
+
 [[user-defined-data]]
 ==== User Defined Cluster Metadata
 
@@ -109,4 +151,4 @@ Enable or disable allocation for persistent tasks:
 This setting does not affect the persistent tasks that are already being executed.
 Only newly created persistent tasks, or tasks that must be reassigned (after a node
 left the cluster, for example), are impacted by this setting.
---
+--
diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/MetaData.java b/server/src/main/java/org/elasticsearch/cluster/metadata/MetaData.java
@@ -22,7 +22,6 @@
 import com.carrotsearch.hppc.ObjectHashSet;
 import com.carrotsearch.hppc.cursors.ObjectCursor;
 import com.carrotsearch.hppc.cursors.ObjectObjectCursor;
-
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.util.CollectionUtil;
 import org.elasticsearch.action.AliasesRequest;
@@ -124,9 +123,11 @@ public enum XContentContext {
     public interface Custom extends NamedDiffable<Custom>, ToXContentFragment, ClusterState.FeatureAware {
 
         EnumSet<XContentContext> context();
-
     }
 
+    public static final Setting<Integer> SETTING_CLUSTER_MAX_SHARDS_PER_NODE =
+        Setting.intSetting("cluster.max_shards_per_node", 1000, 1, Property.Dynamic, Property.NodeScope);
+
     public static final Setting<Boolean> SETTING_READ_ONLY_SETTING =
         Setting.boolSetting("cluster.blocks.read_only", false, Property.Dynamic, Property.NodeScope);
 
@@ -162,6 +163,7 @@ public interface Custom extends NamedDiffable<Custom>, ToXContentFragment, Clust
     private final ImmutableOpenMap<String, Custom> customs;
 
     private final transient int totalNumberOfShards; // Transient ? not serializable anyway?
+    private final int totalOpenIndexShards;
     private final int numberOfShards;
 
     private final String[] allIndices;
@@ -183,12 +185,17 @@ public interface Custom extends NamedDiffable<Custom>, ToXContentFragment, Clust
         this.customs = customs;
         this.templates = templates;
         int totalNumberOfShards = 0;
+        int totalOpenIndexShards = 0;
         int numberOfShards = 0;
         for (ObjectCursor<IndexMetaData> cursor : indices.values()) {
             totalNumberOfShards += cursor.value.getTotalNumberOfShards();
             numberOfShards += cursor.value.getNumberOfShards();
+            if (IndexMetaData.State.OPEN.equals(cursor.value.getState())) {
+                totalOpenIndexShards += cursor.value.getTotalNumberOfShards();
+            }
         }
         this.totalNumberOfShards = totalNumberOfShards;
+        this.totalOpenIndexShards = totalOpenIndexShards;
         this.numberOfShards = numberOfShards;
 
         this.allIndices = allIndices;
@@ -667,10 +674,29 @@ public <T extends Custom> T custom(String type) {
     }
 
 
+    /**
+     * Gets the total number of shards from all indices, including replicas and
+     * closed indices.
+     * @return The total number shards from all indices.
+     */
     public int getTotalNumberOfShards() {
         return this.totalNumberOfShards;
     }
 
+    /**
+     * Gets the total number of open shards from all indices. Includes
+     * replicas, but does not include shards that are part of closed indices.
+     * @return The total number of open shards from all indices.
+     */
+    public int getTotalOpenIndexShards() {
+        return this.totalOpenIndexShards;
+    }
+
+    /**
+     * Gets the number of primary shards from all indices, not including
+     * replicas.
+     * @return The number of primary shards from all indices.
+     */
     public int getNumberOfShards() {
         return this.numberOfShards;
     }

diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/MetaDataCreateIndexService.java b/server/src/main/java/org/elasticsearch/cluster/metadata/MetaDataCreateIndexService.java
@@ -53,6 +53,7 @@
 import org.elasticsearch.common.component.AbstractComponent;
 import org.elasticsearch.common.compress.CompressedXContent;
 import org.elasticsearch.common.io.PathUtils;
+import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.settings.IndexScopedSettings;
 import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.settings.Settings;
@@ -82,6 +83,7 @@
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Optional;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.BiFunction;
@@ -587,19 +589,38 @@ public void onFailure(String source, Exception e) {
 
     private void validate(CreateIndexClusterStateUpdateRequest request, ClusterState state) {
         validateIndexName(request.index(), state);
-        validateIndexSettings(request.index(), request.settings(), forbidPrivateIndexSettings);
+        validateIndexSettings(request.index(), request.settings(), state, forbidPrivateIndexSettings);
     }
 
-    public void validateIndexSettings(
-            final String indexName, final Settings settings, final boolean forbidPrivateIndexSettings) throws IndexCreationException {
+    public void validateIndexSettings(String indexName, final Settings settings, final ClusterState clusterState,
+                                      final boolean forbidPrivateIndexSettings) throws IndexCreationException {
         List<String> validationErrors = getIndexSettingsValidationErrors(settings, forbidPrivateIndexSettings);
+
+        Optional<String> shardAllocation = checkShardLimit(settings, clusterState, deprecationLogger);
+        shardAllocation.ifPresent(validationErrors::add);
+
         if (validationErrors.isEmpty() == false) {
             ValidationException validationException = new ValidationException();
             validationException.addValidationErrors(validationErrors);
             throw new IndexCreationException(indexName, validationException);
         }
     }
 
+    /**
+     * Checks whether an index can be created without going over the cluster shard limit.
+     *
+     * @param settings The settings of the index to be created.
+     * @param clusterState The current cluster state.
+     * @param deprecationLogger The logger to use to emit a deprecation warning, if appropriate.
+     * @return If present, an error message to be used to reject index creation. If empty, a signal that this operation may be carried out.
+     */
+    static Optional<String> checkShardLimit(Settings settings, ClusterState clusterState, DeprecationLogger deprecationLogger) {
+        int shardsToCreate = IndexMetaData.INDEX_NUMBER_OF_SHARDS_SETTING.get(settings)
+            * (1 + IndexMetaData.INDEX_NUMBER_OF_REPLICAS_SETTING.get(settings));
+
+        return IndicesService.checkShardLimit(shardsToCreate, clusterState, deprecationLogger);
+    }
+
     List<String> getIndexSettingsValidationErrors(final Settings settings, final boolean forbidPrivateIndexSettings) {
         String customPath = IndexMetaData.INDEX_DATA_PATH_SETTING.get(settings);
         List<String> validationErrors = new ArrayList<>();

diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/MetaDataIndexStateService.java b/server/src/main/java/org/elasticsearch/cluster/metadata/MetaDataIndexStateService.java
@@ -36,8 +36,10 @@
 import org.elasticsearch.cluster.routing.allocation.AllocationService;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.Priority;
+import org.elasticsearch.common.ValidationException;
 import org.elasticsearch.common.component.AbstractComponent;
 import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.indices.IndicesService;
@@ -50,6 +52,7 @@
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Optional;
 import java.util.Set;
 
 /**
@@ -175,6 +178,8 @@ public ClusterState execute(ClusterState currentState) {
                     }
                 }
 
+                validateShardLimit(currentState, request.indices(), deprecationLogger);
+
                 if (indicesToOpen.isEmpty()) {
                     return currentState;
                 }
@@ -217,4 +222,33 @@ public ClusterState execute(ClusterState currentState) {
         });
     }
 
+    /**
+     * Validates whether a list of indices can be opened without going over the cluster shard limit.  Only counts indices which are
+     * currently closed and will be opened, ignores indices which are already open.
+     *
+     * @param currentState The current cluster state.
+     * @param indices The indices which are to be opened.
+     * @param deprecationLogger The logger to use to emit a deprecation warning, if appropriate.
+     * @throws ValidationException If this operation would take the cluster over the limit and enforcement is enabled.
+     */
+    static void validateShardLimit(ClusterState currentState, Index[] indices, DeprecationLogger deprecationLogger) {
+        int shardsToOpen = Arrays.stream(indices)
+            .filter(index -> currentState.metaData().index(index).getState().equals(IndexMetaData.State.CLOSE))
+            .mapToInt(index -> getTotalShardCount(currentState, index))
+            .sum();
+
+        Optional<String> error = IndicesService.checkShardLimit(shardsToOpen, currentState, deprecationLogger);
+        if (error.isPresent()) {
+            ValidationException ex = new ValidationException();
+            ex.addValidationError(error.get());
+            throw ex;
+        }
+
+    }
+
+    private static int getTotalShardCount(ClusterState state, Index index) {
+        IndexMetaData indexMetaData = state.metaData().index(index);
+        return indexMetaData.getNumberOfShards() * (1 + indexMetaData.getNumberOfReplicas());
+    }
+
 }
diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/MetaDataUpdateSettingsService.java b/server/src/main/java/org/elasticsearch/cluster/metadata/MetaDataUpdateSettingsService.java
@@ -33,6 +33,7 @@
 import org.elasticsearch.cluster.routing.allocation.AllocationService;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.Priority;
+import org.elasticsearch.common.ValidationException;
 import org.elasticsearch.common.collect.Tuple;
 import org.elasticsearch.common.component.AbstractComponent;
 import org.elasticsearch.common.inject.Inject;
@@ -45,9 +46,11 @@
 import org.elasticsearch.threadpool.ThreadPool;
 
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Optional;
 import java.util.Set;
 
 import static org.elasticsearch.action.support.ContextPreservingActionListener.wrapPreservingContext;
@@ -115,6 +118,7 @@ protected ClusterStateUpdateResponse newResponse(boolean acknowledged) {
 
             @Override
             public ClusterState execute(ClusterState currentState) {
+
                 RoutingTable.Builder routingTableBuilder = RoutingTable.builder(currentState.routingTable());
                 MetaData.Builder metaDataBuilder = MetaData.builder(currentState.metaData());
 
@@ -141,6 +145,18 @@ public ClusterState execute(ClusterState currentState) {
 
                 int updatedNumberOfReplicas = openSettings.getAsInt(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, -1);
                 if (updatedNumberOfReplicas != -1 && preserveExisting == false) {
+
+                    // Verify that this won't take us over the cluster shard limit.
+                    int totalNewShards = Arrays.stream(request.indices())
+                        .mapToInt(i -> getTotalNewShards(i, currentState, updatedNumberOfReplicas))
+                        .sum();
+                    Optional<String> error = IndicesService.checkShardLimit(totalNewShards, currentState, deprecationLogger);
+                    if (error.isPresent()) {
+                        ValidationException ex = new ValidationException();
+                        ex.addValidationError(error.get());
+                        throw ex;
+                    }
+
                     // we do *not* update the in sync allocation ids as they will be removed upon the first index
                     // operation which make these copies stale
                     // TODO: update the list once the data is deleted by the node?
@@ -224,6 +240,14 @@ public ClusterState execute(ClusterState currentState) {
         });
     }
 
+    private int getTotalNewShards(Index index, ClusterState currentState, int updatedNumberOfReplicas) {
+        IndexMetaData indexMetaData = currentState.metaData().index(index);
+        int shardsInIndex = indexMetaData.getNumberOfShards();
+        int oldNumberOfReplicas = indexMetaData.getNumberOfReplicas();
+        int replicaIncrease = updatedNumberOfReplicas - oldNumberOfReplicas;
+        return replicaIncrease * shardsInIndex;
+    }
+
     /**
      * Updates the cluster block only iff the setting exists in the given settings
      */

diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java
@@ -196,6 +196,7 @@ public void apply(Settings value, Settings current, Settings previous) {
                     MappingUpdatedAction.INDICES_MAPPING_DYNAMIC_TIMEOUT_SETTING,
                     MetaData.SETTING_READ_ONLY_SETTING,
                     MetaData.SETTING_READ_ONLY_ALLOW_DELETE_SETTING,
+                    MetaData.SETTING_CLUSTER_MAX_SHARDS_PER_NODE,
                     RecoverySettings.INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING,
                     RecoverySettings.INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING,
                     RecoverySettings.INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING,