From e7029168a2949e160adcd8f96cfd109e3740f793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?danielhuang=28=E9=BB=84=E5=8D=8E=29?= Date: Thu, 16 Jun 2022 16:36:08 +0800 Subject: [PATCH 1/7] Optimize log cluster health performance. --- .../cluster/health/ClusterStateHealth.java | 44 +++++++++++++++++++ .../cluster/routing/IndexRoutingTable.java | 10 ++++- .../routing/allocation/AllocationService.java | 18 +++++--- 3 files changed, 66 insertions(+), 6 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java b/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java index 592bc1a75b3f4..48d579bbd6ed6 100644 --- a/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java +++ b/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java @@ -7,9 +7,14 @@ */ package org.elasticsearch.cluster.health; +import org.apache.logging.log4j.Logger; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.routing.IndexRoutingTable; +import org.elasticsearch.cluster.routing.IndexShardRoutingTable; +import org.elasticsearch.cluster.routing.RoutingNodes; +import org.elasticsearch.cluster.routing.RoutingTable; +import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; @@ -22,6 +27,8 @@ import java.util.Map; import java.util.Objects; +import static org.elasticsearch.cluster.health.ClusterShardHealth.getInactivePrimaryHealth; + public final class ClusterStateHealth implements Iterable, Writeable { private final int numberOfNodes; @@ -144,6 +151,43 @@ public ClusterStateHealth( this.indices = indices; } + public static ClusterHealthStatus getHealthStatus(final RoutingTable routingTable, + final RoutingNodes routingNodes, + final Logger logger) { + ClusterHealthStatus computeStatus; + if (routingNodes.unassigned().size() == 0 && routingNodes.hasInactiveShards() == false) { + // no unassigned, inactive shards + return ClusterHealthStatus.GREEN; + } + if (routingNodes.hasUnassignedPrimaries() == false && routingNodes.hasInactivePrimaries() == false) { + // no unassigned, inactive primaries, but has unassigned or inactive replicas + return ClusterHealthStatus.YELLOW; + } + + // cluster at least YELLOW + computeStatus = ClusterHealthStatus.YELLOW; + for (IndexRoutingTable indexRoutingTable : routingTable) { + if (indexRoutingTable.allShardsActive()) { + // skip GREEN index + continue; + } + for (int i=0; i { // note, we assume that when the index routing is created, ShardRoutings are created for all possible number of // shards with state set to UNASSIGNED private final IndexShardRoutingTable[] shards; - + // total shard count of this index + private final int totalShardCount; private final List allActiveShards; IndexRoutingTable(Index index, IndexShardRoutingTable[] shards) { this.index = index; this.shuffler = new RotationShardShuffler(Randomness.get().nextInt()); this.shards = shards; + int totalShardCount = 0; List allActiveShards = new ArrayList<>(); for (IndexShardRoutingTable shard : shards) { allActiveShards.addAll(shard.activeShards()); + totalShardCount += shard.size(); } + this.totalShardCount = totalShardCount; this.allActiveShards = CollectionUtils.wrapUnmodifiableOrEmptySingleton(allActiveShards); } @@ -217,6 +221,10 @@ public boolean allPrimaryShardsActive() { return primaryShardsActive() == shards.length; } + public boolean allShardsActive() { + return totalShardCount == allActiveShards.size(); + } + /** * Calculates the number of primary shards in active state in routing table * diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java index 947a0e9fdf9be..b60c311f82a0f 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java @@ -55,6 +55,7 @@ import static java.util.Collections.emptyList; import static java.util.Collections.singletonList; +import static org.elasticsearch.cluster.health.ClusterStateHealth.getHealthStatus; import static org.elasticsearch.cluster.routing.UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING; /** @@ -168,7 +169,7 @@ private static ClusterState buildResultAndLogHealthChange(ClusterState oldState, } final ClusterState newState = newStateBuilder.build(); - logClusterHealthStateChange(new ClusterStateHealth(oldState), new ClusterStateHealth(newState), reason); + logClusterHealthStateChange(oldState, newState, reason); return newState; } @@ -496,12 +497,19 @@ public ClusterState reroute(ClusterState clusterState, String reason) { } private static void logClusterHealthStateChange( - ClusterStateHealth previousStateHealth, - ClusterStateHealth newStateHealth, + final ClusterState previousState, + final ClusterState newState, String reason ) { - ClusterHealthStatus previousHealth = previousStateHealth.getStatus(); - ClusterHealthStatus currentHealth = newStateHealth.getStatus(); + ClusterHealthStatus previousHealth = getHealthStatus( + previousState.routingTable(), + previousState.getRoutingNodes(), + logger); + ClusterHealthStatus currentHealth = getHealthStatus( + newState.routingTable(), + newState.getRoutingNodes(), + logger); + if (previousHealth.equals(currentHealth) == false) { logger.info( new ESLogMessage("Cluster health status changed from [{}] to [{}] (reason: [{}]).").argAndField( From d38ab3c5f37ca62956f9f2b7df028dabbb62b37d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?danielhuang=28=E9=BB=84=E5=8D=8E=29?= Date: Thu, 16 Jun 2022 16:53:39 +0800 Subject: [PATCH 2/7] fix unused import --- .../cluster/routing/allocation/AllocationService.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java index b60c311f82a0f..493f3e9eae2c4 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java @@ -14,7 +14,6 @@ import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.RestoreInProgress; import org.elasticsearch.cluster.health.ClusterHealthStatus; -import org.elasticsearch.cluster.health.ClusterStateHealth; import org.elasticsearch.cluster.metadata.AutoExpandReplicas; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.metadata.Metadata; From 65ffd634290530009fdd023bfb79b83cf8984728 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?danielhuang=28=E9=BB=84=E5=8D=8E=29?= Date: Thu, 16 Jun 2022 17:48:23 +0800 Subject: [PATCH 3/7] Add change log. --- docs/changelog/87723.yaml | 5 +++++ .../cluster/health/ClusterStateHealth.java | 13 +++++++------ .../routing/allocation/AllocationService.java | 16 +++------------- 3 files changed, 15 insertions(+), 19 deletions(-) create mode 100644 docs/changelog/87723.yaml diff --git a/docs/changelog/87723.yaml b/docs/changelog/87723.yaml new file mode 100644 index 0000000000000..ff2707a8d9f25 --- /dev/null +++ b/docs/changelog/87723.yaml @@ -0,0 +1,5 @@ +pr: 87723 +summary: Optimize log cluster health performance. +area: Cluster Allocation +type: enhancement +issues: [] diff --git a/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java b/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java index 48d579bbd6ed6..c901f44d1a600 100644 --- a/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java +++ b/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java @@ -151,9 +151,11 @@ public ClusterStateHealth( this.indices = indices; } - public static ClusterHealthStatus getHealthStatus(final RoutingTable routingTable, - final RoutingNodes routingNodes, - final Logger logger) { + public static ClusterHealthStatus getHealthStatus( + final RoutingTable routingTable, + final RoutingNodes routingNodes, + final Logger logger + ) { ClusterHealthStatus computeStatus; if (routingNodes.unassigned().size() == 0 && routingNodes.hasInactiveShards() == false) { // no unassigned, inactive shards @@ -171,7 +173,7 @@ public static ClusterHealthStatus getHealthStatus(final RoutingTable routingTabl // skip GREEN index continue; } - for (int i=0; i Date: Tue, 21 Jun 2022 00:09:26 +0800 Subject: [PATCH 4/7] remote routing nodes logic in log health --- .../cluster/health/ClusterStateHealth.java | 26 +++++++------------ .../cluster/routing/IndexRoutingTable.java | 7 +++-- .../routing/allocation/AllocationService.java | 4 +-- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java b/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java index c901f44d1a600..7b6d75008109a 100644 --- a/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java +++ b/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java @@ -12,8 +12,6 @@ import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.routing.IndexRoutingTable; import org.elasticsearch.cluster.routing.IndexShardRoutingTable; -import org.elasticsearch.cluster.routing.RoutingNodes; -import org.elasticsearch.cluster.routing.RoutingTable; import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; @@ -152,31 +150,27 @@ public ClusterStateHealth( } public static ClusterHealthStatus getHealthStatus( - final RoutingTable routingTable, - final RoutingNodes routingNodes, + final ClusterState clusterState, final Logger logger ) { - ClusterHealthStatus computeStatus; - if (routingNodes.unassigned().size() == 0 && routingNodes.hasInactiveShards() == false) { - // no unassigned, inactive shards - return ClusterHealthStatus.GREEN; - } - if (routingNodes.hasUnassignedPrimaries() == false && routingNodes.hasInactivePrimaries() == false) { - // no unassigned, inactive primaries, but has unassigned or inactive replicas - return ClusterHealthStatus.YELLOW; + if (clusterState.blocks().hasGlobalBlockWithStatus(RestStatus.SERVICE_UNAVAILABLE)) { + return ClusterHealthStatus.RED; } - // cluster at least YELLOW - computeStatus = ClusterHealthStatus.YELLOW; - for (IndexRoutingTable indexRoutingTable : routingTable) { + ClusterHealthStatus computeStatus = ClusterHealthStatus.GREEN; + for (String index : clusterState.metadata().getConcreteAllIndices()) { + IndexRoutingTable indexRoutingTable = clusterState.routingTable().index(index); if (indexRoutingTable.allShardsActive()) { - // skip GREEN index + // GREEN index continue; } + for (int i = 0; i < indexRoutingTable.size(); i++) { IndexShardRoutingTable indexShardRoutingTable = indexRoutingTable.shard(i); ShardRouting primary = indexShardRoutingTable.primaryShard(); if (primary.active()) { + // index has inactive replicas + computeStatus = ClusterHealthStatus.YELLOW; continue; } computeStatus = getInactivePrimaryHealth(primary); diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/IndexRoutingTable.java b/server/src/main/java/org/elasticsearch/cluster/routing/IndexRoutingTable.java index 21ca1f4b63d42..86185a2bb650f 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/IndexRoutingTable.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/IndexRoutingTable.java @@ -61,8 +61,7 @@ public class IndexRoutingTable implements SimpleDiffable { // note, we assume that when the index routing is created, ShardRoutings are created for all possible number of // shards with state set to UNASSIGNED private final IndexShardRoutingTable[] shards; - // total shard count of this index - private final int totalShardCount; + private final boolean allShardsActive; private final List allActiveShards; IndexRoutingTable(Index index, IndexShardRoutingTable[] shards) { @@ -75,8 +74,8 @@ public class IndexRoutingTable implements SimpleDiffable { allActiveShards.addAll(shard.activeShards()); totalShardCount += shard.size(); } - this.totalShardCount = totalShardCount; this.allActiveShards = CollectionUtils.wrapUnmodifiableOrEmptySingleton(allActiveShards); + this.allShardsActive = totalShardCount == allActiveShards.size(); } /** @@ -222,7 +221,7 @@ public boolean allPrimaryShardsActive() { } public boolean allShardsActive() { - return totalShardCount == allActiveShards.size(); + return this.allShardsActive; } /** diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java index 78c30ba8f02f9..3e66b939c32e1 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java @@ -496,8 +496,8 @@ public ClusterState reroute(ClusterState clusterState, String reason) { } private static void logClusterHealthStateChange(final ClusterState previousState, final ClusterState newState, String reason) { - ClusterHealthStatus previousHealth = getHealthStatus(previousState.routingTable(), previousState.getRoutingNodes(), logger); - ClusterHealthStatus currentHealth = getHealthStatus(newState.routingTable(), newState.getRoutingNodes(), logger); + ClusterHealthStatus previousHealth = getHealthStatus(previousState, logger); + ClusterHealthStatus currentHealth = getHealthStatus(newState, logger); if (previousHealth.equals(currentHealth) == false) { logger.info( From 8af1d285692dcfa7fccef605c705a99d16a91930 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?danielhuang=28=E9=BB=84=E5=8D=8E=29?= Date: Wed, 29 Jun 2022 23:37:12 +0800 Subject: [PATCH 5/7] move getHealthStatus to AllocationService --- .../cluster/health/ClusterStateHealth.java | 39 ------------------ .../routing/allocation/AllocationService.java | 40 +++++++++++++++++-- 2 files changed, 37 insertions(+), 42 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java b/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java index 7b6d75008109a..592bc1a75b3f4 100644 --- a/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java +++ b/server/src/main/java/org/elasticsearch/cluster/health/ClusterStateHealth.java @@ -7,12 +7,9 @@ */ package org.elasticsearch.cluster.health; -import org.apache.logging.log4j.Logger; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.routing.IndexRoutingTable; -import org.elasticsearch.cluster.routing.IndexShardRoutingTable; -import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; @@ -25,8 +22,6 @@ import java.util.Map; import java.util.Objects; -import static org.elasticsearch.cluster.health.ClusterShardHealth.getInactivePrimaryHealth; - public final class ClusterStateHealth implements Iterable, Writeable { private final int numberOfNodes; @@ -149,40 +144,6 @@ public ClusterStateHealth( this.indices = indices; } - public static ClusterHealthStatus getHealthStatus( - final ClusterState clusterState, - final Logger logger - ) { - if (clusterState.blocks().hasGlobalBlockWithStatus(RestStatus.SERVICE_UNAVAILABLE)) { - return ClusterHealthStatus.RED; - } - - ClusterHealthStatus computeStatus = ClusterHealthStatus.GREEN; - for (String index : clusterState.metadata().getConcreteAllIndices()) { - IndexRoutingTable indexRoutingTable = clusterState.routingTable().index(index); - if (indexRoutingTable.allShardsActive()) { - // GREEN index - continue; - } - - for (int i = 0; i < indexRoutingTable.size(); i++) { - IndexShardRoutingTable indexShardRoutingTable = indexRoutingTable.shard(i); - ShardRouting primary = indexShardRoutingTable.primaryShard(); - if (primary.active()) { - // index has inactive replicas - computeStatus = ClusterHealthStatus.YELLOW; - continue; - } - computeStatus = getInactivePrimaryHealth(primary); - if (computeStatus == ClusterHealthStatus.RED) { - logger.debug("One of inactive primary shard {} causes cluster state RED.", primary.shardId()); - return ClusterHealthStatus.RED; - } - } - } - return computeStatus; - } - public int getActiveShards() { return activeShards; } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java index 4e95f92222073..5472433ba5c7b 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java @@ -20,6 +20,8 @@ import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata; import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata.Type; import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.cluster.routing.IndexRoutingTable; +import org.elasticsearch.cluster.routing.IndexShardRoutingTable; import org.elasticsearch.cluster.routing.RerouteService; import org.elasticsearch.cluster.routing.RoutingNode; import org.elasticsearch.cluster.routing.RoutingNodes; @@ -38,6 +40,7 @@ import org.elasticsearch.common.util.set.Sets; import org.elasticsearch.gateway.GatewayAllocator; import org.elasticsearch.gateway.PriorityComparator; +import org.elasticsearch.rest.RestStatus; import org.elasticsearch.snapshots.SnapshotsInfoService; import java.util.ArrayList; @@ -54,7 +57,7 @@ import static java.util.Collections.emptyList; import static java.util.Collections.singletonList; -import static org.elasticsearch.cluster.health.ClusterStateHealth.getHealthStatus; +import static org.elasticsearch.cluster.health.ClusterShardHealth.getInactivePrimaryHealth; import static org.elasticsearch.cluster.routing.UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING; /** @@ -496,8 +499,8 @@ public ClusterState reroute(ClusterState clusterState, String reason) { } private static void logClusterHealthStateChange(final ClusterState previousState, final ClusterState newState, String reason) { - ClusterHealthStatus previousHealth = getHealthStatus(previousState, logger); - ClusterHealthStatus currentHealth = getHealthStatus(newState, logger); + ClusterHealthStatus previousHealth = getHealthStatus(previousState); + ClusterHealthStatus currentHealth = getHealthStatus(newState); if (previousHealth.equals(currentHealth) == false) { logger.info( @@ -510,6 +513,37 @@ private static void logClusterHealthStateChange(final ClusterState previousState } } + public static ClusterHealthStatus getHealthStatus(final ClusterState clusterState) { + if (clusterState.blocks().hasGlobalBlockWithStatus(RestStatus.SERVICE_UNAVAILABLE)) { + return ClusterHealthStatus.RED; + } + + ClusterHealthStatus computeStatus = ClusterHealthStatus.GREEN; + for (String index : clusterState.metadata().getConcreteAllIndices()) { + IndexRoutingTable indexRoutingTable = clusterState.routingTable().index(index); + if (indexRoutingTable.allShardsActive()) { + // GREEN index + continue; + } + + for (int i = 0; i < indexRoutingTable.size(); i++) { + IndexShardRoutingTable indexShardRoutingTable = indexRoutingTable.shard(i); + ShardRouting primary = indexShardRoutingTable.primaryShard(); + if (primary.active()) { + // index has inactive replicas + computeStatus = ClusterHealthStatus.YELLOW; + continue; + } + computeStatus = getInactivePrimaryHealth(primary); + if (computeStatus == ClusterHealthStatus.RED) { + logger.debug("One of inactive primary shard {} causes cluster state RED.", primary.shardId()); + return ClusterHealthStatus.RED; + } + } + } + return computeStatus; + } + private static boolean hasDeadNodes(RoutingAllocation allocation) { for (RoutingNode routingNode : allocation.routingNodes()) { if (allocation.nodes().getDataNodes().containsKey(routingNode.nodeId()) == false) { From 9d0eecfb15e0943e24f41b8fead292443d38130b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?danielhuang=28=E9=BB=84=E5=8D=8E=29?= Date: Thu, 30 Jun 2022 08:38:49 +0800 Subject: [PATCH 6/7] Update changelog area value. --- docs/changelog/87723.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog/87723.yaml b/docs/changelog/87723.yaml index ff2707a8d9f25..9027c17b74a6c 100644 --- a/docs/changelog/87723.yaml +++ b/docs/changelog/87723.yaml @@ -1,5 +1,5 @@ pr: 87723 summary: Optimize log cluster health performance. -area: Cluster Allocation +area: Allocation type: enhancement issues: [] From 5408aec550fef080f70eb3b17f494ad7bed708e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?danielhuang=28=E9=BB=84=E5=8D=8E=29?= Date: Thu, 30 Jun 2022 19:27:04 +0800 Subject: [PATCH 7/7] Fix NPE issue. --- .../cluster/routing/allocation/AllocationService.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java index 5472433ba5c7b..41093785b0c8d 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java @@ -521,6 +521,9 @@ public static ClusterHealthStatus getHealthStatus(final ClusterState clusterStat ClusterHealthStatus computeStatus = ClusterHealthStatus.GREEN; for (String index : clusterState.metadata().getConcreteAllIndices()) { IndexRoutingTable indexRoutingTable = clusterState.routingTable().index(index); + if (indexRoutingTable == null) { + continue; + } if (indexRoutingTable.allShardsActive()) { // GREEN index continue;