From 420825c3b5fbf04bedc4132d4177ece95a69585c Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Thu, 21 Nov 2019 17:37:41 +0100 Subject: [PATCH] Strengthen validateClusterFormed check (#49248) Strengthens the validateClusterFormed check that is used by the test infrastructure to make sure that nodes are properly connected and know about each other. Is used in situations where the cluster is scaled up and down, and where there previously was a network disruption that has been healed. Closes #49243 --- .../test/InternalTestCluster.java | 43 +++++++++++-------- .../test/disruption/NetworkDisruption.java | 8 +--- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java b/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java index f5e1535a2a4c5..991e7f58d15ed 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java +++ b/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java @@ -161,9 +161,9 @@ import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.greaterThanOrEqualTo; -import static org.junit.Assert.assertEquals; import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.nullValue; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; @@ -1239,29 +1239,35 @@ private synchronized void reset(boolean wipeData) throws IOException { /** ensure a cluster is formed with all published nodes. */ public synchronized void validateClusterFormed() { - String name = randomFrom(random, getNodeNames()); - validateClusterFormed(name); - } - - /** ensure a cluster is formed with all published nodes, but do so by using the client of the specified node */ - private synchronized void validateClusterFormed(String viaNode) { - Set expectedNodes = new HashSet<>(); + final Set expectedNodes = new HashSet<>(); for (NodeAndClient nodeAndClient : nodes.values()) { expectedNodes.add(getInstanceFromNode(ClusterService.class, nodeAndClient.node()).localNode()); } - logger.trace("validating cluster formed via [{}], expecting {}", viaNode, expectedNodes); - final Client client = client(viaNode); + logger.trace("validating cluster formed, expecting {}", expectedNodes); + try { assertBusy(() -> { - DiscoveryNodes discoveryNodes = client.admin().cluster().prepareState().get().getState().nodes(); - assertEquals(expectedNodes.size(), discoveryNodes.getSize()); - for (DiscoveryNode expectedNode : expectedNodes) { - assertTrue("Expected node to exist: " + expectedNode, discoveryNodes.nodeExists(expectedNode)); - } + final List states = nodes.values().stream() + .map(node -> getInstanceFromNode(ClusterService.class, node.node())) + .map(ClusterService::state) + .collect(Collectors.toList()); + final String debugString = ", expected nodes: " + expectedNodes + " and actual cluster states " + states; + // all nodes have a master + assertTrue("Missing master" + debugString, states.stream().allMatch(cs -> cs.nodes().getMasterNodeId() != null)); + // all nodes have the same master (in same term) + assertEquals("Not all masters in same term" + debugString, 1, + states.stream().mapToLong(ClusterState::term).distinct().count()); + // all nodes know about all other nodes + states.forEach(cs -> { + DiscoveryNodes discoveryNodes = cs.nodes(); + assertEquals("Node size mismatch" + debugString, expectedNodes.size(), discoveryNodes.getSize()); + for (DiscoveryNode expectedNode : expectedNodes) { + assertTrue("Expected node to exist: " + expectedNode + debugString, discoveryNodes.nodeExists(expectedNode)); + } + }); }, 30, TimeUnit.SECONDS); } catch (AssertionError ae) { - throw new IllegalStateException("cluster failed to form with expected nodes " + expectedNodes + " and actual nodes " + - client.admin().cluster().prepareState().get().getState().nodes()); + throw new IllegalStateException("cluster failed to form", ae); } catch (Exception e) { throw new IllegalStateException(e); } @@ -1821,8 +1827,7 @@ private void restartNode(NodeAndClient nodeAndClient, RestartCallback callback) if (callback.validateClusterForming() || excludedNodeIds.isEmpty() == false) { // we have to validate cluster size to ensure that the restarted node has rejoined the cluster if it was master-eligible; - // we have to do this via the node that was just restarted as it may be that the master didn't yet process the fact that it left - validateClusterFormed(nodeAndClient.name); + validateClusterFormed(); } if (excludedNodeIds.isEmpty() == false) { diff --git a/test/framework/src/main/java/org/elasticsearch/test/disruption/NetworkDisruption.java b/test/framework/src/main/java/org/elasticsearch/test/disruption/NetworkDisruption.java index 66514b1b274ac..cce5ed2ee2882 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/disruption/NetworkDisruption.java +++ b/test/framework/src/main/java/org/elasticsearch/test/disruption/NetworkDisruption.java @@ -20,7 +20,6 @@ package org.elasticsearch.test.disruption; import com.carrotsearch.randomizedtesting.generators.RandomPicks; - import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.cluster.ClusterState; @@ -41,8 +40,6 @@ import java.util.concurrent.CountDownLatch; import java.util.function.BiConsumer; -import static org.junit.Assert.assertFalse; - /** * Network disruptions are modeled using two components: * 1) the {@link DisruptedLinks} represents the links in the network that are to be disrupted @@ -119,10 +116,7 @@ public static void ensureFullyConnectedCluster(InternalTestCluster cluster) { } protected void ensureNodeCount(InternalTestCluster cluster) { - assertFalse("cluster failed to form after disruption was healed", cluster.client().admin().cluster().prepareHealth() - .setWaitForNodes(String.valueOf(cluster.size())) - .setWaitForNoRelocatingShards(true) - .get().isTimedOut()); + cluster.validateClusterFormed(); } @Override