From a0fb86beeda3e38b3333e3fc87d04d2076a8f825 Mon Sep 17 00:00:00 2001 From: Andrei Dan Date: Thu, 14 Nov 2024 17:53:19 +0200 Subject: [PATCH] Fix testSearchAndRelocateConcurrently (#116806) This aims to test we can search through replica shard relocations. However, the way the test was written it was sometimes also starting another data node. The concurrent search requests would sometimes hit this new node, before its cluster state was RECOVERED. The search action throws exception when the cluster state is not recovered as it needs to be able to read the cluster state. This fixes the test to grab a coy of the bootstrapped nodes and use them when calling the _search API before the cluster (potentially) resizes. --- muted-tests.yml | 3 - .../search/basic/SearchWhileRelocatingIT.java | 57 ++++++++++--------- 2 files changed, 31 insertions(+), 29 deletions(-) diff --git a/muted-tests.yml b/muted-tests.yml index bc8585cfddc7..a9dda9ac8078 100644 --- a/muted-tests.yml +++ b/muted-tests.yml @@ -171,9 +171,6 @@ tests: - class: org.elasticsearch.search.basic.SearchWithRandomDisconnectsIT method: testSearchWithRandomDisconnects issue: https://github.com/elastic/elasticsearch/issues/116175 -- class: org.elasticsearch.search.basic.SearchWhileRelocatingIT - method: testSearchAndRelocateConcurrentlyRandomReplicas - issue: https://github.com/elastic/elasticsearch/issues/116145 - class: org.elasticsearch.xpack.deprecation.DeprecationHttpIT method: testDeprecatedSettingsReturnWarnings issue: https://github.com/elastic/elasticsearch/issues/108628 diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/basic/SearchWhileRelocatingIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/basic/SearchWhileRelocatingIT.java index 0d06856ca108..4799b4bec0c8 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/search/basic/SearchWhileRelocatingIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/search/basic/SearchWhileRelocatingIT.java @@ -64,6 +64,8 @@ private void testSearchAndRelocateConcurrently(final int numberOfReplicas) throw } indexRandom(true, indexBuilders.toArray(new IndexRequestBuilder[indexBuilders.size()])); assertHitCount(prepareSearch(), (numDocs)); + // hold a copy of the node names before a new node is potentially added later + String[] nodeNamesBeforeClusterResize = internalCluster().getNodeNames(); final int numIters = scaledRandomIntBetween(5, 20); for (int i = 0; i < numIters; i++) { final AtomicBoolean stop = new AtomicBoolean(false); @@ -76,34 +78,37 @@ private void testSearchAndRelocateConcurrently(final int numberOfReplicas) throw public void run() { try { while (stop.get() == false) { - assertResponse(prepareSearch().setSize(numDocs), response -> { - if (response.getHits().getTotalHits().value() != numDocs) { - // if we did not search all shards but had no serious failures that is potentially fine - // if only the hit-count is wrong. this can happen if the cluster-state is behind when the - // request comes in. It's a small window but a known limitation. - if (response.getTotalShards() != response.getSuccessfulShards() - && Stream.of(response.getShardFailures()) - .allMatch(ssf -> ssf.getCause() instanceof NoShardAvailableActionException)) { - nonCriticalExceptions.add( - "Count is " - + response.getHits().getTotalHits().value() - + " but " - + numDocs - + " was expected. " - + formatShardStatus(response) - ); - } else { - assertHitCount(response, numDocs); + assertResponse( + client(randomFrom(nodeNamesBeforeClusterResize)).prepareSearch().setSize(numDocs), + response -> { + if (response.getHits().getTotalHits().value() != numDocs) { + // if we did not search all shards but had no serious failures that is potentially fine + // if only the hit-count is wrong. this can happen if the cluster-state is behind when the + // request comes in. It's a small window but a known limitation. + if (response.getTotalShards() != response.getSuccessfulShards() + && Stream.of(response.getShardFailures()) + .allMatch(ssf -> ssf.getCause() instanceof NoShardAvailableActionException)) { + nonCriticalExceptions.add( + "Count is " + + response.getHits().getTotalHits().value() + + " but " + + numDocs + + " was expected. " + + formatShardStatus(response) + ); + } else { + assertHitCount(response, numDocs); + } } - } - final SearchHits sh = response.getHits(); - assertThat( - "Expected hits to be the same size the actual hits array", - sh.getTotalHits().value(), - equalTo((long) (sh.getHits().length)) - ); - }); + final SearchHits sh = response.getHits(); + assertThat( + "Expected hits to be the same size the actual hits array", + sh.getTotalHits().value(), + equalTo((long) (sh.getHits().length)) + ); + } + ); // this is the more critical but that we hit the actual hit array has a different size than the // actual number of hits. }