Skip to content

Commit

Permalink
Fix flaky DecommissionControllerTests.testTimesOut (#4683) (#4688) (#…
Browse files Browse the repository at this point in the history
…5206)

This test fails pretty reliably if I run it on repeat. I believe the
problem is that the test assumes the function will take longer than 2ms,
which is likely not a valid assumption in all cases. Fortunately, I can
pass in a zero duration which is guaranteed to timeout even if the
system clock does not advance at all.

Also moved the assertions out of the callback into the main test method,
otherwise the assertion error messages would get buried and the test
report would just show a timeout error.

Signed-off-by: Andrew Ross <[email protected]>
  • Loading branch information
andrross authored Nov 10, 2022
1 parent 506d6e3 commit 737536e
Showing 1 changed file with 11 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

package org.opensearch.cluster.decommission;

import org.hamcrest.MatcherAssert;
import org.junit.After;
import org.junit.Before;
import org.opensearch.OpenSearchTimeoutException;
Expand Down Expand Up @@ -45,6 +46,7 @@
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

Expand All @@ -53,6 +55,7 @@
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.empty;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.sameInstance;
import static org.opensearch.cluster.ClusterState.builder;
import static org.opensearch.cluster.OpenSearchAllocationTestCase.createAllocationService;
Expand Down Expand Up @@ -213,25 +216,28 @@ public void testTimesOut() throws InterruptedException {
nodesToBeRemoved.add(clusterService.state().nodes().get("node13"));
nodesToBeRemoved.add(clusterService.state().nodes().get("node14"));
nodesToBeRemoved.add(clusterService.state().nodes().get("node15"));
final AtomicReference<Exception> exceptionReference = new AtomicReference<>();
decommissionController.removeDecommissionedNodes(
nodesToBeRemoved,
"unit-test-timeout",
TimeValue.timeValueMillis(2),
new ActionListener<Void>() {
TimeValue.timeValueMillis(0),
new ActionListener<>() {
@Override
public void onResponse(Void unused) {
fail("response shouldn't have been called");
countDownLatch.countDown();
}

@Override
public void onFailure(Exception e) {
assertThat(e, instanceOf(OpenSearchTimeoutException.class));
assertThat(e.getMessage(), containsString("waiting for removal of decommissioned nodes"));
exceptionReference.set(e);
countDownLatch.countDown();
}
}
);
assertTrue(countDownLatch.await(30, TimeUnit.SECONDS));
MatcherAssert.assertThat("Expected onFailure to be called", exceptionReference.get(), notNullValue());
MatcherAssert.assertThat(exceptionReference.get(), instanceOf(OpenSearchTimeoutException.class));
MatcherAssert.assertThat(exceptionReference.get().getMessage(), containsString("waiting for removal of decommissioned nodes"));
}

public void testSuccessfulDecommissionStatusMetadataUpdate() throws InterruptedException {
Expand Down

0 comments on commit 737536e

Please sign in to comment.