Skip to content

Commit

Permalink
CA: fix flakiness in actuation.TestStartDeletion
Browse files Browse the repository at this point in the history
Part of the test verifies if all taint updates happened as expected.
The taints are verified asynchronously, and the test waits for exactly
as many taint updates as defined in the test case. A couple of test
cases were missing some expected updates (clearing the taint if
drain/deletion fails). The test could randomly fail if one of the
missing updates happened to apear before one of the expected updates.
This commit adds the missing expected updates, all should be accounted
for now.

This commit also adds a sync point to wait for all expected node
deletion results before asserting them. Without it, the test would
sometimes move to the assertion before the results were actually
reported.
  • Loading branch information
towca committed Jun 6, 2022
1 parent 82bc463 commit c707c53
Showing 1 changed file with 26 additions and 0 deletions.
26 changes: 26 additions & 0 deletions cluster-autoscaler/core/scaledown/actuation/actuator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -538,12 +538,14 @@ func TestStartDeletion(t *testing.T) {
wantTaintUpdates: map[string][][]apiv1.Taint{
"drain-node-0": {
{toBeDeletedTaint},
{},
},
"drain-node-1": {
{toBeDeletedTaint},
},
"drain-node-2": {
{toBeDeletedTaint},
{},
},
"drain-node-3": {
{toBeDeletedTaint},
Expand Down Expand Up @@ -623,12 +625,14 @@ func TestStartDeletion(t *testing.T) {
},
"empty-node-1": {
{toBeDeletedTaint},
{},
},
"drain-node-0": {
{toBeDeletedTaint},
},
"drain-node-1": {
{toBeDeletedTaint},
{},
},
},
wantNodeDeleteResults: map[string]status.NodeDeleteResult{
Expand Down Expand Up @@ -911,6 +915,15 @@ func TestStartDeletion(t *testing.T) {
t.Errorf("taintUpdates diff (-want +got):\n%s", diff)
}

// Wait for all expected deletions to be reported in NodeDeletionTracker. Reporting happens shortly after the deletion
// in cloud provider we sync to above and so this will usually not wait at all. However, it can still happen
// that there is a delay between cloud provider deletion and reporting, in which case the results are not there yet
// and we need to wait for them before asserting.
err = waitForDeletionResultsCount(actuator.nodeDeletionTracker, len(tc.wantNodeDeleteResults), 3*time.Second, 200*time.Millisecond)
if err != nil {
t.Errorf("Timeout while waiting for node deletion results")
}

// Run StartDeletion again to gather node deletion results for deletions started in the previous call, and verify
// that they look as expected.
gotNextStatus, gotNextErr := actuator.StartDeletion(nil, nil, time.Now())
Expand Down Expand Up @@ -1002,3 +1015,16 @@ func generateUtilInfo(cpuUtil, memUtil float64) utilization.Info {
Utilization: higherUtilVal,
}
}

func waitForDeletionResultsCount(ndt *deletiontracker.NodeDeletionTracker, resultsCount int, timeout, retryTime time.Duration) error {
// This is quite ugly, but shouldn't matter much since in most cases there shouldn't be a need to wait at all, and
// the function should return quickly after the first if check.
// An alternative could be to turn NodeDeletionTracker into an interface, and use an implementation which allows
// synchronizing calls to EndDeletion in the test code.
for retryUntil := time.Now().Add(timeout); time.Now().Before(retryUntil); time.Sleep(retryTime) {
if results, _ := ndt.DeletionResults(); len(results) == resultsCount {
return nil
}
}
return fmt.Errorf("timed out while waiting for node deletion results")
}

0 comments on commit c707c53

Please sign in to comment.