CA: fix flakiness in actuation.TestStartDeletion

Part of the test verifies if all taint updates happened as expected. The taints are verified asynchronously, and the test waits for exactly as many taint updates as defined in the test case. A couple of test cases were missing some expected updates (clearing the taint if drain/deletion fails). The test could randomly fail if one of the missing updates happened to apear before one of the expected updates. This commit adds the missing expected updates, all should be accounted for now. This commit also adds a sync point to wait for all expected node deletion results before asserting them. Without it, the test would sometimes move to the assertion before the results were actually reported.
navinjoy · Jun 6, 2022 · c707c53 · c707c53
1 parent 82bc463
commit c707c53
Showing 1 changed file with 26 additions and 0 deletions.
diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go
@@ -538,12 +538,14 @@ func TestStartDeletion(t *testing.T) {
 			wantTaintUpdates: map[string][][]apiv1.Taint{
 				"drain-node-0": {
 					{toBeDeletedTaint},
+					{},
 				},
 				"drain-node-1": {
 					{toBeDeletedTaint},
 				},
 				"drain-node-2": {
 					{toBeDeletedTaint},
+					{},
 				},
 				"drain-node-3": {
 					{toBeDeletedTaint},
@@ -623,12 +625,14 @@ func TestStartDeletion(t *testing.T) {
 				},
 				"empty-node-1": {
 					{toBeDeletedTaint},
+					{},
 				},
 				"drain-node-0": {
 					{toBeDeletedTaint},
 				},
 				"drain-node-1": {
 					{toBeDeletedTaint},
+					{},
 				},
 			},
 			wantNodeDeleteResults: map[string]status.NodeDeleteResult{
@@ -911,6 +915,15 @@ func TestStartDeletion(t *testing.T) {
 				t.Errorf("taintUpdates diff (-want +got):\n%s", diff)
 			}
 
+			// Wait for all expected deletions to be reported in NodeDeletionTracker. Reporting happens shortly after the deletion
+			// in cloud provider we sync to above and so this will usually not wait at all. However, it can still happen
+			// that there is a delay between cloud provider deletion and reporting, in which case the results are not there yet
+			// and we need to wait for them before asserting.
+			err = waitForDeletionResultsCount(actuator.nodeDeletionTracker, len(tc.wantNodeDeleteResults), 3*time.Second, 200*time.Millisecond)
+			if err != nil {
+				t.Errorf("Timeout while waiting for node deletion results")
+			}
+
 			// Run StartDeletion again to gather node deletion results for deletions started in the previous call, and verify
 			// that they look as expected.
 			gotNextStatus, gotNextErr := actuator.StartDeletion(nil, nil, time.Now())
@@ -1002,3 +1015,16 @@ func generateUtilInfo(cpuUtil, memUtil float64) utilization.Info {
 		Utilization:  higherUtilVal,
 	}
 }
+
+func waitForDeletionResultsCount(ndt *deletiontracker.NodeDeletionTracker, resultsCount int, timeout, retryTime time.Duration) error {
+	// This is quite ugly, but shouldn't matter much since in most cases there shouldn't be a need to wait at all, and
+	// the function should return quickly after the first if check.
+	// An alternative could be to turn NodeDeletionTracker into an interface, and use an implementation which allows
+	// synchronizing calls to EndDeletion in the test code.
+	for retryUntil := time.Now().Add(timeout); time.Now().Before(retryUntil); time.Sleep(retryTime) {
+		if results, _ := ndt.DeletionResults(); len(results) == resultsCount {
+			return nil
+		}
+	}
+	return fmt.Errorf("timed out while waiting for node deletion results")
+}