From c707c53160d772b8c0a9f71ca7c19ecd6ab8fec9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kuba=20Tu=C5=BCnik?= <jtuznik@google.com>
Date: Mon, 6 Jun 2022 16:24:52 +0200
Subject: [PATCH] CA: fix flakiness in actuation.TestStartDeletion

Part of the test verifies if all taint updates happened as expected.
The taints are verified asynchronously, and the test waits for exactly
as many taint updates as defined in the test case. A couple of test
cases were missing some expected updates (clearing the taint if
drain/deletion fails). The test could randomly fail if one of the
missing updates happened to apear before one of the expected updates.
This commit adds the missing expected updates, all should be accounted
for now.

This commit also adds a sync point to wait for all expected node
deletion results before asserting them. Without it, the test would
sometimes move to the assertion before the results were actually
reported.
---
 .../core/scaledown/actuation/actuator_test.go | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go
index 587ce02b9535..1c7b7bc0f50f 100644
--- a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go
+++ b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go
@@ -538,12 +538,14 @@ func TestStartDeletion(t *testing.T) {
 			wantTaintUpdates: map[string][][]apiv1.Taint{
 				"drain-node-0": {
 					{toBeDeletedTaint},
+					{},
 				},
 				"drain-node-1": {
 					{toBeDeletedTaint},
 				},
 				"drain-node-2": {
 					{toBeDeletedTaint},
+					{},
 				},
 				"drain-node-3": {
 					{toBeDeletedTaint},
@@ -623,12 +625,14 @@ func TestStartDeletion(t *testing.T) {
 				},
 				"empty-node-1": {
 					{toBeDeletedTaint},
+					{},
 				},
 				"drain-node-0": {
 					{toBeDeletedTaint},
 				},
 				"drain-node-1": {
 					{toBeDeletedTaint},
+					{},
 				},
 			},
 			wantNodeDeleteResults: map[string]status.NodeDeleteResult{
@@ -911,6 +915,15 @@ func TestStartDeletion(t *testing.T) {
 				t.Errorf("taintUpdates diff (-want +got):\n%s", diff)
 			}
 
+			// Wait for all expected deletions to be reported in NodeDeletionTracker. Reporting happens shortly after the deletion
+			// in cloud provider we sync to above and so this will usually not wait at all. However, it can still happen
+			// that there is a delay between cloud provider deletion and reporting, in which case the results are not there yet
+			// and we need to wait for them before asserting.
+			err = waitForDeletionResultsCount(actuator.nodeDeletionTracker, len(tc.wantNodeDeleteResults), 3*time.Second, 200*time.Millisecond)
+			if err != nil {
+				t.Errorf("Timeout while waiting for node deletion results")
+			}
+
 			// Run StartDeletion again to gather node deletion results for deletions started in the previous call, and verify
 			// that they look as expected.
 			gotNextStatus, gotNextErr := actuator.StartDeletion(nil, nil, time.Now())
@@ -1002,3 +1015,16 @@ func generateUtilInfo(cpuUtil, memUtil float64) utilization.Info {
 		Utilization:  higherUtilVal,
 	}
 }
+
+func waitForDeletionResultsCount(ndt *deletiontracker.NodeDeletionTracker, resultsCount int, timeout, retryTime time.Duration) error {
+	// This is quite ugly, but shouldn't matter much since in most cases there shouldn't be a need to wait at all, and
+	// the function should return quickly after the first if check.
+	// An alternative could be to turn NodeDeletionTracker into an interface, and use an implementation which allows
+	// synchronizing calls to EndDeletion in the test code.
+	for retryUntil := time.Now().Add(timeout); time.Now().Before(retryUntil); time.Sleep(retryTime) {
+		if results, _ := ndt.DeletionResults(); len(results) == resultsCount {
+			return nil
+		}
+	}
+	return fmt.Errorf("timed out while waiting for node deletion results")
+}