From 7407d94cef9652dd6efdca767a5cf63e2afda55d Mon Sep 17 00:00:00 2001
From: Vivek Bagade <vivek.bagade92@gmail.com>
Date: Thu, 11 Mar 2021 16:07:00 +0100
Subject: [PATCH] Fix bug where a node that becomes ready after 2 mins can be
 treated as unready. Deprecated LongNotStarted

 In cases where node n1 would:
 1) Be created at t=0min
 2) Ready condition is true at t=2.5min
 3) Not ready taint is removed at t=3min
 the ready node is counted as unready

 Tested cases after fix:
 1) Case described above
 2) Nodes not starting even after 15mins still
 treated as unready
 3) Nodes created long ago that suddenly become unready are
 counted as unready.
---
 .../clusterstate/clusterstate.go              |  59 ++-------
 .../clusterstate/clusterstate_test.go         | 124 +++++++++++-------
 cluster-autoscaler/core/utils/utils.go        |   2 +-
 .../utils/kubernetes/ready_test.go            |  75 +++++++++++
 cluster-autoscaler/utils/test/test_utils.go   |  17 +++
 5 files changed, 174 insertions(+), 103 deletions(-)
 create mode 100644 cluster-autoscaler/utils/kubernetes/ready_test.go

diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
index 86103e445084..08126c2a9d74 100644
--- a/cluster-autoscaler/clusterstate/clusterstate.go
+++ b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -43,10 +43,6 @@ const (
 	// MaxNodeStartupTime is the maximum time from the moment the node is registered to the time the node is ready.
 	MaxNodeStartupTime = 15 * time.Minute
 
-	// MaxStatusSettingDelayAfterCreation is the maximum time for node to set its initial status after the
-	// node is registered.
-	MaxStatusSettingDelayAfterCreation = 2 * time.Minute
-
 	// MaxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.
 	MaxNodeGroupBackoffDuration = 30 * time.Minute
 
@@ -399,7 +395,7 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {
 
 	if unjustifiedUnready > csr.config.OkTotalUnreadyCount &&
 		float64(unjustifiedUnready) > csr.config.MaxTotalUnreadyPercentage/100.0*
-			float64(readiness.Ready+readiness.Unready+readiness.NotStarted+readiness.LongNotStarted) {
+			float64(readiness.Ready+readiness.Unready+readiness.NotStarted) {
 		return false
 	}
 
@@ -452,7 +448,7 @@ func (csr *ClusterStateRegistry) getProvisionedAndTargetSizesForNodeGroup(nodeGr
 		}
 		return 0, target, true
 	}
-	provisioned = readiness.Registered - readiness.NotStarted - readiness.LongNotStarted
+	provisioned = readiness.Registered - readiness.NotStarted
 
 	return provisioned, target, true
 }
@@ -531,8 +527,6 @@ type Readiness struct {
 	// Number of nodes that are being currently deleted. They exist in K8S but
 	// are not included in NodeGroup.TargetSize().
 	Deleted int
-	// Number of nodes that failed to start within a reasonable limit.
-	LongNotStarted int
 	// Number of nodes that are not yet fully started.
 	NotStarted int
 	// Number of all registered nodes in the group (ready/unready/deleted/etc).
@@ -554,12 +548,10 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
 		current.Registered++
 		if deletetaint.HasToBeDeletedTaint(node) {
 			current.Deleted++
-		} else if stillStarting := isNodeStillStarting(node); stillStarting && node.CreationTimestamp.Time.Add(MaxNodeStartupTime).Before(currentTime) {
-			current.LongNotStarted++
-		} else if stillStarting {
-			current.NotStarted++
 		} else if ready {
 			current.Ready++
+		} else if node.CreationTimestamp.Time.Add(MaxNodeStartupTime).After(currentTime) {
+			current.NotStarted++
 		} else {
 			current.Unready++
 		}
@@ -743,11 +735,10 @@ func (csr *ClusterStateRegistry) GetClusterReadiness() Readiness {
 func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int) api.ClusterAutoscalerCondition {
 	condition := api.ClusterAutoscalerCondition{
 		Type: api.ClusterAutoscalerHealth,
-		Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=%d registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
+		Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
 			readiness.Ready,
 			readiness.Unready,
 			readiness.NotStarted,
-			readiness.LongNotStarted,
 			readiness.Registered,
 			readiness.LongUnregistered,
 			acceptable.CurrentTarget,
@@ -798,11 +789,10 @@ func buildScaleDownStatusNodeGroup(candidates []string, lastProbed time.Time) ap
 func buildHealthStatusClusterwide(isReady bool, readiness Readiness) api.ClusterAutoscalerCondition {
 	condition := api.ClusterAutoscalerCondition{
 		Type: api.ClusterAutoscalerHealth,
-		Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=%d registered=%d longUnregistered=%d",
+		Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d",
 			readiness.Ready,
 			readiness.Unready,
 			readiness.NotStarted,
-			readiness.LongNotStarted,
 			readiness.Registered,
 			readiness.LongUnregistered,
 		),
@@ -860,39 +850,6 @@ func buildScaleDownStatusClusterwide(candidates map[string][]string, lastProbed
 	return condition
 }
 
-func hasTaint(node *apiv1.Node, taintKey string) bool {
-	for _, taint := range node.Spec.Taints {
-		if taint.Key == taintKey {
-			return true
-		}
-	}
-	return false
-}
-
-func isNodeStillStarting(node *apiv1.Node) bool {
-	for _, condition := range node.Status.Conditions {
-		if condition.Type == apiv1.NodeReady {
-			notReady := condition.Status != apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNotReady)
-			if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
-				return true
-			}
-		}
-		if condition.Type == apiv1.NodeDiskPressure {
-			notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeDiskPressure)
-			if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
-				return true
-			}
-		}
-		if condition.Type == apiv1.NodeNetworkUnavailable {
-			notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNetworkUnavailable)
-			if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
-				return true
-			}
-		}
-	}
-	return false
-}
-
 func updateLastTransition(oldStatus, newStatus *api.ClusterAutoscalerStatus) {
 	newStatus.ClusterwideConditions = updateLastTransitionSingleList(
 		oldStatus.ClusterwideConditions, newStatus.ClusterwideConditions)
@@ -955,7 +912,7 @@ func (csr *ClusterStateRegistry) GetUpcomingNodes() map[string]int {
 		readiness := csr.perNodeGroupReadiness[id]
 		ar := csr.acceptableRanges[id]
 		// newNodes is the number of nodes that
-		newNodes := ar.CurrentTarget - (readiness.Ready + readiness.Unready + readiness.LongNotStarted + readiness.LongUnregistered)
+		newNodes := ar.CurrentTarget - (readiness.Ready + readiness.Unready + readiness.LongUnregistered)
 		if newNodes <= 0 {
 			// Negative value is unlikely but theoretically possible.
 			continue
@@ -1006,7 +963,7 @@ func (csr *ClusterStateRegistry) GetAutoscaledNodesCount() (currentSize, targetS
 		targetSize += accRange.CurrentTarget
 	}
 	for _, readiness := range csr.perNodeGroupReadiness {
-		currentSize += readiness.Registered - readiness.NotStarted - readiness.LongNotStarted
+		currentSize += readiness.Registered - readiness.NotStarted
 	}
 	return currentSize, targetSize
 }
diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go
index 02cc844d0ec9..1006654b2007 100644
--- a/cluster-autoscaler/clusterstate/clusterstate_test.go
+++ b/cluster-autoscaler/clusterstate/clusterstate_test.go
@@ -319,6 +319,79 @@ func TestTooManyUnready(t *testing.T) {
 	assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
 }
 
+func TestUnreadyLongAfterCreation(t *testing.T) {
+	now := time.Now()
+
+	ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
+	SetNodeReadyState(ng1_1, true, now.Add(-time.Minute))
+	ng2_1 := BuildTestNode("ng2-1", 1000, 1000)
+	SetNodeReadyState(ng2_1, false, now.Add(-time.Minute))
+	ng2_1.CreationTimestamp = metav1.Time{Time: now.Add(-30 * time.Minute)}
+
+	provider := testprovider.NewTestCloudProvider(nil, nil)
+	provider.AddNodeGroup("ng1", 1, 10, 1)
+	provider.AddNodeGroup("ng2", 1, 10, 1)
+	provider.AddNode("ng1", ng1_1)
+	provider.AddNode("ng2", ng2_1)
+
+	assert.NotNil(t, provider)
+	fakeClient := &fake.Clientset{}
+	fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "some-map")
+	clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
+		MaxTotalUnreadyPercentage: 10,
+		OkTotalUnreadyCount:       1,
+	}, fakeLogRecorder, newBackoff())
+	err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
+	assert.NoError(t, err)
+	assert.Equal(t, 1, clusterstate.GetClusterReadiness().Unready)
+	assert.Equal(t, 0, clusterstate.GetClusterReadiness().NotStarted)
+	upcoming := clusterstate.GetUpcomingNodes()
+	assert.Equal(t, 0, upcoming["ng1"])
+}
+
+func TestNotStarted(t *testing.T) {
+	now := time.Now()
+
+	ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
+	SetNodeReadyState(ng1_1, true, now.Add(-time.Minute))
+	ng2_1 := BuildTestNode("ng2-1", 1000, 1000)
+	SetNodeReadyState(ng2_1, false, now.Add(-4*time.Minute))
+	SetNodeNotReadyTaint(ng2_1)
+	ng2_1.CreationTimestamp = metav1.Time{Time: now.Add(-10 * time.Minute)}
+
+	provider := testprovider.NewTestCloudProvider(nil, nil)
+	provider.AddNodeGroup("ng1", 1, 10, 1)
+	provider.AddNodeGroup("ng2", 1, 10, 1)
+	provider.AddNode("ng1", ng1_1)
+	provider.AddNode("ng2", ng2_1)
+
+	assert.NotNil(t, provider)
+	fakeClient := &fake.Clientset{}
+	fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "some-map")
+	clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
+		MaxTotalUnreadyPercentage: 10,
+		OkTotalUnreadyCount:       1,
+	}, fakeLogRecorder, newBackoff())
+	err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
+	assert.NoError(t, err)
+	assert.Equal(t, 1, clusterstate.GetClusterReadiness().NotStarted)
+	assert.Equal(t, 1, clusterstate.GetClusterReadiness().Ready)
+
+	// node ng2_1 moves condition to ready
+	SetNodeReadyState(ng2_1, true, now.Add(-4*time.Minute))
+	err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
+	assert.NoError(t, err)
+	assert.Equal(t, 1, clusterstate.GetClusterReadiness().NotStarted)
+	assert.Equal(t, 1, clusterstate.GetClusterReadiness().Ready)
+
+	// node ng2_1 no longer has the taint
+	RemoveNodeNotReadyTaint(ng2_1)
+	err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
+	assert.NoError(t, err)
+	assert.Equal(t, 0, clusterstate.GetClusterReadiness().NotStarted)
+	assert.Equal(t, 2, clusterstate.GetClusterReadiness().Ready)
+}
+
 func TestExpiredScaleUp(t *testing.T) {
 	now := time.Now()
 
@@ -768,57 +841,6 @@ func TestUpdateScaleUp(t *testing.T) {
 	assert.Nil(t, clusterstate.scaleUpRequests["ng1"])
 }
 
-func TestIsNodeStillStarting(t *testing.T) {
-	testCases := []struct {
-		desc           string
-		condition      apiv1.NodeConditionType
-		status         apiv1.ConditionStatus
-		taintKey       string
-		expectedResult bool
-	}{
-		{"unready", apiv1.NodeReady, apiv1.ConditionFalse, "", true},
-		{"readiness unknown", apiv1.NodeReady, apiv1.ConditionUnknown, "", true},
-		{"out of disk", apiv1.NodeDiskPressure, apiv1.ConditionTrue, "", true},
-		{"network unavailable", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, "", true},
-		{"started", apiv1.NodeReady, apiv1.ConditionTrue, "", false},
-		{"unready and unready taint", apiv1.NodeReady, apiv1.ConditionFalse, apiv1.TaintNodeNotReady, true},
-		{"readiness unknown and unready taint", apiv1.NodeReady, apiv1.ConditionUnknown, apiv1.TaintNodeNotReady, true},
-		{"disk pressure and disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionTrue, apiv1.TaintNodeDiskPressure, true},
-		{"network unavailable and network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, apiv1.TaintNodeNetworkUnavailable, true},
-		{"ready but unready taint", apiv1.NodeReady, apiv1.ConditionTrue, apiv1.TaintNodeNotReady, true},
-		{"no disk pressure but disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionFalse, apiv1.TaintNodeDiskPressure, true},
-		{"network available but network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionFalse, apiv1.TaintNodeNetworkUnavailable, true},
-	}
-	for _, tc := range testCases {
-		createTestNode := func(timeSinceCreation time.Duration) *apiv1.Node {
-			node := BuildTestNode("n1", 1000, 1000)
-			node.CreationTimestamp.Time = time.Time{}
-			testedTime := node.CreationTimestamp.Time.Add(timeSinceCreation)
-
-			SetNodeCondition(node, tc.condition, tc.status, testedTime)
-
-			if tc.taintKey != "" {
-				node.Spec.Taints = []apiv1.Taint{{
-					Key:       tc.taintKey,
-					Effect:    apiv1.TaintEffectNoSchedule,
-					TimeAdded: &metav1.Time{Time: testedTime},
-				}}
-			}
-
-			return node
-		}
-		t.Run("recent "+tc.desc, func(t *testing.T) {
-			node := createTestNode(1 * time.Minute)
-			assert.Equal(t, tc.expectedResult, isNodeStillStarting(node))
-		})
-		t.Run("long "+tc.desc, func(t *testing.T) {
-			node := createTestNode(30 * time.Minute)
-			// No matter what are the node's conditions, stop considering it not started after long enough.
-			assert.False(t, isNodeStillStarting(node))
-		})
-	}
-}
-
 func TestScaleUpFailures(t *testing.T) {
 	now := time.Now()
 
diff --git a/cluster-autoscaler/core/utils/utils.go b/cluster-autoscaler/core/utils/utils.go
index 6fd8354c23c4..c60b050c2c4d 100644
--- a/cluster-autoscaler/core/utils/utils.go
+++ b/cluster-autoscaler/core/utils/utils.go
@@ -312,7 +312,7 @@ func UpdateClusterStateMetrics(csr *clusterstate.ClusterStateRegistry) {
 	}
 	metrics.UpdateClusterSafeToAutoscale(csr.IsClusterHealthy())
 	readiness := csr.GetClusterReadiness()
-	metrics.UpdateNodesCount(readiness.Ready, readiness.Unready+readiness.LongNotStarted, readiness.NotStarted, readiness.LongUnregistered, readiness.Unregistered)
+	metrics.UpdateNodesCount(readiness.Ready, readiness.Unready, readiness.NotStarted, readiness.LongUnregistered, readiness.Unregistered)
 }
 
 // GetOldestCreateTime returns oldest creation time out of the pods in the set
diff --git a/cluster-autoscaler/utils/kubernetes/ready_test.go b/cluster-autoscaler/utils/kubernetes/ready_test.go
new file mode 100644
index 000000000000..bbb1157b125f
--- /dev/null
+++ b/cluster-autoscaler/utils/kubernetes/ready_test.go
@@ -0,0 +1,75 @@
+/*
+Copyright 2021 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	apiv1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
+)
+
+func TestGetReadiness(t *testing.T) {
+	testCases := []struct {
+		desc           string
+		condition      apiv1.NodeConditionType
+		status         apiv1.ConditionStatus
+		taintKey       string
+		expectedResult bool
+	}{
+		{"ready", apiv1.NodeReady, apiv1.ConditionTrue, "", true},
+		{"unready and unready taint", apiv1.NodeReady, apiv1.ConditionFalse, apiv1.TaintNodeNotReady, false},
+		{"readiness unknown and unready taint", apiv1.NodeReady, apiv1.ConditionUnknown, apiv1.TaintNodeNotReady, false},
+		{"disk pressure and disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionTrue, apiv1.TaintNodeDiskPressure, false},
+		{"network unavailable and network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, apiv1.TaintNodeNetworkUnavailable, false},
+		{"ready but unready taint", apiv1.NodeReady, apiv1.ConditionTrue, apiv1.TaintNodeNotReady, false},
+		{"no disk pressure but disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionFalse, apiv1.TaintNodeDiskPressure, false},
+		{"network available but network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionFalse, apiv1.TaintNodeNetworkUnavailable, false},
+	}
+	for _, tc := range testCases {
+		createTestNode := func(timeSinceCreation time.Duration) *apiv1.Node {
+			node := BuildTestNode("n1", 1000, 1000)
+			node.CreationTimestamp.Time = time.Time{}
+			testedTime := node.CreationTimestamp.Time.Add(timeSinceCreation)
+
+			SetNodeCondition(node, tc.condition, tc.status, testedTime)
+			if tc.condition != apiv1.NodeReady {
+				SetNodeCondition(node, apiv1.NodeReady, apiv1.ConditionTrue, testedTime)
+			}
+
+			if tc.taintKey != "" {
+				node.Spec.Taints = []apiv1.Taint{{
+					Key:       tc.taintKey,
+					Effect:    apiv1.TaintEffectNoSchedule,
+					TimeAdded: &metav1.Time{Time: testedTime},
+				}}
+			}
+
+			return node
+		}
+		t.Run(tc.desc, func(t *testing.T) {
+			node := createTestNode(1 * time.Minute)
+			isReady, _, err := GetReadinessState(node)
+			assert.NoError(t, err)
+			assert.Equal(t, tc.expectedResult, isReady)
+		})
+	}
+}
diff --git a/cluster-autoscaler/utils/test/test_utils.go b/cluster-autoscaler/utils/test/test_utils.go
index 8f4e6cef4e0a..b161bf23ef24 100644
--- a/cluster-autoscaler/utils/test/test_utils.go
+++ b/cluster-autoscaler/utils/test/test_utils.go
@@ -162,6 +162,23 @@ func SetNodeReadyState(node *apiv1.Node, ready bool, lastTransition time.Time) {
 	}
 }
 
+// SetNodeNotReadyTaint sets the not ready taint on node.
+func SetNodeNotReadyTaint(node *apiv1.Node) {
+	node.Spec.Taints = append(node.Spec.Taints, apiv1.Taint{Key: apiv1.TaintNodeNotReady, Effect: apiv1.TaintEffectNoSchedule})
+}
+
+// RemoveNodeNotReadyTaint removes the not ready taint.
+func RemoveNodeNotReadyTaint(node *apiv1.Node) {
+	var final []apiv1.Taint
+	for i := range node.Spec.Taints {
+		if node.Spec.Taints[i].Key == apiv1.TaintNodeNotReady {
+			continue
+		}
+		final = append(final, node.Spec.Taints[i])
+	}
+	node.Spec.Taints = final
+}
+
 // SetNodeCondition sets node condition.
 func SetNodeCondition(node *apiv1.Node, conditionType apiv1.NodeConditionType, status apiv1.ConditionStatus, lastTransition time.Time) {
 	for i := range node.Status.Conditions {