From 8c592f0c04d954c475228599884049d31a79f5ba Mon Sep 17 00:00:00 2001 From: Vivek Bagade Date: Thu, 11 Mar 2021 16:07:00 +0100 Subject: [PATCH] Fix bug where a node that becomes ready after 2 mins can be treated as unready. Deprecated LongNotStarted In cases where node n1 would: 1) Be created at t=0min 2) Ready condition is true at t=2.5min 3) Not ready taint is removed at t=3min the ready node is counted as unready Tested cases after fix: 1) Case described above 2) Nodes not starting even after 15mins still treated as unready 3) Nodes created long ago that suddenly become unready are counted as unready. --- .../clusterstate/clusterstate.go | 59 ++------- .../clusterstate/clusterstate_test.go | 124 +++++++++++------- cluster-autoscaler/core/utils/utils.go | 2 +- .../utils/kubernetes/ready_test.go | 75 +++++++++++ cluster-autoscaler/utils/test/test_utils.go | 17 +++ 5 files changed, 174 insertions(+), 103 deletions(-) create mode 100644 cluster-autoscaler/utils/kubernetes/ready_test.go diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 86103e445084..08126c2a9d74 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -43,10 +43,6 @@ const ( // MaxNodeStartupTime is the maximum time from the moment the node is registered to the time the node is ready. MaxNodeStartupTime = 15 * time.Minute - // MaxStatusSettingDelayAfterCreation is the maximum time for node to set its initial status after the - // node is registered. - MaxStatusSettingDelayAfterCreation = 2 * time.Minute - // MaxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start. MaxNodeGroupBackoffDuration = 30 * time.Minute @@ -399,7 +395,7 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool { if unjustifiedUnready > csr.config.OkTotalUnreadyCount && float64(unjustifiedUnready) > csr.config.MaxTotalUnreadyPercentage/100.0* - float64(readiness.Ready+readiness.Unready+readiness.NotStarted+readiness.LongNotStarted) { + float64(readiness.Ready+readiness.Unready+readiness.NotStarted) { return false } @@ -452,7 +448,7 @@ func (csr *ClusterStateRegistry) getProvisionedAndTargetSizesForNodeGroup(nodeGr } return 0, target, true } - provisioned = readiness.Registered - readiness.NotStarted - readiness.LongNotStarted + provisioned = readiness.Registered - readiness.NotStarted return provisioned, target, true } @@ -531,8 +527,6 @@ type Readiness struct { // Number of nodes that are being currently deleted. They exist in K8S but // are not included in NodeGroup.TargetSize(). Deleted int - // Number of nodes that failed to start within a reasonable limit. - LongNotStarted int // Number of nodes that are not yet fully started. NotStarted int // Number of all registered nodes in the group (ready/unready/deleted/etc). @@ -554,12 +548,10 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) { current.Registered++ if deletetaint.HasToBeDeletedTaint(node) { current.Deleted++ - } else if stillStarting := isNodeStillStarting(node); stillStarting && node.CreationTimestamp.Time.Add(MaxNodeStartupTime).Before(currentTime) { - current.LongNotStarted++ - } else if stillStarting { - current.NotStarted++ } else if ready { current.Ready++ + } else if node.CreationTimestamp.Time.Add(MaxNodeStartupTime).After(currentTime) { + current.NotStarted++ } else { current.Unready++ } @@ -743,11 +735,10 @@ func (csr *ClusterStateRegistry) GetClusterReadiness() Readiness { func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int) api.ClusterAutoscalerCondition { condition := api.ClusterAutoscalerCondition{ Type: api.ClusterAutoscalerHealth, - Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=%d registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)", + Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)", readiness.Ready, readiness.Unready, readiness.NotStarted, - readiness.LongNotStarted, readiness.Registered, readiness.LongUnregistered, acceptable.CurrentTarget, @@ -798,11 +789,10 @@ func buildScaleDownStatusNodeGroup(candidates []string, lastProbed time.Time) ap func buildHealthStatusClusterwide(isReady bool, readiness Readiness) api.ClusterAutoscalerCondition { condition := api.ClusterAutoscalerCondition{ Type: api.ClusterAutoscalerHealth, - Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=%d registered=%d longUnregistered=%d", + Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d", readiness.Ready, readiness.Unready, readiness.NotStarted, - readiness.LongNotStarted, readiness.Registered, readiness.LongUnregistered, ), @@ -860,39 +850,6 @@ func buildScaleDownStatusClusterwide(candidates map[string][]string, lastProbed return condition } -func hasTaint(node *apiv1.Node, taintKey string) bool { - for _, taint := range node.Spec.Taints { - if taint.Key == taintKey { - return true - } - } - return false -} - -func isNodeStillStarting(node *apiv1.Node) bool { - for _, condition := range node.Status.Conditions { - if condition.Type == apiv1.NodeReady { - notReady := condition.Status != apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNotReady) - if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation { - return true - } - } - if condition.Type == apiv1.NodeDiskPressure { - notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeDiskPressure) - if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation { - return true - } - } - if condition.Type == apiv1.NodeNetworkUnavailable { - notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNetworkUnavailable) - if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation { - return true - } - } - } - return false -} - func updateLastTransition(oldStatus, newStatus *api.ClusterAutoscalerStatus) { newStatus.ClusterwideConditions = updateLastTransitionSingleList( oldStatus.ClusterwideConditions, newStatus.ClusterwideConditions) @@ -955,7 +912,7 @@ func (csr *ClusterStateRegistry) GetUpcomingNodes() map[string]int { readiness := csr.perNodeGroupReadiness[id] ar := csr.acceptableRanges[id] // newNodes is the number of nodes that - newNodes := ar.CurrentTarget - (readiness.Ready + readiness.Unready + readiness.LongNotStarted + readiness.LongUnregistered) + newNodes := ar.CurrentTarget - (readiness.Ready + readiness.Unready + readiness.LongUnregistered) if newNodes <= 0 { // Negative value is unlikely but theoretically possible. continue @@ -1006,7 +963,7 @@ func (csr *ClusterStateRegistry) GetAutoscaledNodesCount() (currentSize, targetS targetSize += accRange.CurrentTarget } for _, readiness := range csr.perNodeGroupReadiness { - currentSize += readiness.Registered - readiness.NotStarted - readiness.LongNotStarted + currentSize += readiness.Registered - readiness.NotStarted } return currentSize, targetSize } diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go index 7e4035af44dd..fc710081a66f 100644 --- a/cluster-autoscaler/clusterstate/clusterstate_test.go +++ b/cluster-autoscaler/clusterstate/clusterstate_test.go @@ -319,6 +319,79 @@ func TestTooManyUnready(t *testing.T) { assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) } +func TestUnreadyLongAfterCreation(t *testing.T) { + now := time.Now() + + ng1_1 := BuildTestNode("ng1-1", 1000, 1000) + SetNodeReadyState(ng1_1, true, now.Add(-time.Minute)) + ng2_1 := BuildTestNode("ng2-1", 1000, 1000) + SetNodeReadyState(ng2_1, false, now.Add(-time.Minute)) + ng2_1.CreationTimestamp = metav1.Time{Time: now.Add(-30 * time.Minute)} + + provider := testprovider.NewTestCloudProvider(nil, nil) + provider.AddNodeGroup("ng1", 1, 10, 1) + provider.AddNodeGroup("ng2", 1, 10, 1) + provider.AddNode("ng1", ng1_1) + provider.AddNode("ng2", ng2_1) + + assert.NotNil(t, provider) + fakeClient := &fake.Clientset{} + fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "some-map") + clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ + MaxTotalUnreadyPercentage: 10, + OkTotalUnreadyCount: 1, + }, fakeLogRecorder, newBackoff()) + err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now) + assert.NoError(t, err) + assert.Equal(t, 1, clusterstate.GetClusterReadiness().Unready) + assert.Equal(t, 0, clusterstate.GetClusterReadiness().NotStarted) + upcoming := clusterstate.GetUpcomingNodes() + assert.Equal(t, 0, upcoming["ng1"]) +} + +func TestNotStarted(t *testing.T) { + now := time.Now() + + ng1_1 := BuildTestNode("ng1-1", 1000, 1000) + SetNodeReadyState(ng1_1, true, now.Add(-time.Minute)) + ng2_1 := BuildTestNode("ng2-1", 1000, 1000) + SetNodeReadyState(ng2_1, false, now.Add(-4*time.Minute)) + SetNodeNotReadyTaint(ng2_1) + ng2_1.CreationTimestamp = metav1.Time{Time: now.Add(-10 * time.Minute)} + + provider := testprovider.NewTestCloudProvider(nil, nil) + provider.AddNodeGroup("ng1", 1, 10, 1) + provider.AddNodeGroup("ng2", 1, 10, 1) + provider.AddNode("ng1", ng1_1) + provider.AddNode("ng2", ng2_1) + + assert.NotNil(t, provider) + fakeClient := &fake.Clientset{} + fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "some-map") + clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ + MaxTotalUnreadyPercentage: 10, + OkTotalUnreadyCount: 1, + }, fakeLogRecorder, newBackoff()) + err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now) + assert.NoError(t, err) + assert.Equal(t, 1, clusterstate.GetClusterReadiness().NotStarted) + assert.Equal(t, 1, clusterstate.GetClusterReadiness().Ready) + + // node ng2_1 moves condition to ready + SetNodeReadyState(ng2_1, true, now.Add(-4*time.Minute)) + err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now) + assert.NoError(t, err) + assert.Equal(t, 1, clusterstate.GetClusterReadiness().NotStarted) + assert.Equal(t, 1, clusterstate.GetClusterReadiness().Ready) + + // node ng2_1 no longer has the taint + RemoveNodeNotReadyTaint(ng2_1) + err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now) + assert.NoError(t, err) + assert.Equal(t, 0, clusterstate.GetClusterReadiness().NotStarted) + assert.Equal(t, 2, clusterstate.GetClusterReadiness().Ready) +} + func TestExpiredScaleUp(t *testing.T) { now := time.Now() @@ -768,57 +841,6 @@ func TestUpdateScaleUp(t *testing.T) { assert.Nil(t, clusterstate.scaleUpRequests["ng1"]) } -func TestIsNodeStillStarting(t *testing.T) { - testCases := []struct { - desc string - condition apiv1.NodeConditionType - status apiv1.ConditionStatus - taintKey string - expectedResult bool - }{ - {"unready", apiv1.NodeReady, apiv1.ConditionFalse, "", true}, - {"readiness unknown", apiv1.NodeReady, apiv1.ConditionUnknown, "", true}, - {"out of disk", apiv1.NodeDiskPressure, apiv1.ConditionTrue, "", true}, - {"network unavailable", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, "", true}, - {"started", apiv1.NodeReady, apiv1.ConditionTrue, "", false}, - {"unready and unready taint", apiv1.NodeReady, apiv1.ConditionFalse, apiv1.TaintNodeNotReady, true}, - {"readiness unknown and unready taint", apiv1.NodeReady, apiv1.ConditionUnknown, apiv1.TaintNodeNotReady, true}, - {"disk pressure and disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionTrue, apiv1.TaintNodeDiskPressure, true}, - {"network unavailable and network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, apiv1.TaintNodeNetworkUnavailable, true}, - {"ready but unready taint", apiv1.NodeReady, apiv1.ConditionTrue, apiv1.TaintNodeNotReady, true}, - {"no disk pressure but disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionFalse, apiv1.TaintNodeDiskPressure, true}, - {"network available but network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionFalse, apiv1.TaintNodeNetworkUnavailable, true}, - } - for _, tc := range testCases { - createTestNode := func(timeSinceCreation time.Duration) *apiv1.Node { - node := BuildTestNode("n1", 1000, 1000) - node.CreationTimestamp.Time = time.Time{} - testedTime := node.CreationTimestamp.Time.Add(timeSinceCreation) - - SetNodeCondition(node, tc.condition, tc.status, testedTime) - - if tc.taintKey != "" { - node.Spec.Taints = []apiv1.Taint{{ - Key: tc.taintKey, - Effect: apiv1.TaintEffectNoSchedule, - TimeAdded: &metav1.Time{Time: testedTime}, - }} - } - - return node - } - t.Run("recent "+tc.desc, func(t *testing.T) { - node := createTestNode(1 * time.Minute) - assert.Equal(t, tc.expectedResult, isNodeStillStarting(node)) - }) - t.Run("long "+tc.desc, func(t *testing.T) { - node := createTestNode(30 * time.Minute) - // No matter what are the node's conditions, stop considering it not started after long enough. - assert.False(t, isNodeStillStarting(node)) - }) - } -} - func TestScaleUpFailures(t *testing.T) { now := time.Now() diff --git a/cluster-autoscaler/core/utils/utils.go b/cluster-autoscaler/core/utils/utils.go index 6fd8354c23c4..c60b050c2c4d 100644 --- a/cluster-autoscaler/core/utils/utils.go +++ b/cluster-autoscaler/core/utils/utils.go @@ -312,7 +312,7 @@ func UpdateClusterStateMetrics(csr *clusterstate.ClusterStateRegistry) { } metrics.UpdateClusterSafeToAutoscale(csr.IsClusterHealthy()) readiness := csr.GetClusterReadiness() - metrics.UpdateNodesCount(readiness.Ready, readiness.Unready+readiness.LongNotStarted, readiness.NotStarted, readiness.LongUnregistered, readiness.Unregistered) + metrics.UpdateNodesCount(readiness.Ready, readiness.Unready, readiness.NotStarted, readiness.LongUnregistered, readiness.Unregistered) } // GetOldestCreateTime returns oldest creation time out of the pods in the set diff --git a/cluster-autoscaler/utils/kubernetes/ready_test.go b/cluster-autoscaler/utils/kubernetes/ready_test.go new file mode 100644 index 000000000000..bbb1157b125f --- /dev/null +++ b/cluster-autoscaler/utils/kubernetes/ready_test.go @@ -0,0 +1,75 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + apiv1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + . "k8s.io/autoscaler/cluster-autoscaler/utils/test" +) + +func TestGetReadiness(t *testing.T) { + testCases := []struct { + desc string + condition apiv1.NodeConditionType + status apiv1.ConditionStatus + taintKey string + expectedResult bool + }{ + {"ready", apiv1.NodeReady, apiv1.ConditionTrue, "", true}, + {"unready and unready taint", apiv1.NodeReady, apiv1.ConditionFalse, apiv1.TaintNodeNotReady, false}, + {"readiness unknown and unready taint", apiv1.NodeReady, apiv1.ConditionUnknown, apiv1.TaintNodeNotReady, false}, + {"disk pressure and disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionTrue, apiv1.TaintNodeDiskPressure, false}, + {"network unavailable and network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, apiv1.TaintNodeNetworkUnavailable, false}, + {"ready but unready taint", apiv1.NodeReady, apiv1.ConditionTrue, apiv1.TaintNodeNotReady, false}, + {"no disk pressure but disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionFalse, apiv1.TaintNodeDiskPressure, false}, + {"network available but network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionFalse, apiv1.TaintNodeNetworkUnavailable, false}, + } + for _, tc := range testCases { + createTestNode := func(timeSinceCreation time.Duration) *apiv1.Node { + node := BuildTestNode("n1", 1000, 1000) + node.CreationTimestamp.Time = time.Time{} + testedTime := node.CreationTimestamp.Time.Add(timeSinceCreation) + + SetNodeCondition(node, tc.condition, tc.status, testedTime) + if tc.condition != apiv1.NodeReady { + SetNodeCondition(node, apiv1.NodeReady, apiv1.ConditionTrue, testedTime) + } + + if tc.taintKey != "" { + node.Spec.Taints = []apiv1.Taint{{ + Key: tc.taintKey, + Effect: apiv1.TaintEffectNoSchedule, + TimeAdded: &metav1.Time{Time: testedTime}, + }} + } + + return node + } + t.Run(tc.desc, func(t *testing.T) { + node := createTestNode(1 * time.Minute) + isReady, _, err := GetReadinessState(node) + assert.NoError(t, err) + assert.Equal(t, tc.expectedResult, isReady) + }) + } +} diff --git a/cluster-autoscaler/utils/test/test_utils.go b/cluster-autoscaler/utils/test/test_utils.go index ee1ff1082ccd..54edf9dc9e54 100644 --- a/cluster-autoscaler/utils/test/test_utils.go +++ b/cluster-autoscaler/utils/test/test_utils.go @@ -162,6 +162,23 @@ func SetNodeReadyState(node *apiv1.Node, ready bool, lastTransition time.Time) { } } +// SetNodeNotReadyTaint sets the not ready taint on node. +func SetNodeNotReadyTaint(node *apiv1.Node) { + node.Spec.Taints = append(node.Spec.Taints, apiv1.Taint{Key: apiv1.TaintNodeNotReady, Effect: apiv1.TaintEffectNoSchedule}) +} + +// RemoveNodeNotReadyTaint removes the not ready taint. +func RemoveNodeNotReadyTaint(node *apiv1.Node) { + var final []apiv1.Taint + for i := range node.Spec.Taints { + if node.Spec.Taints[i].Key == apiv1.TaintNodeNotReady { + continue + } + final = append(final, node.Spec.Taints[i]) + } + node.Spec.Taints = final +} + // SetNodeCondition sets node condition. func SetNodeCondition(node *apiv1.Node, conditionType apiv1.NodeConditionType, status apiv1.ConditionStatus, lastTransition time.Time) { for i := range node.Status.Conditions {