Skip to content

Commit

Permalink
Fix bug where a node that becomes ready after 2 mins can be treated a…
Browse files Browse the repository at this point in the history
…s unready. Deprecated LongNotStarted

 In cases where node n1 would:
 1) Be created at t=0min
 2) Ready condition is true at t=2.5min
 3) Not ready taint is removed at t=3min
 the ready node is counted as unready

 Tested cases after fix:
 1) Case described above
 2) Nodes not starting even after 15mins still
 treated as unready
 3) Nodes created long ago that suddenly become unready are
 counted as unready.
  • Loading branch information
vivekbagade authored and matthias50 committed Sep 9, 2021
1 parent 20b1808 commit 7407d94
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 103 deletions.
59 changes: 8 additions & 51 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@ const (
// MaxNodeStartupTime is the maximum time from the moment the node is registered to the time the node is ready.
MaxNodeStartupTime = 15 * time.Minute

// MaxStatusSettingDelayAfterCreation is the maximum time for node to set its initial status after the
// node is registered.
MaxStatusSettingDelayAfterCreation = 2 * time.Minute

// MaxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.
MaxNodeGroupBackoffDuration = 30 * time.Minute

Expand Down Expand Up @@ -399,7 +395,7 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {

if unjustifiedUnready > csr.config.OkTotalUnreadyCount &&
float64(unjustifiedUnready) > csr.config.MaxTotalUnreadyPercentage/100.0*
float64(readiness.Ready+readiness.Unready+readiness.NotStarted+readiness.LongNotStarted) {
float64(readiness.Ready+readiness.Unready+readiness.NotStarted) {
return false
}

Expand Down Expand Up @@ -452,7 +448,7 @@ func (csr *ClusterStateRegistry) getProvisionedAndTargetSizesForNodeGroup(nodeGr
}
return 0, target, true
}
provisioned = readiness.Registered - readiness.NotStarted - readiness.LongNotStarted
provisioned = readiness.Registered - readiness.NotStarted

return provisioned, target, true
}
Expand Down Expand Up @@ -531,8 +527,6 @@ type Readiness struct {
// Number of nodes that are being currently deleted. They exist in K8S but
// are not included in NodeGroup.TargetSize().
Deleted int
// Number of nodes that failed to start within a reasonable limit.
LongNotStarted int
// Number of nodes that are not yet fully started.
NotStarted int
// Number of all registered nodes in the group (ready/unready/deleted/etc).
Expand All @@ -554,12 +548,10 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
current.Registered++
if deletetaint.HasToBeDeletedTaint(node) {
current.Deleted++
} else if stillStarting := isNodeStillStarting(node); stillStarting && node.CreationTimestamp.Time.Add(MaxNodeStartupTime).Before(currentTime) {
current.LongNotStarted++
} else if stillStarting {
current.NotStarted++
} else if ready {
current.Ready++
} else if node.CreationTimestamp.Time.Add(MaxNodeStartupTime).After(currentTime) {
current.NotStarted++
} else {
current.Unready++
}
Expand Down Expand Up @@ -743,11 +735,10 @@ func (csr *ClusterStateRegistry) GetClusterReadiness() Readiness {
func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int) api.ClusterAutoscalerCondition {
condition := api.ClusterAutoscalerCondition{
Type: api.ClusterAutoscalerHealth,
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=%d registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
readiness.Ready,
readiness.Unready,
readiness.NotStarted,
readiness.LongNotStarted,
readiness.Registered,
readiness.LongUnregistered,
acceptable.CurrentTarget,
Expand Down Expand Up @@ -798,11 +789,10 @@ func buildScaleDownStatusNodeGroup(candidates []string, lastProbed time.Time) ap
func buildHealthStatusClusterwide(isReady bool, readiness Readiness) api.ClusterAutoscalerCondition {
condition := api.ClusterAutoscalerCondition{
Type: api.ClusterAutoscalerHealth,
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=%d registered=%d longUnregistered=%d",
Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d",
readiness.Ready,
readiness.Unready,
readiness.NotStarted,
readiness.LongNotStarted,
readiness.Registered,
readiness.LongUnregistered,
),
Expand Down Expand Up @@ -860,39 +850,6 @@ func buildScaleDownStatusClusterwide(candidates map[string][]string, lastProbed
return condition
}

func hasTaint(node *apiv1.Node, taintKey string) bool {
for _, taint := range node.Spec.Taints {
if taint.Key == taintKey {
return true
}
}
return false
}

func isNodeStillStarting(node *apiv1.Node) bool {
for _, condition := range node.Status.Conditions {
if condition.Type == apiv1.NodeReady {
notReady := condition.Status != apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNotReady)
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
}
}
if condition.Type == apiv1.NodeDiskPressure {
notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeDiskPressure)
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
}
}
if condition.Type == apiv1.NodeNetworkUnavailable {
notReady := condition.Status == apiv1.ConditionTrue || hasTaint(node, apiv1.TaintNodeNetworkUnavailable)
if notReady && condition.LastTransitionTime.Time.Sub(node.CreationTimestamp.Time) < MaxStatusSettingDelayAfterCreation {
return true
}
}
}
return false
}

func updateLastTransition(oldStatus, newStatus *api.ClusterAutoscalerStatus) {
newStatus.ClusterwideConditions = updateLastTransitionSingleList(
oldStatus.ClusterwideConditions, newStatus.ClusterwideConditions)
Expand Down Expand Up @@ -955,7 +912,7 @@ func (csr *ClusterStateRegistry) GetUpcomingNodes() map[string]int {
readiness := csr.perNodeGroupReadiness[id]
ar := csr.acceptableRanges[id]
// newNodes is the number of nodes that
newNodes := ar.CurrentTarget - (readiness.Ready + readiness.Unready + readiness.LongNotStarted + readiness.LongUnregistered)
newNodes := ar.CurrentTarget - (readiness.Ready + readiness.Unready + readiness.LongUnregistered)
if newNodes <= 0 {
// Negative value is unlikely but theoretically possible.
continue
Expand Down Expand Up @@ -1006,7 +963,7 @@ func (csr *ClusterStateRegistry) GetAutoscaledNodesCount() (currentSize, targetS
targetSize += accRange.CurrentTarget
}
for _, readiness := range csr.perNodeGroupReadiness {
currentSize += readiness.Registered - readiness.NotStarted - readiness.LongNotStarted
currentSize += readiness.Registered - readiness.NotStarted
}
return currentSize, targetSize
}
Expand Down
124 changes: 73 additions & 51 deletions cluster-autoscaler/clusterstate/clusterstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,79 @@ func TestTooManyUnready(t *testing.T) {
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
}

func TestUnreadyLongAfterCreation(t *testing.T) {
now := time.Now()

ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
SetNodeReadyState(ng1_1, true, now.Add(-time.Minute))
ng2_1 := BuildTestNode("ng2-1", 1000, 1000)
SetNodeReadyState(ng2_1, false, now.Add(-time.Minute))
ng2_1.CreationTimestamp = metav1.Time{Time: now.Add(-30 * time.Minute)}

provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 1, 10, 1)
provider.AddNodeGroup("ng2", 1, 10, 1)
provider.AddNode("ng1", ng1_1)
provider.AddNode("ng2", ng2_1)

assert.NotNil(t, provider)
fakeClient := &fake.Clientset{}
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "some-map")
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
MaxTotalUnreadyPercentage: 10,
OkTotalUnreadyCount: 1,
}, fakeLogRecorder, newBackoff())
err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
assert.NoError(t, err)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().Unready)
assert.Equal(t, 0, clusterstate.GetClusterReadiness().NotStarted)
upcoming := clusterstate.GetUpcomingNodes()
assert.Equal(t, 0, upcoming["ng1"])
}

func TestNotStarted(t *testing.T) {
now := time.Now()

ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
SetNodeReadyState(ng1_1, true, now.Add(-time.Minute))
ng2_1 := BuildTestNode("ng2-1", 1000, 1000)
SetNodeReadyState(ng2_1, false, now.Add(-4*time.Minute))
SetNodeNotReadyTaint(ng2_1)
ng2_1.CreationTimestamp = metav1.Time{Time: now.Add(-10 * time.Minute)}

provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 1, 10, 1)
provider.AddNodeGroup("ng2", 1, 10, 1)
provider.AddNode("ng1", ng1_1)
provider.AddNode("ng2", ng2_1)

assert.NotNil(t, provider)
fakeClient := &fake.Clientset{}
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "some-map")
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
MaxTotalUnreadyPercentage: 10,
OkTotalUnreadyCount: 1,
}, fakeLogRecorder, newBackoff())
err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
assert.NoError(t, err)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().NotStarted)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().Ready)

// node ng2_1 moves condition to ready
SetNodeReadyState(ng2_1, true, now.Add(-4*time.Minute))
err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
assert.NoError(t, err)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().NotStarted)
assert.Equal(t, 1, clusterstate.GetClusterReadiness().Ready)

// node ng2_1 no longer has the taint
RemoveNodeNotReadyTaint(ng2_1)
err = clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng2_1}, nil, now)
assert.NoError(t, err)
assert.Equal(t, 0, clusterstate.GetClusterReadiness().NotStarted)
assert.Equal(t, 2, clusterstate.GetClusterReadiness().Ready)
}

func TestExpiredScaleUp(t *testing.T) {
now := time.Now()

Expand Down Expand Up @@ -768,57 +841,6 @@ func TestUpdateScaleUp(t *testing.T) {
assert.Nil(t, clusterstate.scaleUpRequests["ng1"])
}

func TestIsNodeStillStarting(t *testing.T) {
testCases := []struct {
desc string
condition apiv1.NodeConditionType
status apiv1.ConditionStatus
taintKey string
expectedResult bool
}{
{"unready", apiv1.NodeReady, apiv1.ConditionFalse, "", true},
{"readiness unknown", apiv1.NodeReady, apiv1.ConditionUnknown, "", true},
{"out of disk", apiv1.NodeDiskPressure, apiv1.ConditionTrue, "", true},
{"network unavailable", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, "", true},
{"started", apiv1.NodeReady, apiv1.ConditionTrue, "", false},
{"unready and unready taint", apiv1.NodeReady, apiv1.ConditionFalse, apiv1.TaintNodeNotReady, true},
{"readiness unknown and unready taint", apiv1.NodeReady, apiv1.ConditionUnknown, apiv1.TaintNodeNotReady, true},
{"disk pressure and disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionTrue, apiv1.TaintNodeDiskPressure, true},
{"network unavailable and network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, apiv1.TaintNodeNetworkUnavailable, true},
{"ready but unready taint", apiv1.NodeReady, apiv1.ConditionTrue, apiv1.TaintNodeNotReady, true},
{"no disk pressure but disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionFalse, apiv1.TaintNodeDiskPressure, true},
{"network available but network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionFalse, apiv1.TaintNodeNetworkUnavailable, true},
}
for _, tc := range testCases {
createTestNode := func(timeSinceCreation time.Duration) *apiv1.Node {
node := BuildTestNode("n1", 1000, 1000)
node.CreationTimestamp.Time = time.Time{}
testedTime := node.CreationTimestamp.Time.Add(timeSinceCreation)

SetNodeCondition(node, tc.condition, tc.status, testedTime)

if tc.taintKey != "" {
node.Spec.Taints = []apiv1.Taint{{
Key: tc.taintKey,
Effect: apiv1.TaintEffectNoSchedule,
TimeAdded: &metav1.Time{Time: testedTime},
}}
}

return node
}
t.Run("recent "+tc.desc, func(t *testing.T) {
node := createTestNode(1 * time.Minute)
assert.Equal(t, tc.expectedResult, isNodeStillStarting(node))
})
t.Run("long "+tc.desc, func(t *testing.T) {
node := createTestNode(30 * time.Minute)
// No matter what are the node's conditions, stop considering it not started after long enough.
assert.False(t, isNodeStillStarting(node))
})
}
}

func TestScaleUpFailures(t *testing.T) {
now := time.Now()

Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/core/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ func UpdateClusterStateMetrics(csr *clusterstate.ClusterStateRegistry) {
}
metrics.UpdateClusterSafeToAutoscale(csr.IsClusterHealthy())
readiness := csr.GetClusterReadiness()
metrics.UpdateNodesCount(readiness.Ready, readiness.Unready+readiness.LongNotStarted, readiness.NotStarted, readiness.LongUnregistered, readiness.Unregistered)
metrics.UpdateNodesCount(readiness.Ready, readiness.Unready, readiness.NotStarted, readiness.LongUnregistered, readiness.Unregistered)
}

// GetOldestCreateTime returns oldest creation time out of the pods in the set
Expand Down
75 changes: 75 additions & 0 deletions cluster-autoscaler/utils/kubernetes/ready_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package kubernetes

import (
"testing"
"time"

"github.com/stretchr/testify/assert"
apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
)

func TestGetReadiness(t *testing.T) {
testCases := []struct {
desc string
condition apiv1.NodeConditionType
status apiv1.ConditionStatus
taintKey string
expectedResult bool
}{
{"ready", apiv1.NodeReady, apiv1.ConditionTrue, "", true},
{"unready and unready taint", apiv1.NodeReady, apiv1.ConditionFalse, apiv1.TaintNodeNotReady, false},
{"readiness unknown and unready taint", apiv1.NodeReady, apiv1.ConditionUnknown, apiv1.TaintNodeNotReady, false},
{"disk pressure and disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionTrue, apiv1.TaintNodeDiskPressure, false},
{"network unavailable and network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionTrue, apiv1.TaintNodeNetworkUnavailable, false},
{"ready but unready taint", apiv1.NodeReady, apiv1.ConditionTrue, apiv1.TaintNodeNotReady, false},
{"no disk pressure but disk pressure taint", apiv1.NodeDiskPressure, apiv1.ConditionFalse, apiv1.TaintNodeDiskPressure, false},
{"network available but network unavailable taint", apiv1.NodeNetworkUnavailable, apiv1.ConditionFalse, apiv1.TaintNodeNetworkUnavailable, false},
}
for _, tc := range testCases {
createTestNode := func(timeSinceCreation time.Duration) *apiv1.Node {
node := BuildTestNode("n1", 1000, 1000)
node.CreationTimestamp.Time = time.Time{}
testedTime := node.CreationTimestamp.Time.Add(timeSinceCreation)

SetNodeCondition(node, tc.condition, tc.status, testedTime)
if tc.condition != apiv1.NodeReady {
SetNodeCondition(node, apiv1.NodeReady, apiv1.ConditionTrue, testedTime)
}

if tc.taintKey != "" {
node.Spec.Taints = []apiv1.Taint{{
Key: tc.taintKey,
Effect: apiv1.TaintEffectNoSchedule,
TimeAdded: &metav1.Time{Time: testedTime},
}}
}

return node
}
t.Run(tc.desc, func(t *testing.T) {
node := createTestNode(1 * time.Minute)
isReady, _, err := GetReadinessState(node)
assert.NoError(t, err)
assert.Equal(t, tc.expectedResult, isReady)
})
}
}
17 changes: 17 additions & 0 deletions cluster-autoscaler/utils/test/test_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,23 @@ func SetNodeReadyState(node *apiv1.Node, ready bool, lastTransition time.Time) {
}
}

// SetNodeNotReadyTaint sets the not ready taint on node.
func SetNodeNotReadyTaint(node *apiv1.Node) {
node.Spec.Taints = append(node.Spec.Taints, apiv1.Taint{Key: apiv1.TaintNodeNotReady, Effect: apiv1.TaintEffectNoSchedule})
}

// RemoveNodeNotReadyTaint removes the not ready taint.
func RemoveNodeNotReadyTaint(node *apiv1.Node) {
var final []apiv1.Taint
for i := range node.Spec.Taints {
if node.Spec.Taints[i].Key == apiv1.TaintNodeNotReady {
continue
}
final = append(final, node.Spec.Taints[i])
}
node.Spec.Taints = final
}

// SetNodeCondition sets node condition.
func SetNodeCondition(node *apiv1.Node, conditionType apiv1.NodeConditionType, status apiv1.ConditionStatus, lastTransition time.Time) {
for i := range node.Status.Conditions {
Expand Down

0 comments on commit 7407d94

Please sign in to comment.