Skip to content

Commit

Permalink
fix(clusterstate): invalidate instance cache when scaling down
Browse files Browse the repository at this point in the history
  • Loading branch information
qianlei90 committed Dec 2, 2023
1 parent 85b6058 commit 2166ddd
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 6 deletions.
15 changes: 14 additions & 1 deletion cluster-autoscaler/cloudprovider/kwok/kwok_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,20 @@ func (kwok *KwokCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovider
// HasInstance returns whether a given node has a corresponding instance in this cloud provider
// Since there is no underlying cloud provider instance, return true
func (kwok *KwokCloudProvider) HasInstance(node *apiv1.Node) (bool, error) {
return true, nil
for _, nodeGroup := range kwok.nodeGroups {
nodeList, err := nodeGroup.lister.List()
if err != nil {
return false, err
}

for _, no := range nodeList {
if no.GetName() == node.GetName() {
return true, nil
}
}
}

return false, nil
}

// Pricing returns pricing model for this cloud provider or error if not available.
Expand Down
22 changes: 19 additions & 3 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ type ScaleUpRequest struct {

// ScaleDownRequest contains information about the requested node deletion.
type ScaleDownRequest struct {
// NodeName is the name of the node to be deleted.
NodeName string
// Node is the node to be deleted.
Node *apiv1.Node
// NodeGroup is the node group of the deleted node.
NodeGroup cloudprovider.NodeGroup
// Time is the time when the node deletion was requested.
Expand Down Expand Up @@ -283,6 +283,12 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {

newScaleDownRequests := make([]*ScaleDownRequest, 0)
for _, scaleDownRequest := range csr.scaleDownRequests {
// delete scaleDownRequest if there's no instance in cloud provider side
// otherwise we check the delete time
hasInstance, err := csr.cloudProvider.HasInstance(scaleDownRequest.Node)
if err == nil && !hasInstance {
continue
}
if scaleDownRequest.ExpectedDeleteTime.After(currentTime) {
newScaleDownRequests = append(newScaleDownRequests, scaleDownRequest)
}
Expand Down Expand Up @@ -501,6 +507,16 @@ func (csr *ClusterStateRegistry) IsNodeGroupScalingUp(nodeGroupName string) bool
return found
}

// IsNodeGroupScalingDown returns true if the node group is currently scaling down.
func (csr *ClusterStateRegistry) IsNodeGroupScalingDown(nodeGroupName string) bool {
for _, scaleDownRequest := range csr.scaleDownRequests {
if scaleDownRequest.NodeGroup.Id() == nodeGroupName {
return true
}
}
return false
}

// HasNodeGroupStartedScaleUp returns true if the node group has started scale up regardless
// of whether there are any upcoming nodes. This is useful in the case when the node group's
// size reverts back to its previous size before the next UpdatesCall and we want to know
Expand Down Expand Up @@ -991,7 +1007,7 @@ func (csr *ClusterStateRegistry) GetUpcomingNodes() (upcomingCounts map[string]i
// as returned by NodeGroup.Nodes().
func (csr *ClusterStateRegistry) getCloudProviderNodeInstances() (map[string][]cloudprovider.Instance, error) {
for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
if csr.IsNodeGroupScalingUp(nodeGroup.Id()) {
if csr.IsNodeGroupScalingUp(nodeGroup.Id()) || csr.IsNodeGroupScalingDown(nodeGroup.Id()) {
csr.cloudProviderNodeInstancesCache.InvalidateCacheEntry(nodeGroup)
}
}
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/clusterstate/clusterstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ func TestRegisterScaleDown(t *testing.T) {

clusterstate.RegisterScaleDown(&ScaleDownRequest{
NodeGroup: provider.GetNodeGroup("ng1"),
NodeName: "ng1-1",
Node: ng1_1,
ExpectedDeleteTime: now.Add(time.Minute),
Time: now,
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ func RegisterAndRecordSuccessfulScaleDownEvent(ctx *context.AutoscalingContext,
ctx.Recorder.Eventf(node, apiv1.EventTypeNormal, "ScaleDown", "nodes removed by cluster autoscaler")
csr.RegisterScaleDown(&clusterstate.ScaleDownRequest{
NodeGroup: nodeGroup,
NodeName: node.Name,
Node: node,
Time: time.Now(),
ExpectedDeleteTime: time.Now().Add(MaxCloudProviderNodeDeletionTime),
})
Expand Down

0 comments on commit 2166ddd

Please sign in to comment.