Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

simplify DeleteNode logic by removing an extra Mutex #3573

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ func getAutoscaleNodePools(manager *huaweicloudCloudManager, opts config.Autosca
}

clusterUpdateLock := sync.Mutex{}
deleteMux := sync.Mutex{}

// Given our current implementation just support single node pool,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problem here, but need to point out that the clusterUpdateLock and deleteMux should be defined in the loop so that we can ensure the same mutex can't be shared with different node groups.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah you're right. I'll move the clusterUpdateLock inside the for loop below. Thank you for the comment!

// please make sure there is only one node pool with Autoscaling flag turned on in CCE cluster
Expand All @@ -200,7 +199,6 @@ func getAutoscaleNodePools(manager *huaweicloudCloudManager, opts config.Autosca

nodePoolsWithAutoscalingEnabled = append(nodePoolsWithAutoscalingEnabled, NodeGroup{
huaweiCloudManager: manager,
deleteMutex: &deleteMux,
clusterUpdateMutex: &clusterUpdateLock,
nodePoolName: nodePool.Metadata.Name,
nodePoolId: nodePool.Metadata.Uid,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ type NodeGroup struct {
nodePoolSizeTmp int
getNodePoolSizeTime time.Time

deleteMutex *sync.Mutex
clusterUpdateMutex *sync.Mutex
}

Expand Down Expand Up @@ -134,20 +133,19 @@ func (ng *NodeGroup) IncreaseSize(delta int) error {
// DeleteNodes deletes nodes from this node group. This function
// should wait until node group size is updated.
func (ng *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
// use ng.deleteMutex to update ng.nodePoolSizeTmp, ng.getNodePoolSizeTime and ng.nodesToDelete
ng.deleteMutex.Lock()
// start the process of deleting nodes
ng.clusterUpdateMutex.Lock()
defer ng.clusterUpdateMutex.Unlock()

// check whether deleting the nodes will cause the size of the node pool below minimum size
// and update ng.nodesToDelete (as well as ng.nodePoolSizeTmp and ng.getNodePoolSizeTime if necessary)
currentSize, err := checkAndUpdate(ng, nodes)
ng.deleteMutex.Unlock()
if err != nil {
return err
}

// start the process of deleting nodes
ng.clusterUpdateMutex.Lock()
defer ng.clusterUpdateMutex.Unlock()

// get the unique ids of the nodes to be deleted
nodeIDs, finished, err := getNodeIDsToDelete(ng, nodes, currentSize)
nodeIDs, finished, err := getNodeIDsToDelete(ng)
if finished { // nodes have been deleted by other goroutine
return nil
}
Expand Down Expand Up @@ -215,26 +213,13 @@ func checkAndUpdate(ng *NodeGroup, nodes []*apiv1.Node) (int, error) {
// getNodeIDsToDelete checks whether there're still nodes waiting for being deleted. If there're no nodes
// to delete, it will return true, representing that the process of deleting the nodes has been finished;
// otherwise, it will return a slice of node ids to be deleted.
func getNodeIDsToDelete(ng *NodeGroup, nodes []*apiv1.Node, currentSize int) ([]string, bool, error) {
func getNodeIDsToDelete(ng *NodeGroup) ([]string, bool, error) {
// check whether the nodes has already been deleted by other goroutine
ng.deleteMutex.Lock()
// If current goroutine is not the first one to acquire the ng.clusterUpdateMutex,
// it's possible that the nodes have already been deleted, which makes ng.nodesToDelete to be empty.
if len(ng.nodesToDelete) == 0 {
ng.deleteMutex.Unlock()
return nil, true, nil
}
ng.deleteMutex.Unlock()

// wait for more nodes to be added to ng.nodesToDelete
time.Sleep(ng.deleteWaitTime)

// get a copy of the nodes to be deleted and release the lock
ng.deleteMutex.Lock()
nodes = make([]*apiv1.Node, len(ng.nodesToDelete))
copy(nodes, ng.nodesToDelete)
ng.nodesToDelete = nil
ng.deleteMutex.Unlock()

// check whether the cluster is available for deleting nodes
canUpdate, status, err := ng.huaweiCloudManager.canUpdate()
Expand All @@ -245,17 +230,15 @@ func getNodeIDsToDelete(ng *NodeGroup, nodes []*apiv1.Node, currentSize int) ([]
return nil, false, fmt.Errorf("cluster is in %s status, cannot perform node deletion now", status)
}

// check again whether deleting the nodes is valid
if currentSize-len(nodes) < ng.MinSize() {
return nil, false, fmt.Errorf("the size of the node pool is not sufficient for deleting the nodes, target size:%d, minimum size:%d", currentSize-len(nodes), ng.MinSize())
}

nodeIDs := make([]string, 0)
for _, node := range nodes {
for _, node := range ng.nodesToDelete {
// the node.Spec.ProviderID is the node's uid
klog.Infof("Delete node with node id: %s", node.Spec.ProviderID)
nodeIDs = append(nodeIDs, node.Spec.ProviderID)
}

ng.nodesToDelete = nil

return nodeIDs, false, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ func createTestNodeGroup(manager *huaweicloudCloudManager) *NodeGroup {
size := nodePoolNodeCount
return &NodeGroup{
huaweiCloudManager: manager,
deleteMutex: &sync.Mutex{},
clusterUpdateMutex: &sync.Mutex{},
nodePoolName: nodePoolName,
nodePoolId: nodePoolUID,
Expand Down