Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SKS-1903: Fix deleting a PG might create multiple deletion tasks at the same time #151

Merged
merged 3 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions controllers/elfcluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,10 @@ func (r *ElfClusterReconciler) reconcileDelete(ctx *context.ClusterContext) (rec

// if cluster need to force delete, skipping infra resource deletion and remove the finalizer.
if !ctx.ElfCluster.HasForceDeleteCluster() {
if err := r.reconcileDeleteVMPlacementGroups(ctx); err != nil {
if ok, err := r.reconcileDeleteVMPlacementGroups(ctx); err != nil {
return reconcile.Result{}, errors.Wrapf(err, "failed to delete vm placement groups")
} else if !ok {
return reconcile.Result{RequeueAfter: config.DefaultRequeueTimeout}, nil
}

if err := r.reconcileDeleteLabels(ctx); err != nil {
Expand All @@ -227,15 +229,19 @@ func (r *ElfClusterReconciler) reconcileDelete(ctx *context.ClusterContext) (rec
return reconcile.Result{}, nil
}

func (r *ElfClusterReconciler) reconcileDeleteVMPlacementGroups(ctx *context.ClusterContext) error {
func (r *ElfClusterReconciler) reconcileDeleteVMPlacementGroups(ctx *context.ClusterContext) (bool, error) {
placementGroupPrefix := towerresources.GetVMPlacementGroupNamePrefix(ctx.Cluster)
if err := ctx.VMService.DeleteVMPlacementGroupsByName(ctx, placementGroupPrefix); err != nil {
return err
if pgNames, err := ctx.VMService.DeleteVMPlacementGroupsByName(ctx, placementGroupPrefix); err != nil {
return false, err
} else if len(pgNames) > 0 {
ctx.Logger.Info("Waiting for placement groups to be deleted", "placementGroupCount", len(pgNames))
haijianyang marked this conversation as resolved.
Show resolved Hide resolved

return false, nil
} else {
ctx.Logger.Info(fmt.Sprintf("Placement groups with name prefix %s deleted", placementGroupPrefix))
haijianyang marked this conversation as resolved.
Show resolved Hide resolved
}

return nil
return true, nil
}

func (r *ElfClusterReconciler) reconcileDeleteLabels(ctx *context.ClusterContext) error {
Expand Down
15 changes: 10 additions & 5 deletions controllers/elfcluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,21 +207,26 @@ var _ = Describe("ElfClusterReconciler", func() {
reconciler := &ElfClusterReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
elfClusterKey := capiutil.ObjectKey(elfCluster)

mockVMService.EXPECT().DeleteVMPlacementGroupsByName(gomock.Any(), towerresources.GetVMPlacementGroupNamePrefix(cluster)).Return(errors.New("some error"))
mockVMService.EXPECT().DeleteVMPlacementGroupsByName(gomock.Any(), towerresources.GetVMPlacementGroupNamePrefix(cluster)).Return(nil, errors.New("some error"))

result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfClusterKey})
Expect(result).To(BeZero())
Expect(err).To(HaveOccurred())

logBuffer = new(bytes.Buffer)
klog.SetOutput(logBuffer)
task.Status = models.NewTaskStatus(models.TaskStatusSUCCESSED)
mockVMService.EXPECT().DeleteVMPlacementGroupsByName(gomock.Any(), towerresources.GetVMPlacementGroupNamePrefix(cluster)).Return(nil)
logBuffer.Reset()
mockVMService.EXPECT().DeleteVMPlacementGroupsByName(gomock.Any(), towerresources.GetVMPlacementGroupNamePrefix(cluster)).Return([]string{"pgName"}, nil)
result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfClusterKey})
Expect(result).NotTo(BeZero())
Expect(err).NotTo(HaveOccurred())
Expect(logBuffer.String()).To(ContainSubstring("Waiting for placement groups to be deleted"))

logBuffer.Reset()
mockVMService.EXPECT().DeleteVMPlacementGroupsByName(gomock.Any(), towerresources.GetVMPlacementGroupNamePrefix(cluster)).Return(nil, nil)
mockVMService.EXPECT().DeleteLabel(towerresources.GetVMLabelClusterName(), elfCluster.Name, true).Return("labelid", nil)
mockVMService.EXPECT().DeleteLabel(towerresources.GetVMLabelVIP(), elfCluster.Spec.ControlPlaneEndpoint.Host, false).Return("labelid", nil)
mockVMService.EXPECT().DeleteLabel(towerresources.GetVMLabelNamespace(), elfCluster.Namespace, true).Return("", nil)
mockVMService.EXPECT().DeleteLabel(towerresources.GetVMLabelManaged(), "true", true).Return("", nil)

result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfClusterKey})
Expect(result).To(BeZero())
Expect(err).NotTo(HaveOccurred())
Expand Down
6 changes: 5 additions & 1 deletion controllers/elfmachine_controller_placement_group.go
Original file line number Diff line number Diff line change
Expand Up @@ -630,8 +630,12 @@ func (r *ElfMachineReconciler) deletePlacementGroup(ctx *context.MachineContext)
return false, nil
}

if err := ctx.VMService.DeleteVMPlacementGroupsByName(ctx, *placementGroup.Name); err != nil {
if pgNames, err := ctx.VMService.DeleteVMPlacementGroupsByName(ctx, *placementGroup.Name); err != nil {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里加一个func DeleteVMPlacementGroupsByNamePrefix()吧,以区别于DeleteVMPlacementGroupByName

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

删除单个 PG 和多个 PG 使用不同的 func?删除的逻辑应该可以复用,DeleteVMPlacementGroupByName 调用 DeleteVMPlacementGroupsByNamePrefix ?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

单个 PG 是by Name,多个 PG是by NamePrefix,用一个func会混淆

return false, err
} else if len(pgNames) > 0 {
ctx.Logger.Info("Waiting for placement group to be deleted", "placementGroup", *placementGroup.Name)
haijianyang marked this conversation as resolved.
Show resolved Hide resolved

return false, nil
} else {
ctx.Logger.Info(fmt.Sprintf("Placement group %s deleted", *placementGroup.Name))
}
Expand Down
13 changes: 11 additions & 2 deletions controllers/elfmachine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2572,7 +2572,7 @@ var _ = Describe("ElfMachineReconciler", func() {
placementGroup := fake.NewVMPlacementGroup([]string{})
placementGroup.Name = service.TowerString(placementGroupName)
mockVMService.EXPECT().GetVMPlacementGroup(placementGroupName).Return(placementGroup, nil)
mockVMService.EXPECT().DeleteVMPlacementGroupsByName(gomock.Any(), placementGroupName).Return(nil)
mockVMService.EXPECT().DeleteVMPlacementGroupsByName(gomock.Any(), placementGroupName).Return(nil, nil)

reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
ok, err = reconciler.deletePlacementGroup(machineContext)
Expand All @@ -2594,11 +2594,20 @@ var _ = Describe("ElfMachineReconciler", func() {
Expect(err).To(HaveOccurred())

mockVMService.EXPECT().GetVMPlacementGroup(placementGroupName).Return(placementGroup, nil)
mockVMService.EXPECT().DeleteVMPlacementGroupsByName(gomock.Any(), placementGroupName).Return(errors.New("error"))
mockVMService.EXPECT().DeleteVMPlacementGroupsByName(gomock.Any(), placementGroupName).Return(nil, errors.New("error"))
reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
ok, err = reconciler.deletePlacementGroup(machineContext)
Expect(ok).To(BeFalse())
Expect(err).To(HaveOccurred())

logBuffer.Reset()
mockVMService.EXPECT().GetVMPlacementGroup(placementGroupName).Return(placementGroup, nil)
mockVMService.EXPECT().DeleteVMPlacementGroupsByName(gomock.Any(), placementGroupName).Return([]string{*placementGroup.Name}, nil)
reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
ok, err = reconciler.deletePlacementGroup(machineContext)
Expect(ok).To(BeFalse())
Expect(err).NotTo(HaveOccurred())
Expect(logBuffer.String()).To(ContainSubstring("Waiting for placement group to be deleted"))
})

It("should delete k8s node before destroying VM.", func() {
Expand Down
7 changes: 4 additions & 3 deletions pkg/service/mock_services/vm_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

57 changes: 41 additions & 16 deletions pkg/service/vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ type VMService interface {
CreateVMPlacementGroup(name, clusterID string, vmPolicy models.VMVMPolicy) (*models.WithTaskVMPlacementGroup, error)
GetVMPlacementGroup(name string) (*models.VMPlacementGroup, error)
AddVMsToPlacementGroup(placementGroup *models.VMPlacementGroup, vmIDs []string) (*models.Task, error)
DeleteVMPlacementGroupsByName(ctx goctx.Context, placementGroupName string) error
DeleteVMPlacementGroupsByName(ctx goctx.Context, placementGroupName string) ([]string, error)
FindGPUDevicesByHostIDs(hostIDs []string) ([]*models.GpuDevice, error)
FindGPUDevicesByIDs(gpuIDs []string) ([]*models.GpuDevice, error)
}
Expand Down Expand Up @@ -848,35 +848,60 @@ func (svr *TowerVMService) AddVMsToPlacementGroup(placementGroup *models.VMPlace
return &models.Task{ID: updateVMPlacementGroupResp.Payload[0].TaskID}, nil
}

// DeleteVMPlacementGroupsByName deletes placement groups by name synchronously.
func (svr *TowerVMService) DeleteVMPlacementGroupsByName(ctx goctx.Context, placementGroupName string) error {
deleteVMPlacementGroupParams := clientvmplacementgroup.NewDeleteVMPlacementGroupParams()
deleteVMPlacementGroupParams.RequestBody = &models.VMPlacementGroupDeletionParams{
// DeleteVMPlacementGroupsByName deletes placement groups by name.
//
// The return value:
// 1. A empty slice indicates that all specified placements have been deleted.
// 2. A non-empty slice indicates that the names of the placement groups being deleted.
func (svr *TowerVMService) DeleteVMPlacementGroupsByName(ctx goctx.Context, placementGroupName string) ([]string, error) {
// Deleting placement groups in batches, Tower will create a deletion task
// for each placement group.
// Some tasks may fail, and failed tasks need to be deleted again.
// Therefore, need to query and confirm whether all placement groups have been deleted.
getVMPlacementGroupsParams := clientvmplacementgroup.NewGetVMPlacementGroupsParams()
getVMPlacementGroupsParams.RequestBody = &models.GetVMPlacementGroupsRequestBody{
Where: &models.VMPlacementGroupWhereInput{
NameStartsWith: TowerString(placementGroupName),
},
}

deleteVMPlacementGroupResp, err := svr.Session.VMPlacementGroup.DeleteVMPlacementGroup(deleteVMPlacementGroupParams)
getVMPlacementGroupsResp, err := svr.Session.VMPlacementGroup.GetVMPlacementGroups(getVMPlacementGroupsParams)
if err != nil {
return err
return nil, err
} else if len(getVMPlacementGroupsResp.Payload) == 0 {
return nil, nil
}

placementGroups := getVMPlacementGroupsResp.Payload
var deletedPGNames, performingOperationPGNames []string
for i := 0; i < len(placementGroups); i++ {
// EntityAsyncStatus != nil means placement group is performing an operation(deletion etc.).
// Cannot delete a placement group that is being deleted,
// otherwise multiple deletion tasks will occur at the same time,
// and the redundant tasks will eventually fail.
if placementGroups[i].EntityAsyncStatus != nil {
performingOperationPGNames = append(performingOperationPGNames, *placementGroups[i].Name)
} else {
deletedPGNames = append(deletedPGNames, *placementGroups[i].Name)
}
}

if len(deleteVMPlacementGroupResp.Payload) == 0 {
return nil
if len(deletedPGNames) == 0 {
return performingOperationPGNames, nil
}

taskID := *deleteVMPlacementGroupResp.Payload[0].TaskID
withLatestStatusTask, err := svr.WaitTask(ctx, taskID, config.WaitTaskTimeout, config.WaitTaskInterval)
if err != nil {
return errors.Wrapf(err, "failed to wait for placement group deleting task to complete in %s: pgNamePrefix %s, taskID %s", config.WaitTaskTimeoutForPlacementGroupOperation, placementGroupName, taskID)
deleteVMPlacementGroupParams := clientvmplacementgroup.NewDeleteVMPlacementGroupParams()
deleteVMPlacementGroupParams.RequestBody = &models.VMPlacementGroupDeletionParams{
Where: &models.VMPlacementGroupWhereInput{
NameIn: deletedPGNames,
haijianyang marked this conversation as resolved.
Show resolved Hide resolved
},
}

if *withLatestStatusTask.Status == models.TaskStatusFAILED {
return errors.Errorf("failed to delete placement group %s in task %s", placementGroupName, *withLatestStatusTask.ID)
if _, err := svr.Session.VMPlacementGroup.DeleteVMPlacementGroup(deleteVMPlacementGroupParams); err != nil {
return nil, err
}

return nil
return append(deletedPGNames, performingOperationPGNames...), nil
}

func (svr *TowerVMService) FindGPUDevicesByHostIDs(hostIDs []string) ([]*models.GpuDevice, error) {
Expand Down