From 8037ba18f69144d81c6cd2364e8da995c481166d Mon Sep 17 00:00:00 2001
From: haijianyang <weber.yhj@gmail.com>
Date: Thu, 20 Jul 2023 03:07:17 -0400
Subject: [PATCH 1/8] Refactor available hosts and vm migration code

---
 controllers/elfmachine_controller.go          |  15 +-
 .../elfmachine_controller_placement_group.go  | 187 ++++++++----------
 controllers/elfmachine_controller_test.go     | 121 +++++-------
 controllers/vm_limiter.go                     |  28 ---
 controllers/vm_limiter_test.go                |  21 --
 pkg/service/collections.go                    | 147 ++++++++++++++
 pkg/service/collections_test.go               |  69 +++++++
 pkg/service/mock_services/vm_mock.go          |  15 +-
 pkg/service/util.go                           |  86 +-------
 pkg/service/util_test.go                      |  18 --
 pkg/service/vm.go                             |   6 +-
 pkg/util/kcp/kcp.go                           |  15 +-
 12 files changed, 381 insertions(+), 347 deletions(-)
 create mode 100644 pkg/service/collections.go
 create mode 100644 pkg/service/collections_test.go

diff --git a/controllers/elfmachine_controller.go b/controllers/elfmachine_controller.go
index a83ae456..4373b613 100644
--- a/controllers/elfmachine_controller.go
+++ b/controllers/elfmachine_controller.go
@@ -842,12 +842,6 @@ func (r *ElfMachineReconciler) reconcileVMTask(ctx *context.MachineContext, vm *
 		switch {
 		case service.IsCloneVMTask(task):
 			releaseTicketForCreateVM(ctx.ElfMachine.Name)
-		case service.IsVMMigrationTask(task):
-			placementGroupName, err := towerresources.GetVMPlacementGroupName(ctx, ctx.Client, ctx.Machine, ctx.Cluster)
-			if err != nil {
-				return false, err
-			}
-			releaseTicketForPlacementGroupVMMigration(placementGroupName)
 		case service.IsMemoryInsufficientError(errorMessage):
 			setElfClusterMemoryInsufficient(ctx.ElfCluster.Spec.Cluster, true)
 			message := fmt.Sprintf("Insufficient memory detected for ELF cluster %s", ctx.ElfCluster.Spec.Cluster)
@@ -858,16 +852,9 @@ func (r *ElfMachineReconciler) reconcileVMTask(ctx *context.MachineContext, vm *
 	case models.TaskStatusSUCCESSED:
 		ctx.Logger.Info("VM task succeeded", "vmRef", vmRef, "taskRef", taskRef, "taskDescription", service.GetTowerString(task.Description))
 
-		switch {
-		case service.IsCloneVMTask(task) || service.IsPowerOnVMTask(task):
+		if service.IsCloneVMTask(task) || service.IsPowerOnVMTask(task) {
 			setElfClusterMemoryInsufficient(ctx.ElfCluster.Spec.Cluster, false)
 			releaseTicketForCreateVM(ctx.ElfMachine.Name)
-		case service.IsVMMigrationTask(task):
-			placementGroupName, err := towerresources.GetVMPlacementGroupName(ctx, ctx.Client, ctx.Machine, ctx.Cluster)
-			if err != nil {
-				return false, err
-			}
-			releaseTicketForPlacementGroupVMMigration(placementGroupName)
 		}
 	default:
 		ctx.Logger.Info("Waiting for VM task done", "vmRef", vmRef, "taskRef", taskRef, "taskStatus", service.GetTowerTaskStatus(task.Status), "taskDescription", service.GetTowerString(task.Description))
diff --git a/controllers/elfmachine_controller_placement_group.go b/controllers/elfmachine_controller_placement_group.go
index 2e27f4ae..5f249a3b 100644
--- a/controllers/elfmachine_controller_placement_group.go
+++ b/controllers/elfmachine_controller_placement_group.go
@@ -145,7 +145,7 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 		return nil, err
 	}
 
-	usedHostsByPG, err := r.getHostsInPlacementGroup(ctx, placementGroup)
+	usedHostSetByPG, err := r.getHostsInPlacementGroup(ctx, placementGroup)
 	if err != nil {
 		return nil, err
 	}
@@ -155,10 +155,10 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 		return nil, err
 	}
 
+	usedHostsByPG := hosts.Find(usedHostSetByPG)
 	availableHosts := r.getAvailableHostsForVM(ctx, hosts, usedHostsByPG, nil)
-	availableHostSet := service.HostsToSet(availableHosts)
-	if availableHostSet.Len() != 0 {
-		ctx.Logger.V(1).Info("The placement group still has capacity", "placementGroup", *placementGroup.Name, "availableHosts", availableHostSet.UnsortedList())
+	if !availableHosts.IsEmpty() {
+		ctx.Logger.V(1).Info("The placement group still has capacity", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String())
 
 		return pointer.String(""), nil
 	}
@@ -170,7 +170,7 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 
 	// KCP is not in scaling down/rolling update.
 	if !(kcputil.IsKCPRollingUpdateFirstMachine(kcp) || kcputil.IsKCPInScalingDown(kcp)) {
-		ctx.Logger.V(1).Info("The placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHostSet.UnsortedList(), "usedHostsByPG", usedHostsByPG.UnsortedList())
+		ctx.Logger.V(1).Info("The placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
 
 		return nil, nil
 	}
@@ -192,7 +192,7 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 			return nil, errors.Wrapf(err, "failed to init patch helper for %s %s/%s", ctx.Machine.GroupVersionKind(), ctx.Machine.Namespace, ctx.Machine.Name)
 		}
 
-		ctx.Logger.Info("Add the delete machine annotation on KCP Machine in order to delete it, because KCP is being scaled down after a failed scaling up", "placementGroup", *placementGroup.Name, "availableHosts", availableHostSet.UnsortedList())
+		ctx.Logger.Info("Add the delete machine annotation on KCP Machine in order to delete it, because KCP is being scaled down after a failed scaling up", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String())
 
 		// Allow scaling down of KCP with the possibility of marking specific control plane machine(s) to be deleted with delete annotation key.
 		// The presence of the annotation will affect the rollout strategy in a way that, it implements the following prioritization logic in descending order,
@@ -213,7 +213,7 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 
 	// KCP is in rolling update.
 
-	if !service.ContainsUnavailableHost(hosts, usedHostsByPG.UnsortedList(), *service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB)) &&
+	if usedHostsByPG.Len() == usedHostsByPG.Available(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB)).Len() &&
 		int(*kcp.Spec.Replicas) == usedHostsByPG.Len() {
 		// Only when KCP is in rolling update and the placement group is full
 		// does it need to get the latest created machine.
@@ -225,7 +225,7 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 		return pointer.String(hostID), err
 	}
 
-	ctx.Logger.V(1).Info("The placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHostSet.UnsortedList(), "usedHostsByPG", usedHostsByPG.UnsortedList())
+	ctx.Logger.V(1).Info("The placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
 
 	return nil, nil
 }
@@ -235,7 +235,7 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 // Find the latest created machine in the placement group,
 // and set the host where the machine is located to the first machine created by KCP rolling update.
 // This prevents migration of virtual machine during KCP rolling update when using a placement group.
-func (r *ElfMachineReconciler) getVMHostForRollingUpdate(ctx *context.MachineContext, placementGroup *models.VMPlacementGroup, hosts []*models.Host) (string, error) {
+func (r *ElfMachineReconciler) getVMHostForRollingUpdate(ctx *context.MachineContext, placementGroup *models.VMPlacementGroup, hosts service.Hosts) (string, error) {
 	elfMachines, err := machineutil.GetControlPlaneElfMachinesInCluster(ctx, ctx.Client, ctx.Cluster.Namespace, ctx.Cluster.Name)
 	if err != nil {
 		return "", err
@@ -267,7 +267,7 @@ func (r *ElfMachineReconciler) getVMHostForRollingUpdate(ctx *context.MachineCon
 		if vm, err := ctx.VMService.Get(vmMap[machine.Name]); err != nil {
 			return "", err
 		} else {
-			host := service.GetHostFromList(*vm.Host.ID, hosts)
+			host := hosts.Get(*vm.Host.ID)
 			if host == nil {
 				ctx.Logger.Info("Host not found, skip selecting host for VM", "hostID", *vm.Host.ID, "vmRef", ctx.ElfMachine.Status.VMRef)
 			} else {
@@ -307,33 +307,22 @@ func (r *ElfMachineReconciler) getHostsInPlacementGroup(ctx *context.MachineCont
 // The 'Available' means that the specified VM can run on these hosts.
 // It returns hosts that are not in faulted state, not in the given 'usedHostsByPG',
 // and have sufficient memory for running this VM.
-func (r *ElfMachineReconciler) getAvailableHostsForVM(ctx *context.MachineContext, hosts []*models.Host, usedHostsByPG sets.Set[string], vm *models.VM) []*models.Host {
+func (r *ElfMachineReconciler) getAvailableHostsForVM(ctx *context.MachineContext, hosts service.Hosts, usedHostsByPG service.Hosts, vm *models.VM) service.Hosts {
+	availableHosts := hosts.Available(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB)).Difference(usedHostsByPG)
+
 	// If the VM is running, and the host where the VM is located
 	// is not used by the placement group, then it is not necessary to
 	// check the memory is sufficient to determine whether the host is available.
-	// Otherwise, the VM may need to be migrated to another host,
-	// and need to check whether the memory is sufficient.
-	if vm != nil && (*vm.Status == models.VMStatusRUNNING || *vm.Status == models.VMStatusSUSPENDED) {
-		availableHosts := service.GetAvailableHosts(hosts, 0)
-		unusedHostsByPG := service.HostsToSet(availableHosts).Difference(usedHostsByPG)
-
-		if unusedHostsByPG.Has(*vm.Host.ID) {
-			return service.FilterHosts(availableHosts, usedHostsByPG)
+	if vm != nil &&
+		(*vm.Status == models.VMStatusRUNNING || *vm.Status == models.VMStatusSUSPENDED) &&
+		!availableHosts.Contains(*vm.Host.ID) {
+		host := hosts.Get(*vm.Host.ID)
+		available, _ := service.IsAvailableHost(host, 0)
+		if available && !usedHostsByPG.Contains(*host.ID) {
+			availableHosts.Insert(host)
 		}
-
-		// The virtual machine is not on an unused host and may need to be migrated,
-		// so need to check whether the host memory is sufficient.
-		unusedHosts := service.FilterHosts(availableHosts, usedHostsByPG)
-		availableHosts = service.GetAvailableHosts(unusedHosts, *service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB))
-
-		return availableHosts
 	}
 
-	// The virtual machine has not been created or is not powered on.
-	// Need to check whether the memory of the host is sufficient.
-	availableHosts := service.GetAvailableHosts(hosts, *service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB))
-	availableHosts = service.FilterHosts(availableHosts, usedHostsByPG)
-
 	return availableHosts
 }
 
@@ -360,6 +349,8 @@ func (r *ElfMachineReconciler) getPlacementGroup(ctx *context.MachineContext, pl
 //     For example, the virtual machine has joined the placement group.
 //  2. false and error is nil means the virtual machine has not joined the placement group.
 //     For example, the placement group is full or the virtual machine is being migrated.
+//
+//nolint:gocyclo
 func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, vm *models.VM) (ret bool, reterr error) {
 	if !version.IsCompatibleWithPlacementGroup(ctx.ElfMachine) {
 		ctx.Logger.V(1).Info(fmt.Sprintf("The capeVersion of ElfMachine is lower than %s, skip adding VM to the placement group", version.CAPEVersion1_2_0), "capeVersion", version.GetCAPEVersion(ctx.ElfMachine))
@@ -401,19 +392,20 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 	}
 
 	if machineutil.IsControlPlaneMachine(ctx.Machine) {
-		usedHostsByPG, err := r.getHostsInPlacementGroup(ctx, placementGroup)
+		hosts, err := ctx.VMService.GetHostsByCluster(ctx.ElfCluster.Spec.Cluster)
 		if err != nil {
 			return false, err
 		}
 
-		hosts, err := ctx.VMService.GetHostsByCluster(ctx.ElfCluster.Spec.Cluster)
+		usedHostSetByPG, err := r.getHostsInPlacementGroup(ctx, placementGroup)
 		if err != nil {
 			return false, err
 		}
 
+		usedHostsByPG := hosts.Find(usedHostSetByPG)
+
 		availableHosts := r.getAvailableHostsForVM(ctx, hosts, usedHostsByPG, vm)
-		availableHostSet := service.HostsToSet(availableHosts)
-		if availableHostSet.Len() == 0 {
+		if availableHosts.IsEmpty() {
 			kcp, err := machineutil.GetKCPByMachine(ctx, ctx.Client, ctx.Machine)
 			if err != nil {
 				return false, err
@@ -424,21 +416,21 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 			// In this case first machine created by KCP rolling update can be powered on without being added to the placement group.
 			if kcputil.IsKCPRollingUpdateFirstMachine(kcp) &&
 				*vm.Status == models.VMStatusSTOPPED &&
-				!service.ContainsUnavailableHost(hosts, usedHostsByPG.UnsortedList(), *service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB)) &&
+				usedHostsByPG.Len() == usedHostsByPG.Available(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB)).Len() &&
 				int(*kcp.Spec.Replicas) == usedHostsByPG.Len() {
-				ctx.Logger.Info("The placement group is full and KCP is in rolling update, skip adding VM to the placement group", "placementGroup", *placementGroup.Name, "availableHosts", availableHostSet.UnsortedList(), "usedHostsByPG", usedHostsByPG.UnsortedList(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
+				ctx.Logger.Info("The placement group is full and KCP is in rolling update, skip adding VM to the placement group", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
 
 				return true, nil
 			}
 
 			if *vm.Status == models.VMStatusRUNNING || *vm.Status == models.VMStatusSUSPENDED {
-				ctx.Logger.V(1).Info(fmt.Sprintf("The placement group is full and VM is in %s status, skip adding VM to the placement group", *vm.Status), "placementGroup", *placementGroup.Name, "availableHosts", availableHostSet.UnsortedList(), "usedHostsByPG", usedHostsByPG.UnsortedList(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
+				ctx.Logger.V(1).Info(fmt.Sprintf("The placement group is full and VM is in %s status, skip adding VM to the placement group", *vm.Status), "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
 
 				return true, nil
 			}
 
 			// KCP is scaling out or being created.
-			ctx.Logger.V(1).Info("The placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHostSet.UnsortedList(), "usedHostsByPG", usedHostsByPG.UnsortedList(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
+			ctx.Logger.V(1).Info("The placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
 
 			return false, nil
 		}
@@ -447,9 +439,56 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 		// and the virtual machine is not STOPPED, we need to migrate the virtual machine to a host that
 		// is not used by the placement group before adding the virtual machine to the placement group.
 		// Otherwise, just add the virtual machine to the placement group directly.
-		ctx.Logger.V(1).Info("The availableHosts for migrating the VM", "hosts", availableHostSet, "vmHost", *vm.Host.ID)
-		if !availableHostSet.Has(*vm.Host.ID) && *vm.Status != models.VMStatusSTOPPED {
-			return r.migrateVMForJoiningPlacementGroup(ctx, vm, placementGroup, availableHostSet.UnsortedList()[0])
+		ctx.Logger.V(1).Info("The availableHosts for migrating the VM", "hosts", availableHosts.String(), "vmHost", *vm.Host.ID)
+		if !availableHosts.Contains(*vm.Host.ID) && *vm.Status != models.VMStatusSTOPPED {
+			ctx.Logger.V(1).Info("Try to migrate the virtual machine to the specified target host if needed")
+
+			kcp, err := machineutil.GetKCPByMachine(ctx, ctx.Client, ctx.Machine)
+			if err != nil {
+				return false, err
+			}
+
+			if kcputil.IsKCPRollingUpdate(kcp) {
+				ctx.Logger.Info("KCP rolling update in progress, skip migrating VM", "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
+				return true, nil
+			}
+
+			// The 1st new CP ElfMachine should wait for other new CP ElfMachines to join the target PlacementGroup.
+			// The code below double checks the recommended target host for migration is valid.
+			cpElfMachines, err := machineutil.GetControlPlaneElfMachinesInCluster(ctx, ctx.Client, ctx.Cluster.Namespace, ctx.Cluster.Name)
+			if err != nil {
+				return false, err
+			}
+
+			targetHost := availableHosts.UnsortedList()[0]
+			usedHostsByPG := sets.Set[string]{}
+			for i := 0; i < len(cpElfMachines); i++ {
+				if ctx.ElfMachine.Name != cpElfMachines[i].Name &&
+					cpElfMachines[i].Status.PlacementGroupRef == *placementGroup.ID &&
+					cpElfMachines[i].CreationTimestamp.After(ctx.ElfMachine.CreationTimestamp.Time) {
+					usedHostsByPG.Insert(cpElfMachines[i].Status.HostServerRef)
+				}
+			}
+
+			usedHostsCount := usedHostsByPG.Len()
+			ctx.Logger.V(1).Info("The hosts used by the PlacementGroup", "usedHosts", usedHostsByPG, "count", usedHostsCount, "targetHost", targetHost)
+			if usedHostsCount < int(*kcp.Spec.Replicas-1) {
+				ctx.Logger.V(1).Info("Not all other CPs joined the PlacementGroup, skip migrating VM")
+				return true, nil
+			}
+
+			if usedHostsByPG.Has(*targetHost.ID) {
+				ctx.Logger.V(1).Info("The recommended target host for VM migration is used by the PlacementGroup, skip migrating VM")
+				return true, nil
+			}
+
+			// KCP is not in rolling update process.
+			// This is the last CP ElfMachine (i.e. the 1st new CP ElfMachine) which has not been added into the target PlacementGroup.
+			// Migrate this VM to the target host, then it will be added into the target PlacementGroup.
+
+			ctx.Logger.V(1).Info("Start migrateVM since KCP is not in rolling update process", "targetHost", targetHost)
+
+			return r.migrateVM(ctx, vm, *targetHost.ID)
 		}
 	}
 
@@ -463,79 +502,17 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 	return true, nil
 }
 
-// migrateVMForJoiningPlacementGroup migrates the virtual machine to the specified target host
-// for joining placement group.
-//
-// The return value:
-// 1. true means that the virtual machine does not need to be migrated.
-// 2. false and error is nil means the virtual machine is being migrated.
-func (r *ElfMachineReconciler) migrateVMForJoiningPlacementGroup(ctx *context.MachineContext, vm *models.VM, placementGroup *models.VMPlacementGroup, targetHost string) (bool, error) {
-	ctx.Logger.V(1).Info("Try to migrate the virtual machine to the specified target host if needed")
-	kcp, err := machineutil.GetKCPByMachine(ctx, ctx.Client, ctx.Machine)
-	if err != nil {
-		return false, err
-	}
-
-	// When *kcp.Spec.Replicas != kcp.Status.UpdatedReplicas, it must be in a KCP rolling update process.
-	// When *kcp.Spec.Replicas == kcp.Status.UpdatedReplicas, it could be in one of the following cases:
-	// 1) It's not in a KCP rolling update process. So kcp.Spec.Replicas == kcp.Status.Replicas.
-	// 2) It's at the end of a KCP rolling update process, and the last KCP replica (i.e the last KCP ElfMachine) is created just now.
-	//    There is still an old KCP ElfMachine, so kcp.Spec.Replicas + 1 == kcp.Status.Replicas.
-
-	if *kcp.Spec.Replicas != kcp.Status.UpdatedReplicas || *kcp.Spec.Replicas != kcp.Status.Replicas {
-		ctx.Logger.Info("KCP rolling update in progress, skip migrating VM", "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
-		return true, nil
-	}
-
-	// The 1st new CP ElfMachine should wait for other new CP ElfMachines to join the target PlacementGroup.
-	// The code below double checks the recommended target host for migration is valid.
-	cpElfMachines, err := machineutil.GetControlPlaneElfMachinesInCluster(ctx, ctx.Client, ctx.Cluster.Namespace, ctx.Cluster.Name)
-	if err != nil {
-		return false, err
-	}
-	usedHostsByPG := sets.Set[string]{}
-	for i := 0; i < len(cpElfMachines); i++ {
-		if ctx.ElfMachine.Name != cpElfMachines[i].Name &&
-			cpElfMachines[i].Status.PlacementGroupRef == *placementGroup.ID &&
-			cpElfMachines[i].CreationTimestamp.After(ctx.ElfMachine.CreationTimestamp.Time) {
-			usedHostsByPG.Insert(cpElfMachines[i].Status.HostServerRef)
-		}
-	}
-	usedHostsCount := usedHostsByPG.Len()
-	ctx.Logger.V(1).Info("The hosts used by the PlacementGroup", "usedHosts", usedHostsByPG, "count", usedHostsCount, "targetHost", targetHost)
-	if usedHostsCount < int(*kcp.Spec.Replicas-1) {
-		ctx.Logger.V(1).Info("Not all other CPs joined the PlacementGroup, skip migrating VM")
-		return true, nil
-	}
-	if usedHostsByPG.Has(targetHost) {
-		ctx.Logger.V(1).Info("The recommended target host for VM migration is used by the PlacementGroup, skip migrating VM")
-		return true, nil
-	}
-
-	// KCP is not in rolling update process.
-	// This is the last CP ElfMachine (i.e. the 1st new CP ElfMachine) which has not been added into the target PlacementGroup.
-	// Migrate this VM to the target host, then it will be added into the target PlacementGroup.
-	ctx.Logger.V(1).Info("Start migrateVM since KCP is not in rolling update process", "targetHost", targetHost)
-	return r.migrateVM(ctx, vm, placementGroup, targetHost)
-}
-
 // migrateVM migrates the virtual machine to the specified target host.
 //
 // The return value:
 // 1. true means that the virtual machine does not need to be migrated.
 // 2. false and error is nil means the virtual machine is being migrated.
-func (r *ElfMachineReconciler) migrateVM(ctx *context.MachineContext, vm *models.VM, placementGroup *models.VMPlacementGroup, targetHost string) (bool, error) {
+func (r *ElfMachineReconciler) migrateVM(ctx *context.MachineContext, vm *models.VM, targetHost string) (bool, error) {
 	if *vm.Host.ID == targetHost {
 		ctx.Logger.V(1).Info(fmt.Sprintf("The VM is already on the recommended target host %s, skip migrating VM", targetHost))
 		return true, nil
 	}
 
-	if ok := acquireTicketForPlacementGroupVMMigration(*placementGroup.Name); !ok {
-		ctx.Logger.V(1).Info("The placement group is performing another VM migration, skip migrating VM", "placementGroup", service.GetTowerString(placementGroup.Name), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
-
-		return false, nil
-	}
-
 	withTaskVM, err := ctx.VMService.Migrate(service.GetTowerString(vm.ID), targetHost)
 	if err != nil {
 		return false, err
diff --git a/controllers/elfmachine_controller_test.go b/controllers/elfmachine_controller_test.go
index 5bd4867d..c74c0ca1 100644
--- a/controllers/elfmachine_controller_test.go
+++ b/controllers/elfmachine_controller_test.go
@@ -35,7 +35,6 @@ import (
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
-	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/klog/v2"
 	"k8s.io/utils/pointer"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
@@ -955,7 +954,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
 
 				mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil)
 				mockVMService.EXPECT().FindByIDs([]string{}).Return([]*models.VM{}, nil)
 				mockVMService.EXPECT().AddVMsToPlacementGroup(placementGroup, []string{*vm.ID}).Return(task, nil)
 				mockVMService.EXPECT().WaitTask(*task.ID, config.WaitTaskTimeout, config.WaitTaskInterval).Return(task, nil)
@@ -980,12 +979,13 @@ var _ = Describe("ElfMachineReconciler", func() {
 				kcp.Spec.Replicas = pointer.Int32(1)
 				kcp.Status.Replicas = 2
 				kcp.Status.UpdatedReplicas = 1
+				conditions.MarkFalse(kcp, controlplanev1.MachinesSpecUpToDateCondition, controlplanev1.RollingUpdateInProgressReason, clusterv1.ConditionSeverityWarning, "")
 				ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md, kcp)
 				machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService)
 				fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
 
 				mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil)
 				mockVMService.EXPECT().FindByIDs([]string{*placementGroup.Vms[0].ID}).Return([]*models.VM{vm2}, nil)
 
 				reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -998,7 +998,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				klog.SetOutput(logBuffer)
 				host.HostState = &models.NestedMaintenanceHostState{State: models.NewMaintenanceModeEnum(models.MaintenanceModeEnumMAINTENANCEMODE)}
 				mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil)
 				mockVMService.EXPECT().FindByIDs([]string{*placementGroup.Vms[0].ID}).Return([]*models.VM{}, nil)
 
 				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1012,7 +1012,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				vm.Status = models.NewVMStatus(models.VMStatusRUNNING)
 				host.HostState = &models.NestedMaintenanceHostState{State: models.NewMaintenanceModeEnum(models.MaintenanceModeEnumMAINTENANCEMODE)}
 				mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil)
 				mockVMService.EXPECT().FindByIDs([]string{*placementGroup.Vms[0].ID}).Return([]*models.VM{}, nil)
 
 				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1042,7 +1042,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
 
 				mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host1, host2, host3}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host1, host2, host3), nil)
 				mockVMService.EXPECT().FindByIDs([]string{*vm2.ID}).Return([]*models.VM{vm2}, nil)
 				mockVMService.EXPECT().AddVMsToPlacementGroup(placementGroup, gomock.Any()).Return(task, nil)
 				mockVMService.EXPECT().WaitTask(*task.ID, config.WaitTaskTimeout, config.WaitTaskInterval).Return(task, nil)
@@ -1073,12 +1073,13 @@ var _ = Describe("ElfMachineReconciler", func() {
 				kcp.Spec.Replicas = pointer.Int32(3)
 				kcp.Status.UpdatedReplicas = 3
 				kcp.Status.Replicas = 4
+				conditions.MarkFalse(kcp, controlplanev1.MachinesSpecUpToDateCondition, controlplanev1.RollingUpdateInProgressReason, clusterv1.ConditionSeverityWarning, "")
 				ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md, kcp)
 				machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService)
 				fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
 
 				mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host1, host2, host3}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host1, host2, host3), nil)
 				mockVMService.EXPECT().FindByIDs(gomock.InAnyOrder([]string{*newCP2.ID, *oldCP3.ID})).Return([]*models.VM{newCP2, oldCP3}, nil)
 
 				reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1138,7 +1139,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 
 				mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil)
 				mockVMService.EXPECT().FindByIDs(gomock.InAnyOrder([]string{*vm1.ID, *vm2.ID})).Return([]*models.VM{vm1, vm2}, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host0, host1, host2}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host0, host1, host2), nil)
 				mockVMService.EXPECT().Migrate(*vm0.ID, *host0.ID).Return(withTaskVM, nil)
 
 				reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1155,7 +1156,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				elfMachine1.Status.HostServerRef = *host0.ID
 				mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil)
 				mockVMService.EXPECT().FindByIDs(gomock.InAnyOrder([]string{*vm1.ID, *vm2.ID})).Return([]*models.VM{vm1, vm2}, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host0, host1, host2}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host0, host1, host2), nil)
 				ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, kcp, elfMachine1, machine1, elfMachine2, machine2)
 				machineContext = newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService)
 				fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
@@ -1172,13 +1173,12 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host := fake.NewTowerHost()
 				vm := fake.NewTowerVMFromElfMachine(elfMachine)
 				vm.Host = &models.NestedHost{ID: service.TowerString(*host.ID)}
-				placementGroup := fake.NewVMPlacementGroup([]string{})
 				ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, kcp)
 				machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService)
 				fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
 
 				reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
-				ok, err := reconciler.migrateVM(machineContext, vm, placementGroup, *host.ID)
+				ok, err := reconciler.migrateVM(machineContext, vm, *host.ID)
 				Expect(ok).To(BeTrue())
 				Expect(err).To(BeZero())
 				Expect(logBuffer.String()).To(ContainSubstring("The VM is already on the recommended target host"))
@@ -1241,7 +1241,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
 				mockVMService.EXPECT().GetVMPlacementGroup(placementGroupName).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil)
 				mockVMService.EXPECT().FindByIDs(gomock.InAnyOrder([]string{})).Return([]*models.VM{}, nil)
 
 				reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1253,7 +1253,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				klog.SetOutput(logBuffer)
 				placementGroup.Vms = []*models.NestedVM{{ID: vm1.ID, Name: vm1.Name}}
 				mockVMService.EXPECT().GetVMPlacementGroup(placementGroupName).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil)
 				mockVMService.EXPECT().FindByIDs(gomock.InAnyOrder([]string{*vm1.ID})).Return([]*models.VM{vm1}, nil)
 
 				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1263,7 +1263,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				Expect(logBuffer.String()).To(ContainSubstring("The placement group is full, wait for enough available hosts"))
 			})
 
-			It("when placement group is full", func() {
+			It("when placement group is full and KCP rolling update in progress", func() {
 				host1 := fake.NewTowerHost()
 				host2 := fake.NewTowerHost()
 				host3 := fake.NewTowerHost()
@@ -1299,6 +1299,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				kcp.Spec.Replicas = pointer.Int32(3)
 				kcp.Status.Replicas = 4
 				kcp.Status.UpdatedReplicas = 1
+				conditions.MarkFalse(kcp, controlplanev1.MachinesSpecUpToDateCondition, controlplanev1.RollingUpdateInProgressReason, clusterv1.ConditionSeverityWarning, "")
 				ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, kcp, elfMachine1, machine1, elfMachine2, machine2, elfMachine3, machine3)
 				machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService)
 				fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
@@ -1312,7 +1313,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				klog.SetOutput(logBuffer)
 				mockVMService.EXPECT().Get(*vm3.ID).Return(vm3, nil)
 				mockVMService.EXPECT().GetVMPlacementGroup(placementGroupName).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host1, host2, host3}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host1, host2, host3), nil)
 				mockVMService.EXPECT().FindByIDs(gomock.InAnyOrder([]string{*vm1.ID, *vm2.ID, *vm3.ID})).Return([]*models.VM{vm1, vm2, vm3}, nil)
 
 				reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1326,7 +1327,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				klog.SetOutput(logBuffer)
 				host1.Status = models.NewHostStatus(models.HostStatusCONNECTEDERROR)
 				mockVMService.EXPECT().GetVMPlacementGroup(placementGroupName).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host1, host2, host3}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host1, host2, host3), nil)
 				mockVMService.EXPECT().FindByIDs(gomock.InAnyOrder([]string{*vm1.ID, *vm2.ID, *vm3.ID})).Return([]*models.VM{vm1, vm2, vm3}, nil)
 
 				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1340,7 +1341,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host1.Status = models.NewHostStatus(models.HostStatusCONNECTEDHEALTHY)
 				host2.Status = models.NewHostStatus(models.HostStatusCONNECTEDERROR)
 				mockVMService.EXPECT().GetVMPlacementGroup(placementGroupName).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host1, host2, host3}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host1, host2, host3), nil)
 				mockVMService.EXPECT().FindByIDs(gomock.InAnyOrder([]string{*vm1.ID, *vm2.ID, *vm3.ID})).Return([]*models.VM{vm1, vm2, vm3}, nil)
 
 				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1354,7 +1355,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host2.Status = models.NewHostStatus(models.HostStatusCONNECTEDHEALTHY)
 				host3.Status = models.NewHostStatus(models.HostStatusCONNECTEDERROR)
 				mockVMService.EXPECT().GetVMPlacementGroup(placementGroupName).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host1, host2, host3}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host1, host2, host3), nil)
 				mockVMService.EXPECT().FindByIDs(gomock.InAnyOrder([]string{*vm1.ID, *vm2.ID, *vm3.ID})).Return([]*models.VM{vm1, vm2, vm3}, nil)
 
 				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1370,7 +1371,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				mockVMService.EXPECT().Get(*vm3.ID).Return(vm3, nil)
 
 				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
-				hostID, err := reconciler.getVMHostForRollingUpdate(machineContext, placementGroup, hosts)
+				hostID, err := reconciler.getVMHostForRollingUpdate(machineContext, placementGroup, service.NewHostsFromList(hosts))
 				Expect(err).To(BeZero())
 				Expect(hostID).To(Equal(""))
 				Expect(logBuffer.String()).To(ContainSubstring("Host is unavailable: host is in CONNECTED_ERROR status, skip selecting host for VM"))
@@ -1382,7 +1383,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				mockVMService.EXPECT().Get(*vm3.ID).Return(vm3, nil)
 
 				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
-				hostID, err = reconciler.getVMHostForRollingUpdate(machineContext, placementGroup, hosts)
+				hostID, err = reconciler.getVMHostForRollingUpdate(machineContext, placementGroup, service.NewHostsFromList(hosts))
 				Expect(err).To(BeZero())
 				Expect(hostID).To(Equal(""))
 				Expect(logBuffer.String()).To(ContainSubstring("Host not found, skip selecting host for VM"))
@@ -1399,7 +1400,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				mockVMService.EXPECT().Get(*vm3.ID).Return(vm3, nil)
 
 				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
-				hostID, err = reconciler.getVMHostForRollingUpdate(machineContext, placementGroup, hosts)
+				hostID, err = reconciler.getVMHostForRollingUpdate(machineContext, placementGroup, service.NewHostsFromList(hosts))
 				Expect(err).To(BeZero())
 				Expect(hostID).To(Equal(*vm3.Host.ID))
 				Expect(logBuffer.String()).To(ContainSubstring("Selected the host server for VM since the placement group is full"))
@@ -1411,7 +1412,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				mockVMService.EXPECT().Get(*vm3.ID).Return(vm3, nil)
 
 				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
-				hostID, err = reconciler.getVMHostForRollingUpdate(machineContext, placementGroup, hosts)
+				hostID, err = reconciler.getVMHostForRollingUpdate(machineContext, placementGroup, service.NewHostsFromList(hosts))
 				Expect(err).To(BeZero())
 				Expect(hostID).To(Equal(*vm3.Host.ID))
 				Expect(logBuffer.String()).To(ContainSubstring("Selected the host server for VM since the placement group is full"))
@@ -1450,7 +1451,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				placementGroupName, err := towerresources.GetVMPlacementGroupName(ctx, ctrlContext.Client, machine, cluster)
 				Expect(err).NotTo(HaveOccurred())
 				mockVMService.EXPECT().GetVMPlacementGroup(placementGroupName).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host1}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host1), nil)
 				mockVMService.EXPECT().FindByIDs(gomock.InAnyOrder([]string{*vm1.ID})).Return([]*models.VM{vm1}, nil)
 
 				reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1462,7 +1463,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 
 				elfMachine.Status.Conditions = nil
 				mockVMService.EXPECT().GetVMPlacementGroup(placementGroupName).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host1, host2}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host1, host2), nil)
 				mockVMService.EXPECT().FindByIDs(gomock.InAnyOrder([]string{*vm1.ID})).Return([]*models.VM{vm1}, nil)
 
 				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1473,7 +1474,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 
 				placementGroup.Vms = []*models.NestedVM{}
 				mockVMService.EXPECT().GetVMPlacementGroup(placementGroupName).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host1}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host1), nil)
 				mockVMService.EXPECT().FindByIDs(gomock.InAnyOrder([]string{})).Return([]*models.VM{}, nil)
 
 				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1505,7 +1506,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				placementGroupName, err := towerresources.GetVMPlacementGroupName(ctx, ctrlContext.Client, machine, cluster)
 				Expect(err).NotTo(HaveOccurred())
 				mockVMService.EXPECT().GetVMPlacementGroup(placementGroupName).Return(placementGroup, nil)
-				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return([]*models.Host{host}, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil)
 				mockVMService.EXPECT().FindByIDs(gomock.InAnyOrder([]string{*vm1.ID})).Return([]*models.VM{vm1}, nil)
 
 				reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
@@ -1520,9 +1521,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 	})
 
 	Context("Get Available Hosts For VM", func() {
-		It("should return the available hosts when the virtual machine has not been created or is not powered on", func() {
-			vm := fake.NewTowerVMFromElfMachine(elfMachine)
-			vm.Status = models.NewVMStatus(models.VMStatusSTOPPED)
+		It("should return the available hosts", func() {
 			host1 := fake.NewTowerHost()
 			host1.AllocatableMemoryBytes = service.TowerMemory(0)
 			host2 := fake.NewTowerHost()
@@ -1531,69 +1530,47 @@ var _ = Describe("ElfMachineReconciler", func() {
 			ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, kcp)
 			machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService)
 
+			// virtual machine has not been created
+
 			reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
-			availableHosts := reconciler.getAvailableHostsForVM(machineContext, nil, sets.Set[string]{}, nil)
+			availableHosts := reconciler.getAvailableHostsForVM(machineContext, nil, service.NewHosts(), nil)
 			Expect(availableHosts).To(BeEmpty())
 
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, nil, sets.Set[string]{}.Insert(*host2.ID), nil)
+			availableHosts = reconciler.getAvailableHostsForVM(machineContext, nil, service.NewHosts(host2), nil)
 			Expect(availableHosts).To(BeEmpty())
 
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, []*models.Host{host1, host2, host3}, sets.Set[string]{}.Insert(*host3.ID), vm)
+			availableHosts = reconciler.getAvailableHostsForVM(machineContext, service.NewHosts(host1, host2, host3), service.NewHosts(host3), nil)
 			Expect(availableHosts).To(ContainElements(host2))
 
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, []*models.Host{host1, host2, host3}, sets.Set[string]{}.Insert(*host1.ID, *host2.ID, *host3.ID), vm)
+			availableHosts = reconciler.getAvailableHostsForVM(machineContext, service.NewHosts(host1, host2, host3), service.NewHosts(host1, host2, host3), nil)
 			Expect(availableHosts).To(BeEmpty())
-		})
 
-		It("should return the available hosts when the virtual machine is running", func() {
-			vm := fake.NewTowerVMFromElfMachine(elfMachine)
-			vm.Status = models.NewVMStatus(models.VMStatusRUNNING)
-			vm.Host = &models.NestedHost{ID: service.TowerString(fake.ID())}
-			host1 := fake.NewTowerHost()
-			host1.Status = models.NewHostStatus(models.HostStatusSESSIONEXPIRED)
-			host2 := fake.NewTowerHost()
-			host3 := fake.NewTowerHost()
+			// virtual machine is not powered on
 
-			ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, kcp)
-			machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService)
-
-			vm.Host.ID = host2.ID
-			reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
-			availableHosts := reconciler.getAvailableHostsForVM(machineContext, []*models.Host{host1, host2, host3}, sets.Set[string]{}, vm)
-			Expect(availableHosts).To(ContainElements(host2, host3))
-
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, []*models.Host{host1, host2, host3}, sets.Set[string]{}.Insert(*host1.ID), vm)
-			Expect(availableHosts).To(ContainElements(host2, host3))
-
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, []*models.Host{host1, host2, host3}, sets.Set[string]{}.Insert(*host2.ID), vm)
-			Expect(availableHosts).To(ContainElements(host3))
-
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, []*models.Host{host1, host2, host3}, sets.Set[string]{}.Insert(*host3.ID), vm)
-			Expect(availableHosts).To(ContainElements(host2))
-
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, []*models.Host{host1, host2, host3}, sets.Set[string]{}.Insert(*host1.ID, *host2.ID), vm)
-			Expect(availableHosts).To(ContainElements(host3))
-
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, []*models.Host{host1, host2, host3}, sets.Set[string]{}.Insert(*host1.ID, *host3.ID), vm)
-			Expect(availableHosts).To(ContainElements(host2))
+			vm := fake.NewTowerVMFromElfMachine(elfMachine)
+			vm.Status = models.NewVMStatus(models.VMStatusSTOPPED)
 
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, []*models.Host{host1, host2, host3}, sets.Set[string]{}.Insert(*host2.ID, *host3.ID), vm)
+			availableHosts = reconciler.getAvailableHostsForVM(machineContext, nil, service.NewHosts(), vm)
 			Expect(availableHosts).To(BeEmpty())
 
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, []*models.Host{host1, host2, host3}, sets.Set[string]{}.Insert(*host1.ID, *host2.ID, *host3.ID), vm)
+			availableHosts = reconciler.getAvailableHostsForVM(machineContext, nil, service.NewHosts(host2), vm)
 			Expect(availableHosts).To(BeEmpty())
 
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, []*models.Host{host1, host2}, sets.Set[string]{}.Insert(*host2.ID, *host3.ID), vm)
-			Expect(availableHosts).To(BeEmpty())
+			availableHosts = reconciler.getAvailableHostsForVM(machineContext, service.NewHosts(host1, host2, host3), service.NewHosts(host3), vm)
+			Expect(availableHosts).To(ContainElements(host2))
 
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, []*models.Host{host2, host3}, sets.Set[string]{}.Insert(*host2.ID, *host3.ID), vm)
+			availableHosts = reconciler.getAvailableHostsForVM(machineContext, service.NewHosts(host1, host2, host3), service.NewHosts(host1, host2, host3), vm)
 			Expect(availableHosts).To(BeEmpty())
 
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, nil, sets.Set[string]{}, vm)
-			Expect(availableHosts).To(BeEmpty())
+			// virtual machine is powered on
+			vm.Status = models.NewVMStatus(models.VMStatusRUNNING)
+			vm.Host = &models.NestedHost{ID: host1.ID}
 
-			availableHosts = reconciler.getAvailableHostsForVM(machineContext, nil, sets.Set[string]{}.Insert(*host3.ID), vm)
+			availableHosts = reconciler.getAvailableHostsForVM(machineContext, service.NewHosts(host1, host2, host3), service.NewHosts(host1, host2, host3), vm)
 			Expect(availableHosts).To(BeEmpty())
+
+			availableHosts = reconciler.getAvailableHostsForVM(machineContext, service.NewHosts(host1, host2, host3), service.NewHosts(host2, host3), vm)
+			Expect(availableHosts).To(ContainElements(host1))
 		})
 	})
 
diff --git a/controllers/vm_limiter.go b/controllers/vm_limiter.go
index a8c593d1..e0efcc99 100644
--- a/controllers/vm_limiter.go
+++ b/controllers/vm_limiter.go
@@ -27,7 +27,6 @@ import (
 const (
 	creationTimeout               = time.Minute * 6
 	vmOperationRateLimit          = time.Second * 6
-	vmMigrationTimeout            = time.Minute * 20
 	placementGroupSilenceTime     = time.Minute * 30
 	placementGroupCreationLockKey = "%s:creation"
 )
@@ -35,7 +34,6 @@ const (
 var vmStatusMap = make(map[string]time.Time)
 var limiterLock sync.Mutex
 var vmOperationMap = make(map[string]time.Time)
-var placementGroupVMMigrationMap = make(map[string]time.Time)
 var vmOperationLock sync.Mutex
 
 var placementGroupOperationMap = make(map[string]time.Time)
@@ -91,32 +89,6 @@ func acquireTicketForUpdatingVM(vmName string) bool {
 	return true
 }
 
-// acquireTicketForPlacementGroupVMMigration returns whether virtual machine migration
-// of placement group operation can be performed.
-func acquireTicketForPlacementGroupVMMigration(groupName string) bool {
-	vmOperationLock.Lock()
-	defer vmOperationLock.Unlock()
-
-	if status, ok := placementGroupVMMigrationMap[groupName]; ok {
-		if !time.Now().After(status.Add(vmMigrationTimeout)) {
-			return false
-		}
-	}
-
-	placementGroupVMMigrationMap[groupName] = time.Now()
-
-	return true
-}
-
-// releaseTicketForPlacementGroupVMMigration releases the virtual machine migration
-// of placement group being operated.
-func releaseTicketForPlacementGroupVMMigration(groupName string) {
-	vmOperationLock.Lock()
-	defer vmOperationLock.Unlock()
-
-	delete(placementGroupVMMigrationMap, groupName)
-}
-
 // acquireTicketForPlacementGroupOperation returns whether placement group operation
 // can be performed.
 func acquireTicketForPlacementGroupOperation(groupName string) bool {
diff --git a/controllers/vm_limiter_test.go b/controllers/vm_limiter_test.go
index 81badabd..37b259c5 100644
--- a/controllers/vm_limiter_test.go
+++ b/controllers/vm_limiter_test.go
@@ -85,27 +85,6 @@ var _ = Describe("VM Operation Limiter", func() {
 	})
 })
 
-var _ = Describe("Placement Group VM Migration Limiter", func() {
-	var groupName string
-
-	BeforeEach(func() {
-		groupName = fake.UUID()
-	})
-
-	It("acquireTicketForPlacementGroupVMMigration", func() {
-		Expect(acquireTicketForPlacementGroupVMMigration(groupName)).To(BeTrue())
-		Expect(placementGroupVMMigrationMap).To(HaveKey(groupName))
-
-		Expect(acquireTicketForPlacementGroupVMMigration(groupName)).To(BeFalse())
-		releaseTicketForPlacementGroupVMMigration(groupName)
-		Expect(placementGroupVMMigrationMap).NotTo(HaveKey(groupName))
-
-		placementGroupVMMigrationMap[groupName] = time.Now().Add(-vmMigrationTimeout)
-		Expect(acquireTicketForPlacementGroupVMMigration(groupName)).To(BeTrue())
-		Expect(placementGroupVMMigrationMap).To(HaveKey(groupName))
-	})
-})
-
 var _ = Describe("Placement Group Operation Limiter", func() {
 	var groupName string
 
diff --git a/pkg/service/collections.go b/pkg/service/collections.go
new file mode 100644
index 00000000..05c78d8a
--- /dev/null
+++ b/pkg/service/collections.go
@@ -0,0 +1,147 @@
+/*
+Copyright 2023.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package service
+
+import (
+	"fmt"
+
+	"github.com/smartxworks/cloudtower-go-sdk/v2/models"
+	"k8s.io/apimachinery/pkg/util/sets"
+)
+
+// Hosts is a set of hosts.
+type Hosts map[string]*models.Host
+
+// NewHosts creates a Hosts. from a list of values.
+func NewHosts(hosts ...*models.Host) Hosts {
+	ss := make(Hosts, len(hosts))
+	ss.Insert(hosts...)
+	return ss
+}
+
+// NewHostsFromList creates a Hosts from the given host slice.
+func NewHostsFromList(hosts []*models.Host) Hosts {
+	ss := make(Hosts, len(hosts))
+	for i := range hosts {
+		ss.Insert(hosts[i])
+	}
+	return ss
+}
+
+func (s Hosts) Insert(hosts ...*models.Host) {
+	for i := range hosts {
+		if hosts[i] != nil {
+			h := hosts[i]
+			s[*h.ID] = h
+		}
+	}
+}
+
+func (s Hosts) Contains(hostID string) bool {
+	_, ok := s[hostID]
+	return ok
+}
+
+// Len returns the size of the set.
+func (s Hosts) Len() int {
+	return len(s)
+}
+
+func (s Hosts) IsEmpty() bool {
+	return len(s) == 0
+}
+
+func (s Hosts) String() string {
+	str := ""
+	for _, host := range s {
+		str += fmt.Sprintf("{id: %s,name: %s},", GetTowerString(host.ID), GetTowerString(host.Name))
+	}
+
+	return fmt.Sprintf("[%s]", str)
+}
+
+// Available returns a Hosts with available hosts.
+func (s Hosts) Available(memory int64) Hosts {
+	return s.Filter(func(h *models.Host) bool {
+		ok, _ := IsAvailableHost(h, memory)
+		return ok
+	})
+}
+
+// Get returns a Host of the specified host.
+func (s Hosts) Get(hostID string) *models.Host {
+	if host, ok := s[hostID]; ok {
+		return host
+	}
+	return nil
+}
+
+// Find returns a Hosts of the specified hosts.
+func (s Hosts) Find(targetHosts sets.Set[string]) Hosts {
+	return s.Filter(func(h *models.Host) bool {
+		return targetHosts.Has(*h.ID)
+	})
+}
+
+// UnsortedList returns the slice with contents in random order.
+func (s Hosts) UnsortedList() []*models.Host {
+	res := make([]*models.Host, 0, len(s))
+	for _, value := range s {
+		res = append(res, value)
+	}
+	return res
+}
+
+// Difference returns a copy without hosts that are in the given collection.
+func (s Hosts) Difference(hosts Hosts) Hosts {
+	return s.Filter(func(h *models.Host) bool {
+		_, found := hosts[*h.ID]
+		return !found
+	})
+}
+
+// newFilteredHostCollection creates a Hosts from a filtered list of values.
+func newFilteredHostCollection(filter Func, hosts ...*models.Host) Hosts {
+	ss := make(Hosts, len(hosts))
+	for i := range hosts {
+		h := hosts[i]
+		if filter(h) {
+			ss.Insert(h)
+		}
+	}
+	return ss
+}
+
+// Filter returns a Hosts containing only the Hosts that match all of the given HostFilters.
+func (s Hosts) Filter(filters ...Func) Hosts {
+	return newFilteredHostCollection(And(filters...), s.UnsortedList()...)
+}
+
+// Func is the functon definition for a filter.
+type Func func(host *models.Host) bool
+
+// And returns a filter that returns true if all of the given filters returns true.
+func And(filters ...Func) Func {
+	return func(host *models.Host) bool {
+		for _, f := range filters {
+			if !f(host) {
+				return false
+			}
+		}
+		return true
+	}
+}
diff --git a/pkg/service/collections_test.go b/pkg/service/collections_test.go
new file mode 100644
index 00000000..cf74e046
--- /dev/null
+++ b/pkg/service/collections_test.go
@@ -0,0 +1,69 @@
+/*
+Copyright 2023.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package service
+
+import (
+	"testing"
+
+	"github.com/onsi/gomega"
+	"github.com/smartxworks/cloudtower-go-sdk/v2/models"
+	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/utils/pointer"
+)
+
+func TestHostCollection(t *testing.T) {
+	g := gomega.NewGomegaWithT(t)
+
+	t.Run("Find", func(t *testing.T) {
+		host1 := &models.Host{ID: TowerString("1"), Name: TowerString("host1")}
+		host2 := &models.Host{ID: TowerString("2"), Name: TowerString("host2")}
+
+		hosts := NewHosts()
+		g.Expect(hosts.Find(sets.Set[string]{}.Insert(*host1.ID)).Len()).To(gomega.Equal(0))
+
+		hosts = NewHostsFromList([]*models.Host{host1, host2})
+		g.Expect(hosts.Get(*host1.ID)).To(gomega.Equal(host1))
+		g.Expect(hosts.Get(*TowerString("404"))).To(gomega.BeNil())
+		g.Expect(hosts.Find(sets.Set[string]{}.Insert(*host1.ID)).Contains(*host1.ID)).To(gomega.BeTrue())
+		g.Expect(hosts.Find(sets.Set[string]{}.Insert(*host1.ID)).Len()).To(gomega.Equal(1))
+	})
+
+	t.Run("Available", func(t *testing.T) {
+		host1 := &models.Host{ID: TowerString("1"), Name: TowerString("host1"), AllocatableMemoryBytes: pointer.Int64(1), Status: models.NewHostStatus(models.HostStatusCONNECTEDHEALTHY)}
+		host2 := &models.Host{ID: TowerString("2"), Name: TowerString("host2"), AllocatableMemoryBytes: pointer.Int64(2), Status: models.NewHostStatus(models.HostStatusCONNECTEDHEALTHY)}
+
+		hosts := NewHosts()
+		g.Expect(hosts.Available(0).Len()).To(gomega.Equal(0))
+
+		hosts = NewHostsFromList([]*models.Host{host1, host2})
+		availableHosts := hosts.Available(2)
+		g.Expect(availableHosts.Len()).To(gomega.Equal(1))
+		g.Expect(availableHosts.Contains(*host2.ID)).To(gomega.BeTrue())
+	})
+
+	t.Run("Difference", func(t *testing.T) {
+		host1 := &models.Host{ID: TowerString("1"), Name: TowerString("host1")}
+		host2 := &models.Host{ID: TowerString("2"), Name: TowerString("host2")}
+
+		g.Expect(NewHosts().Difference(NewHosts()).Len()).To(gomega.Equal(0))
+		g.Expect(NewHosts().Difference(NewHosts(host1)).Len()).To(gomega.Equal(0))
+		g.Expect(NewHosts(host1).Difference(NewHosts(host1)).Len()).To(gomega.Equal(0))
+		g.Expect(NewHosts(host1).Difference(NewHosts()).Contains(*host1.ID)).To(gomega.BeTrue())
+		g.Expect(NewHosts(host1).Difference(NewHosts(host2)).Contains(*host1.ID)).To(gomega.BeTrue())
+		g.Expect(NewHosts(host1, host2).Difference(NewHosts(host2)).Contains(*host1.ID)).To(gomega.BeTrue())
+	})
+}
diff --git a/pkg/service/mock_services/vm_mock.go b/pkg/service/mock_services/vm_mock.go
index ffed9efd..968f2293 100644
--- a/pkg/service/mock_services/vm_mock.go
+++ b/pkg/service/mock_services/vm_mock.go
@@ -1,5 +1,5 @@
 // Code generated by MockGen. DO NOT EDIT.
-// Source: github.com/smartxworks/cluster-api-provider-elf/pkg/service/vm.go
+// Source: /root/github/cluster-api-provider-elf/pkg/service/vm.go
 
 // Package mock_services is a generated GoMock package.
 package mock_services
@@ -11,6 +11,7 @@ import (
 	gomock "github.com/golang/mock/gomock"
 	models "github.com/smartxworks/cloudtower-go-sdk/v2/models"
 	v1beta1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1"
+	service "github.com/smartxworks/cluster-api-provider-elf/pkg/service"
 	v1beta10 "sigs.k8s.io/cluster-api/api/v1beta1"
 )
 
@@ -217,10 +218,10 @@ func (mr *MockVMServiceMockRecorder) GetHost(id interface{}) *gomock.Call {
 }
 
 // GetHostsByCluster mocks base method.
-func (m *MockVMService) GetHostsByCluster(clusterID string) ([]*models.Host, error) {
+func (m *MockVMService) GetHostsByCluster(clusterID string) (service.Hosts, error) {
 	m.ctrl.T.Helper()
 	ret := m.ctrl.Call(m, "GetHostsByCluster", clusterID)
-	ret0, _ := ret[0].([]*models.Host)
+	ret0, _ := ret[0].(service.Hosts)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
@@ -277,18 +278,18 @@ func (mr *MockVMServiceMockRecorder) GetVMPlacementGroup(name interface{}) *gomo
 }
 
 // GetVMTemplate mocks base method.
-func (m *MockVMService) GetVMTemplate(id string) (*models.ContentLibraryVMTemplate, error) {
+func (m *MockVMService) GetVMTemplate(template string) (*models.ContentLibraryVMTemplate, error) {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "GetVMTemplate", id)
+	ret := m.ctrl.Call(m, "GetVMTemplate", template)
 	ret0, _ := ret[0].(*models.ContentLibraryVMTemplate)
 	ret1, _ := ret[1].(error)
 	return ret0, ret1
 }
 
 // GetVMTemplate indicates an expected call of GetVMTemplate.
-func (mr *MockVMServiceMockRecorder) GetVMTemplate(id interface{}) *gomock.Call {
+func (mr *MockVMServiceMockRecorder) GetVMTemplate(template interface{}) *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetVMTemplate", reflect.TypeOf((*MockVMService)(nil).GetVMTemplate), id)
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetVMTemplate", reflect.TypeOf((*MockVMService)(nil).GetVMTemplate), template)
 }
 
 // GetVlan mocks base method.
diff --git a/pkg/service/util.go b/pkg/service/util.go
index 4aa78163..0190deab 100644
--- a/pkg/service/util.go
+++ b/pkg/service/util.go
@@ -77,84 +77,6 @@ func IsAvailableHost(host *models.Host, memory int64) (bool, string) {
 	return true, ""
 }
 
-// GetAvailableHosts returns the available hosts.
-func GetAvailableHosts(hosts []*models.Host, memory int64) []*models.Host {
-	var availableHosts []*models.Host
-	for i := 0; i < len(hosts); i++ {
-		if ok, _ := IsAvailableHost(hosts[i], memory); ok {
-			availableHosts = append(availableHosts, hosts[i])
-		}
-	}
-
-	return availableHosts
-}
-
-func GetUnavailableHostInfo(hosts []*models.Host, memory int64) map[string]string {
-	info := make(map[string]string)
-	for i := 0; i < len(hosts); i++ {
-		ok, message := IsAvailableHost(hosts[i], memory)
-		if !ok {
-			info[*hosts[i].Name] = message
-		}
-	}
-
-	return info
-}
-
-func ContainsUnavailableHost(hosts []*models.Host, hostIDs []string, memory int64) bool {
-	if len(hosts) == 0 || len(hostIDs) == 0 {
-		return true
-	}
-
-	hostMap := make(map[string]*models.Host)
-	for i := 0; i < len(hosts); i++ {
-		hostMap[*hosts[i].ID] = hosts[i]
-	}
-
-	for i := 0; i < len(hostIDs); i++ {
-		host, ok := hostMap[hostIDs[i]]
-		if !ok {
-			return true
-		}
-
-		if ok, _ := IsAvailableHost(host, memory); !ok {
-			return true
-		}
-	}
-
-	return false
-}
-
-func GetHostFromList(hostID string, hosts []*models.Host) *models.Host {
-	for i := 0; i < len(hosts); i++ {
-		if *hosts[i].ID == hostID {
-			return hosts[i]
-		}
-	}
-
-	return nil
-}
-
-func HostsToSet(hosts []*models.Host) sets.Set[string] {
-	hostSet := sets.Set[string]{}
-	for i := 0; i < len(hosts); i++ {
-		hostSet.Insert(*hosts[i].ID)
-	}
-
-	return hostSet
-}
-
-func FilterHosts(hosts []*models.Host, needFilteredHosts sets.Set[string]) []*models.Host {
-	var filteredHosts []*models.Host
-	for i := 0; i < len(hosts); i++ {
-		if !needFilteredHosts.Has(*hosts[i].ID) {
-			filteredHosts = append(filteredHosts, hosts[i])
-		}
-	}
-
-	return filteredHosts
-}
-
 // GetVMsInPlacementGroup returns a Set of IDs of the virtual machines in the placement group.
 func GetVMsInPlacementGroup(placementGroup *models.VMPlacementGroup) sets.Set[string] {
 	placementGroupVMSet := sets.Set[string]{}
@@ -243,6 +165,14 @@ func GetTowerInt32(ptr *int32) int32 {
 	return *ptr
 }
 
+func GetTowerInt64(ptr *int64) int64 {
+	if ptr == nil {
+		return 0
+	}
+
+	return *ptr
+}
+
 func GetTowerTaskStatus(ptr *models.TaskStatus) string {
 	if ptr == nil {
 		return ""
diff --git a/pkg/service/util_test.go b/pkg/service/util_test.go
index d743cc2f..28253a6b 100644
--- a/pkg/service/util_test.go
+++ b/pkg/service/util_test.go
@@ -100,21 +100,3 @@ func TestIsAvailableHost(t *testing.T) {
 		g.Expect(message).To(gomega.ContainSubstring("3"))
 	})
 }
-
-func TestContainsUnavailableHost(t *testing.T) {
-	g := gomega.NewGomegaWithT(t)
-
-	t.Run("should return false when has unavailable host", func(t *testing.T) {
-		hosts := []*models.Host{{ID: pointer.String("1"), AllocatableMemoryBytes: pointer.Int64(1), Status: models.NewHostStatus(models.HostStatusCONNECTEDHEALTHY)}}
-
-		g.Expect(ContainsUnavailableHost(nil, []string{"0"}, 2)).To(gomega.BeTrue())
-
-		g.Expect(ContainsUnavailableHost(hosts, nil, 2)).To(gomega.BeTrue())
-
-		g.Expect(ContainsUnavailableHost(hosts, []string{"0"}, 2)).To(gomega.BeTrue())
-
-		g.Expect(ContainsUnavailableHost(hosts, []string{"1"}, 2)).To(gomega.BeTrue())
-
-		g.Expect(ContainsUnavailableHost(hosts, []string{"1"}, 1)).To(gomega.BeFalse())
-	})
-}
diff --git a/pkg/service/vm.go b/pkg/service/vm.go
index 2421259f..dffd738e 100644
--- a/pkg/service/vm.go
+++ b/pkg/service/vm.go
@@ -60,7 +60,7 @@ type VMService interface {
 	WaitTask(id string, timeout, interval time.Duration) (*models.Task, error)
 	GetCluster(id string) (*models.Cluster, error)
 	GetHost(id string) (*models.Host, error)
-	GetHostsByCluster(clusterID string) ([]*models.Host, error)
+	GetHostsByCluster(clusterID string) (Hosts, error)
 	GetVlan(id string) (*models.Vlan, error)
 	UpsertLabel(key, value string) (*models.Label, error)
 	DeleteLabel(key, value string, strict bool) (string, error)
@@ -488,7 +488,7 @@ func (svr *TowerVMService) GetHost(id string) (*models.Host, error) {
 	return getHostsResp.Payload[0], nil
 }
 
-func (svr *TowerVMService) GetHostsByCluster(clusterID string) ([]*models.Host, error) {
+func (svr *TowerVMService) GetHostsByCluster(clusterID string) (Hosts, error) {
 	getHostsParams := clienthost.NewGetHostsParams()
 	getHostsParams.RequestBody = &models.GetHostsRequestBody{
 		Where: &models.HostWhereInput{
@@ -507,7 +507,7 @@ func (svr *TowerVMService) GetHostsByCluster(clusterID string) ([]*models.Host,
 		return nil, errors.New(HostNotFound)
 	}
 
-	return getHostsResp.Payload, nil
+	return NewHostsFromList(getHostsResp.Payload), nil
 }
 
 // GetVlan searches for a vlan.
diff --git a/pkg/util/kcp/kcp.go b/pkg/util/kcp/kcp.go
index 770297fe..ae0654c2 100644
--- a/pkg/util/kcp/kcp.go
+++ b/pkg/util/kcp/kcp.go
@@ -22,6 +22,19 @@ import (
 	"sigs.k8s.io/cluster-api/util/conditions"
 )
 
+// IsKCPRollingUpdate returns whether KCP is in scaling down.
+//
+// When KCP is in rolling update, KCP controller marks
+// MachinesSpecUpToDateCondition to false and RollingUpdateInProgressReason as Reason.
+//
+// When all machines are up to date, KCP controller marks MachinesSpecUpToDateCondition to true.
+//
+// For more information about KCP MachinesSpecUpToDateCondition and RollingUpdateInProgressReason, refer to https://github.com/kubernetes-sigs/cluster-api/blob/main/api/v1beta1/condition_consts.go
+func IsKCPRollingUpdate(kcp *controlplanev1.KubeadmControlPlane) bool {
+	return conditions.IsFalse(kcp, controlplanev1.MachinesSpecUpToDateCondition) &&
+		conditions.GetReason(kcp, controlplanev1.MachinesSpecUpToDateCondition) == controlplanev1.RollingUpdateInProgressReason
+}
+
 // IsKCPRollingUpdateFirstMachine returns true if KCP is in rolling update and creating the first CP Machine.
 //
 // KCP rollout algorithm is as follows:
@@ -38,7 +51,7 @@ import (
 // For more information about KCP replicas, refer to https://github.com/kubernetes-sigs/cluster-api/blob/main/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
 // For more information about KCP rollout, refer to https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20191017-kubeadm-based-control-plane.md#kubeadmcontrolplane-rollout
 func IsKCPRollingUpdateFirstMachine(kcp *controlplanev1.KubeadmControlPlane) bool {
-	return *kcp.Spec.Replicas < kcp.Status.Replicas && kcp.Status.UpdatedReplicas == 1
+	return IsKCPRollingUpdate(kcp) && kcp.Status.UpdatedReplicas == 1
 }
 
 // IsKCPInScalingDown returns whether KCP is in scaling down.

From d881627f99bf42da8849e9279028db7b4a2a5932 Mon Sep 17 00:00:00 2001
From: haijianyang <weber.yhj@gmail.com>
Date: Mon, 31 Jul 2023 23:28:34 -0400
Subject: [PATCH 2/8] Fix comment

---
 controllers/elfmachine_controller_placement_group.go | 12 +++++++-----
 controllers/elfmachine_controller_test.go            | 10 +++++-----
 pkg/service/mock_services/vm_mock.go                 |  2 +-
 pkg/util/kcp/kcp.go                                  |  6 +++---
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/controllers/elfmachine_controller_placement_group.go b/controllers/elfmachine_controller_placement_group.go
index 5f249a3b..6a13b896 100644
--- a/controllers/elfmachine_controller_placement_group.go
+++ b/controllers/elfmachine_controller_placement_group.go
@@ -168,9 +168,9 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 		return nil, err
 	}
 
-	// KCP is not in scaling down/rolling update.
+	// When KCP is not in scaling down and it's not rolling update the 1st CP Machine, just return since the placement group is full.
 	if !(kcputil.IsKCPRollingUpdateFirstMachine(kcp) || kcputil.IsKCPInScalingDown(kcp)) {
-		ctx.Logger.V(1).Info("The placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
+		ctx.Logger.V(1).Info("KCP is not in scaling down and it's not rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
 
 		return nil, nil
 	}
@@ -211,7 +211,9 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 		return nil, nil
 	}
 
-	// KCP is in rolling update.
+	// When the PlacementGroup is full and it's rolling update the 1st CP Machine and,
+	// place the VM on the target host without joining the PlacementGroup and power it on.
+	// After other CP Machine are rolled out successfully, add this 1st CP VM into the PlacementGroup.
 
 	if usedHostsByPG.Len() == usedHostsByPG.Available(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB)).Len() &&
 		int(*kcp.Spec.Replicas) == usedHostsByPG.Len() {
@@ -225,7 +227,7 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 		return pointer.String(hostID), err
 	}
 
-	ctx.Logger.V(1).Info("The placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
+	ctx.Logger.V(1).Info("KCP is rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
 
 	return nil, nil
 }
@@ -448,7 +450,7 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 				return false, err
 			}
 
-			if kcputil.IsKCPRollingUpdate(kcp) {
+			if kcputil.IsKCPInRollingUpdate(kcp) {
 				ctx.Logger.Info("KCP rolling update in progress, skip migrating VM", "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
 				return true, nil
 			}
diff --git a/controllers/elfmachine_controller_test.go b/controllers/elfmachine_controller_test.go
index c74c0ca1..61da905c 100644
--- a/controllers/elfmachine_controller_test.go
+++ b/controllers/elfmachine_controller_test.go
@@ -1260,7 +1260,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				hostID, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(hostID).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("The placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in scaling down and it's not rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts"))
 			})
 
 			It("when placement group is full and KCP rolling update in progress", func() {
@@ -1334,7 +1334,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(host).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("The placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1348,7 +1348,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(host).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("The placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1362,7 +1362,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(host).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("The placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1458,7 +1458,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				hostID, err := reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(hostID).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("The placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in scaling down and it's not rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts"))
 				expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForAvailableHostRequiredByPlacementGroupReason}})
 
 				elfMachine.Status.Conditions = nil
diff --git a/pkg/service/mock_services/vm_mock.go b/pkg/service/mock_services/vm_mock.go
index 968f2293..a0567d38 100644
--- a/pkg/service/mock_services/vm_mock.go
+++ b/pkg/service/mock_services/vm_mock.go
@@ -1,5 +1,5 @@
 // Code generated by MockGen. DO NOT EDIT.
-// Source: /root/github/cluster-api-provider-elf/pkg/service/vm.go
+// Source: github.com/smartxworks/cluster-api-provider-elf/pkg/service/vm.go
 
 // Package mock_services is a generated GoMock package.
 package mock_services
diff --git a/pkg/util/kcp/kcp.go b/pkg/util/kcp/kcp.go
index ae0654c2..0c9a33e8 100644
--- a/pkg/util/kcp/kcp.go
+++ b/pkg/util/kcp/kcp.go
@@ -22,7 +22,7 @@ import (
 	"sigs.k8s.io/cluster-api/util/conditions"
 )
 
-// IsKCPRollingUpdate returns whether KCP is in scaling down.
+// IsKCPInRollingUpdate returns whether KCP is in scaling down.
 //
 // When KCP is in rolling update, KCP controller marks
 // MachinesSpecUpToDateCondition to false and RollingUpdateInProgressReason as Reason.
@@ -30,7 +30,7 @@ import (
 // When all machines are up to date, KCP controller marks MachinesSpecUpToDateCondition to true.
 //
 // For more information about KCP MachinesSpecUpToDateCondition and RollingUpdateInProgressReason, refer to https://github.com/kubernetes-sigs/cluster-api/blob/main/api/v1beta1/condition_consts.go
-func IsKCPRollingUpdate(kcp *controlplanev1.KubeadmControlPlane) bool {
+func IsKCPInRollingUpdate(kcp *controlplanev1.KubeadmControlPlane) bool {
 	return conditions.IsFalse(kcp, controlplanev1.MachinesSpecUpToDateCondition) &&
 		conditions.GetReason(kcp, controlplanev1.MachinesSpecUpToDateCondition) == controlplanev1.RollingUpdateInProgressReason
 }
@@ -51,7 +51,7 @@ func IsKCPRollingUpdate(kcp *controlplanev1.KubeadmControlPlane) bool {
 // For more information about KCP replicas, refer to https://github.com/kubernetes-sigs/cluster-api/blob/main/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
 // For more information about KCP rollout, refer to https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20191017-kubeadm-based-control-plane.md#kubeadmcontrolplane-rollout
 func IsKCPRollingUpdateFirstMachine(kcp *controlplanev1.KubeadmControlPlane) bool {
-	return IsKCPRollingUpdate(kcp) && kcp.Status.UpdatedReplicas == 1
+	return IsKCPInRollingUpdate(kcp) && kcp.Status.UpdatedReplicas == 1
 }
 
 // IsKCPInScalingDown returns whether KCP is in scaling down.

From b2d4621639a3f5681de15652e9dd4d7cd182788d Mon Sep 17 00:00:00 2001
From: haijianyang <weber.yhj@gmail.com>
Date: Wed, 2 Aug 2023 23:47:05 -0400
Subject: [PATCH 3/8] Update

---
 .../elfmachine_controller_placement_group.go  | 94 ++++++++++---------
 controllers/elfmachine_controller_test.go     | 26 +++--
 pkg/service/collections.go                    | 20 +++-
 pkg/service/collections_test.go               | 17 +++-
 pkg/util/kcp/kcp.go                           | 39 +++-----
 5 files changed, 113 insertions(+), 83 deletions(-)

diff --git a/controllers/elfmachine_controller_placement_group.go b/controllers/elfmachine_controller_placement_group.go
index 6a13b896..f17a9803 100644
--- a/controllers/elfmachine_controller_placement_group.go
+++ b/controllers/elfmachine_controller_placement_group.go
@@ -168,9 +168,9 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 		return nil, err
 	}
 
-	// When KCP is not in scaling down and it's not rolling update the 1st CP Machine, just return since the placement group is full.
-	if !(kcputil.IsKCPRollingUpdateFirstMachine(kcp) || kcputil.IsKCPInScalingDown(kcp)) {
-		ctx.Logger.V(1).Info("KCP is not in scaling down and it's not rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
+	// When KCP is not in rolling update and it's not in scaling down, just return since the placement group is full.
+	if !kcputil.IsKCPInRollingUpdate(kcp) && !kcputil.IsKCPInScalingDown(kcp) {
+		ctx.Logger.V(1).Info("KCP is not in rolling update and it's not in scaling down, the placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
 
 		return nil, nil
 	}
@@ -211,31 +211,30 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 		return nil, nil
 	}
 
-	// When the PlacementGroup is full and it's rolling update the 1st CP Machine and,
-	// place the VM on the target host without joining the PlacementGroup and power it on.
-	// After other CP Machine are rolled out successfully, add this 1st CP VM into the PlacementGroup.
+	// Now since the PlacementGroup is full and KCP is in rolling update,
+	// we will place the target CP VM on a selected host without joining the PlacementGroup and power it on.
+	// After other CP VMs are rolled out successfully, these VMs must already join the PlacementGroup
+	// since there is always an empty seat in the PlacementGroup, we will add the target CP VM into the PlacementGroup.
 
-	if usedHostsByPG.Len() == usedHostsByPG.Available(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB)).Len() &&
-		int(*kcp.Spec.Replicas) == usedHostsByPG.Len() {
-		// Only when KCP is in rolling update and the placement group is full
-		// does it need to get the latest created machine.
-		hostID, err := r.getVMHostForRollingUpdate(ctx, placementGroup, hosts)
-		if err != nil || hostID == "" {
-			return nil, err
-		}
+	usedAndUnavailableHostsByPG := usedHostsByPG.FilterUnavailableHostsWithoutEnoughMemory(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB))
+	if !usedAndUnavailableHostsByPG.IsEmpty() {
+		ctx.Logger.V(1).Info("KCP is in rolling update, the placement group is full and has unavailable hosts, wait for enough available hosts", "placementGroup", *placementGroup.Name, "usedAndUnavailableHostsByPG", usedAndUnavailableHostsByPG.String(), "usedHostsByPG", usedHostsByPG.String())
 
-		return pointer.String(hostID), err
+		return nil, nil
 	}
 
-	ctx.Logger.V(1).Info("KCP is rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
+	hostID, err := r.getVMHostForRollingUpdate(ctx, placementGroup, hosts)
+	if err != nil || hostID == "" {
+		return nil, err
+	}
 
-	return nil, nil
+	return pointer.String(hostID), err
 }
 
 // getVMHostForRollingUpdate returns the target host server id for a virtual machine during rolling update.
 // During KCP rolling update, machines will be deleted in the order of creation.
 // Find the latest created machine in the placement group,
-// and set the host where the machine is located to the first machine created by KCP rolling update.
+// and set the host where the machine is located to the machine created by KCP rolling update.
 // This prevents migration of virtual machine during KCP rolling update when using a placement group.
 func (r *ElfMachineReconciler) getVMHostForRollingUpdate(ctx *context.MachineContext, placementGroup *models.VMPlacementGroup, hosts service.Hosts) (string, error) {
 	elfMachines, err := machineutil.GetControlPlaneElfMachinesInCluster(ctx, ctx.Client, ctx.Cluster.Namespace, ctx.Cluster.Name)
@@ -265,27 +264,31 @@ func (r *ElfMachineReconciler) getVMHostForRollingUpdate(ctx *context.MachineCon
 	}
 
 	machines := collections.FromMachines(placementGroupMachines...)
-	if machine := machines.Newest(); machine != nil {
-		if vm, err := ctx.VMService.Get(vmMap[machine.Name]); err != nil {
-			return "", err
-		} else {
-			host := hosts.Get(*vm.Host.ID)
-			if host == nil {
-				ctx.Logger.Info("Host not found, skip selecting host for VM", "hostID", *vm.Host.ID, "vmRef", ctx.ElfMachine.Status.VMRef)
-			} else {
-				ok, message := service.IsAvailableHost(host, *service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB))
-				if ok {
-					ctx.Logger.Info("Selected the host server for VM since the placement group is full", "hostID", *vm.Host.ID, "vmRef", ctx.ElfMachine.Status.VMRef)
-
-					return *vm.Host.ID, nil
-				}
+	newestMachine := machines.Newest()
+	if newestMachine == nil {
+		ctx.Logger.Info("Newest machine not found, skip selecting host for VM", "vmRef", ctx.ElfMachine.Status.VMRef)
+		return "", nil
+	}
 
-				ctx.Logger.Info(fmt.Sprintf("Host is unavailable: %s, skip selecting host for VM", message), "hostID", *vm.Host.ID, "vmRef", ctx.ElfMachine.Status.VMRef)
-			}
-		}
+	vm, err := ctx.VMService.Get(vmMap[newestMachine.Name])
+	if err != nil {
+		return "", err
 	}
 
-	return "", nil
+	host := hosts.Get(*vm.Host.ID)
+	if host == nil {
+		ctx.Logger.Info("Host not found, skip selecting host for VM", "hostID", *vm.Host.ID, "vmRef", ctx.ElfMachine.Status.VMRef)
+		return "", err
+	}
+
+	ok, message := service.IsAvailableHost(host, *service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB))
+	if ok {
+		ctx.Logger.Info("Selected the host server for VM since the placement group is full", "hostID", *vm.Host.ID, "vmRef", ctx.ElfMachine.Status.VMRef)
+		return *vm.Host.ID, nil
+	}
+
+	ctx.Logger.Info(fmt.Sprintf("Host is unavailable: %s, skip selecting host for VM", message), "hostID", *vm.Host.ID, "vmRef", ctx.ElfMachine.Status.VMRef)
+	return "", err
 }
 
 // getHostsInPlacementGroup returns the hosts where all virtual machines of placement group located.
@@ -310,7 +313,7 @@ func (r *ElfMachineReconciler) getHostsInPlacementGroup(ctx *context.MachineCont
 // It returns hosts that are not in faulted state, not in the given 'usedHostsByPG',
 // and have sufficient memory for running this VM.
 func (r *ElfMachineReconciler) getAvailableHostsForVM(ctx *context.MachineContext, hosts service.Hosts, usedHostsByPG service.Hosts, vm *models.VM) service.Hosts {
-	availableHosts := hosts.Available(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB)).Difference(usedHostsByPG)
+	availableHosts := hosts.FilterAvailableHostsWithEnoughMemory(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB)).Difference(usedHostsByPG)
 
 	// If the VM is running, and the host where the VM is located
 	// is not used by the placement group, then it is not necessary to
@@ -415,14 +418,17 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 
 			// Only when the KCP is in rolling update, the VM is stopped, and all the hosts used by the placement group are available,
 			// will the upgrade be allowed with the same number of hosts and CP nodes.
-			// In this case first machine created by KCP rolling update can be powered on without being added to the placement group.
-			if kcputil.IsKCPRollingUpdateFirstMachine(kcp) &&
-				*vm.Status == models.VMStatusSTOPPED &&
-				usedHostsByPG.Len() == usedHostsByPG.Available(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB)).Len() &&
-				int(*kcp.Spec.Replicas) == usedHostsByPG.Len() {
-				ctx.Logger.Info("The placement group is full and KCP is in rolling update, skip adding VM to the placement group", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
+			// In this case machine created by KCP rolling update can be powered on without being added to the placement group.
+			if kcputil.IsKCPInRollingUpdate(kcp) && *vm.Status == models.VMStatusSTOPPED {
+				usedAndUnavailableHostsByPG := usedHostsByPG.FilterUnavailableHostsWithoutEnoughMemory(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB))
+				if usedAndUnavailableHostsByPG.IsEmpty() {
+					ctx.Logger.Info("KCP is in rolling update, the placement group is full and has no unavailable hosts, skip adding VM to the placement group", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
+					return true, nil
+				}
 
-				return true, nil
+				ctx.Logger.Info("KCP is in rolling update, the placement group is full and has unavailable hosts, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
+
+				return false, nil
 			}
 
 			if *vm.Status == models.VMStatusRUNNING || *vm.Status == models.VMStatusSUSPENDED {
diff --git a/controllers/elfmachine_controller_test.go b/controllers/elfmachine_controller_test.go
index 61da905c..72343811 100644
--- a/controllers/elfmachine_controller_test.go
+++ b/controllers/elfmachine_controller_test.go
@@ -992,20 +992,20 @@ var _ = Describe("ElfMachineReconciler", func() {
 				ok, err := reconciler.joinPlacementGroup(machineContext, vm)
 				Expect(ok).To(BeTrue())
 				Expect(err).To(BeZero())
-				Expect(logBuffer.String()).To(ContainSubstring("The placement group is full and KCP is in rolling update, skip adding VM to the placement group"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has no unavailable hosts, skip adding VM to the placement group"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
 				host.HostState = &models.NestedMaintenanceHostState{State: models.NewMaintenanceModeEnum(models.MaintenanceModeEnumMAINTENANCEMODE)}
 				mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil)
 				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil)
-				mockVMService.EXPECT().FindByIDs([]string{*placementGroup.Vms[0].ID}).Return([]*models.VM{}, nil)
+				mockVMService.EXPECT().FindByIDs([]string{*placementGroup.Vms[0].ID}).Return([]*models.VM{vm2}, nil)
 
 				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
 				ok, err = reconciler.joinPlacementGroup(machineContext, vm)
 				Expect(ok).To(BeFalse())
 				Expect(err).To(BeZero())
-				Expect(logBuffer.String()).To(ContainSubstring("The placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unavailable hosts, wait for enough available hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1229,6 +1229,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				placementGroup.Vms = []*models.NestedVM{}
 				kcp.Spec.Replicas = pointer.Int32(3)
 				kcp.Status.Replicas = 3
+				kcp.Status.UpdatedReplicas = 3
 				ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, kcp, elfMachine1, machine1, elfMachine2, machine2, elfMachine3, machine3)
 				machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService)
 				fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
@@ -1248,6 +1249,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				hostID, err := reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(*hostID).To(Equal(""))
+				Expect(logBuffer.String()).To(ContainSubstring("The placement group still has capacity"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1260,7 +1262,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				hostID, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(hostID).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in scaling down and it's not rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in rolling update and it's not in scaling down, the placement group is full, wait for enough available hosts"))
 			})
 
 			It("when placement group is full and KCP rolling update in progress", func() {
@@ -1334,7 +1336,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(host).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unavailable hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1348,7 +1350,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(host).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unavailable hosts, wait for enough available hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1362,7 +1364,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(host).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unavailable hosts, wait for enough available hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1416,6 +1418,14 @@ var _ = Describe("ElfMachineReconciler", func() {
 				Expect(err).To(BeZero())
 				Expect(hostID).To(Equal(*vm3.Host.ID))
 				Expect(logBuffer.String()).To(ContainSubstring("Selected the host server for VM since the placement group is full"))
+
+				ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, kcp)
+				machineContext = newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService)
+				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
+				hostID, err = reconciler.getVMHostForRollingUpdate(machineContext, placementGroup, service.NewHostsFromList(hosts))
+				Expect(err).To(BeZero())
+				Expect(hostID).To(Equal(""))
+				Expect(logBuffer.String()).To(ContainSubstring("Newest machine not found, skip selecting host for VM"))
 			})
 		})
 
@@ -1458,7 +1468,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				hostID, err := reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(hostID).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in scaling down and it's not rolling update the 1st CP Machine, the placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in rolling update and it's not in scaling down, the placement group is full, wait for enough available hosts"))
 				expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForAvailableHostRequiredByPlacementGroupReason}})
 
 				elfMachine.Status.Conditions = nil
diff --git a/pkg/service/collections.go b/pkg/service/collections.go
index 05c78d8a..98680733 100644
--- a/pkg/service/collections.go
+++ b/pkg/service/collections.go
@@ -67,21 +67,35 @@ func (s Hosts) IsEmpty() bool {
 
 func (s Hosts) String() string {
 	str := ""
+
 	for _, host := range s {
-		str += fmt.Sprintf("{id: %s,name: %s},", GetTowerString(host.ID), GetTowerString(host.Name))
+		state := ""
+		if host.HostState != nil {
+			state = string(*host.HostState.State)
+		}
+
+		str += fmt.Sprintf("{id: %s,name: %s,memory: %d,status: %s,state: %s},", GetTowerString(host.ID), GetTowerString(host.Name), GetTowerInt64(host.AllocatableMemoryBytes), string(*host.Status), state)
 	}
 
 	return fmt.Sprintf("[%s]", str)
 }
 
-// Available returns a Hosts with available hosts.
-func (s Hosts) Available(memory int64) Hosts {
+// FilterAvailableHostsWithEnoughMemory returns a Hosts containing the available host which has allocatable memory no less than the specified memory.
+func (s Hosts) FilterAvailableHostsWithEnoughMemory(memory int64) Hosts {
 	return s.Filter(func(h *models.Host) bool {
 		ok, _ := IsAvailableHost(h, memory)
 		return ok
 	})
 }
 
+// FilterUnavailableHostsWithoutEnoughMemory returns a Hosts containing the unavailable host which has allocatable memory less than the specified memory.
+func (s Hosts) FilterUnavailableHostsWithoutEnoughMemory(memory int64) Hosts {
+	return s.Filter(func(h *models.Host) bool {
+		ok, _ := IsAvailableHost(h, memory)
+		return !ok
+	})
+}
+
 // Get returns a Host of the specified host.
 func (s Hosts) Get(hostID string) *models.Host {
 	if host, ok := s[hostID]; ok {
diff --git a/pkg/service/collections_test.go b/pkg/service/collections_test.go
index cf74e046..dabb92a9 100644
--- a/pkg/service/collections_test.go
+++ b/pkg/service/collections_test.go
@@ -17,6 +17,7 @@ limitations under the License.
 package service
 
 import (
+	"fmt"
 	"testing"
 
 	"github.com/onsi/gomega"
@@ -47,12 +48,24 @@ func TestHostCollection(t *testing.T) {
 		host2 := &models.Host{ID: TowerString("2"), Name: TowerString("host2"), AllocatableMemoryBytes: pointer.Int64(2), Status: models.NewHostStatus(models.HostStatusCONNECTEDHEALTHY)}
 
 		hosts := NewHosts()
-		g.Expect(hosts.Available(0).Len()).To(gomega.Equal(0))
+		g.Expect(hosts.FilterAvailableHostsWithEnoughMemory(0).Len()).To(gomega.Equal(0))
 
 		hosts = NewHostsFromList([]*models.Host{host1, host2})
-		availableHosts := hosts.Available(2)
+		availableHosts := hosts.FilterAvailableHostsWithEnoughMemory(2)
 		g.Expect(availableHosts.Len()).To(gomega.Equal(1))
 		g.Expect(availableHosts.Contains(*host2.ID)).To(gomega.BeTrue())
+
+		hosts = NewHosts()
+		unavailableHosts := hosts.FilterUnavailableHostsWithoutEnoughMemory(0)
+		g.Expect(unavailableHosts.IsEmpty()).To(gomega.BeTrue())
+		g.Expect(unavailableHosts.Len()).To(gomega.Equal(0))
+		g.Expect(unavailableHosts.String()).To(gomega.Equal("[]"))
+
+		hosts = NewHostsFromList([]*models.Host{host1, host2})
+		unavailableHosts = hosts.FilterUnavailableHostsWithoutEnoughMemory(2)
+		g.Expect(unavailableHosts.Len()).To(gomega.Equal(1))
+		g.Expect(unavailableHosts.Contains(*host1.ID)).To(gomega.BeTrue())
+		g.Expect(unavailableHosts.String()).To(gomega.Equal(fmt.Sprintf("[{id: %s,name: %s,memory: %d,status: %s,state: %s},]", *host1.ID, *host1.Name, *host1.AllocatableMemoryBytes, string(*host1.Status), "")))
 	})
 
 	t.Run("Difference", func(t *testing.T) {
diff --git a/pkg/util/kcp/kcp.go b/pkg/util/kcp/kcp.go
index 0c9a33e8..afbaffd3 100644
--- a/pkg/util/kcp/kcp.go
+++ b/pkg/util/kcp/kcp.go
@@ -22,36 +22,23 @@ import (
 	"sigs.k8s.io/cluster-api/util/conditions"
 )
 
-// IsKCPInRollingUpdate returns whether KCP is in scaling down.
+// IsKCPInRollingUpdate returns whether KCP is in rolling update.
 //
-// When KCP is in rolling update, KCP controller marks
-// MachinesSpecUpToDateCondition to false and RollingUpdateInProgressReason as Reason.
-//
-// When all machines are up to date, KCP controller marks MachinesSpecUpToDateCondition to true.
-//
-// For more information about KCP MachinesSpecUpToDateCondition and RollingUpdateInProgressReason, refer to https://github.com/kubernetes-sigs/cluster-api/blob/main/api/v1beta1/condition_consts.go
-func IsKCPInRollingUpdate(kcp *controlplanev1.KubeadmControlPlane) bool {
-	return conditions.IsFalse(kcp, controlplanev1.MachinesSpecUpToDateCondition) &&
-		conditions.GetReason(kcp, controlplanev1.MachinesSpecUpToDateCondition) == controlplanev1.RollingUpdateInProgressReason
-}
-
-// IsKCPRollingUpdateFirstMachine returns true if KCP is in rolling update and creating the first CP Machine.
-//
-// KCP rollout algorithm is as follows:
-// Find Machines that have an outdated spec, If there is a machine requiring rollout
-// 1.Scale up control plane creating a machine with the new spec
-// 2.Scale down control plane by removing one of the machine that needs rollout (the oldest out-of date machine in the failure domain that has the most control-plane machines on it)
-//
-// kcp.Status.UpdatedReplicas is the total number of machines that are up to date with the control
-// plane's configuration and therefore do not require rollout.
-//
-// So when KCP is in rolling update and creating the first CP Machine,
-// kcp.Status.Replicas is greater than kcp.Spec.Replicas and kcp.Status.UpdatedReplicas equals 1.
+// When *kcp.Spec.Replicas > kcp.Status.UpdatedReplicas, it must be in a KCP rolling update process.
+// When *kcp.Spec.Replicas == kcp.Status.UpdatedReplicas, it could be in one of the following cases:
+//  1. It's not in a KCP rolling update process. So kcp.Spec.Replicas == kcp.Status.Replicas.
+//  2. It's at the end of a KCP rolling update process, and the last KCP replica (i.e the last KCP ElfMachine) is created just now.
+//     There is still an old KCP ElfMachine, so kcp.Spec.Replicas + 1 == kcp.Status.Replicas.
 //
 // For more information about KCP replicas, refer to https://github.com/kubernetes-sigs/cluster-api/blob/main/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
 // For more information about KCP rollout, refer to https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20191017-kubeadm-based-control-plane.md#kubeadmcontrolplane-rollout
-func IsKCPRollingUpdateFirstMachine(kcp *controlplanev1.KubeadmControlPlane) bool {
-	return IsKCPInRollingUpdate(kcp) && kcp.Status.UpdatedReplicas == 1
+func IsKCPInRollingUpdate(kcp *controlplanev1.KubeadmControlPlane) bool {
+	if (*kcp.Spec.Replicas > kcp.Status.UpdatedReplicas && *kcp.Spec.Replicas <= kcp.Status.Replicas) ||
+		(*kcp.Spec.Replicas == kcp.Status.UpdatedReplicas && *kcp.Spec.Replicas < kcp.Status.Replicas) {
+		return true
+	}
+
+	return false
 }
 
 // IsKCPInScalingDown returns whether KCP is in scaling down.

From 8c664ed476a04d391ce3a5469cbcbc594e170c83 Mon Sep 17 00:00:00 2001
From: haijianyang <weber.yhj@gmail.com>
Date: Thu, 3 Aug 2023 05:21:57 -0400
Subject: [PATCH 4/8] Fix comment

---
 .../elfmachine_controller_placement_group.go  | 44 ++++++++++++-------
 controllers/elfmachine_controller_test.go     | 38 ++++++++++++----
 pkg/service/collections.go                    |  4 +-
 pkg/service/collections_test.go               |  4 +-
 4 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/controllers/elfmachine_controller_placement_group.go b/controllers/elfmachine_controller_placement_group.go
index f17a9803..cb8e9018 100644
--- a/controllers/elfmachine_controller_placement_group.go
+++ b/controllers/elfmachine_controller_placement_group.go
@@ -168,9 +168,9 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 		return nil, err
 	}
 
-	// When KCP is not in rolling update and it's not in scaling down, just return since the placement group is full.
+	// When KCP is not in rolling update and not in scaling down, just return since the placement group is full.
 	if !kcputil.IsKCPInRollingUpdate(kcp) && !kcputil.IsKCPInScalingDown(kcp) {
-		ctx.Logger.V(1).Info("KCP is not in rolling update and it's not in scaling down, the placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
+		ctx.Logger.V(1).Info("KCP is not in rolling update and not in scaling down, the placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
 
 		return nil, nil
 	}
@@ -216,9 +216,9 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 	// After other CP VMs are rolled out successfully, these VMs must already join the PlacementGroup
 	// since there is always an empty seat in the PlacementGroup, we will add the target CP VM into the PlacementGroup.
 
-	usedAndUnavailableHostsByPG := usedHostsByPG.FilterUnavailableHostsWithoutEnoughMemory(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB))
-	if !usedAndUnavailableHostsByPG.IsEmpty() {
-		ctx.Logger.V(1).Info("KCP is in rolling update, the placement group is full and has unavailable hosts, wait for enough available hosts", "placementGroup", *placementGroup.Name, "usedAndUnavailableHostsByPG", usedAndUnavailableHostsByPG.String(), "usedHostsByPG", usedHostsByPG.String())
+	unusableHosts := usedHostsByPG.FilterUnavailableHostsOrWithoutEnoughMemory(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB))
+	if !unusableHosts.IsEmpty() {
+		ctx.Logger.V(1).Info("KCP is in rolling update, the placement group is full and has unusable hosts, will wait for enough available hosts", "placementGroup", *placementGroup.Name, "unusableHosts", unusableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
 
 		return nil, nil
 	}
@@ -277,17 +277,17 @@ func (r *ElfMachineReconciler) getVMHostForRollingUpdate(ctx *context.MachineCon
 
 	host := hosts.Get(*vm.Host.ID)
 	if host == nil {
-		ctx.Logger.Info("Host not found, skip selecting host for VM", "hostID", *vm.Host.ID, "vmRef", ctx.ElfMachine.Status.VMRef)
+		ctx.Logger.Info("Host not found, skip selecting host for VM", "host", formatNestedHost(vm.Host), "vmRef", ctx.ElfMachine.Status.VMRef)
 		return "", err
 	}
 
 	ok, message := service.IsAvailableHost(host, *service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB))
 	if ok {
-		ctx.Logger.Info("Selected the host server for VM since the placement group is full", "hostID", *vm.Host.ID, "vmRef", ctx.ElfMachine.Status.VMRef)
+		ctx.Logger.Info("Select a host to power on the VM since the placement group is full", "host", formatNestedHost(vm.Host), "vmRef", ctx.ElfMachine.Status.VMRef)
 		return *vm.Host.ID, nil
 	}
 
-	ctx.Logger.Info(fmt.Sprintf("Host is unavailable: %s, skip selecting host for VM", message), "hostID", *vm.Host.ID, "vmRef", ctx.ElfMachine.Status.VMRef)
+	ctx.Logger.Info(fmt.Sprintf("Host is unavailable: %s, skip selecting host for VM", message), "host", formatNestedHost(vm.Host), "vmRef", ctx.ElfMachine.Status.VMRef)
 	return "", err
 }
 
@@ -417,16 +417,17 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 			}
 
 			// Only when the KCP is in rolling update, the VM is stopped, and all the hosts used by the placement group are available,
-			// will the upgrade be allowed with the same number of hosts and CP nodes.
-			// In this case machine created by KCP rolling update can be powered on without being added to the placement group.
+			// will the upgrade be allowed.
+			// In this case the machine created by KCP rolling update can be powered on without being added to the placement group,
+			// so return true and nil to let reconcileVMStatus() power it on.
 			if kcputil.IsKCPInRollingUpdate(kcp) && *vm.Status == models.VMStatusSTOPPED {
-				usedAndUnavailableHostsByPG := usedHostsByPG.FilterUnavailableHostsWithoutEnoughMemory(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB))
-				if usedAndUnavailableHostsByPG.IsEmpty() {
-					ctx.Logger.Info("KCP is in rolling update, the placement group is full and has no unavailable hosts, skip adding VM to the placement group", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
+				unusablehosts := usedHostsByPG.FilterUnavailableHostsOrWithoutEnoughMemory(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB))
+				if unusablehosts.IsEmpty() {
+					ctx.Logger.Info("KCP is in rolling update, the placement group is full and has no unusable hosts, so skip adding VM to the placement group and power it on", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
 					return true, nil
 				}
 
-				ctx.Logger.Info("KCP is in rolling update, the placement group is full and has unavailable hosts, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
+				ctx.Logger.Info("KCP is in rolling update, the placement group is full and has unusable hosts, so wait for enough available hosts", "placementGroup", *placementGroup.Name, "unusablehosts", unusablehosts.String(), "usedHostsByPG", usedHostsByPG.String(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
 
 				return false, nil
 			}
@@ -438,7 +439,7 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 			}
 
 			// KCP is scaling out or being created.
-			ctx.Logger.V(1).Info("The placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
+			ctx.Logger.V(1).Info("KCP is in scaling up or being created, the placement group is full, so wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
 
 			return false, nil
 		}
@@ -447,7 +448,7 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 		// and the virtual machine is not STOPPED, we need to migrate the virtual machine to a host that
 		// is not used by the placement group before adding the virtual machine to the placement group.
 		// Otherwise, just add the virtual machine to the placement group directly.
-		ctx.Logger.V(1).Info("The availableHosts for migrating the VM", "hosts", availableHosts.String(), "vmHost", *vm.Host.ID)
+		ctx.Logger.V(1).Info("The availableHosts for migrating the VM", "hosts", availableHosts.String(), "vmHost", formatNestedHost(vm.Host))
 		if !availableHosts.Contains(*vm.Host.ID) && *vm.Status != models.VMStatusSTOPPED {
 			ctx.Logger.V(1).Info("Try to migrate the virtual machine to the specified target host if needed")
 
@@ -528,7 +529,7 @@ func (r *ElfMachineReconciler) migrateVM(ctx *context.MachineContext, vm *models
 
 	ctx.ElfMachine.SetTask(*withTaskVM.TaskID)
 
-	ctx.Logger.Info(fmt.Sprintf("Waiting for the VM to be migrated from %s to %s", *vm.Host.ID, targetHost), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID, "taskRef", ctx.ElfMachine.Status.TaskRef)
+	ctx.Logger.Info(fmt.Sprintf("Waiting for the VM to be migrated from %s to %s", formatNestedHost(vm.Host), targetHost), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID, "taskRef", ctx.ElfMachine.Status.TaskRef)
 
 	return false, nil
 }
@@ -614,3 +615,12 @@ func (r *ElfMachineReconciler) deletePlacementGroup(ctx *context.MachineContext)
 
 	return true, nil
 }
+
+// formatNestedHost returns the basic information of the host (ID, name).
+func formatNestedHost(host *models.NestedHost) string {
+	if host == nil {
+		return "{}"
+	}
+
+	return fmt.Sprintf("{id: %s,name: %s}", service.GetTowerString(host.ID), service.GetTowerString(host.Name))
+}
diff --git a/controllers/elfmachine_controller_test.go b/controllers/elfmachine_controller_test.go
index 72343811..9f6e83e8 100644
--- a/controllers/elfmachine_controller_test.go
+++ b/controllers/elfmachine_controller_test.go
@@ -992,7 +992,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				ok, err := reconciler.joinPlacementGroup(machineContext, vm)
 				Expect(ok).To(BeTrue())
 				Expect(err).To(BeZero())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has no unavailable hosts, skip adding VM to the placement group"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has no unusable hosts, so skip adding VM to the placement group and power it on"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1005,7 +1005,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				ok, err = reconciler.joinPlacementGroup(machineContext, vm)
 				Expect(ok).To(BeFalse())
 				Expect(err).To(BeZero())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unavailable hosts, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unusable hosts, so wait for enough available hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1020,6 +1020,26 @@ var _ = Describe("ElfMachineReconciler", func() {
 				Expect(ok).To(BeTrue())
 				Expect(err).To(BeZero())
 				Expect(logBuffer.String()).To(ContainSubstring(fmt.Sprintf("The placement group is full and VM is in %s status, skip adding VM to the placement group", *vm.Status)))
+
+				logBuffer = new(bytes.Buffer)
+				klog.SetOutput(logBuffer)
+				kcp.Spec.Replicas = pointer.Int32(1)
+				kcp.Status.Replicas = 1
+				kcp.Status.UpdatedReplicas = 1
+				ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md, kcp)
+				machineContext = newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService)
+				fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine)
+				vm.Status = models.NewVMStatus(models.VMStatusSTOPPED)
+				host.HostState = &models.NestedMaintenanceHostState{State: models.NewMaintenanceModeEnum(models.MaintenanceModeEnumMAINTENANCEMODE)}
+				mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil)
+				mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil)
+				mockVMService.EXPECT().FindByIDs([]string{*placementGroup.Vms[0].ID}).Return([]*models.VM{}, nil)
+
+				reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService}
+				ok, err = reconciler.joinPlacementGroup(machineContext, vm)
+				Expect(ok).To(BeFalse())
+				Expect(err).To(BeZero())
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in scaling up or being created, the placement group is full, so wait for enough available hosts"))
 			})
 
 			It("should add VM to placement group when VM is not in placement group and the host where VM in is not in placement group", func() {
@@ -1262,7 +1282,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				hostID, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(hostID).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in rolling update and it's not in scaling down, the placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in rolling update and not in scaling down, the placement group is full, wait for enough available hosts"))
 			})
 
 			It("when placement group is full and KCP rolling update in progress", func() {
@@ -1336,7 +1356,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(host).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unavailable hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unusable hosts, will wait for enough available hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1350,7 +1370,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(host).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unavailable hosts, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unusable hosts, will wait for enough available hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1364,7 +1384,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(host).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unavailable hosts, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unusable hosts, will wait for enough available hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1405,7 +1425,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				hostID, err = reconciler.getVMHostForRollingUpdate(machineContext, placementGroup, service.NewHostsFromList(hosts))
 				Expect(err).To(BeZero())
 				Expect(hostID).To(Equal(*vm3.Host.ID))
-				Expect(logBuffer.String()).To(ContainSubstring("Selected the host server for VM since the placement group is full"))
+				Expect(logBuffer.String()).To(ContainSubstring("Select a host to power on the VM since the placement group is full"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1417,7 +1437,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				hostID, err = reconciler.getVMHostForRollingUpdate(machineContext, placementGroup, service.NewHostsFromList(hosts))
 				Expect(err).To(BeZero())
 				Expect(hostID).To(Equal(*vm3.Host.ID))
-				Expect(logBuffer.String()).To(ContainSubstring("Selected the host server for VM since the placement group is full"))
+				Expect(logBuffer.String()).To(ContainSubstring("Select a host to power on the VM since the placement group is full"))
 
 				ctrlContext = newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, kcp)
 				machineContext = newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService)
@@ -1468,7 +1488,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				hostID, err := reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(hostID).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in rolling update and it's not in scaling down, the placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in rolling update and not in scaling down, the placement group is full, wait for enough available hosts"))
 				expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForAvailableHostRequiredByPlacementGroupReason}})
 
 				elfMachine.Status.Conditions = nil
diff --git a/pkg/service/collections.go b/pkg/service/collections.go
index 98680733..ff35b4fc 100644
--- a/pkg/service/collections.go
+++ b/pkg/service/collections.go
@@ -88,8 +88,8 @@ func (s Hosts) FilterAvailableHostsWithEnoughMemory(memory int64) Hosts {
 	})
 }
 
-// FilterUnavailableHostsWithoutEnoughMemory returns a Hosts containing the unavailable host which has allocatable memory less than the specified memory.
-func (s Hosts) FilterUnavailableHostsWithoutEnoughMemory(memory int64) Hosts {
+// FilterUnavailableHostsOrWithoutEnoughMemory returns a Hosts containing the unavailable hosts or available hosts whose available memory is less than the specified memory.
+func (s Hosts) FilterUnavailableHostsOrWithoutEnoughMemory(memory int64) Hosts {
 	return s.Filter(func(h *models.Host) bool {
 		ok, _ := IsAvailableHost(h, memory)
 		return !ok
diff --git a/pkg/service/collections_test.go b/pkg/service/collections_test.go
index dabb92a9..3dc8ebc8 100644
--- a/pkg/service/collections_test.go
+++ b/pkg/service/collections_test.go
@@ -56,13 +56,13 @@ func TestHostCollection(t *testing.T) {
 		g.Expect(availableHosts.Contains(*host2.ID)).To(gomega.BeTrue())
 
 		hosts = NewHosts()
-		unavailableHosts := hosts.FilterUnavailableHostsWithoutEnoughMemory(0)
+		unavailableHosts := hosts.FilterUnavailableHostsOrWithoutEnoughMemory(0)
 		g.Expect(unavailableHosts.IsEmpty()).To(gomega.BeTrue())
 		g.Expect(unavailableHosts.Len()).To(gomega.Equal(0))
 		g.Expect(unavailableHosts.String()).To(gomega.Equal("[]"))
 
 		hosts = NewHostsFromList([]*models.Host{host1, host2})
-		unavailableHosts = hosts.FilterUnavailableHostsWithoutEnoughMemory(2)
+		unavailableHosts = hosts.FilterUnavailableHostsOrWithoutEnoughMemory(2)
 		g.Expect(unavailableHosts.Len()).To(gomega.Equal(1))
 		g.Expect(unavailableHosts.Contains(*host1.ID)).To(gomega.BeTrue())
 		g.Expect(unavailableHosts.String()).To(gomega.Equal(fmt.Sprintf("[{id: %s,name: %s,memory: %d,status: %s,state: %s},]", *host1.ID, *host1.Name, *host1.AllocatableMemoryBytes, string(*host1.Status), "")))

From 7b337922d4b327b925616b8822e4ac147ce96a66 Mon Sep 17 00:00:00 2001
From: haijianyang <weber.yhj@gmail.com>
Date: Thu, 3 Aug 2023 07:40:28 -0400
Subject: [PATCH 5/8] Fix comment

---
 controllers/elfmachine_controller_placement_group.go | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/controllers/elfmachine_controller_placement_group.go b/controllers/elfmachine_controller_placement_group.go
index cb8e9018..d01d0d56 100644
--- a/controllers/elfmachine_controller_placement_group.go
+++ b/controllers/elfmachine_controller_placement_group.go
@@ -354,8 +354,6 @@ func (r *ElfMachineReconciler) getPlacementGroup(ctx *context.MachineContext, pl
 //     For example, the virtual machine has joined the placement group.
 //  2. false and error is nil means the virtual machine has not joined the placement group.
 //     For example, the placement group is full or the virtual machine is being migrated.
-//
-//nolint:gocyclo
 func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, vm *models.VM) (ret bool, reterr error) {
 	if !version.IsCompatibleWithPlacementGroup(ctx.ElfMachine) {
 		ctx.Logger.V(1).Info(fmt.Sprintf("The capeVersion of ElfMachine is lower than %s, skip adding VM to the placement group", version.CAPEVersion1_2_0), "capeVersion", version.GetCAPEVersion(ctx.ElfMachine))
@@ -432,7 +430,7 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 				return false, nil
 			}
 
-			if *vm.Status == models.VMStatusRUNNING || *vm.Status == models.VMStatusSUSPENDED {
+			if *vm.Status != models.VMStatusSTOPPED {
 				ctx.Logger.V(1).Info(fmt.Sprintf("The placement group is full and VM is in %s status, skip adding VM to the placement group", *vm.Status), "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String(), "vmRef", ctx.ElfMachine.Status.VMRef, "vmId", *vm.ID)
 
 				return true, nil
@@ -473,8 +471,7 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 			usedHostsByPG := sets.Set[string]{}
 			for i := 0; i < len(cpElfMachines); i++ {
 				if ctx.ElfMachine.Name != cpElfMachines[i].Name &&
-					cpElfMachines[i].Status.PlacementGroupRef == *placementGroup.ID &&
-					cpElfMachines[i].CreationTimestamp.After(ctx.ElfMachine.CreationTimestamp.Time) {
+					cpElfMachines[i].Status.PlacementGroupRef == *placementGroup.ID {
 					usedHostsByPG.Insert(cpElfMachines[i].Status.HostServerRef)
 				}
 			}

From 52dd4ad1027ac496f4796786810259c6d5123ece Mon Sep 17 00:00:00 2001
From: haijianyang <weber.yhj@gmail.com>
Date: Thu, 3 Aug 2023 08:33:00 -0400
Subject: [PATCH 6/8] Add formatHost

---
 .../elfmachine_controller_placement_group.go      | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/controllers/elfmachine_controller_placement_group.go b/controllers/elfmachine_controller_placement_group.go
index d01d0d56..85f34efc 100644
--- a/controllers/elfmachine_controller_placement_group.go
+++ b/controllers/elfmachine_controller_placement_group.go
@@ -477,7 +477,7 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 			}
 
 			usedHostsCount := usedHostsByPG.Len()
-			ctx.Logger.V(1).Info("The hosts used by the PlacementGroup", "usedHosts", usedHostsByPG, "count", usedHostsCount, "targetHost", targetHost)
+			ctx.Logger.V(1).Info("The hosts used by the PlacementGroup", "usedHosts", usedHostsByPG, "count", usedHostsCount, "targetHost", formatHost(targetHost))
 			if usedHostsCount < int(*kcp.Spec.Replicas-1) {
 				ctx.Logger.V(1).Info("Not all other CPs joined the PlacementGroup, skip migrating VM")
 				return true, nil
@@ -492,7 +492,7 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 			// This is the last CP ElfMachine (i.e. the 1st new CP ElfMachine) which has not been added into the target PlacementGroup.
 			// Migrate this VM to the target host, then it will be added into the target PlacementGroup.
 
-			ctx.Logger.V(1).Info("Start migrateVM since KCP is not in rolling update process", "targetHost", targetHost)
+			ctx.Logger.V(1).Info("Start migrateVM since KCP is not in rolling update process", "targetHost", formatHost(targetHost))
 
 			return r.migrateVM(ctx, vm, *targetHost.ID)
 		}
@@ -613,7 +613,7 @@ func (r *ElfMachineReconciler) deletePlacementGroup(ctx *context.MachineContext)
 	return true, nil
 }
 
-// formatNestedHost returns the basic information of the host (ID, name).
+// formatNestedHost returns the basic information of the NestedHost (ID, name).
 func formatNestedHost(host *models.NestedHost) string {
 	if host == nil {
 		return "{}"
@@ -621,3 +621,12 @@ func formatNestedHost(host *models.NestedHost) string {
 
 	return fmt.Sprintf("{id: %s,name: %s}", service.GetTowerString(host.ID), service.GetTowerString(host.Name))
 }
+
+// formatHost returns the basic information of the Host (ID, name).
+func formatHost(host *models.Host) string {
+	if host == nil {
+		return "{}"
+	}
+
+	return fmt.Sprintf("{id: %s,name: %s}", service.GetTowerString(host.ID), service.GetTowerString(host.Name))
+}

From 9cd49cb28261fa7aaded9bdc7d2b56943a4dc9a5 Mon Sep 17 00:00:00 2001
From: haijianyang <weber.yhj@gmail.com>
Date: Fri, 4 Aug 2023 03:49:51 -0400
Subject: [PATCH 7/8] Fix migration

---
 .../elfmachine_controller_placement_group.go  | 33 ++++++++++++++++---
 controllers/elfmachine_controller_test.go     | 10 +++---
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/controllers/elfmachine_controller_placement_group.go b/controllers/elfmachine_controller_placement_group.go
index 85f34efc..a5db0459 100644
--- a/controllers/elfmachine_controller_placement_group.go
+++ b/controllers/elfmachine_controller_placement_group.go
@@ -26,6 +26,7 @@ import (
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/utils/pointer"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
+	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
 	capiutil "sigs.k8s.io/cluster-api/util"
 	"sigs.k8s.io/cluster-api/util/annotations"
 	"sigs.k8s.io/cluster-api/util/collections"
@@ -170,7 +171,7 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 
 	// When KCP is not in rolling update and not in scaling down, just return since the placement group is full.
 	if !kcputil.IsKCPInRollingUpdate(kcp) && !kcputil.IsKCPInScalingDown(kcp) {
-		ctx.Logger.V(1).Info("KCP is not in rolling update and not in scaling down, the placement group is full, wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
+		ctx.Logger.V(1).Info("KCP is not in rolling update and not in scaling down, the placement group is full, so wait for enough available hosts", "placementGroup", *placementGroup.Name, "availableHosts", availableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
 
 		return nil, nil
 	}
@@ -218,7 +219,7 @@ func (r *ElfMachineReconciler) preCheckPlacementGroup(ctx *context.MachineContex
 
 	unusableHosts := usedHostsByPG.FilterUnavailableHostsOrWithoutEnoughMemory(*service.TowerMemory(ctx.ElfMachine.Spec.MemoryMiB))
 	if !unusableHosts.IsEmpty() {
-		ctx.Logger.V(1).Info("KCP is in rolling update, the placement group is full and has unusable hosts, will wait for enough available hosts", "placementGroup", *placementGroup.Name, "unusableHosts", unusableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
+		ctx.Logger.V(1).Info("KCP is in rolling update, the placement group is full and has unusable hosts, so wait for enough available hosts", "placementGroup", *placementGroup.Name, "unusableHosts", unusableHosts.String(), "usedHostsByPG", usedHostsByPG.String())
 
 		return nil, nil
 	}
@@ -354,6 +355,8 @@ func (r *ElfMachineReconciler) getPlacementGroup(ctx *context.MachineContext, pl
 //     For example, the virtual machine has joined the placement group.
 //  2. false and error is nil means the virtual machine has not joined the placement group.
 //     For example, the placement group is full or the virtual machine is being migrated.
+//
+//nolint:gocyclo
 func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, vm *models.VM) (ret bool, reterr error) {
 	if !version.IsCompatibleWithPlacementGroup(ctx.ElfMachine) {
 		ctx.Logger.V(1).Info(fmt.Sprintf("The capeVersion of ElfMachine is lower than %s, skip adding VM to the placement group", version.CAPEVersion1_2_0), "capeVersion", version.GetCAPEVersion(ctx.ElfMachine))
@@ -460,24 +463,35 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 				return true, nil
 			}
 
-			// The 1st new CP ElfMachine should wait for other new CP ElfMachines to join the target PlacementGroup.
+			// The new CP ElfMachine should wait for other new CP ElfMachines to join the target PlacementGroup.
 			// The code below double checks the recommended target host for migration is valid.
 			cpElfMachines, err := machineutil.GetControlPlaneElfMachinesInCluster(ctx, ctx.Client, ctx.Cluster.Namespace, ctx.Cluster.Name)
 			if err != nil {
 				return false, err
 			}
 
-			targetHost := availableHosts.UnsortedList()[0]
 			usedHostsByPG := sets.Set[string]{}
+			cpElfMachineNames := make([]string, 0, len(cpElfMachines))
 			for i := 0; i < len(cpElfMachines); i++ {
+				cpElfMachineNames = append(cpElfMachineNames, cpElfMachines[i].Name)
 				if ctx.ElfMachine.Name != cpElfMachines[i].Name &&
 					cpElfMachines[i].Status.PlacementGroupRef == *placementGroup.ID {
 					usedHostsByPG.Insert(cpElfMachines[i].Status.HostServerRef)
 				}
 			}
 
+			// During KCP rolling update, when the last new CP is just created,
+			// it may happen that kcp.Spec.Replicas == kcp.Status.Replicas == kcp.Status.UpdatedReplicas
+			// and kcp.Status.UnavailableReplicas == 0.
+			// So we need to check if the number of CP ElfMachine is equal to kcp.Spec.Replicas.
+			if len(cpElfMachines) != int(*kcp.Spec.Replicas) {
+				ctx.Logger.Info("The number of CP ElfMachine does not match the expected", "kcp", formatKCP(kcp), "cpElfMachines", cpElfMachineNames)
+				return true, nil
+			}
+
+			targetHost := availableHosts.UnsortedList()[0]
 			usedHostsCount := usedHostsByPG.Len()
-			ctx.Logger.V(1).Info("The hosts used by the PlacementGroup", "usedHosts", usedHostsByPG, "count", usedHostsCount, "targetHost", formatHost(targetHost))
+			ctx.Logger.V(1).Info("The hosts used by the PlacementGroup", "usedHosts", usedHostsByPG, "count", usedHostsCount, "targetHost", formatHost(targetHost), "kcp", formatKCP(kcp), "cpElfMachines", cpElfMachineNames)
 			if usedHostsCount < int(*kcp.Spec.Replicas-1) {
 				ctx.Logger.V(1).Info("Not all other CPs joined the PlacementGroup, skip migrating VM")
 				return true, nil
@@ -630,3 +644,12 @@ func formatHost(host *models.Host) string {
 
 	return fmt.Sprintf("{id: %s,name: %s}", service.GetTowerString(host.ID), service.GetTowerString(host.Name))
 }
+
+// formatKCP returns the basic information of the KCP (name, namespace, replicas).
+func formatKCP(kcp *controlplanev1.KubeadmControlPlane) string {
+	if kcp == nil {
+		return "{}"
+	}
+
+	return fmt.Sprintf("{name:%s,namespace:%s,spec:{replicas:%d},status:{replicas:%d,readyReplicas:%d,updatedReplicas:%d,unavailableReplicas:%d}}", kcp.Name, kcp.Namespace, *kcp.Spec.Replicas, kcp.Status.Replicas, kcp.Status.ReadyReplicas, kcp.Status.UpdatedReplicas, kcp.Status.UnavailableReplicas)
+}
diff --git a/controllers/elfmachine_controller_test.go b/controllers/elfmachine_controller_test.go
index 9f6e83e8..f34e640f 100644
--- a/controllers/elfmachine_controller_test.go
+++ b/controllers/elfmachine_controller_test.go
@@ -1282,7 +1282,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				hostID, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(hostID).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in rolling update and not in scaling down, the placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in rolling update and not in scaling down, the placement group is full, so wait for enough available hosts"))
 			})
 
 			It("when placement group is full and KCP rolling update in progress", func() {
@@ -1356,7 +1356,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(host).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unusable hosts, will wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unusable hosts, so wait for enough available hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1370,7 +1370,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(host).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unusable hosts, will wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unusable hosts, so wait for enough available hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1384,7 +1384,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				host, err = reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(host).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unusable hosts, will wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is in rolling update, the placement group is full and has unusable hosts, so wait for enough available hosts"))
 
 				logBuffer = new(bytes.Buffer)
 				klog.SetOutput(logBuffer)
@@ -1488,7 +1488,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				hostID, err := reconciler.preCheckPlacementGroup(machineContext)
 				Expect(err).To(BeZero())
 				Expect(hostID).To(BeNil())
-				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in rolling update and not in scaling down, the placement group is full, wait for enough available hosts"))
+				Expect(logBuffer.String()).To(ContainSubstring("KCP is not in rolling update and not in scaling down, the placement group is full, so wait for enough available hosts"))
 				expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForAvailableHostRequiredByPlacementGroupReason}})
 
 				elfMachine.Status.Conditions = nil

From a18cc5405b7da6ecaac245dc4197ce2ad4164f54 Mon Sep 17 00:00:00 2001
From: haijianyang <weber.yhj@gmail.com>
Date: Fri, 4 Aug 2023 04:55:35 -0400
Subject: [PATCH 8/8] Fix comment

---
 controllers/elfmachine_controller_placement_group.go | 6 +++---
 controllers/elfmachine_controller_test.go            | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/controllers/elfmachine_controller_placement_group.go b/controllers/elfmachine_controller_placement_group.go
index a5db0459..accb4652 100644
--- a/controllers/elfmachine_controller_placement_group.go
+++ b/controllers/elfmachine_controller_placement_group.go
@@ -463,7 +463,7 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 				return true, nil
 			}
 
-			// The new CP ElfMachine should wait for other new CP ElfMachines to join the target PlacementGroup.
+			// The powered on CP ElfMachine which is not in the PlacementGroup should wait for other new CP ElfMachines to join the target PlacementGroup.
 			// The code below double checks the recommended target host for migration is valid.
 			cpElfMachines, err := machineutil.GetControlPlaneElfMachinesInCluster(ctx, ctx.Client, ctx.Cluster.Namespace, ctx.Cluster.Name)
 			if err != nil {
@@ -485,7 +485,7 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 			// and kcp.Status.UnavailableReplicas == 0.
 			// So we need to check if the number of CP ElfMachine is equal to kcp.Spec.Replicas.
 			if len(cpElfMachines) != int(*kcp.Spec.Replicas) {
-				ctx.Logger.Info("The number of CP ElfMachine does not match the expected", "kcp", formatKCP(kcp), "cpElfMachines", cpElfMachineNames)
+				ctx.Logger.V(1).Info("The number of CP ElfMachine does not match the expected", "kcp", formatKCP(kcp), "cpElfMachines", cpElfMachineNames)
 				return true, nil
 			}
 
@@ -506,7 +506,7 @@ func (r *ElfMachineReconciler) joinPlacementGroup(ctx *context.MachineContext, v
 			// This is the last CP ElfMachine (i.e. the 1st new CP ElfMachine) which has not been added into the target PlacementGroup.
 			// Migrate this VM to the target host, then it will be added into the target PlacementGroup.
 
-			ctx.Logger.V(1).Info("Start migrateVM since KCP is not in rolling update process", "targetHost", formatHost(targetHost))
+			ctx.Logger.V(1).Info("Start migrating VM since KCP is not in rolling update process", "targetHost", formatHost(targetHost))
 
 			return r.migrateVM(ctx, vm, *targetHost.ID)
 		}
diff --git a/controllers/elfmachine_controller_test.go b/controllers/elfmachine_controller_test.go
index f34e640f..e1ad0198 100644
--- a/controllers/elfmachine_controller_test.go
+++ b/controllers/elfmachine_controller_test.go
@@ -1167,7 +1167,7 @@ var _ = Describe("ElfMachineReconciler", func() {
 				Expect(ok).To(BeFalse())
 				Expect(err).To(BeZero())
 				Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID))
-				Expect(logBuffer.String()).To(ContainSubstring("Start migrateVM since KCP is not in rolling update process"))
+				Expect(logBuffer.String()).To(ContainSubstring("Start migrating VM since KCP is not in rolling update process"))
 				Expect(logBuffer.String()).To(ContainSubstring("Waiting for the VM to be migrated from"))
 				expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.JoiningPlacementGroupReason}})