diff --git a/api/v1beta1/elfmachine_types.go b/api/v1beta1/elfmachine_types.go index 600e6f5a..ee50e679 100644 --- a/api/v1beta1/elfmachine_types.go +++ b/api/v1beta1/elfmachine_types.go @@ -318,8 +318,16 @@ func (m *ElfMachine) GetVMDisconnectionTimestamp() *metav1.Time { return nil } +func (m *ElfMachine) RequiresGPUOrVGPUDevices() bool { + return m.RequiresGPUDevices() || m.RequiresVGPUDevices() +} + func (m *ElfMachine) RequiresGPUDevices() bool { - return len(m.Spec.GPUDevices) > 0 || len(m.Spec.VGPUDevices) > 0 + return len(m.Spec.GPUDevices) > 0 +} + +func (m *ElfMachine) RequiresVGPUDevices() bool { + return len(m.Spec.VGPUDevices) > 0 } //+kubebuilder:object:root=true diff --git a/controllers/elfmachine_controller.go b/controllers/elfmachine_controller.go index dc64c8fb..096bfd64 100644 --- a/controllers/elfmachine_controller.go +++ b/controllers/elfmachine_controller.go @@ -308,7 +308,7 @@ func (r *ElfMachineReconciler) reconcileDelete(ctx *context.MachineContext) (rec // locked by the virtual machine may not be unlocked. // For example, the Cluster or ElfMachine was deleted during a pause. if !ctrlutil.ContainsFinalizer(ctx.ElfMachine, infrav1.MachineFinalizer) && - ctx.ElfMachine.RequiresGPUDevices() { + ctx.ElfMachine.RequiresGPUOrVGPUDevices() { unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) } }() @@ -532,7 +532,7 @@ func (r *ElfMachineReconciler) reconcileVM(ctx *context.MachineContext) (*models } var hostID *string - var gpuDevices []*models.GpuDevice + var gpuDeviceInfos []*service.GPUDeviceInfo // The virtual machine of the Control Plane does not support GPU Devices. if machineutil.IsControlPlaneMachine(ctx.Machine) { hostID, err = r.preCheckPlacementGroup(ctx) @@ -540,7 +540,7 @@ func (r *ElfMachineReconciler) reconcileVM(ctx *context.MachineContext) (*models return nil, false, err } } else { - hostID, gpuDevices, err = r.selectHostAndGPUsForVM(ctx, "") + hostID, gpuDeviceInfos, err = r.selectHostAndGPUsForVM(ctx, "") if err != nil || hostID == nil { return nil, false, err } @@ -548,7 +548,7 @@ func (r *ElfMachineReconciler) reconcileVM(ctx *context.MachineContext) (*models ctx.Logger.Info("Create VM for ElfMachine") - withTaskVM, err := ctx.VMService.Clone(ctx.ElfCluster, ctx.ElfMachine, bootstrapData, *hostID, gpuDevices) + withTaskVM, err := ctx.VMService.Clone(ctx.ElfCluster, ctx.ElfMachine, bootstrapData, *hostID, gpuDeviceInfos) if err != nil { releaseTicketForCreateVM(ctx.ElfMachine.Name) @@ -561,7 +561,7 @@ func (r *ElfMachineReconciler) reconcileVM(ctx *context.MachineContext) (*models ctx.ElfMachine.SetVM(util.GetVMRef(vm)) } else { // Duplicate VM error does not require unlocking GPU devices. - if ctx.ElfMachine.RequiresGPUDevices() { + if ctx.ElfMachine.RequiresGPUOrVGPUDevices() { unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) } @@ -907,11 +907,11 @@ func (r *ElfMachineReconciler) reconcileVMTask(ctx *context.MachineContext, vm * setVMDuplicate(ctx.ElfMachine.Name) } - if ctx.ElfMachine.RequiresGPUDevices() { + if ctx.ElfMachine.RequiresGPUOrVGPUDevices() { unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) } case service.IsPowerOnVMTask(task) || service.IsUpdateVMTask(task): - if ctx.ElfMachine.RequiresGPUDevices() { + if ctx.ElfMachine.RequiresGPUOrVGPUDevices() { unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) } case service.IsMemoryInsufficientError(errorMessage): @@ -933,7 +933,7 @@ func (r *ElfMachineReconciler) reconcileVMTask(ctx *context.MachineContext, vm * ctx.Logger.Info("VM task succeeded", "vmRef", vmRef, "taskRef", taskRef, "taskDescription", service.GetTowerString(task.Description)) if service.IsCloneVMTask(task) || service.IsUpdateVMTask(task) { - if ctx.ElfMachine.RequiresGPUDevices() { + if ctx.ElfMachine.RequiresGPUOrVGPUDevices() { unlockGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name) } } diff --git a/controllers/elfmachine_controller_gpu.go b/controllers/elfmachine_controller_gpu.go index cc264fc0..17042f03 100644 --- a/controllers/elfmachine_controller_gpu.go +++ b/controllers/elfmachine_controller_gpu.go @@ -43,8 +43,8 @@ import ( // 3. A non-empty string indicates that the specified host ID was returned. // // The return gpudevices: the GPU devices for virtual machine. -func (r *ElfMachineReconciler) selectHostAndGPUsForVM(ctx *context.MachineContext, preferredHostID string) (rethost *string, gpudevices []*models.GpuDevice, reterr error) { - if !ctx.ElfMachine.RequiresGPUDevices() { +func (r *ElfMachineReconciler) selectHostAndGPUsForVM(ctx *context.MachineContext, preferredHostID string) (rethost *string, gpudevices []*service.GPUDeviceInfo, reterr error) { + if !ctx.ElfMachine.RequiresGPUOrVGPUDevices() { return pointer.String(""), nil, nil } @@ -58,12 +58,12 @@ func (r *ElfMachineReconciler) selectHostAndGPUsForVM(ctx *context.MachineContex // If the GPU devices locked by the virtual machine still exist, use them directly. if lockedVMGPUs := getGPUDevicesLockedByVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name); lockedVMGPUs != nil { - if ok, gpuDevices, err := r.checkGPUsCanBeUsedForVM(ctx, lockedVMGPUs.GPUDeviceIDs, ctx.ElfMachine.Name); err != nil { + if ok, err := r.checkGPUsCanBeUsedForVM(ctx, lockedVMGPUs.GetGPUIDs()); err != nil { return nil, nil, err } else if ok { ctx.Logger.V(1).Info("Found locked VM GPU devices, so skip allocation", "lockedVMGPUs", lockedVMGPUs) - return &lockedVMGPUs.HostID, gpuDevices, nil + return &lockedVMGPUs.HostID, lockedVMGPUs.GetGPUDeviceInfos(), nil } // If the GPU devices returned by Tower is inconsistent with the locked GPU, @@ -84,30 +84,46 @@ func (r *ElfMachineReconciler) selectHostAndGPUsForVM(ctx *context.MachineContex } // Get all GPU devices of available hosts. - gpuDevices, err := ctx.VMService.FindGPUDevicesByHostIDs(availableHosts.IDs()) + gpuDeviceUsage := models.GpuDeviceUsagePASSTHROUGH + if ctx.ElfMachine.RequiresVGPUDevices() { + gpuDeviceUsage = models.GpuDeviceUsageVGPU + } + gpuDevices, err := ctx.VMService.FindGPUDevicesByHostIDs(availableHosts.IDs(), gpuDeviceUsage) + if err != nil || len(gpuDevices) == 0 { + return nil, nil, err + } + + gpuDeviceIDs := make([]string, len(gpuDevices)) + for i := 0; i < len(gpuDevices); i++ { + gpuDeviceIDs[i] = *gpuDevices[i].ID + } + // Get GPU devices with VMs and allocation details. + gpuDeviceInfos, err := ctx.VMService.FindGPUDeviceInfos(gpuDeviceIDs) if err != nil { return nil, nil, err } - lockedClusterGPUIDs := getLockedClusterGPUIDs(ctx.ElfCluster.Spec.Cluster) + service.AggregateUnusedGPUDevicesToGPUDeviceInfos(gpuDeviceInfos, gpuDevices) - // Group GPU devices by host. - hostGPUDeviceMap := make(map[string][]*models.GpuDevice) - hostIDSet := sets.NewString() - for i := 0; i < len(gpuDevices); i++ { - // Filter already used or locked GPU devices. - if !service.GPUCanBeUsedForVM(gpuDevices[i], ctx.ElfMachine.Name) || - lockedClusterGPUIDs.Has(*gpuDevices[i].ID) { - continue - } + // Filter already used GPU devices. + gpuDeviceInfos = gpuDeviceInfos.Filter(func(g *service.GPUDeviceInfo) bool { + return g.AvailableCount > 0 + }) - hostIDSet.Insert(*gpuDevices[i].Host.ID) - if gpus, ok := hostGPUDeviceMap[*gpuDevices[i].Host.ID]; !ok { - hostGPUDeviceMap[*gpuDevices[i].Host.ID] = []*models.GpuDevice{gpuDevices[i]} + // Filter locked GPU devices. + gpuDeviceInfos = filterGPUDeviceInfosByLockGPUDevices(ctx.ElfCluster.Spec.Cluster, gpuDeviceInfos) + + // Group GPU deviceInfos by host. + hostGPUDeviceInfoMap := make(map[string]service.GPUDeviceInfos) + hostIDSet := sets.NewString() + gpuDeviceInfos.Iterate(func(gpuDeviceInfo *service.GPUDeviceInfo) { + hostIDSet.Insert(gpuDeviceInfo.HostID) + if gpuInfos, ok := hostGPUDeviceInfoMap[gpuDeviceInfo.HostID]; !ok { + hostGPUDeviceInfoMap[gpuDeviceInfo.HostID] = service.NewGPUDeviceInfos(gpuDeviceInfo) } else { - hostGPUDeviceMap[*gpuDevices[i].Host.ID] = append(gpus, gpuDevices[i]) + gpuInfos.Insert(gpuDeviceInfo) } - } + }) // Choose a host that meets ElfMachine GPU needs. // Use a random host list to reduce the probability of the same host being selected at the same time. @@ -122,25 +138,29 @@ func (r *ElfMachineReconciler) selectHostAndGPUsForVM(ctx *context.MachineContex } for i := 0; i < len(unsortedHostIDs); i++ { - if hostGPUDevices, ok := hostGPUDeviceMap[unsortedHostIDs[i]]; ok { - selectedGPUDevices := selectGPUDevicesForVM(hostGPUDevices, ctx.ElfMachine.Spec.GPUDevices) - if len(selectedGPUDevices) > 0 { - gpuDeviceIDs := make([]string, len(selectedGPUDevices)) - for i := 0; i < len(selectedGPUDevices); i++ { - gpuDeviceIDs[i] = *selectedGPUDevices[i].ID - } - - // Lock the selected GPU devices to prevent it from being allocated to multiple virtual machines. - if !lockGPUDevicesForVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name, unsortedHostIDs[i], gpuDeviceIDs) { - // Lock failure indicates that the GPU devices are locked by another virtual machine. - // Just trying other hosts. - continue - } - - ctx.Logger.Info("Selected host and GPU devices for VM", "hostId", unsortedHostIDs[i], "gpuDeviceIds", gpuDeviceIDs) - - return &unsortedHostIDs[i], selectedGPUDevices, nil + hostGPUDeviceInfos, ok := hostGPUDeviceInfoMap[unsortedHostIDs[i]] + if !ok { + continue + } + + var selectedGPUDeviceInfos []*service.GPUDeviceInfo + if ctx.ElfMachine.RequiresGPUDevices() { + selectedGPUDeviceInfos = selectGPUDevicesForVM(hostGPUDeviceInfos, ctx.ElfMachine.Spec.GPUDevices) + } else { + selectedGPUDeviceInfos = selectVGPUDevicesForVM(hostGPUDeviceInfos, ctx.ElfMachine.Spec.VGPUDevices) + } + + if len(selectedGPUDeviceInfos) > 0 { + // Lock the selected GPU devices to prevent it from being allocated to multiple virtual machines. + if !lockGPUDevicesForVM(ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Name, unsortedHostIDs[i], selectedGPUDeviceInfos) { + // Lock failure indicates that the GPU devices are locked by another virtual machine. + // Just trying other hosts. + continue } + + ctx.Logger.Info("Selected host and GPU devices for VM", "hostId", unsortedHostIDs[i], "gpuDevices", selectedGPUDeviceInfos) + + return &unsortedHostIDs[i], selectedGPUDeviceInfos, nil } } @@ -149,38 +169,86 @@ func (r *ElfMachineReconciler) selectHostAndGPUsForVM(ctx *context.MachineContex // selectGPUDevicesForVM selects the GPU devices required by the virtual machine from the host's GPU devices. // Empty GPU devices indicates that the host's GPU devices cannot meet the GPU requirements of the virtual machine. -func selectGPUDevicesForVM(hostGPUDevices []*models.GpuDevice, requiredGPUDevices []infrav1.GPUPassthroughDeviceSpec) []*models.GpuDevice { +func selectGPUDevicesForVM(hostGPUDeviceInfos service.GPUDeviceInfos, requiredGPUDevices []infrav1.GPUPassthroughDeviceSpec) []*service.GPUDeviceInfo { // Group GPU devices by model. - modelGPUDeviceMap := make(map[string][]*models.GpuDevice) - for i := 0; i < len(hostGPUDevices); i++ { - if gpus, ok := modelGPUDeviceMap[*hostGPUDevices[i].Model]; !ok { - modelGPUDeviceMap[*hostGPUDevices[i].Model] = []*models.GpuDevice{hostGPUDevices[i]} + modelGPUDeviceMap := make(map[string][]*service.GPUDeviceInfo) + hostGPUDeviceInfos.Iterate(func(gpuDeviceInfo *service.GPUDeviceInfo) { + if gpuInfos, ok := modelGPUDeviceMap[gpuDeviceInfo.Model]; !ok { + modelGPUDeviceMap[gpuDeviceInfo.Model] = []*service.GPUDeviceInfo{gpuDeviceInfo} } else { - modelGPUDeviceMap[*hostGPUDevices[i].Model] = append(gpus, hostGPUDevices[i]) + modelGPUDeviceMap[gpuDeviceInfo.Model] = append(gpuInfos, gpuDeviceInfo) } - } + }) - var selectedGPUDevices []*models.GpuDevice + var selectedGPUDeviceInfos []*service.GPUDeviceInfo for i := 0; i < len(requiredGPUDevices); i++ { - if gpus, ok := modelGPUDeviceMap[requiredGPUDevices[i].Model]; !ok { + gpuDevices, ok := modelGPUDeviceMap[requiredGPUDevices[i].Model] + if !ok || len(gpuDevices) < int(requiredGPUDevices[i].Count) { return nil + } + + gpuInfos := gpuDevices[:int(requiredGPUDevices[i].Count)] + for j := 0; j < len(gpuInfos); j++ { + selectedGPUDeviceInfos = append(selectedGPUDeviceInfos, &service.GPUDeviceInfo{ID: gpuInfos[j].ID, AllocatedCount: 1, AvailableCount: 1}) + } + } + + return selectedGPUDeviceInfos +} + +// selectVGPUDevicesForVM selects the vGPU devices required by the virtual machine from the host's vGPU devices. +// Empty vGPU devices indicates that the host's vGPU devices cannot meet the vGPU requirements of the virtual machine. +func selectVGPUDevicesForVM(hostGPUDeviceInfos service.GPUDeviceInfos, requiredVGPUDevices []infrav1.VGPUDeviceSpec) []*service.GPUDeviceInfo { + // Group vGPU devices by vGPU type. + typeVGPUDeviceInfoMap := make(map[string][]*service.GPUDeviceInfo) + hostGPUDeviceInfos.Iterate(func(gpuDeviceInfo *service.GPUDeviceInfo) { + if gpuInfos, ok := typeVGPUDeviceInfoMap[gpuDeviceInfo.VGPUType]; !ok { + typeVGPUDeviceInfoMap[gpuDeviceInfo.VGPUType] = []*service.GPUDeviceInfo{gpuDeviceInfo} } else { - if len(gpus) < int(requiredGPUDevices[i].Count) { - return nil + typeVGPUDeviceInfoMap[gpuDeviceInfo.VGPUType] = append(gpuInfos, gpuDeviceInfo) + } + }) + + var selectedGPUDeviceInfos []*service.GPUDeviceInfo + for i := 0; i < len(requiredVGPUDevices); i++ { + gpuDeviceInfos, ok := typeVGPUDeviceInfoMap[requiredVGPUDevices[i].Type] + if !ok { + return nil + } + + var gpuInfos []*service.GPUDeviceInfo + requiredCount := requiredVGPUDevices[i].Count + for j := 0; j < len(gpuDeviceInfos); j++ { + if gpuDeviceInfos[j].AvailableCount <= 0 { + continue + } + + if gpuDeviceInfos[j].AvailableCount >= requiredCount { + gpuInfos = append(gpuInfos, &service.GPUDeviceInfo{ID: gpuDeviceInfos[j].ID, AllocatedCount: requiredCount, AvailableCount: gpuDeviceInfos[j].AvailableCount}) + requiredCount = 0 + + break + } else { + gpuInfos = append(gpuInfos, &service.GPUDeviceInfo{ID: gpuDeviceInfos[j].ID, AllocatedCount: gpuDeviceInfos[j].AvailableCount, AvailableCount: gpuDeviceInfos[j].AvailableCount}) + requiredCount -= gpuDeviceInfos[j].AvailableCount } + } - selectedGPUDevices = append(selectedGPUDevices, gpus[:int(requiredGPUDevices[i].Count)]...) - // Remove selected GPU devices. - modelGPUDeviceMap[requiredGPUDevices[i].Model] = gpus[int(requiredGPUDevices[i].Count):] + // If requiredCount is greater than 0, it means there are not enough vGPUs, + // just return directly. + if requiredCount > 0 { + return nil } + + selectedGPUDeviceInfos = append(selectedGPUDeviceInfos, gpuInfos...) } - return selectedGPUDevices + return selectedGPUDeviceInfos } // reconcileGPUDevices ensures that the virtual machine has the expected GPU devices. func (r *ElfMachineReconciler) reconcileGPUDevices(ctx *context.MachineContext, vm *models.VM) (bool, error) { - if !ctx.ElfMachine.RequiresGPUDevices() { + if !ctx.ElfMachine.RequiresGPUOrVGPUDevices() { return true, nil } @@ -213,7 +281,7 @@ func (r *ElfMachineReconciler) reconcileGPUDevices(ctx *context.MachineContext, gpuIDs[i] = *vm.GpuDevices[i].ID } - if ok, _, err := r.checkGPUsCanBeUsedForVM(ctx, gpuIDs, ctx.ElfMachine.Name); err != nil { + if ok, err := r.checkGPUsCanBeUsedForVM(ctx, gpuIDs); err != nil { return false, err } else if !ok { // If the GPU devices are already in use, @@ -228,7 +296,7 @@ func (r *ElfMachineReconciler) reconcileGPUDevices(ctx *context.MachineContext, // addGPUDevicesForVM adds expected GPU devices to the virtual machine. func (r *ElfMachineReconciler) addGPUDevicesForVM(ctx *context.MachineContext, vm *models.VM) (bool, error) { - hostID, gpuDevices, err := r.selectHostAndGPUsForVM(ctx, *vm.Host.ID) + hostID, gpuDeviceInfos, err := r.selectHostAndGPUsForVM(ctx, *vm.Host.ID) if err != nil || hostID == nil { return false, err } @@ -244,15 +312,7 @@ func (r *ElfMachineReconciler) addGPUDevicesForVM(ctx *context.MachineContext, v return ok, err } - gpus := make([]*models.VMGpuOperationParams, len(gpuDevices)) - for i := 0; i < len(gpuDevices); i++ { - gpus[i] = &models.VMGpuOperationParams{ - GpuID: gpuDevices[i].ID, - Amount: service.TowerInt32(1), - } - } - - task, err := ctx.VMService.AddGPUDevices(ctx.ElfMachine.Status.VMRef, gpus) + task, err := ctx.VMService.AddGPUDevices(ctx.ElfMachine.Status.VMRef, gpuDeviceInfos) if err != nil { conditions.MarkFalse(ctx.ElfMachine, infrav1.VMProvisionedCondition, infrav1.AttachingGPUFailedReason, clusterv1.ConditionSeverityWarning, err.Error()) @@ -298,19 +358,22 @@ func (r *ElfMachineReconciler) removeVMGPUDevices(ctx *context.MachineContext, v // checkGPUsCanBeUsedForVM checks whether GPU devices can be used by the specified virtual machine. // The return true means the GPU devices can be used for the virtual machine. -func (r *ElfMachineReconciler) checkGPUsCanBeUsedForVM(ctx *context.MachineContext, gpuDeviceIDs []string, vm string) (bool, []*models.GpuDevice, error) { +func (r *ElfMachineReconciler) checkGPUsCanBeUsedForVM(ctx *context.MachineContext, gpuDeviceIDs []string) (bool, error) { gpuDevices, err := ctx.VMService.FindGPUDevicesByIDs(gpuDeviceIDs) - if err != nil { - return false, nil, err + if err != nil || len(gpuDevices) != len(gpuDeviceIDs) { + return false, err } - if len(gpuDevices) != len(gpuDeviceIDs) { - return false, nil, nil + gpuDeviceInfos, err := ctx.VMService.FindGPUDeviceInfos(gpuDeviceIDs) + if err != nil { + return false, err } - if len(service.FilterOutGPUsCanNotBeUsedForVM(gpuDevices, vm)) != len(gpuDeviceIDs) { - return false, nil, nil + service.AggregateUnusedGPUDevicesToGPUDeviceInfos(gpuDeviceInfos, gpuDevices) + + if service.HasGPUsCanNotBeUsedForVM(gpuDeviceInfos, ctx.ElfMachine) { + return false, nil } - return true, gpuDevices, nil + return true, nil } diff --git a/controllers/elfmachine_controller_gpu_test.go b/controllers/elfmachine_controller_gpu_test.go index 720a32f0..af976658 100644 --- a/controllers/elfmachine_controller_gpu_test.go +++ b/controllers/elfmachine_controller_gpu_test.go @@ -110,30 +110,47 @@ var _ = Describe("ElfMachineReconciler-GPU", func() { gpu.Model = service.TowerString(gpuModel) gpuIDs := []string{*gpu.ID} gpusDevices := []*models.GpuDevice{gpu} + gpusDeviceInfos := service.NewGPUDeviceInfos(&service.GPUDeviceInfo{ + ID: *gpu.ID, + HostID: *host.ID, + Model: *gpu.Model, + AllocatedCount: 0, + AvailableCount: 1, + }) ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil) - mockVMService.EXPECT().FindGPUDevicesByHostIDs([]string{*host.ID}).Return(gpusDevices, nil) + mockVMService.EXPECT().FindGPUDevicesByHostIDs([]string{*host.ID}, models.GpuDeviceUsagePASSTHROUGH).Return(gpusDevices, nil) + mockVMService.EXPECT().FindGPUDeviceInfos(gpuIDs).Return(gpusDeviceInfos, nil) machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} hostID, gpus, err := reconciler.selectHostAndGPUsForVM(machineContext, "") Expect(err).NotTo(HaveOccurred()) Expect(*hostID).To(Equal(*host.ID)) - Expect(gpus).To(Equal(gpusDevices)) + Expect(gpus).To(HaveLen(1)) + Expect(gpus[0].ID).To(Equal(*gpu.ID)) + Expect(gpus[0].AllocatedCount).To(Equal(int32(1))) mockVMService.EXPECT().FindGPUDevicesByIDs(gpuIDs).Return(gpusDevices, nil) + mockVMService.EXPECT().FindGPUDeviceInfos(gpuIDs).Return(gpusDeviceInfos, nil) hostID, gpus, err = reconciler.selectHostAndGPUsForVM(machineContext, "") Expect(err).NotTo(HaveOccurred()) Expect(*hostID).To(Equal(*host.ID)) - Expect(gpus).To(Equal(gpusDevices)) + Expect(gpus).To(HaveLen(1)) + Expect(gpus[0].ID).To(Equal(*gpu.ID)) + Expect(gpus[0].AllocatedCount).To(Equal(int32(1))) Expect(logBuffer.String()).To(ContainSubstring("Found locked VM GPU devices")) logBuffer.Reset() gpu.Vms = []*models.NestedVM{{ID: service.TowerString("id"), Name: service.TowerString("vm")}} + gpusDeviceInfo := gpusDeviceInfos.Get(*gpu.ID) + gpusDeviceInfo.AllocatedCount = 1 + gpusDeviceInfo.AvailableCount = 0 + gpusDeviceInfo.VMs = []service.GPUDeviceVM{{ID: "id", Name: "vm"}} mockVMService.EXPECT().FindGPUDevicesByIDs(gpuIDs).Return(gpusDevices, nil) - mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host), nil) - mockVMService.EXPECT().FindGPUDevicesByHostIDs([]string{*host.ID}).Return(gpusDevices, nil) + mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(nil, nil) + mockVMService.EXPECT().FindGPUDeviceInfos(gpuIDs).Return(gpusDeviceInfos, nil) hostID, gpus, err = reconciler.selectHostAndGPUsForVM(machineContext, "") Expect(err).NotTo(HaveOccurred()) Expect(hostID).To(BeNil()) @@ -154,17 +171,33 @@ var _ = Describe("ElfMachineReconciler-GPU", func() { preferredGPU.Host = &models.NestedHost{ID: preferredHost.ID} preferredGPU.Model = service.TowerString(gpuModel) gpusDevices := []*models.GpuDevice{gpu, preferredGPU} + gpusDeviceInfos := service.NewGPUDeviceInfos(&service.GPUDeviceInfo{ + ID: *gpu.ID, + HostID: *host.ID, + Model: *gpu.Model, + AllocatedCount: 0, + AvailableCount: 1, + }, &service.GPUDeviceInfo{ + ID: *preferredGPU.ID, + HostID: *preferredHost.ID, + Model: *preferredGPU.Model, + AllocatedCount: 0, + AvailableCount: 1, + }) ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Return(service.NewHosts(host, preferredHost), nil) - mockVMService.EXPECT().FindGPUDevicesByHostIDs(gomock.InAnyOrder([]string{*host.ID, *preferredHost.ID})).Return(gpusDevices, nil) + mockVMService.EXPECT().FindGPUDevicesByHostIDs(gomock.InAnyOrder([]string{*host.ID, *preferredHost.ID}), models.GpuDeviceUsagePASSTHROUGH).Return(gpusDevices, nil) + mockVMService.EXPECT().FindGPUDeviceInfos(gomock.InAnyOrder([]string{*gpu.ID, *preferredGPU.ID})).Return(gpusDeviceInfos, nil) machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} hostID, gpus, err := reconciler.selectHostAndGPUsForVM(machineContext, *preferredHost.ID) Expect(err).NotTo(HaveOccurred()) Expect(*hostID).To(Equal(*preferredHost.ID)) - Expect(gpus).To(Equal([]*models.GpuDevice{preferredGPU})) + Expect(gpus).To(HaveLen(1)) + Expect(gpus[0].ID).To(Equal(*preferredGPU.ID)) + Expect(gpus[0].AllocatedCount).To(Equal(int32(1))) }) }) @@ -244,6 +277,14 @@ var _ = Describe("ElfMachineReconciler-GPU", func() { gpu.Host = &models.NestedHost{ID: host.ID} gpu.Model = service.TowerString(gpuModel) gpu.Vms = []*models.NestedVM{{ID: service.TowerString("id"), Name: service.TowerString("vm")}} + gpusDeviceInfos := service.NewGPUDeviceInfos(&service.GPUDeviceInfo{ + ID: *gpu.ID, + HostID: *host.ID, + Model: *gpu.Model, + AllocatedCount: 0, + AvailableCount: 1, + VMs: []service.GPUDeviceVM{{Name: "name", AllocatedCount: 1}}, + }) vm := fake.NewTowerVMFromElfMachine(elfMachine) vm.Host = &models.NestedHost{ID: host.ID} vm.Status = models.NewVMStatus(models.VMStatusSTOPPED) @@ -251,6 +292,7 @@ var _ = Describe("ElfMachineReconciler-GPU", func() { ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) mockVMService.EXPECT().FindGPUDevicesByIDs([]string{*gpu.ID}).Times(2).Return([]*models.GpuDevice{gpu}, nil) + mockVMService.EXPECT().FindGPUDeviceInfos([]string{*gpu.ID}).Return(gpusDeviceInfos, nil) mockVMService.EXPECT().RemoveGPUDevices(elfMachine.Status.VMRef, gomock.Len(1)).Return(nil, unexpectedError) machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) @@ -261,7 +303,8 @@ var _ = Describe("ElfMachineReconciler-GPU", func() { Expect(ok).To(BeFalse()) Expect(logBuffer.String()).To(ContainSubstring("GPU devices of VM are already in use, so remove and reallocate")) - gpu.Vms = []*models.NestedVM{{ID: vm.ID, Name: vm.Name}} + gpusDeviceInfos.Get(*gpu.ID).VMs = []service.GPUDeviceVM{{Name: *vm.Name, AllocatedCount: 1}} + mockVMService.EXPECT().FindGPUDeviceInfos([]string{*gpu.ID}).Return(gpusDeviceInfos, nil) ok, err = reconciler.reconcileGPUDevices(machineContext, vm) Expect(err).NotTo(HaveOccurred()) Expect(ok).To(BeTrue()) @@ -281,12 +324,20 @@ var _ = Describe("ElfMachineReconciler-GPU", func() { gpu := fake.NewTowerGPU() gpu.Host = &models.NestedHost{ID: host.ID} gpu.Model = service.TowerString(gpuModel) + gpusDeviceInfos := service.NewGPUDeviceInfos(&service.GPUDeviceInfo{ + ID: *gpu.ID, + HostID: *host.ID, + Model: *gpu.Model, + AllocatedCount: 0, + AvailableCount: 1, + }) task := fake.NewTowerTask() withTaskVM := fake.NewWithTaskVM(vm, task) ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Times(2).Return(service.NewHosts(host), nil) - mockVMService.EXPECT().FindGPUDevicesByHostIDs([]string{*host.ID}).Times(2).Return([]*models.GpuDevice{gpu}, nil) + mockVMService.EXPECT().FindGPUDevicesByHostIDs([]string{*host.ID}, models.GpuDeviceUsagePASSTHROUGH).Times(2).Return([]*models.GpuDevice{gpu}, nil) + mockVMService.EXPECT().FindGPUDeviceInfos([]string{*gpu.ID}).Times(2).Return(gpusDeviceInfos, nil) mockVMService.EXPECT().Migrate(*vm.ID, *host.ID).Return(withTaskVM, nil) machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) @@ -315,11 +366,19 @@ var _ = Describe("ElfMachineReconciler-GPU", func() { gpu := fake.NewTowerGPU() gpu.Host = &models.NestedHost{ID: host.ID} gpu.Model = service.TowerString(gpuModel) + gpusDeviceInfos := service.NewGPUDeviceInfos(&service.GPUDeviceInfo{ + ID: *gpu.ID, + HostID: *host.ID, + Model: *gpu.Model, + AllocatedCount: 0, + AvailableCount: 1, + }) task := fake.NewTowerTask() ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) mockVMService.EXPECT().GetHostsByCluster(elfCluster.Spec.Cluster).Times(2).Return(service.NewHosts(host), nil) - mockVMService.EXPECT().FindGPUDevicesByHostIDs([]string{*host.ID}).Times(2).Return([]*models.GpuDevice{gpu}, nil) + mockVMService.EXPECT().FindGPUDevicesByHostIDs([]string{*host.ID}, models.GpuDeviceUsagePASSTHROUGH).Times(2).Return([]*models.GpuDevice{gpu}, nil) + mockVMService.EXPECT().FindGPUDeviceInfos([]string{*gpu.ID}).Times(2).Return(gpusDeviceInfos, nil) mockVMService.EXPECT().AddGPUDevices(elfMachine.Status.VMRef, gomock.Any()).Return(task, nil) machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) @@ -431,4 +490,90 @@ var _ = Describe("ElfMachineReconciler-GPU", func() { }, timeout).Should(BeTrue()) }) }) + + It("checkGPUsCanBeUsedForVM", func() { + host := fake.NewTowerGPU() + gpu := fake.NewTowerGPU() + gpu.Host = &models.NestedHost{ID: host.ID} + gpuIDs := []string{*gpu.ID} + gpusDevices := []*models.GpuDevice{gpu} + gpusDeviceInfos := service.NewGPUDeviceInfos() + elfMachine.Spec.GPUDevices = append(elfMachine.Spec.GPUDevices, infrav1.GPUPassthroughDeviceSpec{Model: "A16", Count: 1}) + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + mockVMService.EXPECT().FindGPUDevicesByIDs(gpuIDs).Return(nil, nil) + + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + ok, err := reconciler.checkGPUsCanBeUsedForVM(machineContext, gpuIDs) + Expect(err).NotTo(HaveOccurred()) + Expect(ok).To(BeFalse()) + + mockVMService.EXPECT().FindGPUDevicesByIDs(gpuIDs).Return(gpusDevices, nil) + mockVMService.EXPECT().FindGPUDeviceInfos(gpuIDs).Return(gpusDeviceInfos, nil) + ok, err = reconciler.checkGPUsCanBeUsedForVM(machineContext, gpuIDs) + Expect(err).NotTo(HaveOccurred()) + Expect(ok).To(BeTrue()) + + gpusDeviceInfos.Insert(&service.GPUDeviceInfo{ + VMs: []service.GPUDeviceVM{{ID: "vm1", Name: "vm1"}}, + }) + mockVMService.EXPECT().FindGPUDevicesByIDs(gpuIDs).Return(gpusDevices, nil) + mockVMService.EXPECT().FindGPUDeviceInfos(gpuIDs).Return(gpusDeviceInfos, nil) + ok, err = reconciler.checkGPUsCanBeUsedForVM(machineContext, gpuIDs) + Expect(err).NotTo(HaveOccurred()) + Expect(ok).To(BeFalse()) + }) + + It("selectVGPUDevicesForVM", func() { + host := &models.NestedHost{ID: service.TowerString("host")} + vGPU1 := fake.NewTowerVGPU(1) + vGPU1.Host = host + vGPU2 := fake.NewTowerVGPU(2) + vGPU2.Host = host + vGPUType := "V100" + requiredVGPUDevice := infrav1.VGPUDeviceSpec{Type: vGPUType, Count: 1} + requiredVGPUDevices := []infrav1.VGPUDeviceSpec{requiredVGPUDevice} + gpuDeviceInfos := service.NewGPUDeviceInfos() + gpus := selectVGPUDevicesForVM(gpuDeviceInfos, requiredVGPUDevices) + Expect(gpus).To(BeEmpty()) + + gpuDeviceInfo1 := &service.GPUDeviceInfo{ + ID: *vGPU1.ID, + HostID: *vGPU1.Host.ID, + Model: *vGPU1.Model, + VGPUType: vGPUType, + AllocatedCount: 1, + AvailableCount: 0, + } + gpuDeviceInfos = service.NewGPUDeviceInfos(gpuDeviceInfo1) + gpus = selectVGPUDevicesForVM(gpuDeviceInfos, requiredVGPUDevices) + Expect(gpus).To(BeEmpty()) + + gpuDeviceInfo1.AvailableCount = 1 + gpuDeviceInfos = service.NewGPUDeviceInfos(gpuDeviceInfo1) + gpus = selectVGPUDevicesForVM(gpuDeviceInfos, requiredVGPUDevices) + Expect(gpus).To(Equal([]*service.GPUDeviceInfo{{ID: gpuDeviceInfo1.ID, AllocatedCount: requiredVGPUDevice.Count, AvailableCount: gpuDeviceInfo1.AvailableCount}})) + + requiredVGPUDevice.Count = 3 + requiredVGPUDevices[0] = requiredVGPUDevice + gpus = selectVGPUDevicesForVM(gpuDeviceInfos, requiredVGPUDevices) + Expect(gpus).To(BeEmpty()) + + gpuDeviceInfo2 := &service.GPUDeviceInfo{ + ID: *vGPU2.ID, + HostID: *vGPU2.Host.ID, + Model: *vGPU2.Model, + VGPUType: vGPUType, + AllocatedCount: 1, + AvailableCount: 3, + } + gpuDeviceInfos.Insert(gpuDeviceInfo2) + gpus = selectVGPUDevicesForVM(gpuDeviceInfos, requiredVGPUDevices) + Expect(gpus).To(Equal([]*service.GPUDeviceInfo{ + {ID: gpuDeviceInfo1.ID, AllocatedCount: 1, AvailableCount: gpuDeviceInfo1.AvailableCount}, + {ID: gpuDeviceInfo2.ID, AllocatedCount: 2, AvailableCount: gpuDeviceInfo2.AvailableCount}, + })) + Expect(gpus[0].AllocatedCount + gpus[1].AllocatedCount).To(Equal(requiredVGPUDevice.Count)) + }) }) diff --git a/controllers/vm_limiter.go b/controllers/vm_limiter.go index 2aa67df5..db59b22d 100644 --- a/controllers/vm_limiter.go +++ b/controllers/vm_limiter.go @@ -22,9 +22,9 @@ import ( "time" "github.com/patrickmn/go-cache" - "k8s.io/apimachinery/pkg/util/sets" "github.com/smartxworks/cluster-api-provider-elf/pkg/config" + "github.com/smartxworks/cluster-api-provider-elf/pkg/service" ) const ( @@ -142,10 +142,33 @@ func getKeyForVMDuplicate(name string) string { /* GPU */ +type lockedGPUDevice struct { + ID string `json:"id"` + Count int32 `json:"count"` +} + type lockedVMGPUs struct { - HostID string `json:"hostId"` - GPUDeviceIDs []string `json:"gpuDeviceIds"` - LockedAt time.Time `json:"lockedAt"` + HostID string `json:"hostId"` + GPUDevices []lockedGPUDevice `json:"gpuDevices"` + LockedAt time.Time `json:"lockedAt"` +} + +func (g *lockedVMGPUs) GetGPUIDs() []string { + ids := make([]string, len(g.GPUDevices)) + for i := 0; i < len(g.GPUDevices); i++ { + ids[i] = g.GPUDevices[i].ID + } + + return ids +} + +func (g *lockedVMGPUs) GetGPUDeviceInfos() []*service.GPUDeviceInfo { + gpuDeviceInfos := make([]*service.GPUDeviceInfo, len(g.GPUDevices)) + for i := 0; i < len(g.GPUDevices); i++ { + gpuDeviceInfos[i] = &service.GPUDeviceInfo{ID: g.GPUDevices[i].ID, AllocatedCount: g.GPUDevices[i].Count} + } + + return gpuDeviceInfos } type lockedClusterGPUMap map[string]lockedVMGPUs @@ -158,42 +181,53 @@ var lockedGPUMap = make(map[string]lockedClusterGPUMap) // lockGPUDevicesForVM locks the GPU devices required to create or start a virtual machine. // The GPU devices will be unlocked when the task is completed or times out. // This prevents multiple virtual machines from being allocated the same GPU. -func lockGPUDevicesForVM(clusterID, vmName, hostID string, gpuDeviceIDs []string) bool { +func lockGPUDevicesForVM(clusterID, vmName, hostID string, gpuDeviceInfos []*service.GPUDeviceInfo) bool { gpuLock.Lock() defer gpuLock.Unlock() - lockedClusterGPUIDs := getLockedClusterGPUIDsWithoutLock(clusterID) - for i := 0; i < len(gpuDeviceIDs); i++ { - if lockedClusterGPUIDs.Has(gpuDeviceIDs[i]) { - return false - } + availableCountMap := make(map[string]int32) + lockedGPUs := lockedVMGPUs{HostID: hostID, LockedAt: time.Now(), GPUDevices: make([]lockedGPUDevice, len(gpuDeviceInfos))} + for i := 0; i < len(gpuDeviceInfos); i++ { + availableCountMap[gpuDeviceInfos[i].ID] = gpuDeviceInfos[i].AvailableCount - gpuDeviceInfos[i].AllocatedCount + lockedGPUs.GPUDevices[i] = lockedGPUDevice{ID: gpuDeviceInfos[i].ID, Count: gpuDeviceInfos[i].AllocatedCount} } - lockedClusterGPUs := getLockedClusterGPUs(clusterID) - lockedClusterGPUs[vmName] = lockedVMGPUs{ - HostID: hostID, - GPUDeviceIDs: gpuDeviceIDs, - LockedAt: time.Now(), + lockedClusterGPUs := getLockedClusterGPUsWithoutLock(clusterID) + lockedCountMap := getLockedCountMapWithoutLock(lockedClusterGPUs) + + for gpuID, availableCount := range availableCountMap { + if lockedCount, ok := lockedCountMap[gpuID]; ok && lockedCount > availableCount { + return false + } } + lockedClusterGPUs[vmName] = lockedGPUs lockedGPUMap[clusterID] = lockedClusterGPUs return true } -// getLockedClusterGPUIDs returns the locked GPU devices of the specified cluster. -func getLockedClusterGPUIDs(clusterID string) sets.Set[string] { +func filterGPUDeviceInfosByLockGPUDevices(clusterID string, gpuDeviceInfos service.GPUDeviceInfos) service.GPUDeviceInfos { gpuLock.Lock() defer gpuLock.Unlock() - return getLockedClusterGPUIDsWithoutLock(clusterID) + lockedClusterGPUs := getLockedClusterGPUsWithoutLock(clusterID) + lockedCountMap := getLockedCountMapWithoutLock(lockedClusterGPUs) + + return gpuDeviceInfos.Filter(func(g *service.GPUDeviceInfo) bool { + if lockedCount, ok := lockedCountMap[g.ID]; ok && lockedCount >= g.AvailableCount { + return false + } + + return true + }) } func getGPUDevicesLockedByVM(clusterID, vmName string) *lockedVMGPUs { gpuLock.Lock() defer gpuLock.Unlock() - lockedClusterGPUs := getLockedClusterGPUs(clusterID) + lockedClusterGPUs := getLockedClusterGPUsWithoutLock(clusterID) if vmGPUs, ok := lockedClusterGPUs[vmName]; ok { if time.Now().Before(vmGPUs.LockedAt.Add(gpuLockTimeout)) { return &vmGPUs @@ -210,7 +244,7 @@ func unlockGPUDevicesLockedByVM(clusterID, vmName string) { gpuLock.Lock() defer gpuLock.Unlock() - lockedClusterGPUs := getLockedClusterGPUs(clusterID) + lockedClusterGPUs := getLockedClusterGPUsWithoutLock(clusterID) delete(lockedClusterGPUs, vmName) if len(lockedClusterGPUs) == 0 { @@ -220,25 +254,34 @@ func unlockGPUDevicesLockedByVM(clusterID, vmName string) { } } -func getLockedClusterGPUs(clusterID string) lockedClusterGPUMap { - if _, ok := lockedGPUMap[clusterID]; ok { - return lockedGPUMap[clusterID] +func getLockedClusterGPUsWithoutLock(clusterID string) lockedClusterGPUMap { + if _, ok := lockedGPUMap[clusterID]; !ok { + return make(map[string]lockedVMGPUs) } - return make(map[string]lockedVMGPUs) -} - -func getLockedClusterGPUIDsWithoutLock(clusterID string) sets.Set[string] { - gpuIDs := sets.Set[string]{} - - lockedClusterGPUs := getLockedClusterGPUs(clusterID) + lockedClusterGPUs := lockedGPUMap[clusterID] for vmName, lockedGPUs := range lockedClusterGPUs { - if time.Now().Before(lockedGPUs.LockedAt.Add(gpuLockTimeout)) { - gpuIDs.Insert(lockedGPUs.GPUDeviceIDs...) - } else { + if !time.Now().Before(lockedGPUs.LockedAt.Add(gpuLockTimeout)) { + // Delete expired data delete(lockedClusterGPUs, vmName) } } - return gpuIDs + return lockedClusterGPUs +} + +// getLockedCountMapWithoutLock counts and returns the number of locks for each GPU. +func getLockedCountMapWithoutLock(lockedClusterGPUs lockedClusterGPUMap) map[string]int32 { + lockedCountMap := make(map[string]int32) + for _, lockedGPUs := range lockedClusterGPUs { + for i := 0; i < len(lockedGPUs.GPUDevices); i++ { + if count, ok := lockedCountMap[lockedGPUs.GPUDevices[i].ID]; ok { + lockedCountMap[lockedGPUs.GPUDevices[i].ID] = count + lockedGPUs.GPUDevices[i].Count + } else { + lockedCountMap[lockedGPUs.GPUDevices[i].ID] = lockedGPUs.GPUDevices[i].Count + } + } + } + + return lockedCountMap } diff --git a/controllers/vm_limiter_test.go b/controllers/vm_limiter_test.go index 53e529db..6df3c1bf 100644 --- a/controllers/vm_limiter_test.go +++ b/controllers/vm_limiter_test.go @@ -24,6 +24,7 @@ import ( . "github.com/onsi/gomega" "github.com/smartxworks/cluster-api-provider-elf/pkg/config" + "github.com/smartxworks/cluster-api-provider-elf/pkg/service" "github.com/smartxworks/cluster-api-provider-elf/test/fake" ) @@ -137,47 +138,55 @@ var _ = Describe("Lock GPU devices for VM", func() { }) It("lockGPUDevicesForVM", func() { - gpuIDs := []string{gpuID} + lockedGPUID := fake.UUID() + gpuID = fake.UUID() + lockedGPUDeviceInfo := &service.GPUDeviceInfo{ID: lockedGPUID, AllocatedCount: 1, AvailableCount: 1} + gpuDeviceInfo := &service.GPUDeviceInfo{ID: gpuID, AllocatedCount: 0, AvailableCount: 1} + lockedGPUDeviceInfos := []*service.GPUDeviceInfo{lockedGPUDeviceInfo} + filteredGPUDeviceInfos := []*service.GPUDeviceInfo{gpuDeviceInfo, lockedGPUDeviceInfo} lockedVMGPUs := getGPUDevicesLockedByVM(clusterID, vmName) Expect(lockedVMGPUs).To(BeNil()) - lockedClusterGPUIDs := getLockedClusterGPUIDs(clusterID) - Expect(lockedClusterGPUIDs.Len()).To(Equal(0)) + filteredGPUs := filterGPUDeviceInfosByLockGPUDevices(clusterID, service.NewGPUDeviceInfos(filteredGPUDeviceInfos...)) + Expect(filteredGPUs).To(HaveLen(2)) - Expect(lockGPUDevicesForVM(clusterID, vmName, hostID, gpuIDs)).To(BeTrue()) + Expect(lockGPUDevicesForVM(clusterID, vmName, hostID, lockedGPUDeviceInfos)).To(BeTrue()) lockedVMGPUs = getGPUDevicesLockedByVM(clusterID, vmName) Expect(lockedVMGPUs.HostID).To(Equal(hostID)) - Expect(lockedVMGPUs.GPUDeviceIDs).To(Equal(gpuIDs)) + Expect(lockedVMGPUs.GPUDevices).To(HaveLen(1)) + Expect(lockedVMGPUs.GPUDevices[0].ID).To(Equal(lockedGPUID)) + Expect(lockedVMGPUs.GPUDevices[0].Count).To(Equal(int32(1))) Expect(lockedVMGPUs.LockedAt.Unix()).To(Equal(time.Now().Unix())) - lockedClusterGPUIDs = getLockedClusterGPUIDs(clusterID) - Expect(lockedClusterGPUIDs.Len()).To(Equal(1)) - Expect(lockedClusterGPUIDs.Has(gpuID)).To(BeTrue()) + filteredGPUs = filterGPUDeviceInfosByLockGPUDevices(clusterID, service.NewGPUDeviceInfos(filteredGPUDeviceInfos...)) + Expect(filteredGPUs).To(HaveLen(1)) + Expect(filteredGPUs.Contains(gpuDeviceInfo.ID)).To(BeTrue()) - Expect(lockGPUDevicesForVM(clusterID, vmName, hostID, gpuIDs)).To(BeFalse()) + Expect(lockGPUDevicesForVM(clusterID, vmName, hostID, lockedGPUDeviceInfos)).To(BeFalse()) + lockedVMGPUs = getGPUDevicesLockedByVM(clusterID, vmName) + Expect(lockedVMGPUs.GPUDevices).To(HaveLen(1)) unlockGPUDevicesLockedByVM(clusterID, vmName) lockedVMGPUs = getGPUDevicesLockedByVM(clusterID, vmName) Expect(lockedVMGPUs).To(BeNil()) - lockedClusterGPUIDs = getLockedClusterGPUIDs(clusterID) - Expect(lockedClusterGPUIDs.Len()).To(Equal(0)) + filteredGPUs = filterGPUDeviceInfosByLockGPUDevices(clusterID, service.NewGPUDeviceInfos(filteredGPUDeviceInfos...)) + Expect(filteredGPUs).To(HaveLen(2)) - Expect(lockGPUDevicesForVM(clusterID, vmName, hostID, gpuIDs)).To(BeTrue()) + Expect(lockGPUDevicesForVM(clusterID, vmName, hostID, lockedGPUDeviceInfos)).To(BeTrue()) vmGPUs := lockedGPUMap[clusterID][vmName] vmGPUs.LockedAt = vmGPUs.LockedAt.Add(-gpuLockTimeout) lockedGPUMap[clusterID][vmName] = vmGPUs lockedVMGPUs = getGPUDevicesLockedByVM(clusterID, vmName) Expect(lockedVMGPUs).To(BeNil()) - lockedClusterGPUIDs = getLockedClusterGPUIDs(clusterID) - Expect(lockedClusterGPUIDs.Len()).To(Equal(0)) - - Expect(lockGPUDevicesForVM(clusterID, vmName, hostID, gpuIDs)).To(BeTrue()) - vmGPUs = lockedGPUMap[clusterID][vmName] - vmGPUs.LockedAt = vmGPUs.LockedAt.Add(-gpuLockTimeout) - lockedGPUMap[clusterID][vmName] = vmGPUs - lockedClusterGPUIDs = getLockedClusterGPUIDs(clusterID) - Expect(lockedClusterGPUIDs.Len()).To(Equal(0)) - lockedVMGPUs = getGPUDevicesLockedByVM(clusterID, vmName) - Expect(lockedVMGPUs).To(BeNil()) + filteredGPUs = filterGPUDeviceInfosByLockGPUDevices(clusterID, service.NewGPUDeviceInfos(filteredGPUDeviceInfos...)) + Expect(filteredGPUs).To(HaveLen(2)) + + lockedGPUDeviceInfo.AvailableCount = 2 + Expect(lockGPUDevicesForVM(clusterID, vmName, hostID, lockedGPUDeviceInfos)).To(BeTrue()) + Expect(lockGPUDevicesForVM(clusterID, fake.UUID(), hostID, lockedGPUDeviceInfos)).To(BeTrue()) + Expect(lockGPUDevicesForVM(clusterID, fake.UUID(), hostID, lockedGPUDeviceInfos)).To(BeFalse()) + Expect(lockedGPUMap[clusterID]).To(HaveLen(2)) + filteredGPUs = filterGPUDeviceInfosByLockGPUDevices(clusterID, service.NewGPUDeviceInfos(filteredGPUDeviceInfos...)) + Expect(filteredGPUs).To(HaveLen(1)) }) }) diff --git a/go.mod b/go.mod index 53023d2d..a52131c3 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/onsi/gomega v1.27.10 github.com/patrickmn/go-cache v2.1.0+incompatible github.com/pkg/errors v0.9.1 - github.com/smartxworks/cloudtower-go-sdk/v2 v2.11.1-rc-2023-09-14 + github.com/smartxworks/cloudtower-go-sdk/v2 v2.12.1-0.20231102021857-ae16239443e2 golang.org/x/mod v0.12.0 k8s.io/api v0.27.2 k8s.io/apiextensions-apiserver v0.27.2 diff --git a/go.sum b/go.sum index d21438c3..ae9e834e 100644 --- a/go.sum +++ b/go.sum @@ -547,8 +547,8 @@ github.com/sirupsen/logrus v1.4.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPx github.com/sirupsen/logrus v1.4.1/go.mod h1:ni0Sbl8bgC9z8RoU9G6nDWqqs/fq4eDPysMBDgk/93Q= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= -github.com/smartxworks/cloudtower-go-sdk/v2 v2.11.1-rc-2023-09-14 h1:CHJLqIwjPHMKpnlR7wXmKUr9n2Ba7KhR5LG63S4TqQY= -github.com/smartxworks/cloudtower-go-sdk/v2 v2.11.1-rc-2023-09-14/go.mod h1:X6R9+L438SMnLJXykSCV3fJ+AZul0hlyjITsZgrSRtM= +github.com/smartxworks/cloudtower-go-sdk/v2 v2.12.1-0.20231102021857-ae16239443e2 h1:UXS2xA1dmSdR5B9BPmArlKHDAmpjGytY7XVPbVadBqU= +github.com/smartxworks/cloudtower-go-sdk/v2 v2.12.1-0.20231102021857-ae16239443e2/go.mod h1:X6R9+L438SMnLJXykSCV3fJ+AZul0hlyjITsZgrSRtM= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM= diff --git a/pkg/service/collections.go b/pkg/service/collections.go index bf9d7cef..7d25ec7b 100644 --- a/pkg/service/collections.go +++ b/pkg/service/collections.go @@ -168,3 +168,86 @@ func (s Hosts) IDs() []string { } return res } + +// GPUDeviceInfos is a set of GPUDeviceInfos. +type GPUDeviceInfos map[string]*GPUDeviceInfo + +// NewGPUDeviceInfos creates a GPUDeviceInfos. from a list of values. +func NewGPUDeviceInfos(gpuDeviceInfo ...*GPUDeviceInfo) GPUDeviceInfos { + ss := make(GPUDeviceInfos, len(gpuDeviceInfo)) + ss.Insert(gpuDeviceInfo...) + return ss +} + +func (s GPUDeviceInfos) Insert(gpuDeviceInfos ...*GPUDeviceInfo) { + for i := range gpuDeviceInfos { + if gpuDeviceInfos[i] != nil { + g := gpuDeviceInfos[i] + s[g.ID] = g + } + } +} + +// UnsortedList returns the slice with contents in random order. +func (s GPUDeviceInfos) UnsortedList() []*GPUDeviceInfo { + res := make([]*GPUDeviceInfo, 0, len(s)) + for _, value := range s { + res = append(res, value) + } + return res +} + +// Get returns a GPUDeviceInfo of the specified gpuID. +func (s GPUDeviceInfos) Get(gpuID string) *GPUDeviceInfo { + if gpuDeviceInfo, ok := s[gpuID]; ok { + return gpuDeviceInfo + } + return nil +} + +func (s GPUDeviceInfos) Contains(gpuID string) bool { + _, ok := s[gpuID] + return ok +} + +func (s GPUDeviceInfos) Len() int { + return len(s) +} + +func (s GPUDeviceInfos) Iterate(fn func(*GPUDeviceInfo)) { + for _, g := range s { + fn(g) + } +} + +// Filter returns a GPUDeviceInfos containing only the GPUDeviceInfos that match all of the given GPUDeviceInfoFilters. +func (s GPUDeviceInfos) Filter(filters ...GPUDeviceInfoFilterFunc) GPUDeviceInfos { + return newFilteredGPUDeviceInfoCollection(GPUDeviceInfoFilterAnd(filters...), s.UnsortedList()...) +} + +// newFilteredGPUDeviceInfoCollection creates a GPUDeviceInfos from a filtered list of values. +func newFilteredGPUDeviceInfoCollection(filter GPUDeviceInfoFilterFunc, gpuDeviceInfos ...*GPUDeviceInfo) GPUDeviceInfos { + ss := make(GPUDeviceInfos, len(gpuDeviceInfos)) + for i := range gpuDeviceInfos { + g := gpuDeviceInfos[i] + if filter(g) { + ss.Insert(g) + } + } + return ss +} + +// GPUDeviceInfoFilterFunc is the functon definition for a filter. +type GPUDeviceInfoFilterFunc func(*GPUDeviceInfo) bool + +// GPUDeviceInfoFilterAnd returns a filter that returns true if all of the given filters returns true. +func GPUDeviceInfoFilterAnd(filters ...GPUDeviceInfoFilterFunc) GPUDeviceInfoFilterFunc { + return func(g *GPUDeviceInfo) bool { + for _, f := range filters { + if !f(g) { + return false + } + } + return true + } +} diff --git a/pkg/service/collections_test.go b/pkg/service/collections_test.go index 7fe4fa0f..ab6c55d4 100644 --- a/pkg/service/collections_test.go +++ b/pkg/service/collections_test.go @@ -81,3 +81,36 @@ func TestHostCollection(t *testing.T) { g.Expect(NewHosts(host1, host2).Difference(NewHosts(host2)).Contains(*host1.ID)).To(gomega.BeTrue()) }) } + +func TestGPUDeviceInfoCollection(t *testing.T) { + g := gomega.NewGomegaWithT(t) + + t.Run("Find", func(t *testing.T) { + gpuDeviceInfo1 := &GPUDeviceInfo{ID: "gpu1"} + gpuDeviceInfo2 := &GPUDeviceInfo{ID: "gpu2"} + + gpuDeviceInfos := NewGPUDeviceInfos() + g.Expect(gpuDeviceInfos.Get("404")).To(gomega.BeNil()) + g.Expect(gpuDeviceInfos.Len()).To(gomega.Equal(0)) + + gpuDeviceInfos.Insert(gpuDeviceInfo1) + g.Expect(gpuDeviceInfos.Contains("gpu1")).To(gomega.BeTrue()) + g.Expect(gpuDeviceInfos.Get("gpu1")).To(gomega.Equal(gpuDeviceInfo1)) + g.Expect(gpuDeviceInfos.UnsortedList()).To(gomega.Equal([]*GPUDeviceInfo{gpuDeviceInfo1})) + count := 0 + gpuID := gpuDeviceInfo1.ID + gpuDeviceInfos.Iterate(func(g *GPUDeviceInfo) { + count += 1 + gpuID = g.ID + }) + g.Expect(count).To(gomega.Equal(1)) + g.Expect(gpuID).To(gomega.Equal(gpuDeviceInfo1.ID)) + + gpuDeviceInfos = NewGPUDeviceInfos(gpuDeviceInfo1, gpuDeviceInfo2) + filteredGPUDeviceInfos := gpuDeviceInfos.Filter(func(g *GPUDeviceInfo) bool { + return g.ID != gpuDeviceInfo1.ID + }) + g.Expect(filteredGPUDeviceInfos.Len()).To(gomega.Equal(1)) + g.Expect(filteredGPUDeviceInfos.Contains(gpuDeviceInfo2.ID)).To(gomega.BeTrue()) + }) +} diff --git a/pkg/service/mock_services/vm_mock.go b/pkg/service/mock_services/vm_mock.go index 8e30db92..fa7eedb8 100644 --- a/pkg/service/mock_services/vm_mock.go +++ b/pkg/service/mock_services/vm_mock.go @@ -39,18 +39,18 @@ func (m *MockVMService) EXPECT() *MockVMServiceMockRecorder { } // AddGPUDevices mocks base method. -func (m *MockVMService) AddGPUDevices(id string, gpus []*models.VMGpuOperationParams) (*models.Task, error) { +func (m *MockVMService) AddGPUDevices(id string, gpuDeviceInfo []*service.GPUDeviceInfo) (*models.Task, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "AddGPUDevices", id, gpus) + ret := m.ctrl.Call(m, "AddGPUDevices", id, gpuDeviceInfo) ret0, _ := ret[0].(*models.Task) ret1, _ := ret[1].(error) return ret0, ret1 } // AddGPUDevices indicates an expected call of AddGPUDevices. -func (mr *MockVMServiceMockRecorder) AddGPUDevices(id, gpus interface{}) *gomock.Call { +func (mr *MockVMServiceMockRecorder) AddGPUDevices(id, gpuDeviceInfo interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddGPUDevices", reflect.TypeOf((*MockVMService)(nil).AddGPUDevices), id, gpus) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddGPUDevices", reflect.TypeOf((*MockVMService)(nil).AddGPUDevices), id, gpuDeviceInfo) } // AddLabelsToVM mocks base method. @@ -84,7 +84,7 @@ func (mr *MockVMServiceMockRecorder) AddVMsToPlacementGroup(placementGroup, vmID } // Clone mocks base method. -func (m *MockVMService) Clone(elfCluster *v1beta1.ElfCluster, elfMachine *v1beta1.ElfMachine, bootstrapData, host string, machineGPUDevices []*models.GpuDevice) (*models.WithTaskVM, error) { +func (m *MockVMService) Clone(elfCluster *v1beta1.ElfCluster, elfMachine *v1beta1.ElfMachine, bootstrapData, host string, machineGPUDevices []*service.GPUDeviceInfo) (*models.WithTaskVM, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "Clone", elfCluster, elfMachine, bootstrapData, host, machineGPUDevices) ret0, _ := ret[0].(*models.WithTaskVM) @@ -188,19 +188,34 @@ func (mr *MockVMServiceMockRecorder) FindByIDs(ids interface{}) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FindByIDs", reflect.TypeOf((*MockVMService)(nil).FindByIDs), ids) } +// FindGPUDeviceInfos mocks base method. +func (m *MockVMService) FindGPUDeviceInfos(gpuIDs []string) (service.GPUDeviceInfos, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "FindGPUDeviceInfos", gpuIDs) + ret0, _ := ret[0].(service.GPUDeviceInfos) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// FindGPUDeviceInfos indicates an expected call of FindGPUDeviceInfos. +func (mr *MockVMServiceMockRecorder) FindGPUDeviceInfos(gpuIDs interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FindGPUDeviceInfos", reflect.TypeOf((*MockVMService)(nil).FindGPUDeviceInfos), gpuIDs) +} + // FindGPUDevicesByHostIDs mocks base method. -func (m *MockVMService) FindGPUDevicesByHostIDs(hostIDs []string) ([]*models.GpuDevice, error) { +func (m *MockVMService) FindGPUDevicesByHostIDs(hostIDs []string, gpuDeviceUsage models.GpuDeviceUsage) ([]*models.GpuDevice, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "FindGPUDevicesByHostIDs", hostIDs) + ret := m.ctrl.Call(m, "FindGPUDevicesByHostIDs", hostIDs, gpuDeviceUsage) ret0, _ := ret[0].([]*models.GpuDevice) ret1, _ := ret[1].(error) return ret0, ret1 } // FindGPUDevicesByHostIDs indicates an expected call of FindGPUDevicesByHostIDs. -func (mr *MockVMServiceMockRecorder) FindGPUDevicesByHostIDs(hostIDs interface{}) *gomock.Call { +func (mr *MockVMServiceMockRecorder) FindGPUDevicesByHostIDs(hostIDs, gpuDeviceUsage interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FindGPUDevicesByHostIDs", reflect.TypeOf((*MockVMService)(nil).FindGPUDevicesByHostIDs), hostIDs) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FindGPUDevicesByHostIDs", reflect.TypeOf((*MockVMService)(nil).FindGPUDevicesByHostIDs), hostIDs, gpuDeviceUsage) } // FindGPUDevicesByIDs mocks base method. diff --git a/pkg/service/types.go b/pkg/service/types.go new file mode 100644 index 00000000..906abf14 --- /dev/null +++ b/pkg/service/types.go @@ -0,0 +1,60 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package service + +import "fmt" + +type GPUDeviceVM struct { + ID string `json:"id"` + Name string `json:"name"` + AllocatedCount int32 `json:"allocatedCount"` +} + +type GPUDeviceInfo struct { + ID string `json:"id"` + HostID string `json:"hostId"` + Model string `json:"model"` + VGPUType string `json:"vGpuType"` + // AllocatedCount the number that has been allocated. + // For GPU devices, can be 0 or larger than 0. + // For vGPU devices, can larger than vgpuInstanceNum. + AllocatedCount int32 `json:"allocatedCount"` + // AvailableCount is the number of GPU that can be allocated. + // For GPU devices, can be 0 or 1. + // For vGPU devices, can be 0 - vgpuInstanceNum. + AvailableCount int32 `json:"availableCount"` + // VMs(including STOPPED) allocated to the current GPU. + VMs []GPUDeviceVM `json:"vms"` +} + +func (g *GPUDeviceInfo) GetVMCount() int { + return len(g.VMs) +} + +func (g *GPUDeviceInfo) ContainsVM(vm string) bool { + for i := 0; i < len(g.VMs); i++ { + if g.VMs[i].ID == vm || g.VMs[i].Name == vm { + return true + } + } + + return false +} + +func (g *GPUDeviceInfo) String() string { + return fmt.Sprintf("{id:%s, hostId:%s, model:%s, vGPUType:%s, allocatedCount:%d, availableCount:%d}", g.ID, g.HostID, g.Model, g.VGPUType, g.AllocatedCount, g.AvailableCount) +} diff --git a/pkg/service/util.go b/pkg/service/util.go index f6794562..7b1e598e 100644 --- a/pkg/service/util.go +++ b/pkg/service/util.go @@ -201,24 +201,134 @@ func IsPlacementGroupTask(task *models.Task) bool { return strings.Contains(GetTowerString(task.Description), "VM placement group") // Update VM placement group } -// GPUCanBeUsedForVM returns whether the virtual machine can use the specified GPU. -func GPUCanBeUsedForVM(gpuDevice *models.GpuDevice, vm string) bool { - if len(gpuDevice.Vms) == 0 || - *gpuDevice.Vms[0].ID == vm || - *gpuDevice.Vms[0].Name == vm { - return true +// HasGPUsCanNotBeUsedForVM returns whether the specified GPUs contains GPU +// that cannot be used by the specified VM. +func HasGPUsCanNotBeUsedForVM(gpuDeviceInfos GPUDeviceInfos, elfMachine *infrav1.ElfMachine) bool { + if elfMachine.RequiresGPUDevices() { + for gpuID := range gpuDeviceInfos { + gpuInfo := gpuDeviceInfos[gpuID] + if gpuInfo.GetVMCount() > 1 || (gpuInfo.GetVMCount() == 1 && !gpuInfo.ContainsVM(elfMachine.Name)) { + return true + } + } + + return false + } + + gpuCountUsedByVM := 0 + availableCountMap := make(map[string]int32) + for gpuID := range gpuDeviceInfos { + gpuInfo := gpuDeviceInfos[gpuID] + + if gpuInfo.ContainsVM(elfMachine.Name) { + gpuCountUsedByVM += 1 + } + + if count, ok := availableCountMap[gpuInfo.ID]; ok { + availableCountMap[gpuInfo.VGPUType] = count + gpuInfo.AvailableCount + } else { + availableCountMap[gpuInfo.VGPUType] = gpuInfo.AvailableCount + } + } + + if gpuCountUsedByVM > 0 { + return gpuCountUsedByVM != gpuDeviceInfos.Len() + } + + vGPUDevices := elfMachine.Spec.VGPUDevices + for i := 0; i < len(vGPUDevices); i++ { + if count, ok := availableCountMap[vGPUDevices[i].Type]; !ok || vGPUDevices[i].Count > count { + return true + } } return false } -func FilterOutGPUsCanNotBeUsedForVM(gpuDevices []*models.GpuDevice, vm string) []*models.GpuDevice { - var gpus []*models.GpuDevice +// AggregateUnusedGPUDevicesToGPUDeviceInfos selects the GPU device +// that gpuDeviceInfos does not have from the specified GPU devices and add to it. +// It should be used in conjunction with FindGPUDeviceInfos. +// +// FindGPUDeviceInfos only returns the GPUs that has been used by the virtual machine, +// so need to aggregate the unused GPUs. +func AggregateUnusedGPUDevicesToGPUDeviceInfos(gpuDeviceInfos GPUDeviceInfos, gpuDevices []*models.GpuDevice) { for i := 0; i < len(gpuDevices); i++ { - if GPUCanBeUsedForVM(gpuDevices[i], vm) { - gpus = append(gpus, gpuDevices[i]) + if !gpuDeviceInfos.Contains(*gpuDevices[i].ID) { + gpuDeviceInfo := &GPUDeviceInfo{ + ID: *gpuDevices[i].ID, + HostID: *gpuDevices[i].Host.ID, + Model: *gpuDevices[i].Model, + VGPUType: *gpuDevices[i].UserVgpuTypeName, + // Not yet allocated to a VM, the value is 0 + AllocatedCount: 0, + // Not yet allocated to a VM, value of GPU Passthrough is 1, + // value of vGPU is availableVgpusNum + AvailableCount: 1, + } + if *gpuDevices[i].UserUsage == models.GpuDeviceUsageVGPU { + gpuDeviceInfo.AvailableCount = *gpuDevices[i].AvailableVgpusNum + } + + gpuDeviceInfos.Insert(gpuDeviceInfo) } } +} + +// ConvertVMGpuInfosToGPUDeviceInfos Converts Tower's VMGpuInfo type to GPUDeviceInfos. +// It should be used in conjunction with FindGPUDeviceInfos. +// +// Tower does not provide API to obtain the detailes of the VM allocated by the GPU Device. +// So we need to get GPUDeviceInfos reversely through VMGpuInfo. +func ConvertVMGpuInfosToGPUDeviceInfos(vmGPUInfos []*models.VMGpuInfo) GPUDeviceInfos { + gpuDeviceInfos := NewGPUDeviceInfos() + for i := 0; i < len(vmGPUInfos); i++ { + gpuDevices := vmGPUInfos[i].GpuDevices + for j := 0; j < len(gpuDevices); j++ { + allocatedCount := int32(1) + if *gpuDevices[j].UserUsage == models.GpuDeviceUsageVGPU { + allocatedCount = *gpuDevices[j].VgpuInstanceOnVMNum + } + + gpuDeviceVM := GPUDeviceVM{ + ID: *vmGPUInfos[i].ID, + Name: *vmGPUInfos[i].Name, + AllocatedCount: allocatedCount, + } + + if gpuDeviceInfos.Contains(*gpuDevices[j].ID) { + gpuDeviceInfo := gpuDeviceInfos.Get(*gpuDevices[j].ID) + gpuDeviceInfo.VMs = append(gpuDeviceInfo.VMs, gpuDeviceVM) + gpuDeviceInfo.AllocatedCount += gpuDeviceVM.AllocatedCount + if *gpuDevices[j].UserUsage == models.GpuDeviceUsageVGPU { + gpuDeviceInfo.AvailableCount = calGPUAvailableCount(gpuDeviceInfo.AvailableCount, gpuDeviceVM.AllocatedCount) + } + } else { + availableCount := int32(0) + if *gpuDevices[j].UserUsage == models.GpuDeviceUsageVGPU { + availableCount = calGPUAvailableCount(*gpuDevices[j].VgpuInstanceNum, gpuDeviceVM.AllocatedCount) + } + + gpuDeviceInfos.Insert(&GPUDeviceInfo{ + ID: *gpuDevices[j].ID, + HostID: *gpuDevices[j].Host.ID, + Model: *gpuDevices[j].Model, + VGPUType: *gpuDevices[j].UserVgpuTypeName, + AllocatedCount: gpuDeviceVM.AllocatedCount, + AvailableCount: availableCount, + VMs: []GPUDeviceVM{gpuDeviceVM}, + }) + } + } + } + + return gpuDeviceInfos +} + +func calGPUAvailableCount(availableCount, allocatedCount int32) int32 { + count := availableCount - allocatedCount + if count < 0 { + count = 0 + } - return gpus + return count } diff --git a/pkg/service/util_test.go b/pkg/service/util_test.go index 98e167e8..9c2412fc 100644 --- a/pkg/service/util_test.go +++ b/pkg/service/util_test.go @@ -22,6 +22,8 @@ import ( "github.com/onsi/gomega" "github.com/smartxworks/cloudtower-go-sdk/v2/models" "k8s.io/utils/pointer" + + infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" ) func TestIsAvailableHost(t *testing.T) { @@ -101,26 +103,190 @@ func TestIsAvailableHost(t *testing.T) { }) } -func TestGPUCanBeUsedForVM(t *testing.T) { +// func TestGPUCanBeUsedForVM(t *testing.T) { +// g := gomega.NewGomegaWithT(t) + +// t.Run("should return false when GPU can not be used for VM", func(t *testing.T) { +// g.Expect(GPUCanBeUsedForVM(&models.GpuDevice{Vms: []*models.NestedVM{{ID: TowerString("id2"), Name: TowerString("vm2")}, {ID: TowerString("id"), Name: TowerString("vm")}}}, "vm")).To(gomega.BeFalse()) +// }) + +// t.Run("should return false when GPU can not be used for VM", func(t *testing.T) { +// g.Expect(GPUCanBeUsedForVM(&models.GpuDevice{}, "vm")).To(gomega.BeTrue()) +// g.Expect(GPUCanBeUsedForVM(&models.GpuDevice{Vms: []*models.NestedVM{{ID: TowerString("vm")}}}, "vm")).To(gomega.BeTrue()) +// g.Expect(GPUCanBeUsedForVM(&models.GpuDevice{Vms: []*models.NestedVM{{ID: TowerString("id"), Name: TowerString("vm")}}}, "vm")).To(gomega.BeTrue()) +// }) +// } + +// func TestFilterOutGPUsCanNotBeUsedForVM(t *testing.T) { +// g := gomega.NewGomegaWithT(t) + +// t.Run("should filter GPUs", func(t *testing.T) { +// g.Expect(FilterOutGPUsCanNotBeUsedForVM([]*models.GpuDevice{}, "vm")).To(gomega.BeEmpty()) +// g.Expect(FilterOutGPUsCanNotBeUsedForVM([]*models.GpuDevice{{Vms: []*models.NestedVM{{ID: TowerString("id2"), Name: TowerString("vm2")}}}}, "vm")).To(gomega.BeEmpty()) +// g.Expect(FilterOutGPUsCanNotBeUsedForVM([]*models.GpuDevice{{Vms: []*models.NestedVM{{ID: TowerString("id"), Name: TowerString("vm")}}}}, "vm")).To(gomega.HaveLen(1)) +// }) +// } + +func TestHasGPUsCanNotBeUsedForVM(t *testing.T) { g := gomega.NewGomegaWithT(t) + elfMachine := &infrav1.ElfMachine{} + elfMachine.Name = "test" + elfMachine.Spec.GPUDevices = append(elfMachine.Spec.GPUDevices, infrav1.GPUPassthroughDeviceSpec{Model: "A16", Count: 1}) - t.Run("should return false when GPU can not be used for VM", func(t *testing.T) { - g.Expect(GPUCanBeUsedForVM(&models.GpuDevice{Vms: []*models.NestedVM{{ID: TowerString("id2"), Name: TowerString("vm2")}, {ID: TowerString("id"), Name: TowerString("vm")}}}, "vm")).To(gomega.BeFalse()) + t.Run("GPU", func(t *testing.T) { + g.Expect(HasGPUsCanNotBeUsedForVM(NewGPUDeviceInfos(), elfMachine)).To(gomega.BeFalse()) + g.Expect(HasGPUsCanNotBeUsedForVM(NewGPUDeviceInfos(&GPUDeviceInfo{ + VMs: []GPUDeviceVM{{ID: "vm1", Name: elfMachine.Name}}, + }), elfMachine)).To(gomega.BeFalse()) + g.Expect(HasGPUsCanNotBeUsedForVM(NewGPUDeviceInfos(&GPUDeviceInfo{ + VMs: []GPUDeviceVM{{ID: "vm1", Name: "vm1"}}, + }), elfMachine)).To(gomega.BeTrue()) + g.Expect(HasGPUsCanNotBeUsedForVM(NewGPUDeviceInfos(&GPUDeviceInfo{ + VMs: []GPUDeviceVM{ + {ID: "vm1", Name: "vm1"}, + {ID: "vm2", Name: elfMachine.Name}, + }, + }), elfMachine)).To(gomega.BeTrue()) }) +} + +func TestAggregateUnusedGPUDevicesToGPUDeviceInfos(t *testing.T) { + g := gomega.NewGomegaWithT(t) + host := &models.NestedHost{ID: TowerString("host1")} - t.Run("should return false when GPU can not be used for VM", func(t *testing.T) { - g.Expect(GPUCanBeUsedForVM(&models.GpuDevice{}, "vm")).To(gomega.BeTrue()) - g.Expect(GPUCanBeUsedForVM(&models.GpuDevice{Vms: []*models.NestedVM{{ID: TowerString("vm")}}}, "vm")).To(gomega.BeTrue()) - g.Expect(GPUCanBeUsedForVM(&models.GpuDevice{Vms: []*models.NestedVM{{ID: TowerString("id"), Name: TowerString("vm")}}}, "vm")).To(gomega.BeTrue()) + t.Run("GPU", func(t *testing.T) { + gpuDevice := &models.GpuDevice{ID: TowerString("gpu1"), Host: host, Model: TowerString("A16"), UserUsage: models.NewGpuDeviceUsage(models.GpuDeviceUsagePASSTHROUGH), UserVgpuTypeName: TowerString("")} + gpuDevices := []*models.GpuDevice{gpuDevice} + gpuDeviceInfos := NewGPUDeviceInfos() + + AggregateUnusedGPUDevicesToGPUDeviceInfos(gpuDeviceInfos, gpuDevices) + g.Expect(gpuDeviceInfos.Len()).To(gomega.Equal(1)) + g.Expect(*gpuDeviceInfos.Get(*gpuDevice.ID)).To(gomega.Equal(GPUDeviceInfo{ + ID: *gpuDevice.ID, + HostID: *gpuDevice.Host.ID, + Model: *gpuDevice.Model, + VGPUType: *gpuDevice.UserVgpuTypeName, + AllocatedCount: 0, + AvailableCount: 1, + })) + + AggregateUnusedGPUDevicesToGPUDeviceInfos(gpuDeviceInfos, gpuDevices) + g.Expect(gpuDeviceInfos.Len()).To(gomega.Equal(1)) + g.Expect(gpuDeviceInfos.Contains(*gpuDevice.ID)).To(gomega.BeTrue()) + }) + + t.Run("vGPU", func(t *testing.T) { + gpuDevice := &models.GpuDevice{ID: TowerString("gpu1"), Host: host, Model: TowerString("V100"), UserUsage: models.NewGpuDeviceUsage(models.GpuDeviceUsageVGPU), UserVgpuTypeName: TowerString(""), AvailableVgpusNum: TowerInt32(6)} + gpuDevices := []*models.GpuDevice{gpuDevice} + gpuDeviceInfos := NewGPUDeviceInfos() + + AggregateUnusedGPUDevicesToGPUDeviceInfos(gpuDeviceInfos, gpuDevices) + g.Expect(gpuDeviceInfos.Len()).To(gomega.Equal(1)) + g.Expect(*gpuDeviceInfos.Get(*gpuDevice.ID)).To(gomega.Equal(GPUDeviceInfo{ + ID: *gpuDevice.ID, + HostID: *gpuDevice.Host.ID, + Model: *gpuDevice.Model, + VGPUType: *gpuDevice.UserVgpuTypeName, + AllocatedCount: 0, + AvailableCount: *gpuDevice.AvailableVgpusNum, + })) + + AggregateUnusedGPUDevicesToGPUDeviceInfos(gpuDeviceInfos, gpuDevices) + g.Expect(gpuDeviceInfos.Len()).To(gomega.Equal(1)) + g.Expect(gpuDeviceInfos.Contains(*gpuDevice.ID)).To(gomega.BeTrue()) }) } -func TestFilterOutGPUsCanNotBeUsedForVM(t *testing.T) { +func TestConvertVMGpuInfosToGPUDeviceInfos(t *testing.T) { g := gomega.NewGomegaWithT(t) + host := &models.NestedHost{ID: TowerString("host1")} + + t.Run("GPU", func(t *testing.T) { + vmGpuDetail := &models.VMGpuDetail{ID: TowerString("gpu1"), Host: host, Model: TowerString("A16"), UserUsage: models.NewGpuDeviceUsage(models.GpuDeviceUsagePASSTHROUGH), UserVgpuTypeName: TowerString("")} + vmGpuInfo1 := &models.VMGpuInfo{ID: TowerString("1"), Name: TowerString("vm1"), GpuDevices: []*models.VMGpuDetail{vmGpuDetail}} + vmGpuInfo2 := &models.VMGpuInfo{ID: TowerString("2"), Name: TowerString("vm2"), GpuDevices: []*models.VMGpuDetail{vmGpuDetail}} + + g.Expect(ConvertVMGpuInfosToGPUDeviceInfos( + []*models.VMGpuInfo{}, + )).To(gomega.BeEmpty()) + + g.Expect(ConvertVMGpuInfosToGPUDeviceInfos( + []*models.VMGpuInfo{vmGpuInfo1}, + )).To(gomega.Equal(NewGPUDeviceInfos(&GPUDeviceInfo{ + ID: *vmGpuDetail.ID, + HostID: *vmGpuDetail.Host.ID, + Model: *vmGpuDetail.Model, + VGPUType: *vmGpuDetail.UserVgpuTypeName, + AllocatedCount: 1, + AvailableCount: 0, + VMs: []GPUDeviceVM{{ID: *vmGpuInfo1.ID, Name: *vmGpuInfo1.Name, AllocatedCount: 1}}, + }))) + + g.Expect(ConvertVMGpuInfosToGPUDeviceInfos( + []*models.VMGpuInfo{vmGpuInfo1, vmGpuInfo2}, + )).To(gomega.Equal(NewGPUDeviceInfos(&GPUDeviceInfo{ + ID: *vmGpuDetail.ID, + HostID: *vmGpuDetail.Host.ID, + Model: *vmGpuDetail.Model, + VGPUType: *vmGpuDetail.UserVgpuTypeName, + AllocatedCount: 2, + AvailableCount: 0, + VMs: []GPUDeviceVM{ + {ID: *vmGpuInfo1.ID, Name: *vmGpuInfo1.Name, AllocatedCount: 1}, + {ID: *vmGpuInfo2.ID, Name: *vmGpuInfo2.Name, AllocatedCount: 1}, + }, + }))) + }) + + t.Run("vGPU", func(t *testing.T) { + vmGpuDetail1 := &models.VMGpuDetail{ID: TowerString("gpu1"), Host: host, Model: TowerString("V100"), UserUsage: models.NewGpuDeviceUsage(models.GpuDeviceUsageVGPU), UserVgpuTypeName: TowerString("GRID V100-4C"), VgpuInstanceNum: TowerInt32(3), VgpuInstanceOnVMNum: TowerInt32(1)} + vmGpuDetail2 := &models.VMGpuDetail{ID: TowerString("gpu1"), Host: host, Model: TowerString("V100"), UserUsage: models.NewGpuDeviceUsage(models.GpuDeviceUsageVGPU), UserVgpuTypeName: TowerString("GRID V100-4C"), VgpuInstanceNum: TowerInt32(3), VgpuInstanceOnVMNum: TowerInt32(2)} + vmGpuDetail3 := &models.VMGpuDetail{ID: TowerString("gpu1"), Host: host, Model: TowerString("V100"), UserUsage: models.NewGpuDeviceUsage(models.GpuDeviceUsageVGPU), UserVgpuTypeName: TowerString("GRID V100-4C"), VgpuInstanceNum: TowerInt32(3), VgpuInstanceOnVMNum: TowerInt32(1)} + vmGpuInfo1 := &models.VMGpuInfo{ID: TowerString("1"), Name: TowerString("vm1"), GpuDevices: []*models.VMGpuDetail{vmGpuDetail1}} + vmGpuInfo2 := &models.VMGpuInfo{ID: TowerString("2"), Name: TowerString("vm2"), GpuDevices: []*models.VMGpuDetail{vmGpuDetail2}} + vmGpuInfo3 := &models.VMGpuInfo{ID: TowerString("1"), Name: TowerString("vm2"), GpuDevices: []*models.VMGpuDetail{vmGpuDetail3}} + + g.Expect(ConvertVMGpuInfosToGPUDeviceInfos( + []*models.VMGpuInfo{vmGpuInfo1}, + )).To(gomega.Equal(NewGPUDeviceInfos(&GPUDeviceInfo{ + ID: *vmGpuDetail1.ID, + HostID: *vmGpuDetail1.Host.ID, + Model: *vmGpuDetail1.Model, + VGPUType: *vmGpuDetail1.UserVgpuTypeName, + AllocatedCount: 1, + AvailableCount: 2, + VMs: []GPUDeviceVM{{ID: *vmGpuInfo1.ID, Name: *vmGpuInfo1.Name, AllocatedCount: 1}}, + }))) + + g.Expect(ConvertVMGpuInfosToGPUDeviceInfos( + []*models.VMGpuInfo{vmGpuInfo1, vmGpuInfo2}, + )).To(gomega.Equal(NewGPUDeviceInfos(&GPUDeviceInfo{ + ID: *vmGpuDetail1.ID, + HostID: *vmGpuDetail1.Host.ID, + Model: *vmGpuDetail1.Model, + VGPUType: *vmGpuDetail1.UserVgpuTypeName, + AllocatedCount: 3, + AvailableCount: 0, + VMs: []GPUDeviceVM{ + {ID: *vmGpuInfo1.ID, Name: *vmGpuInfo1.Name, AllocatedCount: 1}, + {ID: *vmGpuInfo2.ID, Name: *vmGpuInfo2.Name, AllocatedCount: 2}, + }, + }))) - t.Run("should filter GPUs", func(t *testing.T) { - g.Expect(FilterOutGPUsCanNotBeUsedForVM([]*models.GpuDevice{}, "vm")).To(gomega.BeEmpty()) - g.Expect(FilterOutGPUsCanNotBeUsedForVM([]*models.GpuDevice{{Vms: []*models.NestedVM{{ID: TowerString("id2"), Name: TowerString("vm2")}}}}, "vm")).To(gomega.BeEmpty()) - g.Expect(FilterOutGPUsCanNotBeUsedForVM([]*models.GpuDevice{{Vms: []*models.NestedVM{{ID: TowerString("id"), Name: TowerString("vm")}}}}, "vm")).To(gomega.HaveLen(1)) + g.Expect(ConvertVMGpuInfosToGPUDeviceInfos( + []*models.VMGpuInfo{vmGpuInfo1, vmGpuInfo2, vmGpuInfo3}, + )).To(gomega.Equal(NewGPUDeviceInfos(&GPUDeviceInfo{ + ID: *vmGpuDetail1.ID, + HostID: *vmGpuDetail1.Host.ID, + Model: *vmGpuDetail1.Model, + VGPUType: *vmGpuDetail1.UserVgpuTypeName, + AllocatedCount: 4, + AvailableCount: 0, + VMs: []GPUDeviceVM{ + {ID: *vmGpuInfo1.ID, Name: *vmGpuInfo1.Name, AllocatedCount: 1}, + {ID: *vmGpuInfo2.ID, Name: *vmGpuInfo2.Name, AllocatedCount: 2}, + {ID: *vmGpuInfo3.ID, Name: *vmGpuInfo3.Name, AllocatedCount: 1}, + }, + }))) }) } diff --git a/pkg/service/vm.go b/pkg/service/vm.go index 1e0e69be..33dbf3c7 100644 --- a/pkg/service/vm.go +++ b/pkg/service/vm.go @@ -43,7 +43,7 @@ import ( type VMService interface { Clone(elfCluster *infrav1.ElfCluster, elfMachine *infrav1.ElfMachine, bootstrapData, - host string, machineGPUDevices []*models.GpuDevice) (*models.WithTaskVM, error) + host string, machineGPUDevices []*GPUDeviceInfo) (*models.WithTaskVM, error) UpdateVM(vm *models.VM, elfMachine *infrav1.ElfMachine) (*models.WithTaskVM, error) Migrate(vmID, hostID string) (*models.WithTaskVM, error) Delete(uuid string) (*models.Task, error) @@ -51,7 +51,7 @@ type VMService interface { PowerOn(uuid string) (*models.Task, error) ShutDown(uuid string) (*models.Task, error) RemoveGPUDevices(id string, gpus []*models.VMGpuOperationParams) (*models.Task, error) - AddGPUDevices(id string, gpus []*models.VMGpuOperationParams) (*models.Task, error) + AddGPUDevices(id string, gpuDeviceInfo []*GPUDeviceInfo) (*models.Task, error) Get(id string) (*models.VM, error) GetByName(name string) (*models.VM, error) FindByIDs(ids []string) ([]*models.VM, error) @@ -72,8 +72,9 @@ type VMService interface { AddVMsToPlacementGroup(placementGroup *models.VMPlacementGroup, vmIDs []string) (*models.Task, error) DeleteVMPlacementGroupByID(ctx goctx.Context, id string) (bool, error) DeleteVMPlacementGroupsByNamePrefix(ctx goctx.Context, placementGroupName string) (int, error) - FindGPUDevicesByHostIDs(hostIDs []string) ([]*models.GpuDevice, error) + FindGPUDevicesByHostIDs(hostIDs []string, gpuDeviceUsage models.GpuDeviceUsage) ([]*models.GpuDevice, error) FindGPUDevicesByIDs(gpuIDs []string) ([]*models.GpuDevice, error) + FindGPUDeviceInfos(gpuIDs []string) (GPUDeviceInfos, error) } type NewVMServiceFunc func(ctx goctx.Context, auth infrav1.Tower, logger logr.Logger) (VMService, error) @@ -118,7 +119,7 @@ func (svr *TowerVMService) UpdateVM(vm *models.VM, elfMachine *infrav1.ElfMachin // Clone kicks off a clone operation on Elf to create a new virtual machine using VM template. func (svr *TowerVMService) Clone( elfCluster *infrav1.ElfCluster, elfMachine *infrav1.ElfMachine, bootstrapData, - host string, machineGPUDevices []*models.GpuDevice) (*models.WithTaskVM, error) { + host string, gpuDeviceInfos []*GPUDeviceInfo) (*models.WithTaskVM, error) { cluster, err := svr.GetCluster(elfCluster.Spec.Cluster) if err != nil { return nil, err @@ -133,11 +134,11 @@ func (svr *TowerVMService) Clone( cpuCores := TowerCPUCores(*vCPU, elfMachine.Spec.NumCoresPerSocket) cpuSockets := TowerCPUSockets(*vCPU, *cpuCores) - gpuDevices := make([]*models.VMGpuOperationParams, len(machineGPUDevices)) - for i := 0; i < len(machineGPUDevices); i++ { + gpuDevices := make([]*models.VMGpuOperationParams, len(gpuDeviceInfos)) + for i := 0; i < len(gpuDeviceInfos); i++ { gpuDevices[i] = &models.VMGpuOperationParams{ - GpuID: machineGPUDevices[i].ID, - Amount: TowerInt32(1), + GpuID: TowerString(gpuDeviceInfos[i].ID), + Amount: TowerInt32(int(gpuDeviceInfos[i].AllocatedCount)), } } @@ -403,7 +404,15 @@ func (svr *TowerVMService) RemoveGPUDevices(id string, gpus []*models.VMGpuOpera return &models.Task{ID: temoveVMGPUDeviceResp.Payload[0].TaskID}, nil } -func (svr *TowerVMService) AddGPUDevices(id string, gpus []*models.VMGpuOperationParams) (*models.Task, error) { +func (svr *TowerVMService) AddGPUDevices(id string, gpuDeviceInfos []*GPUDeviceInfo) (*models.Task, error) { + gpus := make([]*models.VMGpuOperationParams, len(gpuDeviceInfos)) + for i := 0; i < len(gpuDeviceInfos); i++ { + gpus[i] = &models.VMGpuOperationParams{ + GpuID: TowerString(gpuDeviceInfos[i].ID), + Amount: TowerInt32(int(gpuDeviceInfos[i].AllocatedCount)), + } + } + addVMGpuDeviceParams := clientvm.NewAddVMGpuDeviceParams() addVMGpuDeviceParams.RequestBody = &models.VMAddGpuDeviceParams{ Data: gpus, @@ -927,21 +936,26 @@ func (svr *TowerVMService) DeleteVMPlacementGroupsByNamePrefix(ctx goctx.Context return len(getVMPlacementGroupsResp.Payload), nil } -func (svr *TowerVMService) FindGPUDevicesByHostIDs(hostIDs []string) ([]*models.GpuDevice, error) { +func (svr *TowerVMService) FindGPUDevicesByHostIDs(hostIDs []string, gpuDeviceUsage models.GpuDeviceUsage) ([]*models.GpuDevice, error) { if len(hostIDs) == 0 { return nil, nil } - getGpuDevicesParams := clientgpu.NewGetGpuDevicesParams() - getGpuDevicesParams.RequestBody = &models.GetGpuDevicesRequestBody{ - Where: &models.GpuDeviceWhereInput{ - UserUsage: models.NewGpuDeviceUsage(models.GpuDeviceUsagePASSTHROUGH), - Host: &models.HostWhereInput{ - IDIn: hostIDs, - }, + where := &models.GpuDeviceWhereInput{ + UserUsage: models.NewGpuDeviceUsage(gpuDeviceUsage), + Host: &models.HostWhereInput{ + IDIn: hostIDs, }, } + // Filter GPU devices whose vGPU has been fully used + if gpuDeviceUsage == models.GpuDeviceUsageVGPU { + where.AvailableVgpusNumGt = TowerInt32(0) + } + + getGpuDevicesParams := clientgpu.NewGetGpuDevicesParams() + getGpuDevicesParams.RequestBody = &models.GetGpuDevicesRequestBody{Where: where} + getGpuDevicesResp, err := svr.Session.GpuDevice.GetGpuDevices(getGpuDevicesParams) if err != nil { return nil, err @@ -958,8 +972,7 @@ func (svr *TowerVMService) FindGPUDevicesByIDs(gpuIDs []string) ([]*models.GpuDe getGpuDevicesParams := clientgpu.NewGetGpuDevicesParams() getGpuDevicesParams.RequestBody = &models.GetGpuDevicesRequestBody{ Where: &models.GpuDeviceWhereInput{ - UserUsage: models.NewGpuDeviceUsage(models.GpuDeviceUsagePASSTHROUGH), - IDIn: gpuIDs, + IDIn: gpuIDs, }, } @@ -970,3 +983,21 @@ func (svr *TowerVMService) FindGPUDevicesByIDs(gpuIDs []string) ([]*models.GpuDe return getGpuDevicesResp.Payload, nil } + +func (svr *TowerVMService) FindGPUDeviceInfos(gpuIDs []string) (GPUDeviceInfos, error) { + getVMGpuDeviceInfoParams := clientvm.NewGetVMGpuDeviceInfoParams() + getVMGpuDeviceInfoParams.RequestBody = &models.GetVmsRequestBody{ + Where: &models.VMWhereInput{ + GpuDevicesSome: &models.GpuDeviceWhereInput{ + IDIn: gpuIDs, + }, + }, + } + + getVMGpuDeviceInfoResp, err := svr.Session.VM.GetVMGpuDeviceInfo(getVMGpuDeviceInfoParams) + if err != nil { + return nil, err + } + + return ConvertVMGpuInfosToGPUDeviceInfos(getVMGpuDeviceInfoResp.Payload), nil +} diff --git a/test/fake/tower.go b/test/fake/tower.go index 3b6329f5..c6a2953c 100644 --- a/test/fake/tower.go +++ b/test/fake/tower.go @@ -153,9 +153,24 @@ func NewWithTaskVMPlacementGroup(placementGroup *models.VMPlacementGroup, task * func NewTowerGPU() *models.GpuDevice { return &models.GpuDevice{ - ID: pointer.String(ID()), - LocalID: pointer.String(UUID()), - Name: pointer.String(ID()), - Model: pointer.String("A16"), + ID: pointer.String(ID()), + LocalID: pointer.String(UUID()), + Name: pointer.String(ID()), + Model: pointer.String("A16"), + UserUsage: models.NewGpuDeviceUsage(models.GpuDeviceUsagePASSTHROUGH), + UserVgpuTypeName: pointer.String(""), + } +} + +func NewTowerVGPU(vGPUCount int32) *models.GpuDevice { + return &models.GpuDevice{ + ID: pointer.String(ID()), + LocalID: pointer.String(UUID()), + Name: pointer.String(ID()), + UserVgpuTypeName: pointer.String("V100"), + UserUsage: models.NewGpuDeviceUsage(models.GpuDeviceUsageVGPU), + AvailableVgpusNum: pointer.Int32(vGPUCount), + AssignedVgpusNum: pointer.Int32(0), + Model: pointer.String(""), } }