From fc8d617328975c724e3f57f5ca87e24dd2c13fbc Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Sat, 7 Sep 2024 02:58:26 +0200 Subject: [PATCH] Refactor how opaque device configs are handled Previously, each config was being applied independently to each request that referenced it. However, some configs may need to operate collectively on all of the requests they are associated with it. The code has been refactored to handle this situation. Additionally, the code to define the ContainerEdits for any custom config has been moved into the config code itself to better encapsulate it. Signed-off-by: Kevin Klues --- README.md | 74 ++++----- api/example.com/resource/gpu/v1alpha1/cdi.go | 64 ++++++++ cmd/dra-example-kubeletplugin/cdi.go | 37 ++--- cmd/dra-example-kubeletplugin/discovery.go | 5 +- cmd/dra-example-kubeletplugin/state.go | 156 +++++++++++-------- 5 files changed, 204 insertions(+), 132 deletions(-) create mode 100644 api/example.com/resource/gpu/v1alpha1/cdi.go diff --git a/README.md b/README.md index 1831cc2c..9d2c77a5 100644 --- a/README.md +++ b/README.md @@ -127,7 +127,7 @@ items: string: gpu-18db0e85-99e9-c746-8531-ffeb86328b39 capacity: memory: 80Gi - name: gpu-18db0e85-99e9-c746-8531-ffeb86328b39 + name: gpu-0 - basic: attributes: driverVersion: @@ -140,7 +140,7 @@ items: string: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac capacity: memory: 80Gi - name: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac + name: gpu-1 - basic: attributes: driverVersion: @@ -153,7 +153,7 @@ items: string: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744 capacity: memory: 80Gi - name: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744 + name: gpu-2 - basic: attributes: driverVersion: @@ -166,7 +166,7 @@ items: string: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243 capacity: memory: 80Gi - name: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243 + name: gpu-3 - basic: attributes: driverVersion: @@ -179,7 +179,7 @@ items: string: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747 capacity: memory: 80Gi - name: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747 + name: gpu-4 - basic: attributes: driverVersion: @@ -192,7 +192,7 @@ items: string: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e capacity: memory: 80Gi - name: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e + name: gpu-5 - basic: attributes: driverVersion: @@ -205,7 +205,7 @@ items: string: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac capacity: memory: 80Gi - name: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac + name: gpu-6 - basic: attributes: driverVersion: @@ -218,7 +218,7 @@ items: string: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b capacity: memory: 80Gi - name: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b + name: gpu-7 kind: List metadata: resourceVersion: "" @@ -275,52 +275,52 @@ This should produce output similar to the following: ```bash gpu-test1: pod0 ctr0: -declare -x GPU_DEVICE_0="gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744" +declare -x GPU_DEVICE_6="gpu-6" pod1 ctr0: -declare -x GPU_DEVICE_0="gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243" +declare -x GPU_DEVICE_7="gpu-7" gpu-test2: pod0 ctr0: -declare -x GPU_DEVICE_0="gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747" -declare -x GPU_DEVICE_1="gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e" +declare -x GPU_DEVICE_0="gpu-0" +declare -x GPU_DEVICE_1="gpu-1" gpu-test3: pod0 ctr0: -declare -x GPU_DEVICE_0="gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac" -declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing" -declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default" +declare -x GPU_DEVICE_2="gpu-2" +declare -x GPU_DEVICE_2_SHARING_STRATEGY="TimeSlicing" +declare -x GPU_DEVICE_2_TIMESLICE_INTERVAL="Default" pod0 ctr1: -declare -x GPU_DEVICE_0="gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac" -declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing" -declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default" +declare -x GPU_DEVICE_2="gpu-2" +declare -x GPU_DEVICE_2_SHARING_STRATEGY="TimeSlicing" +declare -x GPU_DEVICE_2_TIMESLICE_INTERVAL="Default" gpu-test4: pod0 ctr0: -declare -x GPU_DEVICE_0="gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b" -declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing" -declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default" +declare -x GPU_DEVICE_3="gpu-3" +declare -x GPU_DEVICE_3_SHARING_STRATEGY="TimeSlicing" +declare -x GPU_DEVICE_3_TIMESLICE_INTERVAL="Default" pod1 ctr0: -declare -x GPU_DEVICE_0="gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b" -declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing" -declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default" +declare -x GPU_DEVICE_3="gpu-3" +declare -x GPU_DEVICE_3_SHARING_STRATEGY="TimeSlicing" +declare -x GPU_DEVICE_3_TIMESLICE_INTERVAL="Default" gpu-test5: pod0 ts-ctr0: -declare -x GPU_DEVICE_0="gpu-18db0e85-99e9-c746-8531-ffeb86328b39" -declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing" -declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Long" +declare -x GPU_DEVICE_4="gpu-4" +declare -x GPU_DEVICE_4_SHARING_STRATEGY="TimeSlicing" +declare -x GPU_DEVICE_4_TIMESLICE_INTERVAL="Long" pod0 ts-ctr1: -declare -x GPU_DEVICE_0="gpu-18db0e85-99e9-c746-8531-ffeb86328b39" -declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing" -declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Long" +declare -x GPU_DEVICE_4="gpu-4" +declare -x GPU_DEVICE_4_SHARING_STRATEGY="TimeSlicing" +declare -x GPU_DEVICE_4_TIMESLICE_INTERVAL="Long" pod0 sp-ctr0: -declare -x GPU_DEVICE_1="gpu-93d37703-997c-c46f-a531-755e3e0dc2ac" -declare -x GPU_DEVICE_1_PARTITION_COUNT="10" -declare -x GPU_DEVICE_1_SHARING_STRATEGY="SpacePartitioning" +declare -x GPU_DEVICE_5="gpu-5" +declare -x GPU_DEVICE_5_PARTITION_COUNT="10" +declare -x GPU_DEVICE_5_SHARING_STRATEGY="SpacePartitioning" pod0 sp-ctr1: -declare -x GPU_DEVICE_1="gpu-93d37703-997c-c46f-a531-755e3e0dc2ac" -declare -x GPU_DEVICE_1_PARTITION_COUNT="10" -declare -x GPU_DEVICE_1_SHARING_STRATEGY="SpacePartitioning" +declare -x GPU_DEVICE_5="gpu-5" +declare -x GPU_DEVICE_5_PARTITION_COUNT="10" +declare -x GPU_DEVICE_5_SHARING_STRATEGY="SpacePartitioning" ``` In this example resource driver, no "actual" GPUs are made available to any @@ -328,7 +328,7 @@ containers. Instead, a set of environment variables are set in each container to indicate which GPUs *would* have been injected into them by a real resource driver and how they *would* have been configured. -You can use the UUIDs of the GPUs as well as the GPU sharing settings set in +You can use the IDs of the GPUs as well as the GPU sharing settings set in these environment variables to verify that they were handed out in a way consistent with the semantics shown in the figure above. diff --git a/api/example.com/resource/gpu/v1alpha1/cdi.go b/api/example.com/resource/gpu/v1alpha1/cdi.go new file mode 100644 index 00000000..080eea94 --- /dev/null +++ b/api/example.com/resource/gpu/v1alpha1/cdi.go @@ -0,0 +1,64 @@ +/* + * Copyright 2024 The Kubernetes Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package v1alpha1 + +import ( + "fmt" + + resourceapi "k8s.io/api/resource/v1alpha3" + + cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" + cdispec "tags.cncf.io/container-device-interface/specs-go" +) + +// +k8s:deepcopy-gen=false +type PerDeviceCDIContainerEdits map[string]*cdiapi.ContainerEdits + +func (c *GpuConfig) Apply(results []*resourceapi.DeviceRequestAllocationResult) (PerDeviceCDIContainerEdits, error) { + perDeviceEdits := make(PerDeviceCDIContainerEdits) + + for _, result := range results { + envs := []string{} + + if c.Sharing != nil { + envs = append(envs, fmt.Sprintf("GPU_DEVICE_%s_SHARING_STRATEGY=%s", result.Device[4:], c.Sharing.Strategy)) + } + + switch { + case c.Sharing.IsTimeSlicing(): + tsconfig, err := c.Sharing.GetTimeSlicingConfig() + if err != nil { + return nil, fmt.Errorf("unable to get time slicing config for device %v: %w", result.Device, err) + } + envs = append(envs, fmt.Sprintf("GPU_DEVICE_%s_TIMESLICE_INTERVAL=%v", result.Device[4:], tsconfig.Interval)) + case c.Sharing.IsSpacePartitioning(): + spconfig, err := c.Sharing.GetSpacePartitioningConfig() + if err != nil { + return nil, fmt.Errorf("unable to get space partitioning config for device %v: %w", result.Device, err) + } + envs = append(envs, fmt.Sprintf("GPU_DEVICE_%s_PARTITION_COUNT=%v", result.Device[4:], spconfig.PartitionCount)) + } + + edits := &cdispec.ContainerEdits{ + Env: envs, + } + + perDeviceEdits[result.Device] = &cdiapi.ContainerEdits{ContainerEdits: edits} + } + + return perDeviceEdits, nil +} diff --git a/cmd/dra-example-kubeletplugin/cdi.go b/cmd/dra-example-kubeletplugin/cdi.go index 5a1913fd..161df1a7 100644 --- a/cmd/dra-example-kubeletplugin/cdi.go +++ b/cmd/dra-example-kubeletplugin/cdi.go @@ -89,36 +89,19 @@ func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, devices PreparedDevi Devices: []cdispec.Device{}, } - for i, device := range devices { - envs := []string{ - fmt.Sprintf("GPU_DEVICE_%d=%s", i, device.DeviceName), - } - - if device.Config.Sharing != nil { - envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_SHARING_STRATEGY=%s", i, device.Config.Sharing.Strategy)) - } - - switch { - case device.Config.Sharing.IsTimeSlicing(): - tsconfig, err := device.Config.Sharing.GetTimeSlicingConfig() - if err != nil { - return fmt.Errorf("unable to get time slicing config for device %v: %v", device.DeviceName, err) - } - envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_TIMESLICE_INTERVAL=%v", i, tsconfig.Interval)) - - case device.Config.Sharing.IsSpacePartitioning(): - spconfig, err := device.Config.Sharing.GetSpacePartitioningConfig() - if err != nil { - return fmt.Errorf("unable to get space partitioning config for device %v: %v", device.DeviceName, err) - } - envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_PARTITION_COUNT=%v", i, spconfig.PartitionCount)) + for _, device := range devices { + claimEdits := cdiapi.ContainerEdits{ + ContainerEdits: &cdispec.ContainerEdits{ + Env: []string{ + fmt.Sprintf("GPU_DEVICE_%s=%s", device.DeviceName[4:], device.DeviceName), + }, + }, } + claimEdits.Append(device.ContainerEdits) cdiDevice := cdispec.Device{ - Name: device.DeviceName, - ContainerEdits: cdispec.ContainerEdits{ - Env: envs, - }, + Name: device.DeviceName, + ContainerEdits: *claimEdits.ContainerEdits, } spec.Devices = append(spec.Devices, cdiDevice) diff --git a/cmd/dra-example-kubeletplugin/discovery.go b/cmd/dra-example-kubeletplugin/discovery.go index 47a57596..0792b846 100644 --- a/cmd/dra-example-kubeletplugin/discovery.go +++ b/cmd/dra-example-kubeletplugin/discovery.go @@ -17,6 +17,7 @@ package main import ( + "fmt" "math/rand" "os" @@ -35,7 +36,7 @@ func enumerateAllPossibleDevices() (AllocatableDevices, error) { alldevices := make(AllocatableDevices) for i, uuid := range uuids { device := resourceapi.Device{ - Name: uuid, + Name: fmt.Sprintf("gpu-%d", i), Basic: &resourceapi.BasicDevice{ Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{ "index": { @@ -56,7 +57,7 @@ func enumerateAllPossibleDevices() (AllocatableDevices, error) { }, }, } - alldevices[uuid] = device + alldevices[device.Name] = device } return alldevices, nil } diff --git a/cmd/dra-example-kubeletplugin/state.go b/cmd/dra-example-kubeletplugin/state.go index 6b95c57f..e8777ae7 100644 --- a/cmd/dra-example-kubeletplugin/state.go +++ b/cmd/dra-example-kubeletplugin/state.go @@ -27,6 +27,8 @@ import ( "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" configapi "sigs.k8s.io/dra-example-driver/api/example.com/resource/gpu/v1alpha1" + + cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" ) type AllocatableDevices map[string]resourceapi.Device @@ -35,7 +37,7 @@ type PreparedClaims map[string]PreparedDevices type PreparedDevice struct { drapbv1.Device - Config configapi.GpuConfig + ContainerEdits *cdiapi.ContainerEdits } type DeviceState struct { @@ -160,34 +162,48 @@ func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (Prepared return nil, fmt.Errorf("claim not yet allocated") } - // Walk through each device allocation and prepare it. - var preparedDevices PreparedDevices + // Retrieve the full set of device configs for the driver. + configs, err := GetOpaqueDeviceConfigs( + configapi.Decoder, + DriverName, + claim.Status.Allocation.Devices.Config, + ) + if err != nil { + return nil, fmt.Errorf("error getting opaque device configs: %v", err) + } + + // Look through the configs and figure out which one will be applied to + // each device allocation result based on their order of precedence. Use a + // default config if none present for a given result. + defaultGpuConfig := configapi.DefaultGpuConfig() + configResultsMap := make(map[runtime.Object][]*resourceapi.DeviceRequestAllocationResult) for _, result := range claim.Status.Allocation.Devices.Results { if _, exists := s.allocatable[result.Device]; !exists { return nil, fmt.Errorf("requested GPU is not allocatable: %v", result.Device) } - - // Retrieve the set of device configs for the current allocation. - configs, err := GetOpaqueDeviceConfigs( - configapi.Decoder, - DriverName, - result.Request, - claim.Status.Allocation.Devices.Config, - ) - if err != nil { - return nil, fmt.Errorf("error getting GPU configs: %v", err) - } - - // Select the GPU config with the highest precedence from those retrieved. - // Use the default GPU config if no GPU configs have been returned. - config := configapi.DefaultGpuConfig() - for _, c := range configs { - switch castConfig := c.(type) { - case *configapi.GpuConfig: - config = castConfig - default: - return nil, fmt.Errorf("runtime object is not a regognized configuration") + c := func() runtime.Object { + for _, c := range slices.Backward(configs) { + if len(c.Requests) == 0 || slices.Contains(c.Requests, result.Request) { + return c.Config + } } + return defaultGpuConfig + }() + configResultsMap[c] = append(configResultsMap[c], &result) + } + + // Normalize, validate, and apply all configs associated with devices that + // need to be prepared. Track container edits generated from applying the + // config to the set of device allocation results. + perDeviceCDIContainerEdits := make(configapi.PerDeviceCDIContainerEdits) + for c, results := range configResultsMap { + // Cast the opaque config to a GpuConfig + var config *configapi.GpuConfig + switch castConfig := c.(type) { + case *configapi.GpuConfig: + config = castConfig + default: + return nil, fmt.Errorf("runtime object is not a regognized configuration") } // Normalize the config to set any implied defaults. @@ -200,26 +216,34 @@ func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (Prepared return nil, fmt.Errorf("error validating GPU config: %w", err) } - device := &PreparedDevice{ - Device: drapbv1.Device{ - RequestNames: []string{result.Request}, - PoolName: result.Pool, - DeviceName: result.Device, - CDIDeviceIDs: s.cdi.GetClaimDevices([]string{result.Device}), - }, - Config: *config, + // Apply the config to the list of results associated with it. + containerEdits, err := config.Apply(results) + if err != nil { + return nil, fmt.Errorf("error applying GPU config: %w", err) } - // Apply any requested configuration here. - // - // In this example driver there is nothing to do at this point, but a - // real driver would likely need to do some sort of hardware - // configuration , based on the config that has been passed in. - if err := device.applyConfig(); err != nil { - return nil, fmt.Errorf("error applying GPU config: %v", err) + // Merge any new container edits with the overall per device map. + for k, v := range containerEdits { + perDeviceCDIContainerEdits[k] = v } + } - preparedDevices = append(preparedDevices, device) + // Walk through each config and its associated device allocation results + // and construct the list of prepared devices to return. + var preparedDevices PreparedDevices + for _, results := range configResultsMap { + for _, result := range results { + device := &PreparedDevice{ + Device: drapbv1.Device{ + RequestNames: []string{result.Request}, + PoolName: result.Pool, + DeviceName: result.Device, + CDIDeviceIDs: s.cdi.GetClaimDevices([]string{result.Device}), + }, + ContainerEdits: perDeviceCDIContainerEdits[result.Device], + } + preparedDevices = append(preparedDevices, device) + } } return preparedDevices, nil @@ -229,10 +253,6 @@ func (s *DeviceState) unprepareDevices(claimUID string, devices PreparedDevices) return nil } -func (p *PreparedDevice) applyConfig() error { - return nil -} - // GetDevices extracts the list of drapbv1.Devices from PreparedDevices. func (pds PreparedDevices) GetDevices() []*drapbv1.Device { var devices []*drapbv1.Device @@ -242,7 +262,12 @@ func (pds PreparedDevices) GetDevices() []*drapbv1.Device { return devices } -// GetOpaqueDeviceConfigs returns an ordered list of configs specified for a device request in a resource claim. +type DeviceConfig struct { + Requests []string + Config runtime.Object +} + +// GetOpaqueDeviceConfigs returns an ordered list of the configs contained in possibleConfigs for this driver. // // Configs can either come from the resource claim itself or from the device // class associated with the request. Configs coming directly from the resource @@ -250,30 +275,24 @@ func (pds PreparedDevices) GetDevices() []*drapbv1.Device { // configs found later in the list of configs attached to its source take // precedence over configs found earlier in the list for that source. // -// All of the configs relevant to the specified request for this driver will be -// returned in order of precedence (from lowest to highest). If no config is -// found, nil is returned. +// All of the configs relevant to the driver from the list of possibleConfigs +// will be returned in order of precedence (from lowest to highest). If no +// configs are found, nil is returned. func GetOpaqueDeviceConfigs( decoder runtime.Decoder, - driverName, - request string, + driverName string, possibleConfigs []resourceapi.DeviceAllocationConfiguration, -) ([]runtime.Object, error) { +) ([]*DeviceConfig, error) { // Collect all configs in order of reverse precedence. - var classConfigs []resourceapi.DeviceConfiguration - var claimConfigs []resourceapi.DeviceConfiguration - var candidateConfigs []resourceapi.DeviceConfiguration + var classConfigs []resourceapi.DeviceAllocationConfiguration + var claimConfigs []resourceapi.DeviceAllocationConfiguration + var candidateConfigs []resourceapi.DeviceAllocationConfiguration for _, config := range possibleConfigs { - // If the config is for specific requests and the current request isn't - // one of those, the config can be ignored. - if len(config.Requests) != 0 && !slices.Contains(config.Requests, request) { - continue - } switch config.Source { case resourceapi.AllocationConfigSourceClass: - classConfigs = append(classConfigs, config.DeviceConfiguration) + classConfigs = append(classConfigs, config) case resourceapi.AllocationConfigSourceClaim: - claimConfigs = append(claimConfigs, config.DeviceConfiguration) + claimConfigs = append(claimConfigs, config) default: return nil, fmt.Errorf("invalid config source: %v", config.Source) } @@ -282,11 +301,11 @@ func GetOpaqueDeviceConfigs( candidateConfigs = append(candidateConfigs, claimConfigs...) // Decode all configs that are relevant for the driver. - var resultConfigs []runtime.Object + var resultConfigs []*DeviceConfig for _, config := range candidateConfigs { // If this is nil, the driver doesn't support some future API extension // and needs to be updated. - if config.Opaque == nil { + if config.DeviceConfiguration.Opaque == nil { return nil, fmt.Errorf("only opaque parameters are supported by this driver") } @@ -294,16 +313,21 @@ func GetOpaqueDeviceConfigs( // single request can be satisfied by different drivers. This is not // an error -- drivers must skip over other driver's configs in order // to support this. - if config.Opaque.Driver != driverName { + if config.DeviceConfiguration.Opaque.Driver != driverName { continue } - c, err := runtime.Decode(decoder, config.Opaque.Parameters.Raw) + decodedConfig, err := runtime.Decode(decoder, config.DeviceConfiguration.Opaque.Parameters.Raw) if err != nil { return nil, fmt.Errorf("error decoding config parameters: %w", err) } - resultConfigs = append(resultConfigs, c) + resultConfig := &DeviceConfig{ + Requests: config.Requests, + Config: decodedConfig, + } + + resultConfigs = append(resultConfigs, resultConfig) } return resultConfigs, nil