Skip to content

Commit

Permalink
Refactor how opaque device configs are handled
Browse files Browse the repository at this point in the history
Previously, each config was being applied independently to each request
that referenced it. However, some configs may need to operate
collectively on all of the requests they are associated with it.

The code has been refactored to handle this situation. Additionally, the
code to define the ContainerEdits for any custom config has been moved
into the config code itself to better encapsulate it.

Signed-off-by: Kevin Klues <[email protected]>
  • Loading branch information
klueska committed Sep 7, 2024
1 parent 3d48269 commit fc8d617
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 132 deletions.
74 changes: 37 additions & 37 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ items:
string: gpu-18db0e85-99e9-c746-8531-ffeb86328b39
capacity:
memory: 80Gi
name: gpu-18db0e85-99e9-c746-8531-ffeb86328b39
name: gpu-0
- basic:
attributes:
driverVersion:
Expand All @@ -140,7 +140,7 @@ items:
string: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac
capacity:
memory: 80Gi
name: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac
name: gpu-1
- basic:
attributes:
driverVersion:
Expand All @@ -153,7 +153,7 @@ items:
string: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744
capacity:
memory: 80Gi
name: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744
name: gpu-2
- basic:
attributes:
driverVersion:
Expand All @@ -166,7 +166,7 @@ items:
string: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243
capacity:
memory: 80Gi
name: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243
name: gpu-3
- basic:
attributes:
driverVersion:
Expand All @@ -179,7 +179,7 @@ items:
string: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747
capacity:
memory: 80Gi
name: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747
name: gpu-4
- basic:
attributes:
driverVersion:
Expand All @@ -192,7 +192,7 @@ items:
string: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e
capacity:
memory: 80Gi
name: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e
name: gpu-5
- basic:
attributes:
driverVersion:
Expand All @@ -205,7 +205,7 @@ items:
string: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac
capacity:
memory: 80Gi
name: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac
name: gpu-6
- basic:
attributes:
driverVersion:
Expand All @@ -218,7 +218,7 @@ items:
string: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b
capacity:
memory: 80Gi
name: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b
name: gpu-7
kind: List
metadata:
resourceVersion: ""
Expand Down Expand Up @@ -275,60 +275,60 @@ This should produce output similar to the following:
```bash
gpu-test1:
pod0 ctr0:
declare -x GPU_DEVICE_0="gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744"
declare -x GPU_DEVICE_6="gpu-6"
pod1 ctr0:
declare -x GPU_DEVICE_0="gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243"
declare -x GPU_DEVICE_7="gpu-7"

gpu-test2:
pod0 ctr0:
declare -x GPU_DEVICE_0="gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747"
declare -x GPU_DEVICE_1="gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e"
declare -x GPU_DEVICE_0="gpu-0"
declare -x GPU_DEVICE_1="gpu-1"

gpu-test3:
pod0 ctr0:
declare -x GPU_DEVICE_0="gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
declare -x GPU_DEVICE_2="gpu-2"
declare -x GPU_DEVICE_2_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_2_TIMESLICE_INTERVAL="Default"
pod0 ctr1:
declare -x GPU_DEVICE_0="gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
declare -x GPU_DEVICE_2="gpu-2"
declare -x GPU_DEVICE_2_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_2_TIMESLICE_INTERVAL="Default"

gpu-test4:
pod0 ctr0:
declare -x GPU_DEVICE_0="gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
declare -x GPU_DEVICE_3="gpu-3"
declare -x GPU_DEVICE_3_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_3_TIMESLICE_INTERVAL="Default"
pod1 ctr0:
declare -x GPU_DEVICE_0="gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
declare -x GPU_DEVICE_3="gpu-3"
declare -x GPU_DEVICE_3_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_3_TIMESLICE_INTERVAL="Default"

gpu-test5:
pod0 ts-ctr0:
declare -x GPU_DEVICE_0="gpu-18db0e85-99e9-c746-8531-ffeb86328b39"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Long"
declare -x GPU_DEVICE_4="gpu-4"
declare -x GPU_DEVICE_4_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_4_TIMESLICE_INTERVAL="Long"
pod0 ts-ctr1:
declare -x GPU_DEVICE_0="gpu-18db0e85-99e9-c746-8531-ffeb86328b39"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Long"
declare -x GPU_DEVICE_4="gpu-4"
declare -x GPU_DEVICE_4_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_4_TIMESLICE_INTERVAL="Long"
pod0 sp-ctr0:
declare -x GPU_DEVICE_1="gpu-93d37703-997c-c46f-a531-755e3e0dc2ac"
declare -x GPU_DEVICE_1_PARTITION_COUNT="10"
declare -x GPU_DEVICE_1_SHARING_STRATEGY="SpacePartitioning"
declare -x GPU_DEVICE_5="gpu-5"
declare -x GPU_DEVICE_5_PARTITION_COUNT="10"
declare -x GPU_DEVICE_5_SHARING_STRATEGY="SpacePartitioning"
pod0 sp-ctr1:
declare -x GPU_DEVICE_1="gpu-93d37703-997c-c46f-a531-755e3e0dc2ac"
declare -x GPU_DEVICE_1_PARTITION_COUNT="10"
declare -x GPU_DEVICE_1_SHARING_STRATEGY="SpacePartitioning"
declare -x GPU_DEVICE_5="gpu-5"
declare -x GPU_DEVICE_5_PARTITION_COUNT="10"
declare -x GPU_DEVICE_5_SHARING_STRATEGY="SpacePartitioning"
```

In this example resource driver, no "actual" GPUs are made available to any
containers. Instead, a set of environment variables are set in each container
to indicate which GPUs *would* have been injected into them by a real resource
driver and how they *would* have been configured.

You can use the UUIDs of the GPUs as well as the GPU sharing settings set in
You can use the IDs of the GPUs as well as the GPU sharing settings set in
these environment variables to verify that they were handed out in a way
consistent with the semantics shown in the figure above.

Expand Down
64 changes: 64 additions & 0 deletions api/example.com/resource/gpu/v1alpha1/cdi.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright 2024 The Kubernetes Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package v1alpha1

import (
"fmt"

resourceapi "k8s.io/api/resource/v1alpha3"

cdiapi "tags.cncf.io/container-device-interface/pkg/cdi"
cdispec "tags.cncf.io/container-device-interface/specs-go"
)

// +k8s:deepcopy-gen=false
type PerDeviceCDIContainerEdits map[string]*cdiapi.ContainerEdits

func (c *GpuConfig) Apply(results []*resourceapi.DeviceRequestAllocationResult) (PerDeviceCDIContainerEdits, error) {
perDeviceEdits := make(PerDeviceCDIContainerEdits)

for _, result := range results {
envs := []string{}

if c.Sharing != nil {
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%s_SHARING_STRATEGY=%s", result.Device[4:], c.Sharing.Strategy))
}

switch {
case c.Sharing.IsTimeSlicing():
tsconfig, err := c.Sharing.GetTimeSlicingConfig()
if err != nil {
return nil, fmt.Errorf("unable to get time slicing config for device %v: %w", result.Device, err)
}
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%s_TIMESLICE_INTERVAL=%v", result.Device[4:], tsconfig.Interval))
case c.Sharing.IsSpacePartitioning():
spconfig, err := c.Sharing.GetSpacePartitioningConfig()
if err != nil {
return nil, fmt.Errorf("unable to get space partitioning config for device %v: %w", result.Device, err)
}
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%s_PARTITION_COUNT=%v", result.Device[4:], spconfig.PartitionCount))
}

edits := &cdispec.ContainerEdits{
Env: envs,
}

perDeviceEdits[result.Device] = &cdiapi.ContainerEdits{ContainerEdits: edits}
}

return perDeviceEdits, nil
}
37 changes: 10 additions & 27 deletions cmd/dra-example-kubeletplugin/cdi.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,36 +89,19 @@ func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, devices PreparedDevi
Devices: []cdispec.Device{},
}

for i, device := range devices {
envs := []string{
fmt.Sprintf("GPU_DEVICE_%d=%s", i, device.DeviceName),
}

if device.Config.Sharing != nil {
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_SHARING_STRATEGY=%s", i, device.Config.Sharing.Strategy))
}

switch {
case device.Config.Sharing.IsTimeSlicing():
tsconfig, err := device.Config.Sharing.GetTimeSlicingConfig()
if err != nil {
return fmt.Errorf("unable to get time slicing config for device %v: %v", device.DeviceName, err)
}
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_TIMESLICE_INTERVAL=%v", i, tsconfig.Interval))

case device.Config.Sharing.IsSpacePartitioning():
spconfig, err := device.Config.Sharing.GetSpacePartitioningConfig()
if err != nil {
return fmt.Errorf("unable to get space partitioning config for device %v: %v", device.DeviceName, err)
}
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_PARTITION_COUNT=%v", i, spconfig.PartitionCount))
for _, device := range devices {
claimEdits := cdiapi.ContainerEdits{
ContainerEdits: &cdispec.ContainerEdits{
Env: []string{
fmt.Sprintf("GPU_DEVICE_%s=%s", device.DeviceName[4:], device.DeviceName),
},
},
}
claimEdits.Append(device.ContainerEdits)

cdiDevice := cdispec.Device{
Name: device.DeviceName,
ContainerEdits: cdispec.ContainerEdits{
Env: envs,
},
Name: device.DeviceName,
ContainerEdits: *claimEdits.ContainerEdits,
}

spec.Devices = append(spec.Devices, cdiDevice)
Expand Down
5 changes: 3 additions & 2 deletions cmd/dra-example-kubeletplugin/discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package main

import (
"fmt"
"math/rand"
"os"

Expand All @@ -35,7 +36,7 @@ func enumerateAllPossibleDevices() (AllocatableDevices, error) {
alldevices := make(AllocatableDevices)
for i, uuid := range uuids {
device := resourceapi.Device{
Name: uuid,
Name: fmt.Sprintf("gpu-%d", i),
Basic: &resourceapi.BasicDevice{
Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{
"index": {
Expand All @@ -56,7 +57,7 @@ func enumerateAllPossibleDevices() (AllocatableDevices, error) {
},
},
}
alldevices[uuid] = device
alldevices[device.Name] = device
}
return alldevices, nil
}
Expand Down
Loading

0 comments on commit fc8d617

Please sign in to comment.