Skip to content

Commit

Permalink
Refactor how opaque device configs are handled
Browse files Browse the repository at this point in the history
Previously, each config was being applied independently to each request
that referenced it. However, some configs may need to operate
collectively on all of the requests they are associated with it.

The code has been refactored to handle this situation. Additionally, the
code to define the ContainerEdits for any custom config has been moved
into the config code itself to better encapsulate it.

Signed-off-by: Kevin Klues <[email protected]>
  • Loading branch information
klueska committed Sep 7, 2024
1 parent 5b1228f commit 53be214
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 140 deletions.
78 changes: 39 additions & 39 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ items:
string: gpu-18db0e85-99e9-c746-8531-ffeb86328b39
capacity:
memory: 80Gi
name: gpu-18db0e85-99e9-c746-8531-ffeb86328b39
name: gpu-0
- basic:
attributes:
driverVersion:
Expand All @@ -140,7 +140,7 @@ items:
string: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac
capacity:
memory: 80Gi
name: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac
name: gpu-1
- basic:
attributes:
driverVersion:
Expand All @@ -153,7 +153,7 @@ items:
string: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744
capacity:
memory: 80Gi
name: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744
name: gpu-2
- basic:
attributes:
driverVersion:
Expand All @@ -166,7 +166,7 @@ items:
string: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243
capacity:
memory: 80Gi
name: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243
name: gpu-3
- basic:
attributes:
driverVersion:
Expand All @@ -179,7 +179,7 @@ items:
string: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747
capacity:
memory: 80Gi
name: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747
name: gpu-4
- basic:
attributes:
driverVersion:
Expand All @@ -192,7 +192,7 @@ items:
string: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e
capacity:
memory: 80Gi
name: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e
name: gpu-5
- basic:
attributes:
driverVersion:
Expand All @@ -205,7 +205,7 @@ items:
string: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac
capacity:
memory: 80Gi
name: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac
name: gpu-6
- basic:
attributes:
driverVersion:
Expand All @@ -218,7 +218,7 @@ items:
string: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b
capacity:
memory: 80Gi
name: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b
name: gpu-7
kind: List
metadata:
resourceVersion: ""
Expand Down Expand Up @@ -261,9 +261,9 @@ for example in $(seq 1 5); do \
for ctr in $(kubectl get pod -n gpu-test${example} ${pod} -o jsonpath='{.spec.containers[*].name}'); do \
echo "${pod} ${ctr}:"
if [ "${example}" -lt 3 ]; then
kubectl logs -n gpu-test${example} ${pod} -c ${ctr}| grep -E "GPU_DEVICE_[0-9]+="
kubectl logs -n gpu-test${example} ${pod} -c ${ctr}| grep -E "GPU_DEVICE_[0-9]+=" | grep -v "RESOURCE_CLAIM"
else
kubectl logs -n gpu-test${example} ${pod} -c ${ctr}| grep -E "GPU_DEVICE_[0-9]+"
kubectl logs -n gpu-test${example} ${pod} -c ${ctr}| grep -E "GPU_DEVICE_[0-9]+" | grep -v "RESOURCE_CLAIM"
fi
done
done
Expand All @@ -275,60 +275,60 @@ This should produce output similar to the following:
```bash
gpu-test1:
pod0 ctr0:
declare -x GPU_DEVICE_0="gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744"
declare -x GPU_DEVICE_6="gpu-6"
pod1 ctr0:
declare -x GPU_DEVICE_0="gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243"
declare -x GPU_DEVICE_7="gpu-7"

gpu-test2:
pod0 ctr0:
declare -x GPU_DEVICE_0="gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747"
declare -x GPU_DEVICE_1="gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e"
declare -x GPU_DEVICE_0="gpu-0"
declare -x GPU_DEVICE_1="gpu-1"

gpu-test3:
pod0 ctr0:
declare -x GPU_DEVICE_0="gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
declare -x GPU_DEVICE_2="gpu-2"
declare -x GPU_DEVICE_2_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_2_TIMESLICE_INTERVAL="Default"
pod0 ctr1:
declare -x GPU_DEVICE_0="gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
declare -x GPU_DEVICE_2="gpu-2"
declare -x GPU_DEVICE_2_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_2_TIMESLICE_INTERVAL="Default"

gpu-test4:
pod0 ctr0:
declare -x GPU_DEVICE_0="gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
declare -x GPU_DEVICE_3="gpu-3"
declare -x GPU_DEVICE_3_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_3_TIMESLICE_INTERVAL="Default"
pod1 ctr0:
declare -x GPU_DEVICE_0="gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
declare -x GPU_DEVICE_3="gpu-3"
declare -x GPU_DEVICE_3_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_3_TIMESLICE_INTERVAL="Default"

gpu-test5:
pod0 ts-ctr0:
declare -x GPU_DEVICE_0="gpu-18db0e85-99e9-c746-8531-ffeb86328b39"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Long"
declare -x GPU_DEVICE_4="gpu-4"
declare -x GPU_DEVICE_4_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_4_TIMESLICE_INTERVAL="Long"
pod0 ts-ctr1:
declare -x GPU_DEVICE_0="gpu-18db0e85-99e9-c746-8531-ffeb86328b39"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Long"
declare -x GPU_DEVICE_4="gpu-4"
declare -x GPU_DEVICE_4_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_4_TIMESLICE_INTERVAL="Long"
pod0 sp-ctr0:
declare -x GPU_DEVICE_1="gpu-93d37703-997c-c46f-a531-755e3e0dc2ac"
declare -x GPU_DEVICE_1_PARTITION_COUNT="10"
declare -x GPU_DEVICE_1_SHARING_STRATEGY="SpacePartitioning"
declare -x GPU_DEVICE_5="gpu-5"
declare -x GPU_DEVICE_5_PARTITION_COUNT="10"
declare -x GPU_DEVICE_5_SHARING_STRATEGY="SpacePartitioning"
pod0 sp-ctr1:
declare -x GPU_DEVICE_1="gpu-93d37703-997c-c46f-a531-755e3e0dc2ac"
declare -x GPU_DEVICE_1_PARTITION_COUNT="10"
declare -x GPU_DEVICE_1_SHARING_STRATEGY="SpacePartitioning"
declare -x GPU_DEVICE_5="gpu-5"
declare -x GPU_DEVICE_5_PARTITION_COUNT="10"
declare -x GPU_DEVICE_5_SHARING_STRATEGY="SpacePartitioning"
```

In this example resource driver, no "actual" GPUs are made available to any
containers. Instead, a set of environment variables are set in each container
to indicate which GPUs *would* have been injected into them by a real resource
driver and how they *would* have been configured.

You can use the UUIDs of the GPUs as well as the GPU sharing settings set in
You can use the IDs of the GPUs as well as the GPU sharing settings set in
these environment variables to verify that they were handed out in a way
consistent with the semantics shown in the figure above.

Expand Down
41 changes: 12 additions & 29 deletions cmd/dra-example-kubeletplugin/cdi.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,36 +89,19 @@ func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, devices PreparedDevi
Devices: []cdispec.Device{},
}

for i, device := range devices {
envs := []string{
fmt.Sprintf("GPU_DEVICE_%d=%s", i, device.DeviceName),
}

if device.Config.Sharing != nil {
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_SHARING_STRATEGY=%s", i, device.Config.Sharing.Strategy))
}

switch {
case device.Config.Sharing.IsTimeSlicing():
tsconfig, err := device.Config.Sharing.GetTimeSlicingConfig()
if err != nil {
return fmt.Errorf("unable to get time slicing config for device %v: %v", device.DeviceName, err)
}
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_TIMESLICE_INTERVAL=%v", i, tsconfig.Interval))

case device.Config.Sharing.IsSpacePartitioning():
spconfig, err := device.Config.Sharing.GetSpacePartitioningConfig()
if err != nil {
return fmt.Errorf("unable to get space partitioning config for device %v: %v", device.DeviceName, err)
}
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_PARTITION_COUNT=%v", i, spconfig.PartitionCount))
for _, device := range devices {
claimEdits := cdiapi.ContainerEdits{
ContainerEdits: &cdispec.ContainerEdits{
Env: []string{
fmt.Sprintf("GPU_DEVICE_%s_RESOURCE_CLAIM=%s", device.DeviceName[4:], claimUID),
},
},
}
claimEdits.Append(device.ContainerEdits)

cdiDevice := cdispec.Device{
Name: device.DeviceName,
ContainerEdits: cdispec.ContainerEdits{
Env: envs,
},
Name: fmt.Sprintf("%s-%s", claimUID, device.DeviceName),
ContainerEdits: *claimEdits.ContainerEdits,
}

spec.Devices = append(spec.Devices, cdiDevice)
Expand All @@ -138,13 +121,13 @@ func (cdi *CDIHandler) DeleteClaimSpecFile(claimUID string) error {
return cdi.cache.RemoveSpec(specName)
}

func (cdi *CDIHandler) GetClaimDevices(devices []string) []string {
func (cdi *CDIHandler) GetClaimDevices(claimUID string, devices []string) []string {
cdiDevices := []string{
cdiparser.QualifiedName(cdiVendor, cdiClass, cdiCommonDeviceName),
}

for _, device := range devices {
cdiDevice := cdiparser.QualifiedName(cdiVendor, cdiClass, device)
cdiDevice := cdiparser.QualifiedName(cdiVendor, cdiClass, fmt.Sprintf("%s-%s", claimUID, device))
cdiDevices = append(cdiDevices, cdiDevice)
}

Expand Down
5 changes: 3 additions & 2 deletions cmd/dra-example-kubeletplugin/discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package main

import (
"fmt"
"math/rand"
"os"

Expand All @@ -35,7 +36,7 @@ func enumerateAllPossibleDevices() (AllocatableDevices, error) {
alldevices := make(AllocatableDevices)
for i, uuid := range uuids {
device := resourceapi.Device{
Name: uuid,
Name: fmt.Sprintf("gpu-%d", i),
Basic: &resourceapi.BasicDevice{
Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{
"index": {
Expand All @@ -56,7 +57,7 @@ func enumerateAllPossibleDevices() (AllocatableDevices, error) {
},
},
}
alldevices[uuid] = device
alldevices[device.Name] = device
}
return alldevices, nil
}
Expand Down
Loading

0 comments on commit 53be214

Please sign in to comment.