Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor how opaque device configs are handled #58

Merged
merged 2 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
- name: Install Go
uses: actions/setup-go@v4
with:
go-version: 1.22.6
go-version: 1.23.1
- name: Checkout code
uses: actions/checkout@v3
- name: Build
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
test:
strategy:
matrix:
version: ['1.22.2' ]
version: ['1.23.1' ]
platform: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.platform }}
steps:
Expand Down
78 changes: 39 additions & 39 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ items:
string: gpu-18db0e85-99e9-c746-8531-ffeb86328b39
capacity:
memory: 80Gi
name: gpu-18db0e85-99e9-c746-8531-ffeb86328b39
name: gpu-0
- basic:
attributes:
driverVersion:
Expand All @@ -140,7 +140,7 @@ items:
string: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac
capacity:
memory: 80Gi
name: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac
name: gpu-1
- basic:
attributes:
driverVersion:
Expand All @@ -153,7 +153,7 @@ items:
string: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744
capacity:
memory: 80Gi
name: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744
name: gpu-2
- basic:
attributes:
driverVersion:
Expand All @@ -166,7 +166,7 @@ items:
string: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243
capacity:
memory: 80Gi
name: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243
name: gpu-3
- basic:
attributes:
driverVersion:
Expand All @@ -179,7 +179,7 @@ items:
string: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747
capacity:
memory: 80Gi
name: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747
name: gpu-4
- basic:
attributes:
driverVersion:
Expand All @@ -192,7 +192,7 @@ items:
string: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e
capacity:
memory: 80Gi
name: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e
name: gpu-5
- basic:
attributes:
driverVersion:
Expand All @@ -205,7 +205,7 @@ items:
string: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac
capacity:
memory: 80Gi
name: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac
name: gpu-6
- basic:
attributes:
driverVersion:
Expand All @@ -218,7 +218,7 @@ items:
string: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b
capacity:
memory: 80Gi
name: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b
name: gpu-7
kind: List
metadata:
resourceVersion: ""
Expand Down Expand Up @@ -261,9 +261,9 @@ for example in $(seq 1 5); do \
for ctr in $(kubectl get pod -n gpu-test${example} ${pod} -o jsonpath='{.spec.containers[*].name}'); do \
echo "${pod} ${ctr}:"
if [ "${example}" -lt 3 ]; then
kubectl logs -n gpu-test${example} ${pod} -c ${ctr}| grep -E "GPU_DEVICE_[0-9]+="
kubectl logs -n gpu-test${example} ${pod} -c ${ctr}| grep -E "GPU_DEVICE_[0-9]+=" | grep -v "RESOURCE_CLAIM"
else
kubectl logs -n gpu-test${example} ${pod} -c ${ctr}| grep -E "GPU_DEVICE_[0-9]+"
kubectl logs -n gpu-test${example} ${pod} -c ${ctr}| grep -E "GPU_DEVICE_[0-9]+" | grep -v "RESOURCE_CLAIM"
fi
done
done
Expand All @@ -275,60 +275,60 @@ This should produce output similar to the following:
```bash
gpu-test1:
pod0 ctr0:
declare -x GPU_DEVICE_0="gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744"
declare -x GPU_DEVICE_6="gpu-6"
pod1 ctr0:
declare -x GPU_DEVICE_0="gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243"
declare -x GPU_DEVICE_7="gpu-7"

gpu-test2:
pod0 ctr0:
declare -x GPU_DEVICE_0="gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747"
declare -x GPU_DEVICE_1="gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e"
declare -x GPU_DEVICE_0="gpu-0"
declare -x GPU_DEVICE_1="gpu-1"

gpu-test3:
pod0 ctr0:
declare -x GPU_DEVICE_0="gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
declare -x GPU_DEVICE_2="gpu-2"
declare -x GPU_DEVICE_2_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_2_TIMESLICE_INTERVAL="Default"
pod0 ctr1:
declare -x GPU_DEVICE_0="gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
declare -x GPU_DEVICE_2="gpu-2"
declare -x GPU_DEVICE_2_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_2_TIMESLICE_INTERVAL="Default"

gpu-test4:
pod0 ctr0:
declare -x GPU_DEVICE_0="gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
declare -x GPU_DEVICE_3="gpu-3"
declare -x GPU_DEVICE_3_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_3_TIMESLICE_INTERVAL="Default"
pod1 ctr0:
declare -x GPU_DEVICE_0="gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Default"
declare -x GPU_DEVICE_3="gpu-3"
declare -x GPU_DEVICE_3_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_3_TIMESLICE_INTERVAL="Default"

gpu-test5:
pod0 ts-ctr0:
declare -x GPU_DEVICE_0="gpu-18db0e85-99e9-c746-8531-ffeb86328b39"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Long"
declare -x GPU_DEVICE_4="gpu-4"
declare -x GPU_DEVICE_4_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_4_TIMESLICE_INTERVAL="Long"
pod0 ts-ctr1:
declare -x GPU_DEVICE_0="gpu-18db0e85-99e9-c746-8531-ffeb86328b39"
declare -x GPU_DEVICE_0_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_0_TIMESLICE_INTERVAL="Long"
declare -x GPU_DEVICE_4="gpu-4"
declare -x GPU_DEVICE_4_SHARING_STRATEGY="TimeSlicing"
declare -x GPU_DEVICE_4_TIMESLICE_INTERVAL="Long"
pod0 sp-ctr0:
declare -x GPU_DEVICE_1="gpu-93d37703-997c-c46f-a531-755e3e0dc2ac"
declare -x GPU_DEVICE_1_PARTITION_COUNT="10"
declare -x GPU_DEVICE_1_SHARING_STRATEGY="SpacePartitioning"
declare -x GPU_DEVICE_5="gpu-5"
declare -x GPU_DEVICE_5_PARTITION_COUNT="10"
declare -x GPU_DEVICE_5_SHARING_STRATEGY="SpacePartitioning"
pod0 sp-ctr1:
declare -x GPU_DEVICE_1="gpu-93d37703-997c-c46f-a531-755e3e0dc2ac"
declare -x GPU_DEVICE_1_PARTITION_COUNT="10"
declare -x GPU_DEVICE_1_SHARING_STRATEGY="SpacePartitioning"
declare -x GPU_DEVICE_5="gpu-5"
declare -x GPU_DEVICE_5_PARTITION_COUNT="10"
declare -x GPU_DEVICE_5_SHARING_STRATEGY="SpacePartitioning"
```

In this example resource driver, no "actual" GPUs are made available to any
containers. Instead, a set of environment variables are set in each container
to indicate which GPUs *would* have been injected into them by a real resource
driver and how they *would* have been configured.

You can use the UUIDs of the GPUs as well as the GPU sharing settings set in
You can use the IDs of the GPUs as well as the GPU sharing settings set in
these environment variables to verify that they were handed out in a way
consistent with the semantics shown in the figure above.

Expand Down
41 changes: 12 additions & 29 deletions cmd/dra-example-kubeletplugin/cdi.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,36 +89,19 @@ func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, devices PreparedDevi
Devices: []cdispec.Device{},
}

for i, device := range devices {
envs := []string{
fmt.Sprintf("GPU_DEVICE_%d=%s", i, device.DeviceName),
}

if device.Config.Sharing != nil {
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_SHARING_STRATEGY=%s", i, device.Config.Sharing.Strategy))
}

switch {
case device.Config.Sharing.IsTimeSlicing():
tsconfig, err := device.Config.Sharing.GetTimeSlicingConfig()
if err != nil {
return fmt.Errorf("unable to get time slicing config for device %v: %v", device.DeviceName, err)
}
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_TIMESLICE_INTERVAL=%v", i, tsconfig.Interval))

case device.Config.Sharing.IsSpacePartitioning():
spconfig, err := device.Config.Sharing.GetSpacePartitioningConfig()
if err != nil {
return fmt.Errorf("unable to get space partitioning config for device %v: %v", device.DeviceName, err)
}
envs = append(envs, fmt.Sprintf("GPU_DEVICE_%d_PARTITION_COUNT=%v", i, spconfig.PartitionCount))
for _, device := range devices {
claimEdits := cdiapi.ContainerEdits{
ContainerEdits: &cdispec.ContainerEdits{
Env: []string{
fmt.Sprintf("GPU_DEVICE_%s_RESOURCE_CLAIM=%s", device.DeviceName[4:], claimUID),
},
},
}
claimEdits.Append(device.ContainerEdits)

cdiDevice := cdispec.Device{
Name: device.DeviceName,
ContainerEdits: cdispec.ContainerEdits{
Env: envs,
},
Name: fmt.Sprintf("%s-%s", claimUID, device.DeviceName),
ContainerEdits: *claimEdits.ContainerEdits,
}

spec.Devices = append(spec.Devices, cdiDevice)
Expand All @@ -138,13 +121,13 @@ func (cdi *CDIHandler) DeleteClaimSpecFile(claimUID string) error {
return cdi.cache.RemoveSpec(specName)
}

func (cdi *CDIHandler) GetClaimDevices(devices []string) []string {
func (cdi *CDIHandler) GetClaimDevices(claimUID string, devices []string) []string {
cdiDevices := []string{
cdiparser.QualifiedName(cdiVendor, cdiClass, cdiCommonDeviceName),
}

for _, device := range devices {
cdiDevice := cdiparser.QualifiedName(cdiVendor, cdiClass, device)
cdiDevice := cdiparser.QualifiedName(cdiVendor, cdiClass, fmt.Sprintf("%s-%s", claimUID, device))
cdiDevices = append(cdiDevices, cdiDevice)
}

Expand Down
5 changes: 3 additions & 2 deletions cmd/dra-example-kubeletplugin/discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package main

import (
"fmt"
"math/rand"
"os"

Expand All @@ -35,7 +36,7 @@ func enumerateAllPossibleDevices() (AllocatableDevices, error) {
alldevices := make(AllocatableDevices)
for i, uuid := range uuids {
device := resourceapi.Device{
Name: uuid,
Name: fmt.Sprintf("gpu-%d", i),
Basic: &resourceapi.BasicDevice{
Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{
"index": {
Expand All @@ -56,7 +57,7 @@ func enumerateAllPossibleDevices() (AllocatableDevices, error) {
},
},
}
alldevices[uuid] = device
alldevices[device.Name] = device
}
return alldevices, nil
}
Expand Down
Loading
Loading