Skip to content

Commit

Permalink
gpu: add support for CDI devices
Browse files Browse the repository at this point in the history
Signed-off-by: Tuomas Katila <[email protected]>
  • Loading branch information
tkatila committed Sep 11, 2024
1 parent 13e00f0 commit 52a6dc7
Show file tree
Hide file tree
Showing 5 changed files with 379 additions and 23 deletions.
14 changes: 14 additions & 0 deletions cmd/gpu_plugin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Table of Contents
* [Running GPU plugin as non-root](#running-gpu-plugin-as-non-root)
* [Labels created by GPU plugin](#labels-created-by-gpu-plugin)
* [SR-IOV use with the plugin](#sr-iov-use-with-the-plugin)
* [CDI support](#cdi-support)
* [KMD and UMD](#kmd-and-umd)
* [Issues with media workloads on multi-GPU setups](#issues-with-media-workloads-on-multi-gpu-setups)
* [Workaround for QSV and VA-API](#workaround-for-qsv-and-va-api)
Expand Down Expand Up @@ -218,6 +219,19 @@ GPU plugin does __not__ setup SR-IOV. It has to be configured by the cluster adm
GPU plugin does however support provisioning Virtual Functions (VFs) to containers for a SR-IOV enabled GPU. When the plugin detects a GPU with SR-IOV VFs configured, it will only provision the VFs and leaves the PF device on the host.
### CDI support
GPU plugin supports [CDI](https://github.com/container-orchestrated-devices/container-device-interface) to provide device details to the container. It does not yet provide any benefits compared to the traditional Kubernetes Device Plugin API. The CDI device specs will improve in the future with features that are not possible with the Device Plugin API.
To enable CDI support, container runtime has to support it. The support varies depending on the versions:
* CRI-O supports CDI by default v1.24.0 onwards.
* Containerd supports CDI from 1.7.0 onwards. 2.0.0 release will enable it by default.
* Docker supports CDI from v25 onwards.
Kubernetes CDI support is included since 1.28 release. In 1.28 it needs to be enabled via `DevicePluginCDIDevices` feature gate. From 1.29 onwards the feature is enabled by default.
> *NOTE*: To use CDI outside of Kubernetes, for example with Docker or Podman, CDI specs can be generated with the [Intel CDI specs generator](https://github.com/intel/intel-resource-drivers-for-kubernetes/releases/tag/specs-generator-v0.1.0).
### KMD and UMD
There are 3 different Kernel Mode Drivers (KMD) available: `i915 upstream`, `i915 backport` and `xe`:
Expand Down
74 changes: 63 additions & 11 deletions cmd/gpu_plugin/gpu_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler"
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
cdispec "tags.cncf.io/container-device-interface/specs-go"
)

const (
Expand Down Expand Up @@ -202,13 +203,10 @@ func packedPolicy(req *pluginapi.ContainerPreferredAllocationRequest) []string {
return deviceIds
}

// Returns a slice of by-path Mounts for a cardPath&Name.
// by-path files are searched from the given bypathDir.
// In the by-path dir, any files that start with "pci-<pci addr>" will be added to mounts.
func (dp *devicePlugin) bypathMountsForPci(cardPath, cardName, bypathDir string) []pluginapi.Mount {
func (dp *devicePlugin) pciAddressForCard(cardPath, cardName string) (string, error) {
linkPath, err := os.Readlink(cardPath)
if err != nil {
return nil
return "", err
}

// Fetches the pci address for a drm card by reading the
Expand All @@ -220,9 +218,27 @@ func (dp *devicePlugin) bypathMountsForPci(cardPath, cardName, bypathDir string)
if !dp.pciAddressReg.MatchString(pciAddress) {
klog.Warningf("Invalid pci address for %s: %s", cardPath, pciAddress)

return nil
return "", os.ErrInvalid
}

return pciAddress, nil
}

func pciDeviceIDForCard(cardPath string) (string, error) {
idPath := filepath.Join(cardPath, "device", "device")

idBytes, err := os.ReadFile(idPath)
if err != nil {
return "", err
}

return strings.Split(string(idBytes), "\n")[0], nil
}

// Returns a slice of by-path Mounts for a pciAddress.
// by-path files are searched from the given bypathDir.
// In the by-path dir, any files that start with "pci-<pci addr>" will be added to mounts.
func (dp *devicePlugin) bypathMountsForPci(pciAddress, bypathDir string) []pluginapi.Mount {
files, err := os.ReadDir(bypathDir)
if err != nil {
klog.Warningf("Failed to read by-path directory: %+v", err)
Expand Down Expand Up @@ -481,6 +497,45 @@ func (dp *devicePlugin) createDeviceSpecsFromDrmFiles(cardPath string) []plugina
return specs
}

func (dp *devicePlugin) createMountsAndCDIDevices(cardPath, name string, devSpecs []pluginapi.DeviceSpec) ([]pluginapi.Mount, *cdispec.Spec) {
mounts := []pluginapi.Mount{}

if dp.bypathFound {
if pciAddr, pciErr := dp.pciAddressForCard(cardPath, name); pciErr == nil {
mounts = dp.bypathMountsForPci(pciAddr, dp.bypathDir)
}
}

spec := &cdispec.Spec{
Version: dpapi.CDIVersion,
Kind: dpapi.CDIVendor + "/gpu",
Devices: make([]cdispec.Device, 1),
}

spec.Devices[0].Name = name

cedits := &spec.Devices[0].ContainerEdits

for _, dspec := range devSpecs {
cedits.DeviceNodes = append(cedits.DeviceNodes, &cdispec.DeviceNode{
HostPath: dspec.HostPath,
Path: dspec.ContainerPath,
Permissions: dspec.Permissions,
})
}

for _, mount := range mounts {
cedits.Mounts = append(cedits.Mounts, &cdispec.Mount{
HostPath: mount.HostPath,
ContainerPath: mount.ContainerPath,
Type: "none",
Options: []string{"bind", "ro"},
})
}

return mounts, spec
}

func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
files, err := os.ReadDir(dp.sysfsDir)
if err != nil {
Expand Down Expand Up @@ -509,12 +564,9 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
continue
}

mounts := []pluginapi.Mount{}
if dp.bypathFound {
mounts = dp.bypathMountsForPci(cardPath, name, dp.bypathDir)
}
mounts, cdiDevices := dp.createMountsAndCDIDevices(cardPath, name, devSpecs)

deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, nil, nil, nil)
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, nil, nil, cdiDevices)

for i := 0; i < dp.options.sharedDevNum; i++ {
devID := fmt.Sprintf("%s-%d", name, i)
Expand Down
Loading

0 comments on commit 52a6dc7

Please sign in to comment.