diff --git a/.github/workflows/lib-e2e.yaml b/.github/workflows/lib-e2e.yaml index 057c03687..1678243be 100644 --- a/.github/workflows/lib-e2e.yaml +++ b/.github/workflows/lib-e2e.yaml @@ -25,6 +25,7 @@ jobs: - name: e2e-gpu runner: gpu images: intel-gpu-plugin intel-gpu-initcontainer + targetJob: e2e-gpu SKIP=Resource:xe - name: e2e-iaa-spr targetjob: e2e-iaa runner: simics-spr diff --git a/README.md b/README.md index 8417a91ed..bc81fad78 100644 --- a/README.md +++ b/README.md @@ -229,7 +229,7 @@ The summary of resources available via plugins in this repository is given in th * [dsa-accel-config-demo-pod.yaml](demo/dsa-accel-config-demo-pod.yaml) * `fpga.intel.com` : custom, see [mappings](cmd/fpga_admissionwebhook/README.md#mappings) * [intelfpga-job.yaml](demo/intelfpga-job.yaml) - * `gpu.intel.com` : `i915` + * `gpu.intel.com` : `i915`, `i915_monitoring`, `xe` or `xe_monitoring` * [intelgpu-job.yaml](demo/intelgpu-job.yaml) * `iaa.intel.com` : `wq-user-[shared or dedicated]` * [iaa-accel-config-demo-pod.yaml](demo/iaa-accel-config-demo-pod.yaml) diff --git a/cmd/gpu_plugin/README.md b/cmd/gpu_plugin/README.md index 7019de164..e706bb362 100644 --- a/cmd/gpu_plugin/README.md +++ b/cmd/gpu_plugin/README.md @@ -16,6 +16,7 @@ Table of Contents * [Running GPU plugin as non-root](#running-gpu-plugin-as-non-root) * [Labels created by GPU plugin](#labels-created-by-gpu-plugin) * [SR-IOV use with the plugin](#sr-iov-use-with-the-plugin) + * [KMD and UMD](#kmd-and-umd) * [Issues with media workloads on multi-GPU setups](#issues-with-media-workloads-on-multi-gpu-setups) * [Workaround for QSV and VA-API](#workaround-for-qsv-and-va-api) @@ -36,11 +37,23 @@ For example containers with Intel media driver (and components using that), can video transcoding operations, and containers with the Intel OpenCL / oneAPI Level Zero backend libraries can offload compute operations to GPU. +Intel GPU plugin may register four node resources to the Kubernetes cluster: +| Resource | Description | +|:---- |:-------- | +| gpu.intel.com/i915 | GPU instance running legacy `i915` KMD | +| gpu.intel.com/i915_monitoring | Monitoring resource for the legacy `i915` KMD devices | +| gpu.intel.com/xe | GPU instance running new `xe` KMD | +| gpu.intel.com/xe_monitoring | Monitoring resource for the new `xe` KMD devices | + +While GPU plugin basic operations support nodes having both (`i915` and `xe`) KMDs on the same node, its resource management (=GAS) does not, for that node needs to have only one of the KMDs present. + +For workloads on different KMDs, see [KMD and UMD](#kmd-and-umd). + ## Modes and Configuration Options | Flag | Argument | Default | Meaning | |:---- |:-------- |:------- |:------- | -| -enable-monitoring | - | disabled | Enable 'i915_monitoring' resource that provides access to all Intel GPU devices on the node | +| -enable-monitoring | - | disabled | Enable '*_monitoring' resource that provides access to all Intel GPU devices on the node, [see use](./monitoring.md) | | -resource-manager | - | disabled | Enable fractional resource management, [see use](./fractional.md) | | -shared-dev-num | int | 1 | Number of containers that can share the same GPU device | | -allocation-policy | string | none | 3 possible values: balanced, packed, none. For shared-dev-num > 1: _balanced_ mode spreads workloads among GPU devices, _packed_ mode fills one GPU fully before moving to next, and _none_ selects first available device from kubelet. Default is _none_. Allocation policy does not have an effect when resource manager is enabled. | @@ -205,6 +218,31 @@ GPU plugin does __not__ setup SR-IOV. It has to be configured by the cluster adm GPU plugin does however support provisioning Virtual Functions (VFs) to containers for a SR-IOV enabled GPU. When the plugin detects a GPU with SR-IOV VFs configured, it will only provision the VFs and leaves the PF device on the host. +### KMD and UMD + +There are 3 different Kernel Mode Drivers (KMD) available: `i915 upstream`, `i915 backport` and `xe`: +* `i915 upstream` is a vanilla driver that comes from the upstream kernel and is included in the common Linux distributions, like Ubuntu. +* `i915 backport` is an [out-of-tree driver](https://github.com/intel-gpu/intel-gpu-i915-backports/) for older enterprise / LTS kernel versions, having better support for new HW before upstream kernel does. API it provides to user-space can differ from the eventual upstream version. +* `xe` is a new KMD that is intended to support future GPUs. While it has [experimental support for latest current GPUs](https://docs.kernel.org/gpu/rfc/xe.html) (starting from Tigerlake), it will not support them officially. + +For optimal performance, the KMD should be paired with the same UMD variant. When creating a workload container, depending on the target hardware, the UMD packages should be selected approriately. + +| KMD | UMD packages | Support notes | +|:---- |:-------- |:------- | +| `i915 upstream` | Distro Repository | For Integrated GPUs. Newer Linux kernels will introduce support for Arc, Flex or Max series. | +| `i915 backport` | [Intel Repository](https://dgpu-docs.intel.com/driver/installation.html#install-steps) | Best for Arc, Flex and Max series. Untested for Integrated GPUs. | +| `xe` | Source code only | Experimental support for Arc, Flex and Max series. | + +> *NOTE*: Xe UMD is in active development and should be considered as experimental. + +Creating a workload that would support all the different KMDs is not currently possible. Below is a table that clarifies how each domain supports different KMDs. + +| Domain | i915 upstream | i915 backport | xe | Notes | +|:---- |:-------- |:------- |:------- |:------- | +| Compute | Default | [NEO_ENABLE_i915_PRELIM_DETECTION](https://github.com/intel/compute-runtime/blob/3341de7a0d5fddd2ea5f505b5d2ef5c13faa0681/CMakeLists.txt#L496-L502) | [NEO_ENABLE_XE_DRM_DETECTION](https://github.com/intel/compute-runtime/blob/3341de7a0d5fddd2ea5f505b5d2ef5c13faa0681/CMakeLists.txt#L504-L510) | All three KMDs can be supported at the same time. | +| Media | Default | [ENABLE_PRODUCTION_KMD](https://github.com/intel/media-driver/blob/a66b076e83876fbfa9c9ab633ad9c5517f8d74fd/CMakeLists.txt#L58) | [ENABLE_XE_KMD](https://github.com/intel/media-driver/blob/a66b076e83876fbfa9c9ab633ad9c5517f8d74fd/media_driver/cmake/linux/media_feature_flags_linux.cmake#L187-L190) | Xe with upstream or backport i915, not all three. | +| Graphics | Default | Unknown | [intel-xe-kmd](https://gitlab.freedesktop.org/mesa/mesa/-/blob/e9169881dbd1f72eab65a68c2b8e7643f74489b7/meson_options.txt#L708) | i915 and xe KMDs can be supported at the same time. | + ### Issues with media workloads on multi-GPU setups OneVPL media API, 3D and compute APIs provide device discovery diff --git a/cmd/gpu_plugin/device_props.go b/cmd/gpu_plugin/device_props.go new file mode 100644 index 000000000..e6daf2f28 --- /dev/null +++ b/cmd/gpu_plugin/device_props.go @@ -0,0 +1,85 @@ +// Copyright 2024 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "slices" + + "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler" + "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils" + "k8s.io/klog/v2" +) + +type DeviceProperties struct { + currentDriver string + drmDrivers map[string]bool + tileCounts []uint64 + isPfWithVfs bool +} + +type invalidTileCountErr struct { + error +} + +func newDeviceProperties() *DeviceProperties { + return &DeviceProperties{ + drmDrivers: make(map[string]bool), + } +} + +func (d *DeviceProperties) fetch(cardPath string) { + d.isPfWithVfs = pluginutils.IsSriovPFwithVFs(cardPath) + + d.tileCounts = append(d.tileCounts, labeler.GetTileCount(cardPath)) + + driverName, err := pluginutils.ReadDeviceDriver(cardPath) + if err != nil { + klog.Warningf("card (%s) doesn't have driver, using default: %s", cardPath, deviceTypeDefault) + + driverName = deviceTypeDefault + } + + d.currentDriver = driverName + d.drmDrivers[d.currentDriver] = true +} + +func (d *DeviceProperties) drmDriverCount() int { + return len(d.drmDrivers) +} + +func (d *DeviceProperties) driver() string { + return d.currentDriver +} + +func (d *DeviceProperties) monitorResource() string { + return d.currentDriver + monitorSuffix +} + +func (d *DeviceProperties) maxTileCount() (uint64, error) { + if len(d.tileCounts) == 0 { + return 0, invalidTileCountErr{} + } + + minCount := slices.Min(d.tileCounts) + maxCount := slices.Max(d.tileCounts) + + if minCount != maxCount { + klog.Warningf("Node's GPUs are heterogenous (min: %d, max: %d tiles)", minCount, maxCount) + + return 0, invalidTileCountErr{} + } + + return maxCount, nil +} diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index 6f1ad4018..44c504263 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -17,6 +17,7 @@ package main import ( "flag" "fmt" + "io/fs" "os" "path" "path/filepath" @@ -32,7 +33,6 @@ import ( "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm" "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler" - "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils" dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin" ) @@ -47,12 +47,14 @@ const ( vendorString = "0x8086" // Device plugin settings. - namespace = "gpu.intel.com" - deviceType = "i915" + namespace = "gpu.intel.com" + deviceTypeI915 = "i915" + deviceTypeXe = "xe" + deviceTypeDefault = deviceTypeI915 // telemetry resource settings. - monitorType = "i915_monitoring" - monitorID = "all" + monitorSuffix = "_monitoring" + monitorID = "all" // Period of device scans. scanPeriod = 5 * time.Second @@ -68,6 +70,10 @@ type cliOptions struct { resourceManagement bool } +type rmWithMultipleDriversErr struct { + error +} + type preferredAllocationPolicyFunc func(*pluginapi.ContainerPreferredAllocationRequest) []string // nonePolicy is used for allocating GPU devices randomly, while trying @@ -283,7 +289,11 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi if options.resourceManagement { var err error - dp.resMan, err = rm.NewResourceManager(monitorID, namespace+"/"+deviceType) + dp.resMan, err = rm.NewResourceManager(monitorID, + []string{ + namespace + "/" + deviceTypeI915, + namespace + "/" + deviceTypeXe, + }) if err != nil { klog.Errorf("Failed to create resource manager: %+v", err) return nil @@ -345,13 +355,20 @@ func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocatio func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error { defer dp.scanTicker.Stop() - klog.V(1).Infof("GPU '%s' resource share count = %d", deviceType, dp.options.sharedDevNum) + klog.V(1).Infof("GPU (%s/%s) resource share count = %d", deviceTypeI915, deviceTypeXe, dp.options.sharedDevNum) - previousCount := map[string]int{deviceType: 0, monitorType: 0} + previousCount := map[string]int{ + deviceTypeI915: 0, deviceTypeXe: 0, + deviceTypeXe + monitorSuffix: 0, + deviceTypeI915 + monitorSuffix: 0} for { devTree, err := dp.scan() if err != nil { + if errors.Is(err, rmWithMultipleDriversErr{}) { + return err + } + klog.Warning("Failed to scan: ", err) } @@ -426,81 +443,116 @@ func (dp *devicePlugin) devSpecForDrmFile(drmFile string) (devSpec pluginapi.Dev return } +func (dp *devicePlugin) filterOutInvalidCards(files []fs.DirEntry) []fs.DirEntry { + filtered := []fs.DirEntry{} + + for _, f := range files { + if !dp.isCompatibleDevice(f.Name()) { + continue + } + + _, err := os.Stat(path.Join(dp.sysfsDir, f.Name(), "device/drm")) + if err != nil { + continue + } + + filtered = append(filtered, f) + } + + return filtered +} + +func (dp *devicePlugin) createDeviceSpecsFromDrmFiles(cardPath string) []pluginapi.DeviceSpec { + specs := []pluginapi.DeviceSpec{} + + drmFiles, _ := os.ReadDir(path.Join(cardPath, "device/drm")) + + for _, drmFile := range drmFiles { + devSpec, devPath, devSpecErr := dp.devSpecForDrmFile(drmFile.Name()) + if devSpecErr != nil { + continue + } + + klog.V(4).Infof("Adding %s to GPU %s", devPath, filepath.Base(cardPath)) + + specs = append(specs, devSpec) + } + + return specs +} + func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) { files, err := os.ReadDir(dp.sysfsDir) if err != nil { return nil, errors.Wrap(err, "Can't read sysfs folder") } - var monitor []pluginapi.DeviceSpec + monitor := make(map[string][]pluginapi.DeviceSpec, 0) devTree := dpapi.NewDeviceTree() rmDevInfos := rm.NewDeviceInfoMap() - tileCounts := []uint64{} + devProps := newDeviceProperties() - for _, f := range files { - var nodes []pluginapi.DeviceSpec + for _, f := range dp.filterOutInvalidCards(files) { + name := f.Name() + cardPath := path.Join(dp.sysfsDir, name) - if !dp.isCompatibleDevice(f.Name()) { + devProps.fetch(cardPath) + + if devProps.isPfWithVfs { continue } - cardPath := path.Join(dp.sysfsDir, f.Name()) + devSpecs := dp.createDeviceSpecsFromDrmFiles(cardPath) - drmFiles, err := os.ReadDir(path.Join(cardPath, "device/drm")) - if err != nil { - return nil, errors.Wrap(err, "Can't read device folder") + if len(devSpecs) == 0 { + continue } - isPFwithVFs := pluginutils.IsSriovPFwithVFs(path.Join(dp.sysfsDir, f.Name())) - tileCounts = append(tileCounts, labeler.GetTileCount(dp.sysfsDir, f.Name())) - - for _, drmFile := range drmFiles { - devSpec, devPath, devSpecErr := dp.devSpecForDrmFile(drmFile.Name()) - if devSpecErr != nil { - continue - } - - if !isPFwithVFs { - klog.V(4).Infof("Adding %s to GPU %s", devPath, f.Name()) + mounts := []pluginapi.Mount{} + if dp.bypathFound { + mounts = dp.bypathMountsForPci(cardPath, name, dp.bypathDir) + } - nodes = append(nodes, devSpec) - } + deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, nil, nil) - if dp.options.enableMonitoring { - klog.V(4).Infof("Adding %s to GPU %s/%s", devPath, monitorType, monitorID) + for i := 0; i < dp.options.sharedDevNum; i++ { + devID := fmt.Sprintf("%s-%d", name, i) + devTree.AddDevice(devProps.driver(), devID, deviceInfo) - monitor = append(monitor, devSpec) - } + rmDevInfos[devID] = rm.NewDeviceInfo(devSpecs, mounts, nil) } - if len(nodes) > 0 { - mounts := []pluginapi.Mount{} - if dp.bypathFound { - mounts = dp.bypathMountsForPci(cardPath, f.Name(), dp.bypathDir) - } - - deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, nodes, mounts, nil, nil) - - for i := 0; i < dp.options.sharedDevNum; i++ { - devID := fmt.Sprintf("%s-%d", f.Name(), i) - // Currently only one device type (i915) is supported. - // TODO: check model ID to differentiate device models. - devTree.AddDevice(deviceType, devID, deviceInfo) + if dp.options.enableMonitoring { + res := devProps.monitorResource() + klog.V(4).Infof("For %s/%s, adding nodes: %+v", res, monitorID, devSpecs) - rmDevInfos[devID] = rm.NewDeviceInfo(nodes, mounts, nil) - } + monitor[res] = append(monitor[res], devSpecs...) } } - // all Intel GPUs are under single monitoring resource + + // all Intel GPUs are under single monitoring resource per KMD if len(monitor) > 0 { - deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, monitor, nil, nil, nil) - devTree.AddDevice(monitorType, monitorID, deviceInfo) + for resourceName, devices := range monitor { + deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devices, nil, nil, nil) + devTree.AddDevice(resourceName, monitorID, deviceInfo) + } } if dp.resMan != nil { - dp.resMan.SetDevInfos(rmDevInfos) - dp.resMan.SetTileCountPerCard(tileCounts) + if devProps.drmDriverCount() <= 1 { + dp.resMan.SetDevInfos(rmDevInfos) + + if tileCount, err := devProps.maxTileCount(); err == nil { + dp.resMan.SetTileCountPerCard(tileCount) + } + } else { + klog.Warning("Plugin with RM doesn't support multiple DRM drivers:", devProps.drmDrivers) + + err := rmWithMultipleDriversErr{} + + return nil, err + } } return devTree, nil @@ -521,7 +573,7 @@ func main() { ) flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths") - flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable 'i915_monitoring' (= all GPUs) resource") + flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable '*_monitoring' (= all GPUs) resource") flag.BoolVar(&opts.resourceManagement, "resource-manager", false, "fractional GPU resource management") flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device") flag.StringVar(&opts.preferredAllocationPolicy, "allocation-policy", "none", "modes of allocating GPU devices: balanced, packed and none") diff --git a/cmd/gpu_plugin/gpu_plugin_test.go b/cmd/gpu_plugin/gpu_plugin_test.go index 0277a089f..e0ecd6b24 100644 --- a/cmd/gpu_plugin/gpu_plugin_test.go +++ b/cmd/gpu_plugin/gpu_plugin_test.go @@ -37,20 +37,26 @@ func init() { // mockNotifier implements Notifier interface. type mockNotifier struct { - scanDone chan bool - devCount int - monitorCount int + scanDone chan bool + i915Count int + xeCount int + i915monitorCount int + xeMonitorCount int } // Notify stops plugin Scan. func (n *mockNotifier) Notify(newDeviceTree dpapi.DeviceTree) { - n.monitorCount = len(newDeviceTree[monitorType]) - n.devCount = len(newDeviceTree[deviceType]) + n.xeCount = len(newDeviceTree[deviceTypeXe]) + n.xeMonitorCount = len(newDeviceTree[deviceTypeXe+monitorSuffix]) + n.i915Count = len(newDeviceTree[deviceTypeI915]) + n.i915monitorCount = len(newDeviceTree[deviceTypeDefault+monitorSuffix]) n.scanDone <- true } -type mockResourceManager struct{} +type mockResourceManager struct { + tileCount uint64 +} func (m *mockResourceManager) CreateFractionalResourceResponse(*v1beta1.AllocateRequest) (*v1beta1.AllocateResponse, error) { return &v1beta1.AllocateResponse{}, &dpapi.UseDefaultMethodError{} @@ -61,31 +67,62 @@ func (m *mockResourceManager) GetPreferredFractionalAllocation(*v1beta1.Preferre return &v1beta1.PreferredAllocationResponse{}, &dpapi.UseDefaultMethodError{} } -func (m *mockResourceManager) SetTileCountPerCard(counts []uint64) { +func (m *mockResourceManager) SetTileCountPerCard(count uint64) { + m.tileCount = count +} + +type TestCaseDetails struct { + name string + // test-case environment + sysfsdirs []string + sysfsfiles map[string][]byte + symlinkfiles map[string]string + devfsdirs []string + // how plugin should interpret it + options cliOptions + // what the result should be (i915) + expectedI915Devs int + expectedI915Monitors int + // what the result should be (xe) + expectedXeDevs int + expectedXeMonitors int } -func createTestFiles(root string, devfsdirs, sysfsdirs []string, sysfsfiles map[string][]byte) (string, string, error) { +func createTestFiles(root string, tc TestCaseDetails) (string, string, error) { sysfs := path.Join(root, "sys") devfs := path.Join(root, "dev") - for _, devfsdir := range devfsdirs { + for _, devfsdir := range tc.devfsdirs { if err := os.MkdirAll(path.Join(devfs, devfsdir), 0750); err != nil { return "", "", errors.Wrap(err, "Failed to create fake device directory") } } - for _, sysfsdir := range sysfsdirs { + for _, sysfsdir := range tc.sysfsdirs { if err := os.MkdirAll(path.Join(sysfs, sysfsdir), 0750); err != nil { return "", "", errors.Wrap(err, "Failed to create fake device directory") } } - for filename, body := range sysfsfiles { + for filename, body := range tc.sysfsfiles { if err := os.WriteFile(path.Join(sysfs, filename), body, 0600); err != nil { return "", "", errors.Wrap(err, "Failed to create fake vendor file") } } + for source, target := range tc.symlinkfiles { + driverPath := path.Join(sysfs, target) + symlinkPath := path.Join(sysfs, source) + + if err := os.MkdirAll(driverPath, 0750); err != nil { + return "", "", errors.Wrap(err, "Failed to create fake driver file.") + } + + if err := os.Symlink(driverPath, symlinkPath); err != nil { + return "", "", errors.Wrap(err, "Failed to create fake driver symlink file.") + } + } + return sysfs, devfs, nil } @@ -186,18 +223,7 @@ func TestAllocate(t *testing.T) { } func TestScan(t *testing.T) { - tcases := []struct { - name string - // test-case environment - sysfsdirs []string - sysfsfiles map[string][]byte - devfsdirs []string - // how plugin should interpret it - options cliOptions - // what the result should be - expectedDevs int - expectedMonitors int - }{ + tcases := []TestCaseDetails{ { name: "no sysfs mounted", }, @@ -223,7 +249,71 @@ func TestScan(t *testing.T) { "by-path/pci-0000:00:00.0-card", "by-path/pci-0000:00:00.0-render", }, - expectedDevs: 1, + expectedI915Devs: 1, + }, + { + name: "one device with xe driver", + sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"}, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + }, + symlinkfiles: map[string]string{ + "card0/device/driver": "drivers/xe", + }, + devfsdirs: []string{ + "card0", + "by-path/pci-0000:00:00.0-card", + "by-path/pci-0000:00:00.0-render", + }, + expectedXeDevs: 1, + }, + { + name: "two devices with xe driver and monitoring", + sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64", "card1/device/drm/card1"}, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card1/device/vendor": []byte("0x8086"), + }, + symlinkfiles: map[string]string{ + "card0/device/driver": "drivers/xe", + "card1/device/driver": "drivers/xe", + }, + devfsdirs: []string{ + "card0", + "by-path/pci-0000:00:00.0-card", + "by-path/pci-0000:00:00.0-render", + "card1", + "by-path/pci-0000:00:01.0-card", + "by-path/pci-0000:00:01.0-render", + }, + options: cliOptions{enableMonitoring: true}, + expectedXeDevs: 2, + expectedXeMonitors: 1, + }, + { + name: "two devices with xe and i915 drivers", + sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64", "card1/device/drm/card1"}, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card1/device/vendor": []byte("0x8086"), + }, + symlinkfiles: map[string]string{ + "card0/device/driver": "drivers/xe", + "card1/device/driver": "drivers/i915", + }, + devfsdirs: []string{ + "card0", + "by-path/pci-0000:00:00.0-card", + "by-path/pci-0000:00:00.0-render", + "card1", + "by-path/pci-0000:00:01.0-card", + "by-path/pci-0000:00:01.0-render", + }, + options: cliOptions{enableMonitoring: true}, + expectedXeDevs: 1, + expectedXeMonitors: 1, + expectedI915Devs: 1, + expectedI915Monitors: 1, }, { name: "sriov-1-pf-no-vfs + monitoring", @@ -232,10 +322,10 @@ func TestScan(t *testing.T) { "card0/device/vendor": []byte("0x8086"), "card0/device/sriov_numvfs": []byte("0"), }, - devfsdirs: []string{"card0"}, - options: cliOptions{enableMonitoring: true}, - expectedDevs: 1, - expectedMonitors: 1, + devfsdirs: []string{"card0"}, + options: cliOptions{enableMonitoring: true}, + expectedI915Devs: 1, + expectedI915Monitors: 1, }, { name: "two sysfs records but one dev node", @@ -247,8 +337,8 @@ func TestScan(t *testing.T) { "card0/device/vendor": []byte("0x8086"), "card1/device/vendor": []byte("0x8086"), }, - devfsdirs: []string{"card0"}, - expectedDevs: 1, + devfsdirs: []string{"card0"}, + expectedI915Devs: 1, }, { name: "sriov-1-pf-and-2-vfs", @@ -263,8 +353,8 @@ func TestScan(t *testing.T) { "card1/device/vendor": []byte("0x8086"), "card2/device/vendor": []byte("0x8086"), }, - devfsdirs: []string{"card0", "card1", "card2"}, - expectedDevs: 2, + devfsdirs: []string{"card0", "card1", "card2"}, + expectedI915Devs: 2, }, { name: "two devices with 13 shares + monitoring", @@ -276,10 +366,10 @@ func TestScan(t *testing.T) { "card0/device/vendor": []byte("0x8086"), "card1/device/vendor": []byte("0x8086"), }, - devfsdirs: []string{"card0", "card1"}, - options: cliOptions{sharedDevNum: 13, enableMonitoring: true}, - expectedDevs: 26, - expectedMonitors: 1, + devfsdirs: []string{"card0", "card1"}, + options: cliOptions{sharedDevNum: 13, enableMonitoring: true}, + expectedI915Devs: 26, + expectedI915Monitors: 1, }, { name: "wrong vendor", @@ -317,7 +407,7 @@ func TestScan(t *testing.T) { // dirs/files need to be removed for the next test defer os.RemoveAll(root) - sysfs, devfs, err := createTestFiles(root, tc.devfsdirs, tc.sysfsdirs, tc.sysfsfiles) + sysfs, devfs, err := createTestFiles(root, tc) if err != nil { t.Errorf("unexpected error: %+v", err) } @@ -328,20 +418,157 @@ func TestScan(t *testing.T) { scanDone: plugin.scanDone, } - plugin.resMan = &mockResourceManager{} - err = plugin.Scan(notifier) // Scans in GPU plugin never fail if err != nil { t.Errorf("unexpected error: %+v", err) } - if tc.expectedDevs != notifier.devCount { - t.Errorf("Expected %d, discovered %d devices", - tc.expectedDevs, notifier.devCount) + if tc.expectedI915Devs != notifier.i915Count { + t.Errorf("Expected %d, discovered %d devices (i915)", + tc.expectedI915Devs, notifier.i915Count) + } + if tc.expectedI915Monitors != notifier.i915monitorCount { + t.Errorf("Expected %d, discovered %d monitors (i915)", + tc.expectedI915Monitors, notifier.i915monitorCount) + } + if tc.expectedXeDevs != notifier.xeCount { + t.Errorf("Expected %d, discovered %d devices (XE)", + tc.expectedXeDevs, notifier.xeCount) + } + if tc.expectedXeMonitors != notifier.xeMonitorCount { + t.Errorf("Expected %d, discovered %d monitors (XE)", + tc.expectedXeMonitors, notifier.xeMonitorCount) + } + }) + } +} + +func TestScanFails(t *testing.T) { + tc := TestCaseDetails{ + name: "xe and i915 devices with rm will fail", + sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64", "card1/device/drm/card1"}, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card1/device/vendor": []byte("0x8086"), + }, + symlinkfiles: map[string]string{ + "card0/device/driver": "drivers/xe", + "card1/device/driver": "drivers/i915", + }, + devfsdirs: []string{ + "card0", + "card1", + }, + } + + t.Run(tc.name, func(t *testing.T) { + root, err := os.MkdirTemp("", "test_new_device_plugin") + if err != nil { + t.Fatalf("can't create temporary directory: %+v", err) + } + // dirs/files need to be removed for the next test + defer os.RemoveAll(root) + + sysfs, devfs, err := createTestFiles(root, tc) + if err != nil { + t.Errorf("unexpected error: %+v", err) + } + + plugin := newDevicePlugin(sysfs, devfs, tc.options) + + plugin.resMan = &mockResourceManager{} + + notifier := &mockNotifier{ + scanDone: plugin.scanDone, + } + + err = plugin.Scan(notifier) + if err == nil { + t.Error("unexpected nil error") + } + }) +} + +func TestScanWithRmAndTiles(t *testing.T) { + tcs := []TestCaseDetails{ + { + name: "two tile xe devices with rm enabled - homogeneous", + sysfsdirs: []string{ + "card0/device/drm/card0", + "card1/device/drm/card1", + "card0/device/tile0/gt0", + "card0/device/tile1/gt1", + "card1/device/tile0/gt0", + "card1/device/tile1/gt1", + }, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card1/device/vendor": []byte("0x8086"), + }, + symlinkfiles: map[string]string{ + "card0/device/driver": "drivers/xe", + "card1/device/driver": "drivers/xe", + }, + devfsdirs: []string{ + "card0", + "card1", + }, + }, + { + name: "2 & 1 tile xe devices with rm enabled - heterogeneous", + sysfsdirs: []string{ + "card0/device/drm/card0", + "card1/device/drm/card1", + "card0/device/tile0/gt0", + "card0/device/tile1/gt1", + "card1/device/tile0/gt0", + }, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card1/device/vendor": []byte("0x8086"), + }, + symlinkfiles: map[string]string{ + "card0/device/driver": "drivers/xe", + "card1/device/driver": "drivers/xe", + }, + devfsdirs: []string{ + "card0", + "card1", + }, + }, + } + + expectedTileCounts := []uint64{2, 0} + + for i, tc := range tcs { + t.Run(tc.name, func(t *testing.T) { + root, err := os.MkdirTemp("", "test_new_device_plugin") + if err != nil { + t.Fatalf("can't create temporary directory: %+v", err) + } + // dirs/files need to be removed for the next test + defer os.RemoveAll(root) + + sysfs, devfs, err := createTestFiles(root, tc) + if err != nil { + t.Errorf("unexpected error: %+v", err) + } + + plugin := newDevicePlugin(sysfs, devfs, tc.options) + + rm := &mockResourceManager{} + plugin.resMan = rm + + notifier := &mockNotifier{ + scanDone: plugin.scanDone, + } + + err = plugin.Scan(notifier) + if err != nil { + t.Error("unexpected error") } - if tc.expectedMonitors != notifier.monitorCount { - t.Errorf("Expected %d, discovered %d monitors", - tc.expectedMonitors, notifier.monitorCount) + if rm.tileCount != expectedTileCounts[i] { + t.Error("unexpected tilecount for RM") } }) } diff --git a/cmd/gpu_plugin/monitoring.md b/cmd/gpu_plugin/monitoring.md new file mode 100644 index 000000000..3b3050aeb --- /dev/null +++ b/cmd/gpu_plugin/monitoring.md @@ -0,0 +1,32 @@ +# Monitoring GPUs + +## i915_monitoring resource + +GPU plugin can be configured to register a monitoring resource for the nodes that have Intel GPUs on them. `gpu.intel.com/i915_monitoring` (or `gpu.intel.com/xe_monitoring`) is a singular resource on the nodes. A container requesting it, will get access to _all_ the Intel GPUs (`i915` or `xe` KMD device files) on the node. The idea behind this resource is to allow the container to _monitor_ the GPUs. A container requesting the `i915_monitoring` resource would typically export data to some metrics consumer. An example for such a consumer is [Prometheus](https://prometheus.io/). + +
+ +
Monitoring Pod listening to all GPUs while one Pod is using a GPU.
+
+ +For the monitoring applications, there are two possibilities: [Intel XPU Manager](https://github.com/intel/xpumanager/) and [collectd](https://github.com/collectd/collectd/tree/collectd-6.0). Intel XPU Manager is readily available as a container and with a deployment yaml. collectd has Intel GPU support in its 6.0 branch, but there are no public containers available for it. + +To deploy XPU Manager to a cluster, one has to run the following kubectl: +``` +$ kubectl apply -k https://github.com/intel/xpumanager/deployment/kubernetes/daemonset/base +``` + +This will deploy an XPU Manager daemonset to run on all the nodes having the `i915_monitoring` resource. + +## Prometheus integration with XPU Manager + +For deploying Prometheus to a cluster, see [this page](https://prometheus-operator.dev/docs/user-guides/getting-started/). One can also use Prometheus' [helm chart](https://github.com/prometheus-community/helm-charts). + +Prometheus requires additional Kubernetes configuration so it can fetch GPU metrics. The following steps will add a Kubernetes Service and a ServiceMonitor components. The components instruct Prometheus how and where from to retrieve the metrics. + +``` +$ kubectl apply -f https://raw.githubusercontent.com/intel/xpumanager/master/deployment/kubernetes/monitoring/service-intel-xpum.yaml +$ kubectl apply -f https://raw.githubusercontent.com/intel/xpumanager/master/deployment/kubernetes/monitoring/servicemonitor-intel-xpum.yaml +``` + +With those components in place, one can query Intel GPU metrics from Prometheus with `xpum_` prefix. diff --git a/cmd/gpu_plugin/monitoring.png b/cmd/gpu_plugin/monitoring.png new file mode 100644 index 000000000..c56fc5057 Binary files /dev/null and b/cmd/gpu_plugin/monitoring.png differ diff --git a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go index 491d27fe1..4a5046da0 100644 --- a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go +++ b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go @@ -25,7 +25,6 @@ import ( "net" "net/http" "os" - "slices" "sort" "strconv" "strings" @@ -105,7 +104,7 @@ type ResourceManager interface { CreateFractionalResourceResponse(*pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) GetPreferredFractionalAllocation(*pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) SetDevInfos(DeviceInfoMap) - SetTileCountPerCard(counts []uint64) + SetTileCountPerCard(count uint64) } type containerAssignments struct { @@ -118,20 +117,20 @@ type podAssignmentDetails struct { } type resourceManager struct { - clientset kubernetes.Interface - deviceInfos DeviceInfoMap - prGetClientFunc getClientFunc - assignments map[string]podAssignmentDetails // pod name -> assignment details - nodeName string - hostIP string - skipID string - fullResourceName string - retryTimeout time.Duration - cleanupInterval time.Duration - mutex sync.RWMutex // for devTree updates during scan - cleanupMutex sync.RWMutex // for assignment details during cleanup - useKubelet bool - tileCountPerCard uint64 + clientset kubernetes.Interface + deviceInfos DeviceInfoMap + prGetClientFunc getClientFunc + assignments map[string]podAssignmentDetails // pod name -> assignment details + nodeName string + hostIP string + skipID string + fullResourceNames []string + retryTimeout time.Duration + cleanupInterval time.Duration + mutex sync.RWMutex // for devTree updates during scan + cleanupMutex sync.RWMutex // for assignment details during cleanup + useKubelet bool + tileCountPerCard uint64 } // NewDeviceInfo creates a new DeviceInfo. @@ -152,7 +151,7 @@ func NewDeviceInfoMap() DeviceInfoMap { } // NewResourceManager creates a new resource manager. -func NewResourceManager(skipID, fullResourceName string) (ResourceManager, error) { +func NewResourceManager(skipID string, fullResourceNames []string) (ResourceManager, error) { clientset, err := getClientset() if err != nil { @@ -160,16 +159,16 @@ func NewResourceManager(skipID, fullResourceName string) (ResourceManager, error } rm := resourceManager{ - nodeName: os.Getenv("NODE_NAME"), - hostIP: os.Getenv("HOST_IP"), - clientset: clientset, - skipID: skipID, - fullResourceName: fullResourceName, - prGetClientFunc: podresources.GetV1Client, - assignments: make(map[string]podAssignmentDetails), - retryTimeout: 1 * time.Second, - cleanupInterval: 20 * time.Minute, - useKubelet: true, + nodeName: os.Getenv("NODE_NAME"), + hostIP: os.Getenv("HOST_IP"), + clientset: clientset, + skipID: skipID, + fullResourceNames: fullResourceNames, + prGetClientFunc: podresources.GetV1Client, + assignments: make(map[string]podAssignmentDetails), + retryTimeout: 1 * time.Second, + cleanupInterval: 20 * time.Minute, + useKubelet: true, } klog.Info("GPU device plugin resource manager enabled") @@ -684,7 +683,7 @@ func (rm *resourceManager) getNodePendingGPUPods() (map[string]*v1.Pod, error) { pendingPods := rm.listPodsOnNodeWithStates([]string{string(v1.PodPending)}) for podName, pod := range pendingPods { - if numGPUUsingContainers(pod, rm.fullResourceName) == 0 { + if numGPUUsingContainers(pod, rm.fullResourceNames) == 0 { delete(pendingPods, podName) } } @@ -719,7 +718,7 @@ func (rm *resourceManager) findAllocationPodCandidates(pendingPods map[string]*v for _, cont := range podRes.Containers { for _, dev := range cont.Devices { - if dev.ResourceName == rm.fullResourceName { + if sslices.Contains(rm.fullResourceNames, dev.ResourceName) { numContainersAllocated++ break } @@ -729,7 +728,7 @@ func (rm *resourceManager) findAllocationPodCandidates(pendingPods map[string]*v key := getPodResourceKey(podRes) if pod, pending := pendingPods[key]; pending { - allocationTargetNum := numGPUUsingContainers(pod, rm.fullResourceName) + allocationTargetNum := numGPUUsingContainers(pod, rm.fullResourceNames) if numContainersAllocated < allocationTargetNum { candidate := podCandidate{ pod: pod, @@ -751,23 +750,10 @@ func (rm *resourceManager) SetDevInfos(deviceInfos DeviceInfoMap) { rm.deviceInfos = deviceInfos } -func (rm *resourceManager) SetTileCountPerCard(counts []uint64) { - if len(counts) == 0 { - return - } - - minCount := slices.Min(counts) - maxCount := slices.Max(counts) - - if minCount != maxCount { - klog.Warningf("Node's GPUs are heterogenous (min: %d, max: %d tiles)", minCount, maxCount) - - return - } - +func (rm *resourceManager) SetTileCountPerCard(count uint64) { rm.mutex.Lock() defer rm.mutex.Unlock() - rm.tileCountPerCard = maxCount + rm.tileCountPerCard = count } func (rm *resourceManager) createAllocateResponse(deviceIds []string, tileAffinityMask string) (*pluginapi.AllocateResponse, error) { @@ -818,13 +804,13 @@ func (rm *resourceManager) createAllocateResponse(deviceIds []string, tileAffini return &allocateResponse, nil } -func numGPUUsingContainers(pod *v1.Pod, fullResourceName string) int { +func numGPUUsingContainers(pod *v1.Pod, fullResourceNames []string) int { num := 0 for _, container := range pod.Spec.Containers { for reqName, quantity := range container.Resources.Requests { resourceName := reqName.String() - if resourceName == fullResourceName { + if sslices.Contains(fullResourceNames, resourceName) { value, _ := quantity.AsInt64() if value > 0 { num++ diff --git a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go index ae8038da3..09a5c68b2 100644 --- a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go +++ b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go @@ -107,11 +107,11 @@ func newMockResourceManager(pods []v1.Pod) ResourceManager { prGetClientFunc: func(string, time.Duration, int) (podresourcesv1.PodResourcesListerClient, *grpc.ClientConn, error) { return &mockPodResources{pods: pods}, client, nil }, - skipID: "all", - fullResourceName: "gpu.intel.com/i915", - assignments: make(map[string]podAssignmentDetails), - retryTimeout: 1 * time.Millisecond, - useKubelet: false, + skipID: "all", + fullResourceNames: []string{"gpu.intel.com/i915", "gpu.intel.com/xe"}, + assignments: make(map[string]podAssignmentDetails), + retryTimeout: 1 * time.Millisecond, + useKubelet: false, } deviceInfoMap := NewDeviceInfoMap() @@ -150,7 +150,7 @@ type testCase struct { func TestNewResourceManager(t *testing.T) { // normal clientset is unavailable inside the unit tests - _, err := NewResourceManager("foo", "bar") + _, err := NewResourceManager("foo", []string{"bar"}) if err == nil { t.Errorf("unexpected success") @@ -419,7 +419,7 @@ func TestCreateFractionalResourceResponse(t *testing.T) { for _, tCase := range testCases { rm := newMockResourceManager(tCase.pods) - rm.SetTileCountPerCard([]uint64{1}) + rm.SetTileCountPerCard(uint64(1)) _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ ContainerRequests: tCase.prefContainerRequests, @@ -501,7 +501,7 @@ func TestCreateFractionalResourceResponseWithOneCardTwoTiles(t *testing.T) { } rm := newMockResourceManager(tCase.pods) - rm.SetTileCountPerCard([]uint64{2}) + rm.SetTileCountPerCard(uint64(2)) _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ ContainerRequests: tCase.prefContainerRequests, @@ -574,7 +574,7 @@ func TestCreateFractionalResourceResponseWithTwoCardsOneTile(t *testing.T) { } rm := newMockResourceManager(tCase.pods) - rm.SetTileCountPerCard([]uint64{5}) + rm.SetTileCountPerCard(uint64(5)) _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ ContainerRequests: tCase.prefContainerRequests, @@ -652,7 +652,7 @@ func TestCreateFractionalResourceResponseWithThreeCardsTwoTiles(t *testing.T) { } rm := newMockResourceManager(tCase.pods) - rm.SetTileCountPerCard([]uint64{5}) + rm.SetTileCountPerCard(uint64(5)) _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ ContainerRequests: tCase.prefContainerRequests, @@ -747,7 +747,7 @@ func TestCreateFractionalResourceResponseWithMultipleContainersTileEach(t *testi } rm := newMockResourceManager(tCase.pods) - rm.SetTileCountPerCard([]uint64{2}) + rm.SetTileCountPerCard(uint64(2)) _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ ContainerRequests: properPrefContainerRequests, diff --git a/cmd/internal/labeler/labeler.go b/cmd/internal/labeler/labeler.go index 869bd8da4..0d2fdc19f 100644 --- a/cmd/internal/labeler/labeler.go +++ b/cmd/internal/labeler/labeler.go @@ -184,10 +184,16 @@ func GetMemoryAmount(sysfsDrmDir, gpuName string, numTiles uint64) uint64 { } // GetTileCount reads the tile count. -func GetTileCount(sysfsDrmDir, gpuName string) (numTiles uint64) { - filePath := filepath.Join(sysfsDrmDir, gpuName, "gt/gt*") +func GetTileCount(cardPath string) (numTiles uint64) { + files := []string{} - files, _ := filepath.Glob(filePath) + paths, _ := filepath.Glob(filepath.Join(cardPath, "gt/gt*")) // i915 driver + files = append(files, paths...) + + paths, _ = filepath.Glob(filepath.Join(cardPath, "device/tile?")) // Xe driver + files = append(files, paths...) + + klog.V(4).Info("tile files found:", files) if len(files) == 0 { return 1 @@ -232,6 +238,19 @@ func (lm labelMap) addNumericLabel(labelName string, valueToAdd int64) { lm[labelName] = strconv.FormatInt(value, 10) } +// Stores a long string to labels so that it's possibly split into multiple +// keys: foobar="", foobar2="", foobar3="The end." +func (lm labelMap) addSplittableString(labelBase, fullValue string) { + splitList := pluginutils.SplitAtLastAlphaNum(fullValue, labelMaxLength, labelControlChar) + + lm[labelBase] = splitList[0] + + for i := 1; i < len(splitList); i++ { + nextLabel := labelBase + strconv.FormatInt(int64(i+1), 10) + lm[nextLabel] = splitList[i] + } +} + // this returns pci groups label value, groups separated by "_", gpus separated by ".". // Example for two groups with 4 gpus: "0.1.2.3_4.5.6.7". func (l *labeler) createPCIGroupLabel(gpuNumList []string) string { @@ -295,7 +314,7 @@ func (l *labeler) createLabels() error { return errors.Wrap(err, "gpu name parsing error") } - numTiles := GetTileCount(l.sysfsDRMDir, gpuName) + numTiles := GetTileCount(filepath.Join(l.sysfsDRMDir, gpuName)) tileCount += int(numTiles) memoryAmount := GetMemoryAmount(l.sysfsDRMDir, gpuName, numTiles) @@ -327,24 +346,13 @@ func (l *labeler) createLabels() error { strings.Join(gpuNameList, "."), labelMaxLength, labelControlChar)[0] // add gpu num list label(s) (example: "0.1.2", which is short form of "card0.card1.card2") - allGPUs := strings.Join(gpuNumList, ".") - gpuNumLists := pluginutils.SplitAtLastAlphaNum(allGPUs, labelMaxLength, labelControlChar) - - l.labels[labelNamespace+gpuNumListLabelName] = gpuNumLists[0] - for i := 1; i < len(gpuNumLists); i++ { - l.labels[labelNamespace+gpuNumListLabelName+strconv.FormatInt(int64(i+1), 10)] = gpuNumLists[i] - } + l.labels.addSplittableString(labelNamespace+gpuNumListLabelName, strings.Join(gpuNumList, ".")) if len(numaMapping) > 0 { // add numa node mapping to labels: gpu.intel.com/numa-gpu-map="0-0.1.2.3_1-4.5.6.7" numaMappingLabel := createNumaNodeMappingLabel(numaMapping) - numaMappingLabelList := pluginutils.SplitAtLastAlphaNum(numaMappingLabel, labelMaxLength, labelControlChar) - - l.labels[labelNamespace+numaMappingName] = numaMappingLabelList[0] - for i := 1; i < len(numaMappingLabelList); i++ { - l.labels[labelNamespace+numaMappingName+strconv.FormatInt(int64(i+1), 10)] = numaMappingLabelList[i] - } + l.labels.addSplittableString(labelNamespace+numaMappingName, numaMappingLabel) } // all GPUs get default number of millicores (1000) @@ -353,12 +361,7 @@ func (l *labeler) createLabels() error { // aa pci-group label(s), (two group example: "1.2.3.4_5.6.7.8") allPCIGroups := l.createPCIGroupLabel(gpuNumList) if allPCIGroups != "" { - pciGroups := pluginutils.SplitAtLastAlphaNum(allPCIGroups, labelMaxLength, labelControlChar) - - l.labels[labelNamespace+pciGroupLabelName] = pciGroups[0] - for i := 1; i < len(gpuNumLists); i++ { - l.labels[labelNamespace+pciGroupLabelName+strconv.FormatInt(int64(i+1), 10)] = pciGroups[i] - } + l.labels.addSplittableString(labelNamespace+pciGroupLabelName, allPCIGroups) } } diff --git a/cmd/internal/labeler/labeler_test.go b/cmd/internal/labeler/labeler_test.go index 31186e224..e3dd50bef 100644 --- a/cmd/internal/labeler/labeler_test.go +++ b/cmd/internal/labeler/labeler_test.go @@ -137,60 +137,6 @@ func getTestCases() []testcase { "gpu.intel.com/tiles": "1", }, }, - { - sysfsdirs: []string{ - "card0/device/drm/card0", - }, - sysfsfiles: map[string][]byte{ - "card0/device/vendor": []byte("0x8086"), - }, - name: "when gen:capability info is missing", - memoryOverride: 16000000000, - expectedRetval: nil, - expectedLabels: labelMap{ - "gpu.intel.com/millicores": "1000", - "gpu.intel.com/memory.max": "16000000000", - "gpu.intel.com/cards": "card0", - "gpu.intel.com/gpu-numbers": "0", - "gpu.intel.com/tiles": "1", - }, - }, - { - sysfsdirs: []string{ - "card0/device/drm/card0", - }, - sysfsfiles: map[string][]byte{ - "card0/device/vendor": []byte("0x8086"), - }, - name: "gen version missing, but media & graphics versions present", - memoryOverride: 16000000000, - expectedRetval: nil, - expectedLabels: labelMap{ - "gpu.intel.com/millicores": "1000", - "gpu.intel.com/memory.max": "16000000000", - "gpu.intel.com/cards": "card0", - "gpu.intel.com/gpu-numbers": "0", - "gpu.intel.com/tiles": "1", - }, - }, - { - sysfsdirs: []string{ - "card0/device/drm/card0", - }, - sysfsfiles: map[string][]byte{ - "card0/device/vendor": []byte("0x8086"), - }, - name: "only media version present", - memoryOverride: 16000000000, - expectedRetval: nil, - expectedLabels: labelMap{ - "gpu.intel.com/millicores": "1000", - "gpu.intel.com/memory.max": "16000000000", - "gpu.intel.com/cards": "card0", - "gpu.intel.com/gpu-numbers": "0", - "gpu.intel.com/tiles": "1", - }, - }, { sysfsdirs: []string{ "card0/device/drm/card0", @@ -562,6 +508,74 @@ func getTestCases() []testcase { "gpu.intel.com/tiles": "1", }, }, + { + sysfsdirs: []string{ + "card0/device/drm/card0", + "card0/device/tile0/gt0", + "card0/device/tile1/gt0", + "card1/device/drm/card1", + "card1/device/tile0/gt0", + "card1/device/tile1/gt0", + "card2/device/drm/card2", + "card2/device/tile0/gt0", + "card2/device/tile1/gt0", + }, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card0/lmem_total_bytes": []byte("8000"), + "card0/device/numa_node": []byte("1"), + "card1/device/vendor": []byte("0x8086"), + "card1/lmem_total_bytes": []byte("8000"), + "card1/device/numa_node": []byte("1"), + "card2/device/vendor": []byte("0x8086"), + "card2/lmem_total_bytes": []byte("8000"), + "card2/device/numa_node": []byte("1"), + }, + name: "successful labeling with three cards and with xe driver", + expectedRetval: nil, + expectedLabels: labelMap{ + "gpu.intel.com/millicores": "3000", + "gpu.intel.com/memory.max": "48000", + "gpu.intel.com/gpu-numbers": "0.1.2", + "gpu.intel.com/cards": "card0.card1.card2", + "gpu.intel.com/tiles": "6", + "gpu.intel.com/numa-gpu-map": "1-0.1.2", + }, + }, + { + sysfsdirs: []string{ + "card0/device/drm/card0", + "card0/device/tile0/gt0", + "card0/device/tile0/gt1", + "card0/device/tile1/gt2", + "card0/device/tile1/gt3", + "card0/device/tile1/gt4", + "card0/device/tile1/gt5", + "card1/device/drm/card1", + "card1/device/tile0/gt0", + "card1/device/tile0/gt1", + "card1/device/tile1/gt2", + "card1/device/tile1/gt4", + }, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card0/lmem_total_bytes": []byte("8000"), + "card0/device/numa_node": []byte("1"), + "card1/device/vendor": []byte("0x8086"), + "card1/lmem_total_bytes": []byte("8000"), + "card1/device/numa_node": []byte("1"), + }, + name: "successful labeling with two cards, two tiles per card and multiple gts per tile", + expectedRetval: nil, + expectedLabels: labelMap{ + "gpu.intel.com/millicores": "2000", + "gpu.intel.com/memory.max": "32000", + "gpu.intel.com/gpu-numbers": "0.1", + "gpu.intel.com/cards": "card0.card1", + "gpu.intel.com/tiles": "4", + "gpu.intel.com/numa-gpu-map": "1-0.1", + }, + }, } } diff --git a/cmd/internal/pluginutils/devicedriver.go b/cmd/internal/pluginutils/devicedriver.go new file mode 100644 index 000000000..0c7cda3fa --- /dev/null +++ b/cmd/internal/pluginutils/devicedriver.go @@ -0,0 +1,30 @@ +// Copyright 2024 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pluginutils + +import ( + "os" + "path/filepath" +) + +// Read driver for a device. +func ReadDeviceDriver(path string) (string, error) { + linkpath, err := os.Readlink(filepath.Join(path, "device/driver")) + if err != nil { + return "", err + } + + return filepath.Base(linkpath), nil +} diff --git a/cmd/internal/pluginutils/devicedriver_test.go b/cmd/internal/pluginutils/devicedriver_test.go new file mode 100644 index 000000000..b72a1bdbc --- /dev/null +++ b/cmd/internal/pluginutils/devicedriver_test.go @@ -0,0 +1,80 @@ +// Copyright 2024 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pluginutils + +import ( + "os" + "path/filepath" + "testing" +) + +func TestDeviceDriverSymlink(t *testing.T) { + root, err := os.MkdirTemp("", "test_devicedriver") + if err != nil { + t.Fatalf("can't create temporary directory: %+v", err) + } + + defer os.RemoveAll(root) + + err = os.Mkdir(filepath.Join(root, "i915"), 0777) + if err != nil { + t.Errorf("Failed to create required directory structure: %+v", err) + } + + err = os.Mkdir(filepath.Join(root, "device"), 0777) + if err != nil { + t.Errorf("Failed to create required directory structure: %+v", err) + } + + err = os.Symlink(filepath.Join(root, "i915"), filepath.Join(root, "device", "driver")) + if err != nil { + t.Errorf("Failed to create required directory structure: %+v", err) + } + + driver, err := ReadDeviceDriver(root) + + if err != nil { + t.Errorf("Got error when there shouldn't be any: %+v", err) + } + + if driver != "i915" { + t.Errorf("Got invalid driver: %s", driver) + } +} + +func TestDeviceDriverSymlinkError(t *testing.T) { + root, err := os.MkdirTemp("", "test_devicedriver") + if err != nil { + t.Fatalf("can't create temporary directory: %+v", err) + } + + defer os.RemoveAll(root) + + err = os.Mkdir(filepath.Join(root, "i915"), 0777) + if err != nil { + t.Errorf("Failed to create required directory structure: %+v", err) + } + + err = os.MkdirAll(filepath.Join(root, "device", "driver"), 0777) + if err != nil { + t.Errorf("Failed to create required directory structure: %+v", err) + } + + _, err = ReadDeviceDriver(root) + + if err == nil { + t.Errorf("Got no error when there should be one") + } +} diff --git a/deployments/nfd/overlays/node-feature-rules/node-feature-rules.yaml b/deployments/nfd/overlays/node-feature-rules/node-feature-rules.yaml index 7e32d4c2e..1ccc85ab5 100644 --- a/deployments/nfd/overlays/node-feature-rules/node-feature-rules.yaml +++ b/deployments/nfd/overlays/node-feature-rules/node-feature-rules.yaml @@ -57,9 +57,23 @@ spec: matchExpressions: vendor: {op: In, value: ["8086"]} class: {op: In, value: ["0300", "0380"]} - - feature: kernel.loadedmodule - matchExpressions: - i915: {op: Exists} + matchAny: + - matchFeatures: + - feature: kernel.loadedmodule + matchExpressions: + i915: {op: Exists} + - matchFeatures: + - feature: kernel.enabledmodule + matchExpressions: + i915: {op: Exists} + - matchFeatures: + - feature: kernel.loadedmodule + matchExpressions: + xe: {op: Exists} + - matchFeatures: + - feature: kernel.enabledmodule + matchExpressions: + xe: {op: Exists} - name: "intel.iaa" labels: diff --git a/deployments/xpumanager_sidecar/kustom/kustom_xpumanager.yaml b/deployments/xpumanager_sidecar/kustom/kustom_xpumanager.yaml index 69acf5898..3ce726271 100644 --- a/deployments/xpumanager_sidecar/kustom/kustom_xpumanager.yaml +++ b/deployments/xpumanager_sidecar/kustom/kustom_xpumanager.yaml @@ -27,8 +27,3 @@ spec: - ALL readOnlyRootFilesystem: true runAsUser: 0 - - name: xpumd - resources: - limits: - $patch: replace - gpu.intel.com/i915_monitoring: 1 diff --git a/deployments/xpumanager_sidecar/kustomization.yaml b/deployments/xpumanager_sidecar/kustomization.yaml index 728397536..a72b9631c 100644 --- a/deployments/xpumanager_sidecar/kustomization.yaml +++ b/deployments/xpumanager_sidecar/kustomization.yaml @@ -1,5 +1,5 @@ resources: -- https://raw.githubusercontent.com/intel/xpumanager/V1.2.18/deployment/kubernetes/daemonset-intel-xpum.yaml +- https://github.com/intel/xpumanager/deployment/kubernetes/daemonset/base/?ref=V1.2.29 namespace: monitoring apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization diff --git a/test/e2e/gpu/gpu.go b/test/e2e/gpu/gpu.go index 52747673a..783d556cb 100644 --- a/test/e2e/gpu/gpu.go +++ b/test/e2e/gpu/gpu.go @@ -144,4 +144,57 @@ func describe() { ginkgo.It("does nothing", func() {}) }) }) + + ginkgo.Context("When GPU resources are available [Resource:xe]", func() { + ginkgo.BeforeEach(func(ctx context.Context) { + ginkgo.By("checking if the resource is allocatable") + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/xe", 30*time.Second); err != nil { + framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) + } + }) + ginkgo.It("checks availability of GPU resources [App:busybox]", func(ctx context.Context) { + ginkgo.By("submitting a pod requesting GPU resources") + podSpec := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "gpuplugin-tester"}, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Args: []string{"-c", "ls /dev/dri"}, + Name: containerName, + Image: imageutils.GetE2EImage(imageutils.BusyBox), + Command: []string{"/bin/sh"}, + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{"gpu.intel.com/xe": resource.MustParse("1")}, + Limits: v1.ResourceList{"gpu.intel.com/xe": resource.MustParse("1")}, + }, + }, + }, + RestartPolicy: v1.RestartPolicyNever, + }, + } + pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(ctx, podSpec, metav1.CreateOptions{}) + framework.ExpectNoError(err, "pod Create API error") + + ginkgo.By("waiting the pod to finish successfully") + e2epod.NewPodClient(f).WaitForSuccess(ctx, pod.ObjectMeta.Name, 60*time.Second) + + ginkgo.By("checking log output") + log, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, containerName) + + if err != nil { + framework.Failf("unable to get log from pod: %v", err) + } + + if !strings.Contains(log, "card") || !strings.Contains(log, "renderD") { + framework.Logf("log output: %s", log) + framework.Failf("device mounts not found from log") + } + + framework.Logf("found card and renderD from the log") + }) + + ginkgo.When("there is no app to run [App:noapp]", func() { + ginkgo.It("does nothing", func() {}) + }) + }) }