From af401eef705ed3220ff34b92e8738d037945e963 Mon Sep 17 00:00:00 2001 From: Gabriel Dos Santos Date: Tue, 26 Nov 2024 03:52:36 +0000 Subject: [PATCH] Parsing GPU tags on kubeapiserver collector --- .../collectors/internal/kubelet/kubelet.go | 36 +++----------- .../util/kubernetes_resource_parsers/pod.go | 16 +++++++ go.mod | 1 + pkg/util/gpu/common.go | 48 +++++++++++++++++++ pkg/util/gpu/common_test.go | 41 ++++++++++++++++ 5 files changed, 112 insertions(+), 30 deletions(-) create mode 100644 pkg/util/gpu/common.go create mode 100644 pkg/util/gpu/common_test.go diff --git a/comp/core/workloadmeta/collectors/internal/kubelet/kubelet.go b/comp/core/workloadmeta/collectors/internal/kubelet/kubelet.go index c9e4960d805729..eac4131ebcedef 100644 --- a/comp/core/workloadmeta/collectors/internal/kubelet/kubelet.go +++ b/comp/core/workloadmeta/collectors/internal/kubelet/kubelet.go @@ -22,6 +22,7 @@ import ( "github.com/DataDog/datadog-agent/pkg/errors" "github.com/DataDog/datadog-agent/pkg/util/containers" pkgcontainersimage "github.com/DataDog/datadog-agent/pkg/util/containers/image" + "github.com/DataDog/datadog-agent/pkg/util/gpu" "github.com/DataDog/datadog-agent/pkg/util/kubernetes" "github.com/DataDog/datadog-agent/pkg/util/kubernetes/kubelet" "github.com/DataDog/datadog-agent/pkg/util/log" @@ -418,21 +419,6 @@ func extractEnvFromSpec(envSpec []kubelet.EnvVar) map[string]string { return env } -func extractGPUVendor(gpuNamePrefix kubelet.ResourceName) string { - gpuVendor := "" - switch gpuNamePrefix { - case kubelet.ResourcePrefixNvidiaMIG, kubelet.ResourceGenericNvidiaGPU: - gpuVendor = "nvidia" - case kubelet.ResourcePrefixAMDGPU: - gpuVendor = "amd" - case kubelet.ResourcePrefixIntelGPU: - gpuVendor = "intel" - default: - gpuVendor = string(gpuNamePrefix) - } - return gpuVendor -} - func extractResources(spec *kubelet.ContainerSpec) workloadmeta.ContainerResources { resources := workloadmeta.ContainerResources{} if cpuReq, found := spec.Resources.Requests[kubelet.ResourceCPU]; found { @@ -444,24 +430,14 @@ func extractResources(spec *kubelet.ContainerSpec) workloadmeta.ContainerResourc } // extract GPU resource info from the possible GPU sources - uniqueGPUVendor := make(map[string]bool) - - resourceKeys := make([]kubelet.ResourceName, 0, len(spec.Resources.Requests)) + uniqueGPUVendor := make(map[string]struct{}) for resourceName := range spec.Resources.Requests { - resourceKeys = append(resourceKeys, resourceName) - } - - for _, gpuResourceName := range kubelet.GetGPUResourceNames() { - for _, resourceKey := range resourceKeys { - if strings.HasPrefix(string(resourceKey), string(gpuResourceName)) { - if gpuReq, found := spec.Resources.Requests[resourceKey]; found { - resources.GPURequest = pointer.Ptr(uint64(gpuReq.Value())) - uniqueGPUVendor[extractGPUVendor(gpuResourceName)] = true - break - } - } + gpuName, found := gpu.ExtractSimpleGPUName(gpu.ResourceGPU(resourceName)) + if found { + uniqueGPUVendor[gpuName] = struct{}{} } } + gpuVendorList := make([]string, 0, len(uniqueGPUVendor)) for GPUVendor := range uniqueGPUVendor { gpuVendorList = append(gpuVendorList, GPUVendor) diff --git a/comp/core/workloadmeta/collectors/util/kubernetes_resource_parsers/pod.go b/comp/core/workloadmeta/collectors/util/kubernetes_resource_parsers/pod.go index 1ae7d26db13f2c..f776081150bb81 100644 --- a/comp/core/workloadmeta/collectors/util/kubernetes_resource_parsers/pod.go +++ b/comp/core/workloadmeta/collectors/util/kubernetes_resource_parsers/pod.go @@ -13,6 +13,7 @@ import ( corev1 "k8s.io/api/core/v1" workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" + "github.com/DataDog/datadog-agent/pkg/util/gpu" ) type podParser struct { @@ -62,6 +63,20 @@ func (p podParser) Parse(obj interface{}) workloadmeta.Entity { rtcName = *pod.Spec.RuntimeClassName } + var gpuVendorList []string + uniqueGPUVendor := make(map[string]struct{}) + for _, container := range pod.Spec.Containers { + for resourceName := range container.Resources.Limits { + gpuName, found := gpu.ExtractSimpleGPUName(gpu.ResourceGPU(resourceName)) + if found { + uniqueGPUVendor[gpuName] = struct{}{} + } + } + } + for gpuVendor := range uniqueGPUVendor { + gpuVendorList = append(gpuVendorList, gpuVendor) + } + return &workloadmeta.KubernetesPod{ EntityID: workloadmeta.EntityID{ Kind: workloadmeta.KindKubernetesPod, @@ -81,6 +96,7 @@ func (p podParser) Parse(obj interface{}) workloadmeta.Entity { PriorityClass: pod.Spec.PriorityClassName, QOSClass: string(pod.Status.QOSClass), RuntimeClass: rtcName, + GPUVendorList: gpuVendorList, // Containers could be generated by this collector, but // currently it's not to save on memory, since this is supposed diff --git a/go.mod b/go.mod index d39bc25dbbadd7..2b510c208605ad 100644 --- a/go.mod +++ b/go.mod @@ -123,6 +123,7 @@ replace ( github.com/DataDog/datadog-agent/pkg/util/filesystem => ./pkg/util/filesystem github.com/DataDog/datadog-agent/pkg/util/flavor => ./pkg/util/flavor github.com/DataDog/datadog-agent/pkg/util/fxutil => ./pkg/util/fxutil/ + github.com/DataDog/datadog-agent/pkg/util/gpu => ./pkg/util/gpu github.com/DataDog/datadog-agent/pkg/util/grpc => ./pkg/util/grpc/ github.com/DataDog/datadog-agent/pkg/util/hostname/validate => ./pkg/util/hostname/validate/ github.com/DataDog/datadog-agent/pkg/util/http => ./pkg/util/http/ diff --git a/pkg/util/gpu/common.go b/pkg/util/gpu/common.go new file mode 100644 index 00000000000000..7d5dc6a9825b23 --- /dev/null +++ b/pkg/util/gpu/common.go @@ -0,0 +1,48 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +// Package gpu provides utilities for interacting with GPU resources. +package gpu + +import "strings" + +// ResourceGPU represents a GPU resource +type ResourceGPU string + +// Resource name prefixes +const ( + gpuNvidiaGeneric ResourceGPU = "nvidia.com/gpu" + gpuAMD ResourceGPU = "amd.com/gpu" + gpuIntelXe ResourceGPU = "gpu.intel.com/xe" + gpuInteli915 ResourceGPU = "gpu.intel.com/i915" + + gpuNvidiaMigPrefix ResourceGPU = "nvidia.com/mig" +) + +// longToShortGPUName maps a GPU resource to a simplified name +var longToShortGPUName = map[ResourceGPU]string{ + gpuNvidiaGeneric: "nvidia", + gpuAMD: "amd", + gpuIntelXe: "intel", + gpuInteli915: "intel", +} + +// ExtractSimpleGPUName returns a simplified GPU name. +// If the resource is not recognized, the second return value is false. +func ExtractSimpleGPUName(gpuName ResourceGPU) (string, bool) { + val, ok := longToShortGPUName[gpuName] + if ok { + return val, true + } + + // More complex cases (eg. nvidia.com/mig-3g.20gb => nvidia) + switch { + case strings.HasPrefix(string(gpuName), string(gpuNvidiaMigPrefix)): + return "nvidia", true + } + + // Not a GPU resource (or not recognized) + return "", false +} diff --git a/pkg/util/gpu/common_test.go b/pkg/util/gpu/common_test.go new file mode 100644 index 00000000000000..f5e9fa4946c8f6 --- /dev/null +++ b/pkg/util/gpu/common_test.go @@ -0,0 +1,41 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package gpu + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestExtractSimpleGPUName(t *testing.T) { + tests := []struct { + name string + gpuName ResourceGPU + found bool + expected string + }{ + { + name: "known gpu resource", + gpuName: gpuNvidiaGeneric, + found: true, + expected: "nvidia", + }, + { + name: "unknown gpu resource", + gpuName: ResourceGPU("cpu"), + found: false, + expected: "", + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + actual, found := ExtractSimpleGPUName(test.gpuName) + assert.Equal(t, test.found, found) + assert.Equal(t, test.expected, actual) + }) + } +}