diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 42466f04064aae..c3d8aed6105503 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -454,6 +454,7 @@ /pkg/util/docker/ @DataDog/container-integrations /pkg/util/ecs/ @DataDog/container-integrations /pkg/util/funcs/ @DataDog/ebpf-platform +/pkg/util/gpu/ @DataDog/container-platform /pkg/util/kernel/ @DataDog/ebpf-platform /pkg/util/safeelf/ @DataDog/ebpf-platform /pkg/util/ktime @DataDog/agent-security diff --git a/comp/core/workloadmeta/collectors/internal/kubelet/kubelet.go b/comp/core/workloadmeta/collectors/internal/kubelet/kubelet.go index c9e4960d805729..b875899ef9c145 100644 --- a/comp/core/workloadmeta/collectors/internal/kubelet/kubelet.go +++ b/comp/core/workloadmeta/collectors/internal/kubelet/kubelet.go @@ -22,6 +22,7 @@ import ( "github.com/DataDog/datadog-agent/pkg/errors" "github.com/DataDog/datadog-agent/pkg/util/containers" pkgcontainersimage "github.com/DataDog/datadog-agent/pkg/util/containers/image" + "github.com/DataDog/datadog-agent/pkg/util/gpu" "github.com/DataDog/datadog-agent/pkg/util/kubernetes" "github.com/DataDog/datadog-agent/pkg/util/kubernetes/kubelet" "github.com/DataDog/datadog-agent/pkg/util/log" @@ -83,13 +84,13 @@ func (c *collector) Pull(ctx context.Context) error { return err } - events := c.parsePods(updatedPods) + events := parsePods(updatedPods) if time.Since(c.lastExpire) >= c.expireFreq { var expiredIDs []string expiredIDs, err = c.watcher.Expire() if err == nil { - events = append(events, c.parseExpires(expiredIDs)...) + events = append(events, parseExpires(expiredIDs)...) c.lastExpire = time.Now() } } @@ -107,7 +108,7 @@ func (c *collector) GetTargetCatalog() workloadmeta.AgentType { return c.catalog } -func (c *collector) parsePods(pods []*kubelet.Pod) []workloadmeta.CollectorEvent { +func parsePods(pods []*kubelet.Pod) []workloadmeta.CollectorEvent { events := []workloadmeta.CollectorEvent{} for _, pod := range pods { @@ -131,14 +132,14 @@ func (c *collector) parsePods(pods []*kubelet.Pod) []workloadmeta.CollectorEvent ID: podMeta.UID, } - podInitContainers, initContainerEvents := c.parsePodContainers( + podInitContainers, initContainerEvents := parsePodContainers( pod, pod.Spec.InitContainers, pod.Status.InitContainers, &podID, ) - podContainers, containerEvents := c.parsePodContainers( + podContainers, containerEvents := parsePodContainers( pod, pod.Spec.Containers, pod.Status.Containers, @@ -194,7 +195,7 @@ func (c *collector) parsePods(pods []*kubelet.Pod) []workloadmeta.CollectorEvent return events } -func (c *collector) parsePodContainers( +func parsePodContainers( pod *kubelet.Pod, containerSpecs []kubelet.ContainerSpec, containerStatuses []kubelet.ContainerStatus, @@ -418,21 +419,6 @@ func extractEnvFromSpec(envSpec []kubelet.EnvVar) map[string]string { return env } -func extractGPUVendor(gpuNamePrefix kubelet.ResourceName) string { - gpuVendor := "" - switch gpuNamePrefix { - case kubelet.ResourcePrefixNvidiaMIG, kubelet.ResourceGenericNvidiaGPU: - gpuVendor = "nvidia" - case kubelet.ResourcePrefixAMDGPU: - gpuVendor = "amd" - case kubelet.ResourcePrefixIntelGPU: - gpuVendor = "intel" - default: - gpuVendor = string(gpuNamePrefix) - } - return gpuVendor -} - func extractResources(spec *kubelet.ContainerSpec) workloadmeta.ContainerResources { resources := workloadmeta.ContainerResources{} if cpuReq, found := spec.Resources.Requests[kubelet.ResourceCPU]; found { @@ -444,24 +430,14 @@ func extractResources(spec *kubelet.ContainerSpec) workloadmeta.ContainerResourc } // extract GPU resource info from the possible GPU sources - uniqueGPUVendor := make(map[string]bool) - - resourceKeys := make([]kubelet.ResourceName, 0, len(spec.Resources.Requests)) + uniqueGPUVendor := make(map[string]struct{}) for resourceName := range spec.Resources.Requests { - resourceKeys = append(resourceKeys, resourceName) - } - - for _, gpuResourceName := range kubelet.GetGPUResourceNames() { - for _, resourceKey := range resourceKeys { - if strings.HasPrefix(string(resourceKey), string(gpuResourceName)) { - if gpuReq, found := spec.Resources.Requests[resourceKey]; found { - resources.GPURequest = pointer.Ptr(uint64(gpuReq.Value())) - uniqueGPUVendor[extractGPUVendor(gpuResourceName)] = true - break - } - } + gpuName, found := gpu.ExtractSimpleGPUName(gpu.ResourceGPU(resourceName)) + if found { + uniqueGPUVendor[gpuName] = struct{}{} } } + gpuVendorList := make([]string, 0, len(uniqueGPUVendor)) for GPUVendor := range uniqueGPUVendor { gpuVendorList = append(gpuVendorList, GPUVendor) @@ -481,7 +457,7 @@ func findContainerSpec(name string, specs []kubelet.ContainerSpec) *kubelet.Cont return nil } -func (c *collector) parseExpires(expiredIDs []string) []workloadmeta.CollectorEvent { +func parseExpires(expiredIDs []string) []workloadmeta.CollectorEvent { events := make([]workloadmeta.CollectorEvent, 0, len(expiredIDs)) podTerminatedTime := time.Now() diff --git a/comp/core/workloadmeta/collectors/internal/kubelet/kubelet_test.go b/comp/core/workloadmeta/collectors/internal/kubelet/kubelet_test.go new file mode 100644 index 00000000000000..a8a1ba1bdda202 --- /dev/null +++ b/comp/core/workloadmeta/collectors/internal/kubelet/kubelet_test.go @@ -0,0 +1,175 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +//go:build kubelet && test + +package kubelet + +import ( + "testing" + + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + + "github.com/DataDog/datadog-agent/pkg/util/kubernetes" + "github.com/DataDog/datadog-agent/pkg/util/kubernetes/kubelet" + + workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" +) + +func TestPodParser(t *testing.T) { + + referencePod := []*kubelet.Pod{ + { + Metadata: kubelet.PodMetadata{ + Name: "TestPod", + UID: "uniqueIdentifier", + Namespace: "namespace", + Owners: []kubelet.PodOwner{ + { + Kind: "ReplicaSet", + Name: "deployment-hashrs", + ID: "ownerUID", + }, + }, + Annotations: map[string]string{ + "annotationKey": "annotationValue", + }, + Labels: map[string]string{ + "labelKey": "labelValue", + }, + }, + Spec: kubelet.Spec{ + PriorityClassName: "priorityClass", + Volumes: []kubelet.VolumeSpec{ + { + Name: "pvcVol", + PersistentVolumeClaim: &kubelet.PersistentVolumeClaimSpec{ + ClaimName: "pvcName", + }, + }, + }, + Containers: []kubelet.ContainerSpec{ + { + Name: "nginx-container", + Image: "nginx:1.25.2", + Resources: &kubelet.ContainerResourcesSpec{ + Requests: kubelet.ResourceList{ + "nvidia.com/gpu": resource.Quantity{ + Format: "1", + }, + }, + }, + }, + }, + }, + Status: kubelet.Status{ + Phase: string(corev1.PodRunning), + Conditions: []kubelet.Conditions{ + { + Type: string(corev1.PodReady), + Status: string(corev1.ConditionTrue), + }, + }, + PodIP: "127.0.0.1", + QOSClass: string(corev1.PodQOSGuaranteed), + Containers: []kubelet.ContainerStatus{ + { + Name: "nginx-container", + ImageID: "5dbe7e1b6b9c", + Image: "nginx:1.25.2", + ID: "docker://containerID", + Ready: true, + }, + }, + }, + }, + } + + events := parsePods(referencePod) + containerEvent, podEvent := events[0], events[1] + + expectedContainer := &workloadmeta.Container{ + EntityID: workloadmeta.EntityID{ + Kind: workloadmeta.KindContainer, + ID: "containerID", + }, + EntityMeta: workloadmeta.EntityMeta{ + Name: "nginx-container", + Labels: map[string]string{ + kubernetes.CriContainerNamespaceLabel: "namespace", + }, + }, + Image: workloadmeta.ContainerImage{ + ID: "5dbe7e1b6b9c", + Name: "nginx", + ShortName: "nginx", + Tag: "1.25.2", + RawName: "nginx:1.25.2", + }, + Runtime: "docker", + Resources: workloadmeta.ContainerResources{ + GPUVendorList: []string{"nvidia"}, + }, + Owner: &workloadmeta.EntityID{ + Kind: "kubernetes_pod", + ID: "uniqueIdentifier", + }, + Ports: []workloadmeta.ContainerPort{}, + EnvVars: map[string]string{}, + State: workloadmeta.ContainerState{ + Health: "healthy", + }, + } + expectedPod := &workloadmeta.KubernetesPod{ + EntityID: workloadmeta.EntityID{ + Kind: workloadmeta.KindKubernetesPod, + ID: "uniqueIdentifier", + }, + EntityMeta: workloadmeta.EntityMeta{ + Name: "TestPod", + Namespace: "namespace", + Annotations: map[string]string{ + "annotationKey": "annotationValue", + }, + Labels: map[string]string{ + "labelKey": "labelValue", + }, + }, + Phase: "Running", + Owners: []workloadmeta.KubernetesPodOwner{ + { + Kind: "ReplicaSet", + Name: "deployment-hashrs", + ID: "ownerUID", + }, + }, + Containers: []workloadmeta.OrchestratorContainer{ + { + Name: "nginx-container", + ID: "containerID", + Image: workloadmeta.ContainerImage{ + ID: "5dbe7e1b6b9c", + Name: "nginx", + ShortName: "nginx", + Tag: "1.25.2", + RawName: "nginx:1.25.2", + }, + }, + }, + InitContainers: []workloadmeta.OrchestratorContainer{}, + PersistentVolumeClaimNames: []string{"pvcName"}, + Ready: true, + IP: "127.0.0.1", + PriorityClass: "priorityClass", + GPUVendorList: []string{"nvidia"}, + QOSClass: "Guaranteed", + } + + assert.Equal(t, expectedPod, podEvent.Entity) + + assert.Equal(t, expectedContainer, containerEvent.Entity) +} diff --git a/comp/core/workloadmeta/collectors/util/kubernetes_resource_parsers/pod.go b/comp/core/workloadmeta/collectors/util/kubernetes_resource_parsers/pod.go index 1ae7d26db13f2c..f776081150bb81 100644 --- a/comp/core/workloadmeta/collectors/util/kubernetes_resource_parsers/pod.go +++ b/comp/core/workloadmeta/collectors/util/kubernetes_resource_parsers/pod.go @@ -13,6 +13,7 @@ import ( corev1 "k8s.io/api/core/v1" workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" + "github.com/DataDog/datadog-agent/pkg/util/gpu" ) type podParser struct { @@ -62,6 +63,20 @@ func (p podParser) Parse(obj interface{}) workloadmeta.Entity { rtcName = *pod.Spec.RuntimeClassName } + var gpuVendorList []string + uniqueGPUVendor := make(map[string]struct{}) + for _, container := range pod.Spec.Containers { + for resourceName := range container.Resources.Limits { + gpuName, found := gpu.ExtractSimpleGPUName(gpu.ResourceGPU(resourceName)) + if found { + uniqueGPUVendor[gpuName] = struct{}{} + } + } + } + for gpuVendor := range uniqueGPUVendor { + gpuVendorList = append(gpuVendorList, gpuVendor) + } + return &workloadmeta.KubernetesPod{ EntityID: workloadmeta.EntityID{ Kind: workloadmeta.KindKubernetesPod, @@ -81,6 +96,7 @@ func (p podParser) Parse(obj interface{}) workloadmeta.Entity { PriorityClass: pod.Spec.PriorityClassName, QOSClass: string(pod.Status.QOSClass), RuntimeClass: rtcName, + GPUVendorList: gpuVendorList, // Containers could be generated by this collector, but // currently it's not to save on memory, since this is supposed diff --git a/comp/core/workloadmeta/collectors/util/kubernetes_resource_parsers/pod_test.go b/comp/core/workloadmeta/collectors/util/kubernetes_resource_parsers/pod_test.go index 2815e270e4530c..a90d0a2028357b 100644 --- a/comp/core/workloadmeta/collectors/util/kubernetes_resource_parsers/pod_test.go +++ b/comp/core/workloadmeta/collectors/util/kubernetes_resource_parsers/pod_test.go @@ -12,6 +12,7 @@ import ( "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" @@ -54,6 +55,28 @@ func TestPodParser_Parse(t *testing.T) { }, }, }, + Containers: []corev1.Container{ + { + Name: "gpuContainer1", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.Quantity{ + Format: "1", + }, + }, + }, + }, + { + Name: "gpuContainer2", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "gpu.intel.com/xe": resource.Quantity{ + Format: "2", + }, + }, + }, + }, + }, }, Status: corev1.PodStatus{ Phase: corev1.PodRunning, @@ -97,6 +120,7 @@ func TestPodParser_Parse(t *testing.T) { Ready: true, IP: "127.0.0.1", PriorityClass: "priorityClass", + GPUVendorList: []string{"nvidia", "intel"}, QOSClass: "Guaranteed", } diff --git a/pkg/util/gpu/common.go b/pkg/util/gpu/common.go new file mode 100644 index 00000000000000..7d5dc6a9825b23 --- /dev/null +++ b/pkg/util/gpu/common.go @@ -0,0 +1,48 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +// Package gpu provides utilities for interacting with GPU resources. +package gpu + +import "strings" + +// ResourceGPU represents a GPU resource +type ResourceGPU string + +// Resource name prefixes +const ( + gpuNvidiaGeneric ResourceGPU = "nvidia.com/gpu" + gpuAMD ResourceGPU = "amd.com/gpu" + gpuIntelXe ResourceGPU = "gpu.intel.com/xe" + gpuInteli915 ResourceGPU = "gpu.intel.com/i915" + + gpuNvidiaMigPrefix ResourceGPU = "nvidia.com/mig" +) + +// longToShortGPUName maps a GPU resource to a simplified name +var longToShortGPUName = map[ResourceGPU]string{ + gpuNvidiaGeneric: "nvidia", + gpuAMD: "amd", + gpuIntelXe: "intel", + gpuInteli915: "intel", +} + +// ExtractSimpleGPUName returns a simplified GPU name. +// If the resource is not recognized, the second return value is false. +func ExtractSimpleGPUName(gpuName ResourceGPU) (string, bool) { + val, ok := longToShortGPUName[gpuName] + if ok { + return val, true + } + + // More complex cases (eg. nvidia.com/mig-3g.20gb => nvidia) + switch { + case strings.HasPrefix(string(gpuName), string(gpuNvidiaMigPrefix)): + return "nvidia", true + } + + // Not a GPU resource (or not recognized) + return "", false +} diff --git a/pkg/util/gpu/common_test.go b/pkg/util/gpu/common_test.go new file mode 100644 index 00000000000000..f5e9fa4946c8f6 --- /dev/null +++ b/pkg/util/gpu/common_test.go @@ -0,0 +1,41 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package gpu + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestExtractSimpleGPUName(t *testing.T) { + tests := []struct { + name string + gpuName ResourceGPU + found bool + expected string + }{ + { + name: "known gpu resource", + gpuName: gpuNvidiaGeneric, + found: true, + expected: "nvidia", + }, + { + name: "unknown gpu resource", + gpuName: ResourceGPU("cpu"), + found: false, + expected: "", + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + actual, found := ExtractSimpleGPUName(test.gpuName) + assert.Equal(t, test.found, found) + assert.Equal(t, test.expected, actual) + }) + } +} diff --git a/releasenotes/notes/kubeapiserver-gpu-tagging-e6202bc782982e5d.yaml b/releasenotes/notes/kubeapiserver-gpu-tagging-e6202bc782982e5d.yaml new file mode 100644 index 00000000000000..a09ed5f24515e7 --- /dev/null +++ b/releasenotes/notes/kubeapiserver-gpu-tagging-e6202bc782982e5d.yaml @@ -0,0 +1,12 @@ +# Each section from every release note are combined when the +# CHANGELOG.rst is rendered. So the text needs to be worded so that +# it does not depend on any information only available in another +# section. This may mean repeating some details, but each section +# must be readable independently of the other. +# +# Each section note must be formatted as reStructuredText. +--- +fixes: + - | + Include `gpu_vendor` pod tags on the Datadog Cluster Agent when + enabling datadog.clusterTagger.collectKubernetesTags.