Skip to content

Commit

Permalink
Parsing GPU tags on kubeapiserver collector
Browse files Browse the repository at this point in the history
  • Loading branch information
gabedos committed Nov 26, 2024
1 parent 924a150 commit af401ee
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 30 deletions.
36 changes: 6 additions & 30 deletions comp/core/workloadmeta/collectors/internal/kubelet/kubelet.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/DataDog/datadog-agent/pkg/errors"
"github.com/DataDog/datadog-agent/pkg/util/containers"
pkgcontainersimage "github.com/DataDog/datadog-agent/pkg/util/containers/image"
"github.com/DataDog/datadog-agent/pkg/util/gpu"
"github.com/DataDog/datadog-agent/pkg/util/kubernetes"
"github.com/DataDog/datadog-agent/pkg/util/kubernetes/kubelet"
"github.com/DataDog/datadog-agent/pkg/util/log"
Expand Down Expand Up @@ -418,21 +419,6 @@ func extractEnvFromSpec(envSpec []kubelet.EnvVar) map[string]string {
return env
}

func extractGPUVendor(gpuNamePrefix kubelet.ResourceName) string {
gpuVendor := ""
switch gpuNamePrefix {
case kubelet.ResourcePrefixNvidiaMIG, kubelet.ResourceGenericNvidiaGPU:
gpuVendor = "nvidia"
case kubelet.ResourcePrefixAMDGPU:
gpuVendor = "amd"
case kubelet.ResourcePrefixIntelGPU:
gpuVendor = "intel"
default:
gpuVendor = string(gpuNamePrefix)
}
return gpuVendor
}

func extractResources(spec *kubelet.ContainerSpec) workloadmeta.ContainerResources {
resources := workloadmeta.ContainerResources{}
if cpuReq, found := spec.Resources.Requests[kubelet.ResourceCPU]; found {
Expand All @@ -444,24 +430,14 @@ func extractResources(spec *kubelet.ContainerSpec) workloadmeta.ContainerResourc
}

// extract GPU resource info from the possible GPU sources
uniqueGPUVendor := make(map[string]bool)

resourceKeys := make([]kubelet.ResourceName, 0, len(spec.Resources.Requests))
uniqueGPUVendor := make(map[string]struct{})
for resourceName := range spec.Resources.Requests {
resourceKeys = append(resourceKeys, resourceName)
}

for _, gpuResourceName := range kubelet.GetGPUResourceNames() {
for _, resourceKey := range resourceKeys {
if strings.HasPrefix(string(resourceKey), string(gpuResourceName)) {
if gpuReq, found := spec.Resources.Requests[resourceKey]; found {
resources.GPURequest = pointer.Ptr(uint64(gpuReq.Value()))
uniqueGPUVendor[extractGPUVendor(gpuResourceName)] = true
break
}
}
gpuName, found := gpu.ExtractSimpleGPUName(gpu.ResourceGPU(resourceName))
if found {
uniqueGPUVendor[gpuName] = struct{}{}
}
}

gpuVendorList := make([]string, 0, len(uniqueGPUVendor))
for GPUVendor := range uniqueGPUVendor {
gpuVendorList = append(gpuVendorList, GPUVendor)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
corev1 "k8s.io/api/core/v1"

workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def"
"github.com/DataDog/datadog-agent/pkg/util/gpu"
)

type podParser struct {
Expand Down Expand Up @@ -62,6 +63,20 @@ func (p podParser) Parse(obj interface{}) workloadmeta.Entity {
rtcName = *pod.Spec.RuntimeClassName
}

var gpuVendorList []string
uniqueGPUVendor := make(map[string]struct{})
for _, container := range pod.Spec.Containers {
for resourceName := range container.Resources.Limits {
gpuName, found := gpu.ExtractSimpleGPUName(gpu.ResourceGPU(resourceName))
if found {
uniqueGPUVendor[gpuName] = struct{}{}
}
}
}
for gpuVendor := range uniqueGPUVendor {
gpuVendorList = append(gpuVendorList, gpuVendor)
}

return &workloadmeta.KubernetesPod{
EntityID: workloadmeta.EntityID{
Kind: workloadmeta.KindKubernetesPod,
Expand All @@ -81,6 +96,7 @@ func (p podParser) Parse(obj interface{}) workloadmeta.Entity {
PriorityClass: pod.Spec.PriorityClassName,
QOSClass: string(pod.Status.QOSClass),
RuntimeClass: rtcName,
GPUVendorList: gpuVendorList,

// Containers could be generated by this collector, but
// currently it's not to save on memory, since this is supposed
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ replace (
github.com/DataDog/datadog-agent/pkg/util/filesystem => ./pkg/util/filesystem
github.com/DataDog/datadog-agent/pkg/util/flavor => ./pkg/util/flavor
github.com/DataDog/datadog-agent/pkg/util/fxutil => ./pkg/util/fxutil/
github.com/DataDog/datadog-agent/pkg/util/gpu => ./pkg/util/gpu
github.com/DataDog/datadog-agent/pkg/util/grpc => ./pkg/util/grpc/
github.com/DataDog/datadog-agent/pkg/util/hostname/validate => ./pkg/util/hostname/validate/
github.com/DataDog/datadog-agent/pkg/util/http => ./pkg/util/http/
Expand Down
48 changes: 48 additions & 0 deletions pkg/util/gpu/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2016-present Datadog, Inc.

// Package gpu provides utilities for interacting with GPU resources.
package gpu

import "strings"

// ResourceGPU represents a GPU resource
type ResourceGPU string

// Resource name prefixes
const (
gpuNvidiaGeneric ResourceGPU = "nvidia.com/gpu"
gpuAMD ResourceGPU = "amd.com/gpu"
gpuIntelXe ResourceGPU = "gpu.intel.com/xe"
gpuInteli915 ResourceGPU = "gpu.intel.com/i915"

gpuNvidiaMigPrefix ResourceGPU = "nvidia.com/mig"
)

// longToShortGPUName maps a GPU resource to a simplified name
var longToShortGPUName = map[ResourceGPU]string{
gpuNvidiaGeneric: "nvidia",
gpuAMD: "amd",
gpuIntelXe: "intel",
gpuInteli915: "intel",
}

// ExtractSimpleGPUName returns a simplified GPU name.
// If the resource is not recognized, the second return value is false.
func ExtractSimpleGPUName(gpuName ResourceGPU) (string, bool) {
val, ok := longToShortGPUName[gpuName]
if ok {
return val, true
}

// More complex cases (eg. nvidia.com/mig-3g.20gb => nvidia)
switch {
case strings.HasPrefix(string(gpuName), string(gpuNvidiaMigPrefix)):
return "nvidia", true
}

// Not a GPU resource (or not recognized)
return "", false
}
41 changes: 41 additions & 0 deletions pkg/util/gpu/common_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2016-present Datadog, Inc.

package gpu

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestExtractSimpleGPUName(t *testing.T) {
tests := []struct {
name string
gpuName ResourceGPU
found bool
expected string
}{
{
name: "known gpu resource",
gpuName: gpuNvidiaGeneric,
found: true,
expected: "nvidia",
},
{
name: "unknown gpu resource",
gpuName: ResourceGPU("cpu"),
found: false,
expected: "",
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
actual, found := ExtractSimpleGPUName(test.gpuName)
assert.Equal(t, test.found, found)
assert.Equal(t, test.expected, actual)
})
}
}

0 comments on commit af401ee

Please sign in to comment.