From c653ab67b5ce21d8197fa4681e76006d649c1c56 Mon Sep 17 00:00:00 2001 From: Martin Linkhorst Date: Wed, 22 May 2024 16:36:16 +0200 Subject: [PATCH 1/2] add nvidia dcgm-exporter to export gpu metrics --- cluster/config-defaults.yaml | 4 +++ .../nvidia/nvidia-gpu-device-plugin.yaml | 30 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/cluster/config-defaults.yaml b/cluster/config-defaults.yaml index 97d155f9de..3e5575e196 100644 --- a/cluster/config-defaults.yaml +++ b/cluster/config-defaults.yaml @@ -387,6 +387,10 @@ flannel_memory: "100Mi" nvidia_device_plugin_cpu: "10m" nvidia_device_plugin_memory: "50Mi" +# nvidia dcgm exporter +nvidia_dcgm_exporter_cpu: "10m" +nvidia_dcgm_exporter_memory: "200Mi" + # static egress controller settings static_egress_controller_enabled: "true" diff --git a/cluster/manifests/nvidia/nvidia-gpu-device-plugin.yaml b/cluster/manifests/nvidia/nvidia-gpu-device-plugin.yaml index 27fa0fa324..e030ec55ad 100644 --- a/cluster/manifests/nvidia/nvidia-gpu-device-plugin.yaml +++ b/cluster/manifests/nvidia/nvidia-gpu-device-plugin.yaml @@ -24,6 +24,9 @@ spec: component: nvidia-gpu-device-plugin annotations: logging/destination: "{{.Cluster.ConfigItems.log_destination_infra}}" + prometheus.io/path: /metrics + prometheus.io/port: "9400" + prometheus.io/scrape: "true" spec: serviceAccountName: nvidia tolerations: @@ -48,6 +51,9 @@ spec: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins + - name: pod-gpu-resources + hostPath: + path: /opt/podruntime/kubelet/pod-resources containers: - name: nvidia-gpu-device-plugin image: container-registry.zalando.net/teapot/nvidia-gpu-device-plugin:v0.14.5-master-10 @@ -68,3 +74,27 @@ spec: volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins + - name: dcgm-exporter + image: container-registry.zalando.net/teapot/nvidia-dcgm-exporter:v3.3.6-3.4.2-ubuntu22.04-master-11 + args: + - --kubernetes + - --address=:9400 + resources: + requests: + cpu: "{{ .Cluster.ConfigItems.nvidia_dcgm_exporter_cpu }}" + memory: "{{ .Cluster.ConfigItems.nvidia_dcgm_exporter_memory }}" + limits: + cpu: "{{ .Cluster.ConfigItems.nvidia_dcgm_exporter_cpu }}" + memory: "{{ .Cluster.ConfigItems.nvidia_dcgm_exporter_memory }}" + ports: + - name: metrics + containerPort: 9400 + securityContext: + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: ["SYS_ADMIN"] + volumeMounts: + - name: pod-gpu-resources + mountPath: /var/lib/kubelet/pod-resources + readOnly: true From 8b9a9680824c839d441a1a7d1a506bafa4ad3727 Mon Sep 17 00:00:00 2001 From: Roman Zavodskikh Date: Fri, 24 May 2024 14:52:25 +0200 Subject: [PATCH 2/2] Update skipper version, step 1/2 Signed-off-by: Roman Zavodskikh --- cluster/manifests/skipper/deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster/manifests/skipper/deployment.yaml b/cluster/manifests/skipper/deployment.yaml index d30f2a60a4..7c22eb633c 100644 --- a/cluster/manifests/skipper/deployment.yaml +++ b/cluster/manifests/skipper/deployment.yaml @@ -1,5 +1,5 @@ {{ $internal_version := "v0.21.91-921" }} -{{ $canary_internal_version := "v0.21.91-921" }} +{{ $canary_internal_version := "v0.21.99-931" }} {{/* Optional canary arguments separated by "[cf724afc]" to allow whitespaces, e.g. "-foo=has a whitespace[cf724afc]-baz=qux" */}} {{ $canary_args := "" }}