From c4937ba3a4a124016a3d7cc72ad35e720ce6c4fe Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Sat, 27 Jan 2024 17:39:36 +0100 Subject: [PATCH 1/2] Document NFD for GPU Labeling Signed-off-by: Carlos Eduardo Arango Gutierrez --- .../docs/tasks/manage-gpus/scheduling-gpus.md | 50 ++++++++++++++++--- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/content/en/docs/tasks/manage-gpus/scheduling-gpus.md b/content/en/docs/tasks/manage-gpus/scheduling-gpus.md index d02a20577e7f4..71a8f486bf6e8 100644 --- a/content/en/docs/tasks/manage-gpus/scheduling-gpus.md +++ b/content/en/docs/tasks/manage-gpus/scheduling-gpus.md @@ -64,7 +64,7 @@ spec: gpu-vendor.example/example-gpu: 1 # requesting 1 GPU ``` -## Clusters containing different types of GPUs +## Manage clusters with different types of GPUs If different nodes in your cluster have different types of GPUs, then you can use [Node Labels and Node Selectors](/docs/tasks/configure-pod-container/assign-pods-nodes/) @@ -83,10 +83,46 @@ a different label key if you prefer. ## Automatic node labelling {#node-labeller} -If you're using AMD GPU devices, you can deploy -[Node Labeller](https://github.com/RadeonOpenCompute/k8s-device-plugin/tree/master/cmd/k8s-node-labeller). -Node Labeller is a {{< glossary_tooltip text="controller" term_id="controller" >}} that automatically -labels your nodes with GPU device properties. +As an administrator, you can automatically discover and label all your GPU enabled nodes +by deploying Kubernetes [Node Feature Discovery](https://github.com/kubernetes-sigs/node-feature-discovery) (NFD). +NFD detects the hardware features that are available on each node in a Kubernetes cluster. +Typically, NFD is configured to advertise those features as node labels, but NFD can also add extended resources, annotations, and node taints. +NFD is compatible with all [supported versions](/releases/version-skew-policy/#supported-versions) of Kubernetes. +By default NFD create the [feature labels](https://kubernetes-sigs.github.io/node-feature-discovery/master/usage/features.html) for the detected features. +Administrators can leverage NFD to also taint nodes with specific features, so that only pods that request those features can be scheduled on those nodes. -Similar functionality for NVIDIA is provided by -[GPU feature discovery](https://github.com/NVIDIA/gpu-feature-discovery/blob/main/README.md). +You also need a plugin for NFD that adds appropriate labels to your nodes; these might be generic +labels or they could be vendor specific. Your GPU vendor may provide a third party +plugin for NFD; check their documentation for more details. + +{{< highlight yaml "linenos=false,hl_lines=6-20" >}} +apiVersion: v1 +kind: Pod +metadata: + name: example-vector-add +spec: + # You can use Kubernetes node affinity to schedule this Pod onto a node + # that provides the kind of GPU that its container needs in order to work + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "gpu.gpu-vendor.example/installed-memory" + operator: Gt # (greater than) + values: ["40535"] + - key: "feature.node.kubernetes.io/pci-10.present" # NFD Feature label + values: ["true"] # (optional) only schedule on nodes with PCI device 10 + restartPolicy: OnFailure + containers: + - name: example-vector-add + image: "registry.example/example-vector-add:v42" + resources: + limits: + gpu-vendor.example/example-gpu: 1 # requesting 1 GPU +{{< /highlight >}} + +#### GPU vendor implementations + +- [Intel](https://intel.github.io/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/README.html) +- [NVIDIA](https://github.com/NVIDIA/gpu-feature-discovery/#readme) From 07b14de0273fd0b560faeebdab773294a3d77bdc Mon Sep 17 00:00:00 2001 From: Tim Bannister Date: Tue, 30 Jan 2024 17:02:25 +0000 Subject: [PATCH 2/2] Fix highlighting --- content/en/docs/tasks/manage-gpus/scheduling-gpus.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/docs/tasks/manage-gpus/scheduling-gpus.md b/content/en/docs/tasks/manage-gpus/scheduling-gpus.md index 71a8f486bf6e8..b4337c3615861 100644 --- a/content/en/docs/tasks/manage-gpus/scheduling-gpus.md +++ b/content/en/docs/tasks/manage-gpus/scheduling-gpus.md @@ -95,7 +95,7 @@ You also need a plugin for NFD that adds appropriate labels to your nodes; these labels or they could be vendor specific. Your GPU vendor may provide a third party plugin for NFD; check their documentation for more details. -{{< highlight yaml "linenos=false,hl_lines=6-20" >}} +{{< highlight yaml "linenos=false,hl_lines=6-18" >}} apiVersion: v1 kind: Pod metadata: