From 4032470d34d17ba46f33db99668d0d7a431e8fcc Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Sat, 27 Jan 2024 17:39:36 +0100 Subject: [PATCH] Document NFD for GPU Labeling Signed-off-by: Carlos Eduardo Arango Gutierrez --- .../docs/tasks/manage-gpus/scheduling-gpus.md | 42 +++++++++++++++---- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/content/en/docs/tasks/manage-gpus/scheduling-gpus.md b/content/en/docs/tasks/manage-gpus/scheduling-gpus.md index d02a20577e7f4..4a967860fb4fa 100644 --- a/content/en/docs/tasks/manage-gpus/scheduling-gpus.md +++ b/content/en/docs/tasks/manage-gpus/scheduling-gpus.md @@ -64,7 +64,7 @@ spec: gpu-vendor.example/example-gpu: 1 # requesting 1 GPU ``` -## Clusters containing different types of GPUs +## Manage clusters with different types of GPUs If different nodes in your cluster have different types of GPUs, then you can use [Node Labels and Node Selectors](/docs/tasks/configure-pod-container/assign-pods-nodes/) @@ -81,12 +81,40 @@ kubectl label nodes node2 accelerator=other-gpu-k915 That label key `accelerator` is just an example; you can use a different label key if you prefer. -## Automatic node labelling {#node-labeller} +### Automatically labeling nodes with Node Feature Discovery {#node-feature-discovery} -If you're using AMD GPU devices, you can deploy +As an administrator, you can automatically discover and label all your GPU enabled nodes +by deploying the K8S-Sig project [Node Feature Discovery](https://github.com/kubernetes-sigs/node-feature-discovery) (NFD). +NFD detects the hardware features that are available on each node in a Kubernetes cluster and advertises those features. +Typically, NFD adds node labels to advertise the features, but NFD can also add extended resources, annotations, and node taints. +NFD is compatible with any recent version of Kubernetes (v1.21+). + +Administrators can leverage NFD to also taint nodes with specific features, so that only pods that request those features can be scheduled on those nodes. +After a cluster is labeled with the GPU feature, you can schedule pods on GPU nodes by adding the following to your pod spec: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: example-vector-add +spec: + restartPolicy: Never + containers: + - name: example-vector-add + image: "registry.example/example-vector-add:v42" + resources: + limits: + gpu-vendor.example/example-gpu: 1 # requesting 1 GPU + nodeSelector: + gpu-vendor.example/example-gpu: "true" +``` + +NFD exposes an API which allows vendors to leverage the automatic labeling functionality. +NVIDIA has implemented this API in the [GPU feature discovery](https://github.com/NVIDIA/gpu-feature-discovery/blob/main/README.md). + +### Using custom labellers + +For AMD GPUs, you can use the [Node Labeller](https://github.com/RadeonOpenCompute/k8s-device-plugin/tree/master/cmd/k8s-node-labeller). Node Labeller is a {{< glossary_tooltip text="controller" term_id="controller" >}} that automatically -labels your nodes with GPU device properties. - -Similar functionality for NVIDIA is provided by -[GPU feature discovery](https://github.com/NVIDIA/gpu-feature-discovery/blob/main/README.md). +labels nodes in a Kubernetes cluster with AMD GPU device properties.