Merge pull request #1922 from mboersma/fix-gpu-e2e

Update NVIDIA GPU operator componentry
kubernetes-sigs · Dec 15, 2021 · fff5371 · fff5371
2 parents 28fc000 + 330ab44
commit fff5371
Show file tree

Hide file tree

Showing 6 changed files with 51 additions and 15 deletions.
diff --git a/templates/cluster-template-nvidia-gpu.yaml b/templates/cluster-template-nvidia-gpu.yaml
@@ -6036,7 +6036,7 @@ metadata:
 ---
 apiVersion: v1
 data:
-  gpu-operator-components.yaml: |-
+  gpu-operator-components.yaml: |
     ---
     # Source: gpu-operator/templates/resources-namespace.yaml
     apiVersion: v1
@@ -6383,7 +6383,7 @@ data:
             - name: node-feature-discovery-master
               securityContext:
                 {}
-              image: "quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0"
+              image: "k8s.gcr.io/nfd/node-feature-discovery:v0.9.0"
               imagePullPolicy: IfNotPresent
               ports:
                 - name: api
@@ -6509,7 +6509,7 @@ data:
       driver:
         repository: nvcr.io/nvidia
         image: driver
-        version: 460.32.03
+        version: 470.82.01
         imagePullPolicy: IfNotPresent
         repoConfig:
           configMapName: ""
@@ -6529,7 +6529,7 @@ data:
       toolkit:
         repository: nvcr.io/nvidia/k8s
         image: container-toolkit
-        version: 1.4.7-ubuntu18.04
+        version: 1.7.2
         imagePullPolicy: IfNotPresent
         tolerations:
           - key: CriticalAddonsOnly

diff --git a/templates/flavors/nvidia-gpu/gpu-operator-components.yaml b/templates/flavors/nvidia-gpu/gpu-operator-components.yaml
@@ -344,7 +344,7 @@ spec:
         - name: node-feature-discovery-master
           securityContext:
             {}
-          image: "quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0"
+          image: "k8s.gcr.io/nfd/node-feature-discovery:v0.9.0"
           imagePullPolicy: IfNotPresent
           ports:
             - name: api
@@ -470,7 +470,7 @@ spec:
   driver:
     repository: nvcr.io/nvidia
     image: driver
-    version: 460.32.03
+    version: 470.82.01
     imagePullPolicy: IfNotPresent
     repoConfig:
       configMapName: ""
@@ -490,7 +490,7 @@ spec:
   toolkit:
     repository: nvcr.io/nvidia/k8s
     image: container-toolkit
-    version: 1.4.7-ubuntu18.04
+    version: 1.7.2
     imagePullPolicy: IfNotPresent
     tolerations:
       - key: CriticalAddonsOnly
@@ -535,4 +535,4 @@ spec:
     nodeSelector:
       nvidia.com/gpu.present: "true"
     migStrategy: single
-    discoveryIntervalSeconds: 60
+    discoveryIntervalSeconds: 60
diff --git a/templates/flavors/nvidia-gpu/patches/cluster.yaml b/templates/flavors/nvidia-gpu/patches/cluster.yaml
@@ -3,4 +3,4 @@ kind: Cluster
 metadata:
   name: ${CLUSTER_NAME}
   labels:
-    gpu: "nvidia"
+    gpu: "nvidia"
diff --git a/templates/test/ci/cluster-template-prow-nvidia-gpu.yaml b/templates/test/ci/cluster-template-prow-nvidia-gpu.yaml
@@ -6041,7 +6041,7 @@ metadata:
 ---
 apiVersion: v1
 data:
-  gpu-operator-components.yaml: |-
+  gpu-operator-components.yaml: |
     ---
     # Source: gpu-operator/templates/resources-namespace.yaml
     apiVersion: v1
@@ -6388,7 +6388,7 @@ data:
             - name: node-feature-discovery-master
               securityContext:
                 {}
-              image: "quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0"
+              image: "k8s.gcr.io/nfd/node-feature-discovery:v0.9.0"
               imagePullPolicy: IfNotPresent
               ports:
                 - name: api
@@ -6514,7 +6514,7 @@ data:
       driver:
         repository: nvcr.io/nvidia
         image: driver
-        version: 460.32.03
+        version: 470.82.01
         imagePullPolicy: IfNotPresent
         repoConfig:
           configMapName: ""
@@ -6534,7 +6534,7 @@ data:
       toolkit:
         repository: nvcr.io/nvidia/k8s
         image: container-toolkit
-        version: 1.4.7-ubuntu18.04
+        version: 1.7.2
         imagePullPolicy: IfNotPresent
         tolerations:
           - key: CriticalAddonsOnly

diff --git a/test/e2e/azure_gpu.go b/test/e2e/azure_gpu.go
@@ -20,14 +20,17 @@ package e2e
 
 import (
 	"context"
+	"fmt"
 	"os"
+	"strings"
 
 	. "github.com/onsi/ginkgo"
 	. "github.com/onsi/gomega"
 	batchv1 "k8s.io/api/batch/v1"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
 	"sigs.k8s.io/cluster-api/test/framework"
 )
 
@@ -73,7 +76,9 @@ func AzureGPUSpec(ctx context.Context, inputGetter func() AzureGPUSpecInput) {
 			}
 		}
 		return false
-	}, e2eConfig.GetIntervals(specName, "wait-worker-nodes")...).Should(BeTrue())
+	}, e2eConfig.GetIntervals(specName, "wait-worker-nodes")...).Should(BeTrue(), func() string {
+		return getGPUOperatorPodLogs(ctx, clientset)
+	})
 
 	By("running a CUDA vector calculation job")
 	jobsClient := clientset.BatchV1().Jobs(corev1.NamespaceDefault)
@@ -90,7 +95,7 @@ func AzureGPUSpec(ctx context.Context, inputGetter func() AzureGPUSpecInput) {
 					Containers: []corev1.Container{
 						{
 							Name:  jobName,
-							Image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.1-ubuntu18.04",
+							Image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.2.1",
 							Resources: corev1.ResourceRequirements{
 								Limits: corev1.ResourceList{
 									"nvidia.com/gpu": resource.MustParse("1"),
@@ -112,3 +117,18 @@ func AzureGPUSpec(ctx context.Context, inputGetter func() AzureGPUSpecInput) {
 	}
 	WaitForJobComplete(ctx, gpuJobInput, e2eConfig.GetIntervals(specName, "wait-job")...)
 }
+
+// getGPUOperatorPodLogs returns the logs of the Nvidia GPU operator pods.
+func getGPUOperatorPodLogs(ctx context.Context, clientset *kubernetes.Clientset) string {
+	podsClient := clientset.CoreV1().Pods(corev1.NamespaceAll)
+	pods, err := podsClient.List(ctx, metav1.ListOptions{LabelSelector: "app.kubernetes.io/instance=gpu-operator"})
+	if err != nil {
+		return err.Error()
+	}
+	b := strings.Builder{}
+	for _, pod := range pods.Items {
+		b.WriteString(fmt.Sprintf("\nLogs for pod %s:\n", pod.Name))
+		b.WriteString(getPodLogs(ctx, clientset, pod))
+	}
+	return b.String()
+}
diff --git a/test/e2e/helpers.go b/test/e2e/helpers.go
@@ -634,3 +634,19 @@ func resolveKubetestRepoListPath(version string, path string) (string, error) {
 
 	return filepath.Join(path, "repo-list.yaml"), nil
 }
+
+// getPodLogs returns the logs of a pod, or an error in string format.
+func getPodLogs(ctx context.Context, clientset *kubernetes.Clientset, pod corev1.Pod) string {
+	req := clientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &corev1.PodLogOptions{})
+	logs, err := req.Stream(ctx)
+	if err != nil {
+		return fmt.Sprintf("error streaming logs for pod %s: %v", pod.Name, err)
+	}
+	defer logs.Close()
+
+	b := new(bytes.Buffer)
+	if _, err = io.Copy(b, logs); err != nil {
+		return fmt.Sprintf("error copying logs for pod %s: %v", pod.Name, err)
+	}
+	return b.String()
+}