Skip to content

Commit

Permalink
Merge pull request #1922 from mboersma/fix-gpu-e2e
Browse files Browse the repository at this point in the history
Update NVIDIA GPU operator componentry
  • Loading branch information
k8s-ci-robot authored Dec 15, 2021
2 parents 28fc000 + 330ab44 commit fff5371
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 15 deletions.
8 changes: 4 additions & 4 deletions templates/cluster-template-nvidia-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6036,7 +6036,7 @@ metadata:
---
apiVersion: v1
data:
gpu-operator-components.yaml: |-
gpu-operator-components.yaml: |
---
# Source: gpu-operator/templates/resources-namespace.yaml
apiVersion: v1
Expand Down Expand Up @@ -6383,7 +6383,7 @@ data:
- name: node-feature-discovery-master
securityContext:
{}
image: "quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0"
image: "k8s.gcr.io/nfd/node-feature-discovery:v0.9.0"
imagePullPolicy: IfNotPresent
ports:
- name: api
Expand Down Expand Up @@ -6509,7 +6509,7 @@ data:
driver:
repository: nvcr.io/nvidia
image: driver
version: 460.32.03
version: 470.82.01
imagePullPolicy: IfNotPresent
repoConfig:
configMapName: ""
Expand All @@ -6529,7 +6529,7 @@ data:
toolkit:
repository: nvcr.io/nvidia/k8s
image: container-toolkit
version: 1.4.7-ubuntu18.04
version: 1.7.2
imagePullPolicy: IfNotPresent
tolerations:
- key: CriticalAddonsOnly
Expand Down
8 changes: 4 additions & 4 deletions templates/flavors/nvidia-gpu/gpu-operator-components.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ spec:
- name: node-feature-discovery-master
securityContext:
{}
image: "quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0"
image: "k8s.gcr.io/nfd/node-feature-discovery:v0.9.0"
imagePullPolicy: IfNotPresent
ports:
- name: api
Expand Down Expand Up @@ -470,7 +470,7 @@ spec:
driver:
repository: nvcr.io/nvidia
image: driver
version: 460.32.03
version: 470.82.01
imagePullPolicy: IfNotPresent
repoConfig:
configMapName: ""
Expand All @@ -490,7 +490,7 @@ spec:
toolkit:
repository: nvcr.io/nvidia/k8s
image: container-toolkit
version: 1.4.7-ubuntu18.04
version: 1.7.2
imagePullPolicy: IfNotPresent
tolerations:
- key: CriticalAddonsOnly
Expand Down Expand Up @@ -535,4 +535,4 @@ spec:
nodeSelector:
nvidia.com/gpu.present: "true"
migStrategy: single
discoveryIntervalSeconds: 60
discoveryIntervalSeconds: 60
2 changes: 1 addition & 1 deletion templates/flavors/nvidia-gpu/patches/cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ kind: Cluster
metadata:
name: ${CLUSTER_NAME}
labels:
gpu: "nvidia"
gpu: "nvidia"
8 changes: 4 additions & 4 deletions templates/test/ci/cluster-template-prow-nvidia-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6041,7 +6041,7 @@ metadata:
---
apiVersion: v1
data:
gpu-operator-components.yaml: |-
gpu-operator-components.yaml: |
---
# Source: gpu-operator/templates/resources-namespace.yaml
apiVersion: v1
Expand Down Expand Up @@ -6388,7 +6388,7 @@ data:
- name: node-feature-discovery-master
securityContext:
{}
image: "quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0"
image: "k8s.gcr.io/nfd/node-feature-discovery:v0.9.0"
imagePullPolicy: IfNotPresent
ports:
- name: api
Expand Down Expand Up @@ -6514,7 +6514,7 @@ data:
driver:
repository: nvcr.io/nvidia
image: driver
version: 460.32.03
version: 470.82.01
imagePullPolicy: IfNotPresent
repoConfig:
configMapName: ""
Expand All @@ -6534,7 +6534,7 @@ data:
toolkit:
repository: nvcr.io/nvidia/k8s
image: container-toolkit
version: 1.4.7-ubuntu18.04
version: 1.7.2
imagePullPolicy: IfNotPresent
tolerations:
- key: CriticalAddonsOnly
Expand Down
24 changes: 22 additions & 2 deletions test/e2e/azure_gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,17 @@ package e2e

import (
"context"
"fmt"
"os"
"strings"

. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"sigs.k8s.io/cluster-api/test/framework"
)

Expand Down Expand Up @@ -73,7 +76,9 @@ func AzureGPUSpec(ctx context.Context, inputGetter func() AzureGPUSpecInput) {
}
}
return false
}, e2eConfig.GetIntervals(specName, "wait-worker-nodes")...).Should(BeTrue())
}, e2eConfig.GetIntervals(specName, "wait-worker-nodes")...).Should(BeTrue(), func() string {
return getGPUOperatorPodLogs(ctx, clientset)
})

By("running a CUDA vector calculation job")
jobsClient := clientset.BatchV1().Jobs(corev1.NamespaceDefault)
Expand All @@ -90,7 +95,7 @@ func AzureGPUSpec(ctx context.Context, inputGetter func() AzureGPUSpecInput) {
Containers: []corev1.Container{
{
Name: jobName,
Image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.1-ubuntu18.04",
Image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.2.1",
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"nvidia.com/gpu": resource.MustParse("1"),
Expand All @@ -112,3 +117,18 @@ func AzureGPUSpec(ctx context.Context, inputGetter func() AzureGPUSpecInput) {
}
WaitForJobComplete(ctx, gpuJobInput, e2eConfig.GetIntervals(specName, "wait-job")...)
}

// getGPUOperatorPodLogs returns the logs of the Nvidia GPU operator pods.
func getGPUOperatorPodLogs(ctx context.Context, clientset *kubernetes.Clientset) string {
podsClient := clientset.CoreV1().Pods(corev1.NamespaceAll)
pods, err := podsClient.List(ctx, metav1.ListOptions{LabelSelector: "app.kubernetes.io/instance=gpu-operator"})
if err != nil {
return err.Error()
}
b := strings.Builder{}
for _, pod := range pods.Items {
b.WriteString(fmt.Sprintf("\nLogs for pod %s:\n", pod.Name))
b.WriteString(getPodLogs(ctx, clientset, pod))
}
return b.String()
}
16 changes: 16 additions & 0 deletions test/e2e/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -634,3 +634,19 @@ func resolveKubetestRepoListPath(version string, path string) (string, error) {

return filepath.Join(path, "repo-list.yaml"), nil
}

// getPodLogs returns the logs of a pod, or an error in string format.
func getPodLogs(ctx context.Context, clientset *kubernetes.Clientset, pod corev1.Pod) string {
req := clientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &corev1.PodLogOptions{})
logs, err := req.Stream(ctx)
if err != nil {
return fmt.Sprintf("error streaming logs for pod %s: %v", pod.Name, err)
}
defer logs.Close()

b := new(bytes.Buffer)
if _, err = io.Copy(b, logs); err != nil {
return fmt.Sprintf("error copying logs for pod %s: %v", pod.Name, err)
}
return b.String()
}

0 comments on commit fff5371

Please sign in to comment.