diff --git a/.golangci.yml b/.golangci.yml index a0d05ef61..c094331ff 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -73,6 +73,8 @@ issues: - path: test/e2e/ linters: - wsl + - gocognit + - gocyclo - path: cmd/gpu_fakedev/ linters: - wsl diff --git a/test/e2e/dlb/dlb.go b/test/e2e/dlb/dlb.go index fa6304909..6ba6c6467 100644 --- a/test/e2e/dlb/dlb.go +++ b/test/e2e/dlb/dlb.go @@ -84,7 +84,7 @@ func describe() { ginkgo.Context("When PF resources are available [Resource:pf]", func() { ginkgo.BeforeEach(func(ctx context.Context) { resource := v1.ResourceName("dlb.intel.com/pf") - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resource, 30*time.Second); err != nil { + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resource, 30*time.Second, utils.WaitOpGreater); err != nil { framework.Failf("unable to wait for nodes to have positive allocatable resource %s: %v", resource, err) } }) @@ -101,7 +101,7 @@ func describe() { ginkgo.Context("When VF resources are available [Resource:vf]", func() { ginkgo.BeforeEach(func(ctx context.Context) { resource := v1.ResourceName("dlb.intel.com/vf") - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resource, 30*time.Second); err != nil { + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resource, 30*time.Second, utils.WaitOpGreater); err != nil { framework.Failf("unable to wait for nodes to have positive allocatable resource %s: %v", resource, err) } }) diff --git a/test/e2e/dsa/dsa.go b/test/e2e/dsa/dsa.go index e2e871251..daea2e990 100644 --- a/test/e2e/dsa/dsa.go +++ b/test/e2e/dsa/dsa.go @@ -97,7 +97,7 @@ func describe() { ginkgo.Context("When DSA resources are available [Resource:dedicated]", func() { ginkgo.BeforeEach(func(ctx context.Context) { ginkgo.By("checking if the resource is allocatable") - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "dsa.intel.com/wq-user-dedicated", 300*time.Second); err != nil { + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "dsa.intel.com/wq-user-dedicated", 300*time.Second, utils.WaitOpGreater); err != nil { framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) } }) diff --git a/test/e2e/fpga/fpga.go b/test/e2e/fpga/fpga.go index acc224017..2a9bb5e95 100644 --- a/test/e2e/fpga/fpga.go +++ b/test/e2e/fpga/fpga.go @@ -129,7 +129,7 @@ func runDevicePlugin(ctx context.Context, fmw *framework.Framework, pluginKustom ginkgo.By("checking if the resource is allocatable") - if err = utils.WaitForNodesWithResource(ctx, fmw.ClientSet, resource, 30*time.Second); err != nil { + if err = utils.WaitForNodesWithResource(ctx, fmw.ClientSet, resource, 30*time.Second, utils.WaitOpGreater); err != nil { framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) } } diff --git a/test/e2e/gpu/gpu.go b/test/e2e/gpu/gpu.go index 783d556cb..2e858b94b 100644 --- a/test/e2e/gpu/gpu.go +++ b/test/e2e/gpu/gpu.go @@ -37,51 +37,88 @@ import ( const ( kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml" + monitoringYaml = "deployments/gpu_plugin/overlays/monitoring_shared-dev_nfd/kustomization.yaml" + cdiEnabledYaml = "deployments/gpu_plugin/overlays/cdi-support/kustomization.yaml" + rmEnabledYaml = "deployments/gpu_plugin/overlays/fractional_resources//kustomization.yaml" + nfdRulesYaml = "deployments/nfd/overlays/node-feature-rules/kustomization.yaml" containerName = "testcontainer" tfKustomizationYaml = "deployments/gpu_tensorflow_test/kustomization.yaml" tfPodName = "training-pod" ) func init() { - ginkgo.Describe("GPU plugin [Device:gpu]", describe) + // This needs to be Ordered because only one GPU plugin can function on the node at once. + ginkgo.Describe("GPU plugin [Device:gpu]", describe, ginkgo.Ordered) +} + +func createPluginAndVerifyExistence(f *framework.Framework, ctx context.Context, kustomizationPath, baseResource string) { + ginkgo.By("deploying GPU plugin") + e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomizationPath)) + + ginkgo.By("waiting for GPU plugin's availability") + podList, err := e2epod.WaitForPodsWithLabelRunningReady(ctx, f.ClientSet, f.Namespace.Name, + labels.Set{"app": "intel-gpu-plugin"}.AsSelector(), 1 /* one replica */, 100*time.Second) + if err != nil { + e2edebug.DumpAllNamespaceInfo(ctx, f.ClientSet, f.Namespace.Name) + e2ekubectl.LogFailedContainers(ctx, f.ClientSet, f.Namespace.Name, framework.Logf) + framework.Failf("unable to wait for all pods to be running and ready: %v", err) + } + + ginkgo.By("checking GPU plugin's securityContext") + if err = utils.TestPodsFileSystemInfo(podList.Items); err != nil { + framework.Failf("container filesystem info checks failed: %v", err) + } + + ginkgo.By("checking if the resource is allocatable") + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, v1.ResourceName(baseResource), 30*time.Second, utils.WaitOpGreater); err != nil { + framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) + } } func describe() { f := framework.NewDefaultFramework("gpuplugin") f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged - kustomizationPath, errFailedToLocateRepoFile := utils.LocateRepoFile(kustomizationYaml) + vanillaPath, errFailedToLocateRepoFile := utils.LocateRepoFile(kustomizationYaml) if errFailedToLocateRepoFile != nil { framework.Failf("unable to locate %q: %v", kustomizationYaml, errFailedToLocateRepoFile) } - ginkgo.BeforeEach(func(ctx context.Context) { - ginkgo.By("deploying GPU plugin") - e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomizationPath)) - - ginkgo.By("waiting for GPU plugin's availability") - podList, err := e2epod.WaitForPodsWithLabelRunningReady(ctx, f.ClientSet, f.Namespace.Name, - labels.Set{"app": "intel-gpu-plugin"}.AsSelector(), 1 /* one replica */, 100*time.Second) - if err != nil { - e2edebug.DumpAllNamespaceInfo(ctx, f.ClientSet, f.Namespace.Name) - e2ekubectl.LogFailedContainers(ctx, f.ClientSet, f.Namespace.Name, framework.Logf) - framework.Failf("unable to wait for all pods to be running and ready: %v", err) - } - - ginkgo.By("checking GPU plugin's securityContext") - if err = utils.TestPodsFileSystemInfo(podList.Items); err != nil { - framework.Failf("container filesystem info checks failed: %v", err) - } - }) + monitoringPath, errFailedToLocateRepoFile := utils.LocateRepoFile(monitoringYaml) + if errFailedToLocateRepoFile != nil { + framework.Failf("unable to locate %q: %v", monitoringYaml, errFailedToLocateRepoFile) + } - ginkgo.Context("When GPU resources are available [Resource:i915]", func() { - ginkgo.BeforeEach(func(ctx context.Context) { - ginkgo.By("checking if the resource is allocatable") - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/i915", 30*time.Second); err != nil { - framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) + cdiPath, errFailedToLocateRepoFile := utils.LocateRepoFile(cdiEnabledYaml) + if errFailedToLocateRepoFile != nil { + framework.Failf("unable to locate %q: %v", cdiEnabledYaml, errFailedToLocateRepoFile) + } + + nfdRulesPath, errFailedToLocateRepoFile := utils.LocateRepoFile(nfdRulesYaml) + if errFailedToLocateRepoFile != nil { + framework.Failf("unable to locate %q: %v", nfdRulesYaml, errFailedToLocateRepoFile) + } + + resourceManagerPath, errFailedToLocateRepoFile := utils.LocateRepoFile(rmEnabledYaml) + if errFailedToLocateRepoFile != nil { + framework.Failf("unable to locate %q: %v", rmEnabledYaml, errFailedToLocateRepoFile) + } + + ginkgo.Context("When GPU plugin is deployed [Resource:i915]", func() { + ginkgo.AfterEach(func(ctx context.Context) { + framework.Logf("Removing gpu-plugin manually") + + e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "delete", "-k", filepath.Dir(vanillaPath)) + + // Wait for resources to go to zero + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/i915", 30*time.Second, utils.WaitOpZero); err != nil { + framework.Failf("unable to wait for nodes to have no resources: %v", err) } }) + ginkgo.It("checks availability of GPU resources [App:busybox]", func(ctx context.Context) { + createPluginAndVerifyExistence(f, ctx, vanillaPath, "gpu.intel.com/i915") + ginkgo.By("submitting a pod requesting GPU resources") podSpec := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: "gpuplugin-tester"}, @@ -122,7 +159,75 @@ func describe() { framework.Logf("found card and renderD from the log") }) + ginkgo.Context("When [Deployment:monitoring] deployment is applied [Resource:i915]", func() { + ginkgo.It("check if monitoring resource is available", func(ctx context.Context) { + createPluginAndVerifyExistence(f, ctx, monitoringPath, "gpu.intel.com/i915") + + ginkgo.By("checking if the monitoring resource is allocatable") + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/i915_monitoring", 30*time.Second, utils.WaitOpGreater); err != nil { + framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) + } + }) + }) + + ginkgo.Context("When [Deployment:resourceManager] deployment is applied [Resource:i915]", func() { + ginkgo.It("check if i915 resources is available", func(ctx context.Context) { + e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(nfdRulesPath)) + + createPluginAndVerifyExistence(f, ctx, resourceManagerPath, "gpu.intel.com/i915") + + // To speed up extended resource detection, let's restart NFD worker + e2ekubectl.RunKubectlOrDie("node-feature-discovery", "rollout", "restart", "daemonset", "nfd-worker") + + ginkgo.By("checking if the millicores resource is allocatable") + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/millicores", 30*time.Second, utils.WaitOpGreater); err != nil { + framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) + } + + ginkgo.By("checking if the tiles resource is allocatable") + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/tiles", 30*time.Second, utils.WaitOpGreater); err != nil { + framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) + } + }) + }) + + ginkgo.Context("When [Deployment:cdi] deployment is applied [Resource:i915]", func() { + ginkgo.It("check if normal resource is available", func(ctx context.Context) { + createPluginAndVerifyExistence(f, ctx, cdiPath, "gpu.intel.com/i915") + + podListFunc := framework.ListObjects(f.ClientSet.CoreV1().Pods(f.Namespace.Name).List, metav1.ListOptions{}) + + pods, err := podListFunc(ctx) + if err != nil { + framework.Failf("Couldn't list pods: %+v", err) + } + + if len(pods.Items) != 1 { + framework.Failf("Invalid amount of Pods listed %d", len(pods.Items)) + } + + pod := pods.Items[0] + + ginkgo.By("checking if CDI path is included in volumes") + found := false + for _, v := range pod.Spec.Volumes { + if v.HostPath != nil && v.HostPath.Path == "/var/run/cdi" { + framework.Logf("CDI volume found") + found = true + + break + } + } + + if !found { + framework.Fail("Couldn't find CDI volume in GPU plugin deployment") + } + }) + }) + ginkgo.It("run a small workload on the GPU [App:tensorflow]", func(ctx context.Context) { + createPluginAndVerifyExistence(f, ctx, vanillaPath, "gpu.intel.com/i915") + kustomYaml, err := utils.LocateRepoFile(tfKustomizationYaml) if err != nil { framework.Failf("unable to locate %q: %v", tfKustomizationYaml, err) @@ -146,13 +251,9 @@ func describe() { }) ginkgo.Context("When GPU resources are available [Resource:xe]", func() { - ginkgo.BeforeEach(func(ctx context.Context) { - ginkgo.By("checking if the resource is allocatable") - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/xe", 30*time.Second); err != nil { - framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) - } - }) ginkgo.It("checks availability of GPU resources [App:busybox]", func(ctx context.Context) { + createPluginAndVerifyExistence(f, ctx, vanillaPath, "gpu.intel.com/xe") + ginkgo.By("submitting a pod requesting GPU resources") podSpec := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: "gpuplugin-tester"}, diff --git a/test/e2e/iaa/iaa.go b/test/e2e/iaa/iaa.go index 669fef1f7..5fa394289 100644 --- a/test/e2e/iaa/iaa.go +++ b/test/e2e/iaa/iaa.go @@ -97,7 +97,7 @@ func describe() { ginkgo.Context("When IAA resources are available [Resource:dedicated]", func() { ginkgo.BeforeEach(func(ctx context.Context) { ginkgo.By("checking if the resource is allocatable") - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "iaa.intel.com/wq-user-dedicated", 300*time.Second); err != nil { + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "iaa.intel.com/wq-user-dedicated", 300*time.Second, utils.WaitOpGreater); err != nil { framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) } }) diff --git a/test/e2e/operator/operator.go b/test/e2e/operator/operator.go index 6eb3122de..5913ea97a 100644 --- a/test/e2e/operator/operator.go +++ b/test/e2e/operator/operator.go @@ -89,7 +89,7 @@ func testPluginWithOperator(deviceName string, resourceNames []v1.ResourceName, } for _, resourceName := range resourceNames { - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, timeout); err != nil { + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, timeout, utils.WaitOpGreater); err != nil { framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) } } diff --git a/test/e2e/qat/qatplugin_dpdk.go b/test/e2e/qat/qatplugin_dpdk.go index 0852dc1e6..ba4814953 100644 --- a/test/e2e/qat/qatplugin_dpdk.go +++ b/test/e2e/qat/qatplugin_dpdk.go @@ -98,7 +98,7 @@ func describeQatDpdkPlugin() { } ginkgo.By("checking if the resource is allocatable") - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, 30*time.Second); err != nil { + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, 30*time.Second, utils.WaitOpGreater); err != nil { framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) } }) diff --git a/test/e2e/qat/qatplugin_kernel.go b/test/e2e/qat/qatplugin_kernel.go index 39ed28655..055ce33ff 100644 --- a/test/e2e/qat/qatplugin_kernel.go +++ b/test/e2e/qat/qatplugin_kernel.go @@ -82,7 +82,7 @@ func describeQatKernelPlugin() { ginkgo.Context("When QAT resources are available [Resource:cy1_dc0]", func() { ginkgo.BeforeEach(func(ctx context.Context) { ginkgo.By("checking if the resource is allocatable") - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "qat.intel.com/cy1_dc0", 30*time.Second); err != nil { + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "qat.intel.com/cy1_dc0", 30*time.Second, utils.WaitOpGreater); err != nil { framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) } }) diff --git a/test/e2e/sgx/sgx.go b/test/e2e/sgx/sgx.go index b8bd2cf3a..300aa08e2 100644 --- a/test/e2e/sgx/sgx.go +++ b/test/e2e/sgx/sgx.go @@ -82,13 +82,13 @@ func describe() { ginkgo.Context("When SGX resources are available", func() { ginkgo.BeforeEach(func(ctx context.Context) { ginkgo.By("checking if the resource is allocatable") - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/epc", 150*time.Second); err != nil { + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/epc", 150*time.Second, utils.WaitOpGreater); err != nil { framework.Failf("unable to wait for nodes to have positive allocatable epc resource: %v", err) } - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/enclave", 30*time.Second); err != nil { + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/enclave", 30*time.Second, utils.WaitOpGreater); err != nil { framework.Failf("unable to wait for nodes to have positive allocatable enclave resource: %v", err) } - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/provision", 30*time.Second); err != nil { + if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/provision", 30*time.Second, utils.WaitOpGreater); err != nil { framework.Failf("unable to wait for nodes to have positive allocatable provision resource: %v", err) } }) diff --git a/test/e2e/utils/utils.go b/test/e2e/utils/utils.go index c44c36ce3..160d0fb13 100644 --- a/test/e2e/utils/utils.go +++ b/test/e2e/utils/utils.go @@ -40,6 +40,9 @@ import ( const ( poll = time.Second + + WaitOpGreater = 1 + WaitOpZero = 2 ) // GetPodLogs returns the log of the container. If not possible to get logs, it returns the error message. @@ -52,9 +55,16 @@ func GetPodLogs(ctx context.Context, f *framework.Framework, podName, containerN return fmt.Sprintf("log output of the container %s in the pod %s:%s", containerName, podName, log) } -// WaitForNodesWithResource waits for nodes to have positive allocatable resource. -func WaitForNodesWithResource(ctx context.Context, c clientset.Interface, res v1.ResourceName, timeout time.Duration) error { - framework.Logf("Waiting up to %s for any positive allocatable resource %q", timeout, res) +// WaitForNodesWithResource waits for node's resources to change. +// Depending on the waitOperation, function waits for positive resource count or a zero resource count. +func WaitForNodesWithResource(ctx context.Context, c clientset.Interface, res v1.ResourceName, timeout time.Duration, waitOperation int) error { + if waitOperation == WaitOpGreater { + framework.Logf("Waiting up to %s for any positive allocatable resource %q", timeout, res) + } else if waitOperation == WaitOpZero { + framework.Logf("Waiting up to %s for allocatable resource %q to go to zero", timeout, res) + } else { + framework.Fail("unknown wait operation given: %d", waitOperation) + } start := time.Now() @@ -73,7 +83,10 @@ func WaitForNodesWithResource(ctx context.Context, c clientset.Interface, res v1 } } framework.Logf("Found %d of %q. Elapsed: %s", resNum, res, time.Since(start)) - if resNum > 0 { + + if waitOperation == WaitOpGreater && resNum > 0 { + return true, nil + } else if waitOperation == WaitOpZero && resNum == 0 { return true, nil } }