From 7616ff0a3730f7f56ee9da64ffa79fb6de099014 Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Thu, 6 Apr 2023 12:26:43 +0300 Subject: [PATCH] e2e: gpu: add a basic tensorflow test Signed-off-by: Tuomas Katila --- .../gpu_tensorflow_test/deployment.yaml | 24 +++++++++ .../gpu_tensorflow_test/kustomization.yaml | 11 ++++ deployments/gpu_tensorflow_test/training.py | 46 ++++++++++++++++ test/e2e/gpu/gpu.go | 52 ++++++++++++++++++- 4 files changed, 131 insertions(+), 2 deletions(-) create mode 100644 deployments/gpu_tensorflow_test/deployment.yaml create mode 100644 deployments/gpu_tensorflow_test/kustomization.yaml create mode 100644 deployments/gpu_tensorflow_test/training.py diff --git a/deployments/gpu_tensorflow_test/deployment.yaml b/deployments/gpu_tensorflow_test/deployment.yaml new file mode 100644 index 000000000..6aa4cd6e5 --- /dev/null +++ b/deployments/gpu_tensorflow_test/deployment.yaml @@ -0,0 +1,24 @@ +apiVersion: v1 +kind: Pod +metadata: + name: training-pod +spec: + restartPolicy: Never + containers: + - name: testcontainer + image: intel/intel-extension-for-tensorflow:latest + imagePullPolicy: IfNotPresent + command: ["/bin/sh", "-c"] + args: ["python /code/training.py"] + resources: + limits: + gpu.intel.com/i915: 1 + requests: + gpu.intel.com/i915: 1 + volumeMounts: + - mountPath: /code + name: code + volumes: + - configMap: + name: training-code + name: code diff --git a/deployments/gpu_tensorflow_test/kustomization.yaml b/deployments/gpu_tensorflow_test/kustomization.yaml new file mode 100644 index 000000000..461848f14 --- /dev/null +++ b/deployments/gpu_tensorflow_test/kustomization.yaml @@ -0,0 +1,11 @@ +configMapGenerator: +- name: training-code + files: + - training.py + +resources: + - deployment.yaml + +images: + - name: intel/intel-extension-for-tensorflow + newTag: 1.1.0-gpu-flex diff --git a/deployments/gpu_tensorflow_test/training.py b/deployments/gpu_tensorflow_test/training.py new file mode 100644 index 000000000..91879d23c --- /dev/null +++ b/deployments/gpu_tensorflow_test/training.py @@ -0,0 +1,46 @@ +# original code from: +# https://github.com/tensorflow/examples/blob/master/courses/udacity_intro_to_tensorflow_for_deep_learning/l02c01_celsius_to_fahrenheit.ipynb +# this is slightly modified to run explicitly with XPU devices + +import tensorflow as tf +import intel_extension_for_tensorflow as itex +import numpy as np + +print("BACKENDS: ", str(itex.get_backend())) + +devs = tf.config.list_physical_devices('XPU') + +print(devs) + +if not devs: + raise Exception("No devices found") + +with tf.device("/xpu:0"): + celsius_q = np.array([-40, -10, 0, 8, 15, 22, 38], dtype=float) + fahrenheit_a = np.array([-40, 14, 32, 46, 59, 72, 100], dtype=float) + + model = tf.keras.Sequential([ + tf.keras.layers.Dense(units=1, input_shape=[1]) + ]) + + model.compile(loss='mean_squared_error', + optimizer=tf.keras.optimizers.Adam(0.1)) + + history = model.fit(celsius_q, fahrenheit_a, epochs=500, verbose=False) + + print("model trained") + + test = [100.0] + p = model.predict(test) + + if len(p) != 1: + raise Exception("invalid result obj") + + prediction = p[0] + + if prediction >= 211 and prediction <= 213: + print("inference ok: %f" % prediction) + else: + raise Exception("bad prediction %f" % prediction) + + print("SUCCESS") diff --git a/test/e2e/gpu/gpu.go b/test/e2e/gpu/gpu.go index f80b82733..a6f1193df 100644 --- a/test/e2e/gpu/gpu.go +++ b/test/e2e/gpu/gpu.go @@ -35,8 +35,10 @@ import ( ) const ( - kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml" - containerName = "testcontainer" + kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml" + containerName = "testcontainer" + tfKustomizationYaml = "deployments/gpu_tensorflow_test/kustomization.yaml" + tfPodName = "training-pod" ) func init() { @@ -113,4 +115,50 @@ func describe() { framework.Logf("found card and renderD from the log") }) + + ginkgo.It("run some tensorflow code on GPU", func() { + ginkgo.By("deploying GPU plugin") + e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomizationPath)) + + ginkgo.By("waiting for GPU plugin's availability") + _, err := e2epod.WaitForPodsWithLabelRunningReady(f.ClientSet, f.Namespace.Name, + labels.Set{"app": "intel-gpu-plugin"}.AsSelector(), 1 /* one replica */, 100*time.Second) + if err != nil { + e2edebug.DumpAllNamespaceInfo(f.ClientSet, f.Namespace.Name) + e2ekubectl.LogFailedContainers(f.ClientSet, f.Namespace.Name, framework.Logf) + framework.Failf("unable to wait for all pods to be running and ready: %v", err) + } + + ginkgo.By("checking if the resource is allocatable") + if err = utils.WaitForNodesWithResource(f.ClientSet, "gpu.intel.com/i915", 30*time.Second); err != nil { + framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) + } + + kustomYaml, err := utils.LocateRepoFile(tfKustomizationYaml) + if err != nil { + framework.Failf("unable to locate %q: %v", tfKustomizationYaml, err) + } + + ginkgo.By("submitting demo deployment") + + e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomYaml)) + + ginkgo.By("waiting the pod to finish") + e2epod.NewPodClient(f).WaitForFinish(tfPodName, 240*time.Second) + + ginkgo.By("checking log output") + log, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, tfPodName, containerName) + + framework.Logf("logs: %s", log) + + if err != nil { + framework.Failf("unable to get log from pod: %v", err) + } + + if !strings.Contains(log, "SUCCESS") { + framework.Failf("tensorflow execution failed") + } + + framework.Logf("tensorflow execution succeeded!") + }) }