Skip to content

Commit

Permalink
e2e: gpu: add a basic tensorflow test
Browse files Browse the repository at this point in the history
Signed-off-by: Tuomas Katila <[email protected]>
  • Loading branch information
tkatila committed May 11, 2023
1 parent adbb6fe commit 7616ff0
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 2 deletions.
24 changes: 24 additions & 0 deletions deployments/gpu_tensorflow_test/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
apiVersion: v1
kind: Pod
metadata:
name: training-pod
spec:
restartPolicy: Never
containers:
- name: testcontainer
image: intel/intel-extension-for-tensorflow:latest
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args: ["python /code/training.py"]
resources:
limits:
gpu.intel.com/i915: 1
requests:
gpu.intel.com/i915: 1
volumeMounts:
- mountPath: /code
name: code
volumes:
- configMap:
name: training-code
name: code
11 changes: 11 additions & 0 deletions deployments/gpu_tensorflow_test/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
configMapGenerator:
- name: training-code
files:
- training.py

resources:
- deployment.yaml

images:
- name: intel/intel-extension-for-tensorflow
newTag: 1.1.0-gpu-flex
46 changes: 46 additions & 0 deletions deployments/gpu_tensorflow_test/training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# original code from:
# https://github.com/tensorflow/examples/blob/master/courses/udacity_intro_to_tensorflow_for_deep_learning/l02c01_celsius_to_fahrenheit.ipynb
# this is slightly modified to run explicitly with XPU devices

import tensorflow as tf
import intel_extension_for_tensorflow as itex
import numpy as np

print("BACKENDS: ", str(itex.get_backend()))

devs = tf.config.list_physical_devices('XPU')

print(devs)

if not devs:
raise Exception("No devices found")

with tf.device("/xpu:0"):
celsius_q = np.array([-40, -10, 0, 8, 15, 22, 38], dtype=float)
fahrenheit_a = np.array([-40, 14, 32, 46, 59, 72, 100], dtype=float)

model = tf.keras.Sequential([
tf.keras.layers.Dense(units=1, input_shape=[1])
])

model.compile(loss='mean_squared_error',
optimizer=tf.keras.optimizers.Adam(0.1))

history = model.fit(celsius_q, fahrenheit_a, epochs=500, verbose=False)

print("model trained")

test = [100.0]
p = model.predict(test)

if len(p) != 1:
raise Exception("invalid result obj")

prediction = p[0]

if prediction >= 211 and prediction <= 213:
print("inference ok: %f" % prediction)
else:
raise Exception("bad prediction %f" % prediction)

print("SUCCESS")
52 changes: 50 additions & 2 deletions test/e2e/gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@ import (
)

const (
kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
containerName = "testcontainer"
kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
containerName = "testcontainer"
tfKustomizationYaml = "deployments/gpu_tensorflow_test/kustomization.yaml"
tfPodName = "training-pod"
)

func init() {
Expand Down Expand Up @@ -113,4 +115,50 @@ func describe() {

framework.Logf("found card and renderD from the log")
})

ginkgo.It("run some tensorflow code on GPU", func() {
ginkgo.By("deploying GPU plugin")
e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomizationPath))

ginkgo.By("waiting for GPU plugin's availability")
_, err := e2epod.WaitForPodsWithLabelRunningReady(f.ClientSet, f.Namespace.Name,
labels.Set{"app": "intel-gpu-plugin"}.AsSelector(), 1 /* one replica */, 100*time.Second)
if err != nil {
e2edebug.DumpAllNamespaceInfo(f.ClientSet, f.Namespace.Name)
e2ekubectl.LogFailedContainers(f.ClientSet, f.Namespace.Name, framework.Logf)
framework.Failf("unable to wait for all pods to be running and ready: %v", err)
}

ginkgo.By("checking if the resource is allocatable")
if err = utils.WaitForNodesWithResource(f.ClientSet, "gpu.intel.com/i915", 30*time.Second); err != nil {
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
}

kustomYaml, err := utils.LocateRepoFile(tfKustomizationYaml)
if err != nil {
framework.Failf("unable to locate %q: %v", tfKustomizationYaml, err)
}

ginkgo.By("submitting demo deployment")

e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomYaml))

ginkgo.By("waiting the pod to finish")
e2epod.NewPodClient(f).WaitForFinish(tfPodName, 240*time.Second)

ginkgo.By("checking log output")
log, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, tfPodName, containerName)

framework.Logf("logs: %s", log)

if err != nil {
framework.Failf("unable to get log from pod: %v", err)
}

if !strings.Contains(log, "SUCCESS") {
framework.Failf("tensorflow execution failed")
}

framework.Logf("tensorflow execution succeeded!")
})
}

0 comments on commit 7616ff0

Please sign in to comment.