Create a GPU model deployment to use for E2E testing of serving with …

…GPUs. * This is the first step to creating an E2E for the GPU serving kubeflow#291. * This deployment is suitable for testing that we can deploy the GPU container and not have it crash because of linking errors. * This caught a bug in the Dockerfile. * Fix the Docker file for the GPU image; we need to remove the symbolic links from /usr/local/nvidia to /usr/local/cuda * On GKE the device plugin will make drivers available at /usr/local/nvidia and we don't want this to override /usr/local/cuda Related to kubeflow#291
jlewi · Mar 6, 2018 · 2590180 · 2590180
1 parent a51c4af
commit 2590180
Show file tree

Hide file tree

Showing 11 changed files with 75,639 additions and 8 deletions.
diff --git a/components/k8s-model-server/images/Dockerfile.gpu b/components/k8s-model-server/images/Dockerfile.gpu
@@ -89,9 +89,7 @@ RUN cd /root/serving && \
     bazel build -c opt --copt=-mavx --copt=-mavx2 --copt=-mfma --copt=-mfpmath=both --copt=-msse4.2 --config=cuda -k --verbose_failures --crosstool_top=@local_config_cuda//crosstool:toolchain tensorflow_serving/model_servers:tensorflow_model_server
 
 # Add some softlinks for tensorflow model server and CUDA
-RUN ln -s /root/serving/bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server /usr/bin/tensorflow_model_server && \
-    ln -s /usr/local/cuda /usr/local/nvidia && \
-    ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so.1
+RUN ln -s /root/serving/bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server /usr/bin/tensorflow_model_server
 
 WORKDIR /root
 

diff --git a/components/k8s-model-server/images/releaser/app.yaml b/components/k8s-model-server/images/releaser/app.yaml
@@ -7,6 +7,12 @@ libraries:
       refSpec: master
     name: core
     registry: kubeflow
+  tf-serving:
+    gitVersion:
+      commitSha: d5580b6fec1df321d6a29a7b4d9fde55f98105a8
+      refSpec: master
+    name: tf-serving
+    registry: kubeflow
 name: test-infra
 registries:
   incubator:
@@ -17,7 +23,7 @@ registries:
     uri: github.com/ksonnet/parts/tree/master/incubator
   kubeflow:
     gitVersion:
-      commitSha: 5c35580d76092788b089cb447be3f3097cffe60b
+      commitSha: d5580b6fec1df321d6a29a7b4d9fde55f98105a8
       refSpec: master
     protocol: github
     uri: github.com/google/kubeflow/tree/master/kubeflow

diff --git a/components/k8s-model-server/images/releaser/components/gpu_model.jsonnet b/components/k8s-model-server/images/releaser/components/gpu_model.jsonnet
@@ -0,0 +1,45 @@
+local params = std.extVar("__ksonnet/params").components.gpu_model;
+// TODO(https://github.com/ksonnet/ksonnet/issues/222): We have to add namespace as an explicit parameter
+// because ksonnet doesn't support inheriting it from the environment yet.
+
+local k = import "k.libsonnet";
+local tfServing = import "kubeflow/tf-serving/tf-serving.libsonnet";
+
+local name = params.name;
+local namespace = params.namespace;
+local modelPath = params.model_path;
+local modelServerImage = params.model_server_image;
+local httpProxyImage = params.http_proxy_image;
+local serviceType = params.service_type;
+
+
+// TODO(jlewi): This is awful. We need to find a better way to structure our configs to
+// make it easy to override the resources.
+local containers = tfServing.parts.deployment.modelServer(name, namespace, modelPath, modelServerImage, httpProxyImage).spec.template.spec.containers;
+
+local tfServingContainer = containers[0] {
+  resources+: {
+    limits+: {
+      "nvidia.com/gpu": 1,
+    },
+  },
+};
+
+local httpProxyContainer = containers[1];
+local server = tfServing.parts.deployment.modelServer(name, namespace, modelPath, modelServerImage, httpProxyImage)
+               + {
+                 spec+: {
+                   template+: {
+                     spec+: {
+                       containers: std.prune([tfServingContainer, httpProxyContainer]),
+                       // TODO(jlewi): For the CPU image we set the user and group to 1000 which are defined within the Docker container.
+                       // But we don't do the same for the GPU image.
+                       securityContext: null,
+                     },
+                   },
+                 },
+               };
+std.prune(k.core.v1.list.new([
+  server,
+  tfServing.parts.deployment.modelService(name, namespace, serviceType),
+]))
diff --git a/components/k8s-model-server/images/releaser/components/params.libsonnet b/components/k8s-model-server/images/releaser/components/params.libsonnet
@@ -19,5 +19,14 @@
       cluster: "kubeflow-testing",
       zone: "us-east1-d",
     },
+    // Test deploying a GPU model.
+    gpu_model: {
+      http_proxy_image: "gcr.io/kubeflow/http-proxy:1.0",
+      model_path: "gs://some-bucket/some/model",      
+      model_server_image: "gcr.io/kubeflow-images-staging/tf-model-server-gpu:v20180305-pr362-7f250ae-5cc7",
+      name: "gpu_model",
+      namespace: "default",
+      service_type: "ClusterIP",
+    },
   },
 }
diff --git a/components/k8s-model-server/images/releaser/environments/testing/.metadata/k.libsonnet b/components/k8s-model-server/images/releaser/environments/testing/.metadata/k.libsonnet
@@ -0,0 +1,80 @@
+local k8s = import "k8s.libsonnet";
+
+local apps = k8s.apps;
+local core = k8s.core;
+local extensions = k8s.extensions;
+
+local hidden = {
+  mapContainers(f):: {
+    local podContainers = super.spec.template.spec.containers,
+    spec+: {
+      template+: {
+        spec+: {
+          // IMPORTANT: This overwrites the 'containers' field
+          // for this deployment.
+          containers: std.map(f, podContainers),
+        },
+      },
+    },
+  },
+
+  mapContainersWithName(names, f) ::
+    local nameSet =
+      if std.type(names) == "array"
+      then std.set(names)
+      else std.set([names]);
+    local inNameSet(name) = std.length(std.setInter(nameSet, std.set([name]))) > 0;
+    self.mapContainers(
+      function(c)
+        if std.objectHas(c, "name") && inNameSet(c.name)
+        then f(c)
+        else c
+    ),
+};
+
+k8s + {
+  apps:: apps + {
+    v1beta1:: apps.v1beta1 + {
+      local v1beta1 = apps.v1beta1,
+
+      daemonSet:: v1beta1.daemonSet + {
+        mapContainers(f):: hidden.mapContainers(f),
+        mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
+      },
+
+      deployment:: v1beta1.deployment + {
+        mapContainers(f):: hidden.mapContainers(f),
+        mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
+      },
+    },
+  },
+
+  core:: core + {
+    v1:: core.v1 + {
+      list:: {
+        new(items)::
+          {apiVersion: "v1"} +
+          {kind: "List"} +
+          self.items(items),
+
+        items(items):: if std.type(items) == "array" then {items+: items} else {items+: [items]},
+      },
+    },
+  },
+
+  extensions:: extensions + {
+    v1beta1:: extensions.v1beta1 + {
+      local v1beta1 = extensions.v1beta1,
+
+      daemonSet:: v1beta1.daemonSet + {
+        mapContainers(f):: hidden.mapContainers(f),
+        mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
+      },
+
+      deployment:: v1beta1.deployment + {
+        mapContainers(f):: hidden.mapContainers(f),
+        mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
+      },
+    },
+  },
+}