Skip to content

Commit

Permalink
Create a GPU model deployment to use for E2E testing of serving with …
Browse files Browse the repository at this point in the history
…GPUs.

* This is the first step to creating an E2E for the GPU serving kubeflow#291.
* This deployment is suitable for testing that we can deploy the GPU container
  and not have it crash because of linking errors.

* This caught a bug in the Dockerfile.

* Fix the Docker file for the GPU image; we need to remove the symbolic links from /usr/local/nvidia to /usr/local/cuda

* On GKE the device plugin will make drivers available at /usr/local/nvidia and we don't want this to
override /usr/local/cuda
Related to kubeflow#291
  • Loading branch information
jlewi committed Mar 6, 2018
1 parent a51c4af commit 2590180
Show file tree
Hide file tree
Showing 11 changed files with 75,639 additions and 8 deletions.
4 changes: 1 addition & 3 deletions components/k8s-model-server/images/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,7 @@ RUN cd /root/serving && \
bazel build -c opt --copt=-mavx --copt=-mavx2 --copt=-mfma --copt=-mfpmath=both --copt=-msse4.2 --config=cuda -k --verbose_failures --crosstool_top=@local_config_cuda//crosstool:toolchain tensorflow_serving/model_servers:tensorflow_model_server

# Add some softlinks for tensorflow model server and CUDA
RUN ln -s /root/serving/bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server /usr/bin/tensorflow_model_server && \
ln -s /usr/local/cuda /usr/local/nvidia && \
ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so.1
RUN ln -s /root/serving/bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server /usr/bin/tensorflow_model_server

WORKDIR /root

Expand Down
8 changes: 7 additions & 1 deletion components/k8s-model-server/images/releaser/app.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ libraries:
refSpec: master
name: core
registry: kubeflow
tf-serving:
gitVersion:
commitSha: d5580b6fec1df321d6a29a7b4d9fde55f98105a8
refSpec: master
name: tf-serving
registry: kubeflow
name: test-infra
registries:
incubator:
Expand All @@ -17,7 +23,7 @@ registries:
uri: github.com/ksonnet/parts/tree/master/incubator
kubeflow:
gitVersion:
commitSha: 5c35580d76092788b089cb447be3f3097cffe60b
commitSha: d5580b6fec1df321d6a29a7b4d9fde55f98105a8
refSpec: master
protocol: github
uri: github.com/google/kubeflow/tree/master/kubeflow
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
local params = std.extVar("__ksonnet/params").components.gpu_model;
// TODO(https://github.com/ksonnet/ksonnet/issues/222): We have to add namespace as an explicit parameter
// because ksonnet doesn't support inheriting it from the environment yet.

local k = import "k.libsonnet";
local tfServing = import "kubeflow/tf-serving/tf-serving.libsonnet";

local name = params.name;
local namespace = params.namespace;
local modelPath = params.model_path;
local modelServerImage = params.model_server_image;
local httpProxyImage = params.http_proxy_image;
local serviceType = params.service_type;


// TODO(jlewi): This is awful. We need to find a better way to structure our configs to
// make it easy to override the resources.
local containers = tfServing.parts.deployment.modelServer(name, namespace, modelPath, modelServerImage, httpProxyImage).spec.template.spec.containers;

local tfServingContainer = containers[0] {
resources+: {
limits+: {
"nvidia.com/gpu": 1,
},
},
};

local httpProxyContainer = containers[1];
local server = tfServing.parts.deployment.modelServer(name, namespace, modelPath, modelServerImage, httpProxyImage)
+ {
spec+: {
template+: {
spec+: {
containers: std.prune([tfServingContainer, httpProxyContainer]),
// TODO(jlewi): For the CPU image we set the user and group to 1000 which are defined within the Docker container.
// But we don't do the same for the GPU image.
securityContext: null,
},
},
},
};
std.prune(k.core.v1.list.new([
server,
tfServing.parts.deployment.modelService(name, namespace, serviceType),
]))
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,14 @@
cluster: "kubeflow-testing",
zone: "us-east1-d",
},
// Test deploying a GPU model.
gpu_model: {
http_proxy_image: "gcr.io/kubeflow/http-proxy:1.0",
model_path: "gs://some-bucket/some/model",
model_server_image: "gcr.io/kubeflow-images-staging/tf-model-server-gpu:v20180305-pr362-7f250ae-5cc7",
name: "gpu_model",
namespace: "default",
service_type: "ClusterIP",
},
},
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
local k8s = import "k8s.libsonnet";

local apps = k8s.apps;
local core = k8s.core;
local extensions = k8s.extensions;

local hidden = {
mapContainers(f):: {
local podContainers = super.spec.template.spec.containers,
spec+: {
template+: {
spec+: {
// IMPORTANT: This overwrites the 'containers' field
// for this deployment.
containers: std.map(f, podContainers),
},
},
},
},

mapContainersWithName(names, f) ::
local nameSet =
if std.type(names) == "array"
then std.set(names)
else std.set([names]);
local inNameSet(name) = std.length(std.setInter(nameSet, std.set([name]))) > 0;
self.mapContainers(
function(c)
if std.objectHas(c, "name") && inNameSet(c.name)
then f(c)
else c
),
};

k8s + {
apps:: apps + {
v1beta1:: apps.v1beta1 + {
local v1beta1 = apps.v1beta1,

daemonSet:: v1beta1.daemonSet + {
mapContainers(f):: hidden.mapContainers(f),
mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
},

deployment:: v1beta1.deployment + {
mapContainers(f):: hidden.mapContainers(f),
mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
},
},
},

core:: core + {
v1:: core.v1 + {
list:: {
new(items)::
{apiVersion: "v1"} +
{kind: "List"} +
self.items(items),

items(items):: if std.type(items) == "array" then {items+: items} else {items+: [items]},
},
},
},

extensions:: extensions + {
v1beta1:: extensions.v1beta1 + {
local v1beta1 = extensions.v1beta1,

daemonSet:: v1beta1.daemonSet + {
mapContainers(f):: hidden.mapContainers(f),
mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
},

deployment:: v1beta1.deployment + {
mapContainers(f):: hidden.mapContainers(f),
mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
},
},
},
}
Loading

0 comments on commit 2590180

Please sign in to comment.