Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support using our E2E workflow to build a Docker image for releases. #403

Merged
merged 32 commits into from
Feb 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
cbbb09c
Support using our E2E workflow to build a Docker image for releases.
jlewi Feb 23, 2018
51b9a1b
Fix some bugs.
jlewi Feb 24, 2018
3a6f14d
Fix some bugs.
jlewi Feb 24, 2018
c0bc0f7
Latest.
jlewi Feb 24, 2018
6fe3105
latest.
jlewi Feb 24, 2018
96b9c46
Fix bugs.
jlewi Feb 24, 2018
f3cd8ab
Fix bugs.
jlewi Feb 24, 2018
dc57fb1
latest.
jlewi Feb 24, 2018
ab2dfd7
Bug fix.
jlewi Feb 24, 2018
54aaa09
Fix a bunch of pylint issues now that we are using python3.
jlewi Feb 24, 2018
4db3410
Fix bug.
jlewi Feb 24, 2018
0bfdf5d
Get credentials.
jlewi Feb 24, 2018
bcfd8dc
Fix account.
jlewi Feb 24, 2018
2abe85c
Try to set account correctly.
jlewi Feb 24, 2018
591f563
Fix.
jlewi Feb 24, 2018
eed4eb9
Bug fix.
jlewi Feb 24, 2018
ba7e1cf
Fix.
jlewi Feb 24, 2018
4e04914
Leave cluster up for debugging.
jlewi Feb 24, 2018
c9886bc
Fix image.
jlewi Feb 24, 2018
bb93c00
Fix
jlewi Feb 24, 2018
b3ab2c3
Merge remote-tracking branch 'upstream/master' into releases
jlewi Feb 24, 2018
6303ec9
Fix bug.
jlewi Feb 24, 2018
187040e
Update the releasing.
jlewi Feb 25, 2018
7c57923
Latest.
jlewi Feb 26, 2018
6f1abfb
Latest.
jlewi Feb 26, 2018
de8b4d6
Add yapf file.
jlewi Feb 27, 2018
5ab429d
Apply YAPF
jlewi Feb 27, 2018
77ec7ab
Revert some changes.
jlewi Feb 27, 2018
62fba48
Revert examples changes.
jlewi Feb 27, 2018
7f6c336
Revert a bunch of files.
jlewi Feb 27, 2018
e022a64
Merge remote-tracking branch 'upstream/master' into releases
jlewi Feb 27, 2018
93c2234
Address some of Lunkai's comments.
jlewi Feb 27, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 114 additions & 58 deletions py/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,89 @@
"""

import argparse
import datetime
import logging
import os
import subprocess
import tempfile
import time
import uuid

from kubernetes import client as k8s_client

from kubernetes.client import rest
from googleapiclient import discovery
from google.cloud import storage # pylint: disable=no-name-in-module

from py import test_util
from py import util


def _setup_namespace(api_client, name):
"""Create the namespace for the test.
"""

api = k8s_client.CoreV1Api(api_client)
namespace = k8s_client.V1Namespace()
namespace.api_version = "v1"
namespace.kind = "Namespace"
namespace.metadata = k8s_client.V1ObjectMeta(
name=name, labels={
"app": "tf-job-test",
})

try:
logging.info("Creating namespace %s", namespace.metadata.name)
namespace = api.create_namespace(namespace)
logging.info("Namespace %s created.", namespace.metadata.name)
except rest.ApiException as e:
if e.status == 409:
logging.info("Namespace %s already exists.", namespace.metadata.name)
else:
raise


# TODO(jlewi): We should probably make this a reusable function since a
# lot of test code code use it.
def ks_deploy(app_dir, component, params, env=None, account=None):
"""Deploy the specified ksonnet component.

Args:
app_dir: The ksonnet directory
component: Name of the component to deployed
params: A dictionary of parameters to set; can be empty but should not be
None.
env: (Optional) The environment to use, if none is specified a new one
is created.
account: (Optional) The account to use.

Raises:
ValueError: If input arguments aren't valid.
"""
if not component:
raise ValueError("component can't be None.")

# TODO(jlewi): It might be better if the test creates the app and uses
# the latest stable release of the ksonnet configs. That however will cause
# problems when we make changes to the TFJob operator that require changes
# to the ksonnet configs. One advantage of checking in the app is that
# we can modify the files in vendor if needed so that changes to the code
# and config can be submitted in the same pr.
now = datetime.datetime.now()
if not env:
env = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

logging.info("Using app directory: %s", app_dir)

util.run(["ks", "env", "add", env], cwd=app_dir)

for k, v in params.iteritems():
util.run(
["ks", "param", "set", "--env=" + env, component, k, v], cwd=app_dir)

apply_command = ["ks", "apply", env, "-c", component]
if account:
apply_command.append("--as=" + account)
util.run(apply_command, cwd=app_dir)


def setup(args):
"""Setup a GKE cluster for TensorFlow jobs.

Expand All @@ -31,7 +99,6 @@ def setup(args):
project = args.project
cluster_name = args.cluster
zone = args.zone
chart = args.chart
machine_type = "n1-standard-8"

cluster_request = {
Expand Down Expand Up @@ -72,66 +139,52 @@ def setup(args):
# Create an API client object to talk to the K8s master.
api_client = k8s_client.ApiClient()

util.setup_cluster(api_client)

# A None gcs_client should be passed to test_util.create_junit_xml_file
# unless chart.startswith("gs://"), e.g. https://storage.googleapis.com/...
gcs_client = None

if chart.startswith("gs://"):
remote = chart
chart = os.path.join(tempfile.gettempdir(), os.path.basename(chart))
gcs_client = storage.Client(project=project)
bucket_name, path = util.split_gcs_uri(remote)

bucket = gcs_client.get_bucket(bucket_name)
blob = bucket.blob(path)
logging.info("Downloading %s to %s", remote, chart)
blob.download_to_filename(chart)

t = test_util.TestCase()
try:
start = time.time()

params = {
"tfJobImage": args.image,
"name": "kubeflow-core",
"namespace": args.namespace,
}

component = "core"

account = util.run_and_output(
["gcloud", "config", "get-value", "account", "--quiet"]).strip()
logging.info("Using GCP account %s", account)
util.run([
"helm", "install", chart, "-n", "tf-job", "--namespace=default", "--wait",
"--replace", "--set", "rbac.install=true,cloud=gke"
"kubectl", "create", "clusterrolebinding", "default-admin",
"--clusterrole=cluster-admin", "--user=" + account
])
util.wait_for_deployment(api_client, "default", "tf-job-operator")
except subprocess.CalledProcessError as e:
t.failure = "helm install failed;\n" + (e.output or "")
except util.TimeoutError as e:
t.failure = e.message
finally:
t.time = time.time() - start
t.name = "helm-tfjob-install"
t.class_name = "GKE"
test_util.create_junit_xml_file([t], args.junit_path, gcs_client)

_setup_namespace(api_client, args.namespace)
ks_deploy(args.test_app_dir, component, params, account=account)

def test(args):
"""Run the tests."""
gcs_client = storage.Client(project=args.project)
project = args.project
cluster_name = args.cluster
zone = args.zone
util.configure_kubectl(project, zone, cluster_name)
# Setup GPUs.
util.setup_cluster(api_client)

t = test_util.TestCase()
try:
start = time.time()
util.run(["helm", "test", "tf-job"])
# Verify that the TfJob operator is actually deployed.
tf_job_deployment_name = "tf-job-operator"
logging.info("Verifying TfJob controller started.")

# TODO(jlewi): We should verify the image of the operator is the correct.
util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name)

# Reraise the exception so that the step fails because there's no point
# continuing the test.
except subprocess.CalledProcessError as e:
t.failure = "helm test failed;\n" + (e.output or "")
# Reraise the exception so that the prow job will fail and the test
# is marked as a failure.
# TODO(jlewi): It would be better to this wholistically; e.g. by
# processing all the junit xml files and checking for any failures. This
# should be more tractable when we migrate off Airflow to Argo.
t.failure = "kubeflow-deploy failed;\n" + (e.output or "")
raise
except util.TimeoutError as e:
t.failure = e.message
raise
finally:
t.time = time.time() - start
t.name = "e2e-test"
t.name = "kubeflow-deploy"
t.class_name = "GKE"
gcs_client = storage.Client(project=args.project)
test_util.create_junit_xml_file([t], args.junit_path, gcs_client)


Expand Down Expand Up @@ -193,15 +246,18 @@ def main(): # pylint: disable=too-many-locals
add_common_args(parser_setup)

parser_setup.add_argument(
"--chart", type=str, required=True, help="The path for the helm chart.")
"--test_app_dir",
help="The directory containing the ksonnet app used for testing.",)

#############################################################################
# test
#
parser_test = subparsers.add_parser("test", help="Run the tests.")
now = datetime.datetime.now()
parser_setup.add_argument(
"--namespace",
default="kubeflow-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4],
help="The directory containing the ksonnet app used for testing.",)

parser_test.set_defaults(func=test)
add_common_args(parser_test)
parser_setup.add_argument(
"--image",
help="The image to use",)

#############################################################################
# teardown
Expand Down
58 changes: 9 additions & 49 deletions py/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def wait_for_deployment(api_client, namespace, name):
Raises:
TimeoutError: If timeout waiting for deployment to be ready.
"""
# Wait for tiller to be ready
# Wait for deployment to be ready
end_time = datetime.datetime.now() + datetime.timedelta(minutes=2)

ext_client = k8s_client.ExtensionsV1beta1Api(api_client)
Expand All @@ -296,9 +296,9 @@ def wait_for_deployment(api_client, namespace, name):

logging.error("Timeout waiting for deployment %s in namespace %s to be "
"ready", name, namespace)
raise TimeoutError(
"Timeout waiting for deployment {0} in namespace {1}".format(
name, namespace))
raise TimeoutError("Timeout waiting for deployment {0} in namespace {1}".
format(name, namespace))



def wait_for_statefulset(api_client, namespace, name):
Expand Down Expand Up @@ -330,9 +330,9 @@ def wait_for_statefulset(api_client, namespace, name):

logging.error("Timeout waiting for statefulset %s in namespace %s to be "
"ready", name, namespace)
raise TimeoutError(
"Timeout waiting for statefulset {0} in namespace {1}".format(
name, namespace))
raise TimeoutError("Timeout waiting for statefulset {0} in namespace {1}".
format(name, namespace))



def install_gpu_drivers(api_client):
Expand Down Expand Up @@ -390,43 +390,6 @@ def cluster_has_gpu_nodes(api_client):
return False


def create_tiller_service_accounts(api_client):
logging.info("Creating service account for tiller.")
api = k8s_client.CoreV1Api(api_client)
body = yaml.load("""apiVersion: v1
kind: ServiceAccount
metadata:
name: tiller
namespace: kube-system""")
try:
api.create_namespaced_service_account("kube-system", body)
except rest.ApiException as e:
if e.status == 409:
logging.info("Service account tiller already exists.")
else:
raise
body = yaml.load("""apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: tiller
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: tiller
namespace: kube-system
""")
rbac_api = k8s_client.RbacAuthorizationV1beta1Api(api_client)
try:
rbac_api.create_cluster_role_binding(body)
except rest.ApiException as e:
if e.status == 409:
logging.info("Role binding for service account tiller already exists.")
else:
raise


def setup_cluster(api_client):
"""Setup a cluster.
Expand All @@ -437,8 +400,6 @@ def setup_cluster(api_client):
Args:
use_gpus
"""
create_tiller_service_accounts(api_client)
run(["helm", "init", "--service-account=tiller"])
use_gpus = cluster_has_gpu_nodes(api_client)
if use_gpus:
logging.info("GPUs detected in cluster.")
Expand All @@ -447,13 +408,12 @@ def setup_cluster(api_client):

if use_gpus:
install_gpu_drivers(api_client)
wait_for_deployment(api_client, "kube-system", "tiller-deploy")
if use_gpus:
wait_for_gpu_driver_install(api_client)


# TODO(jlewi): In python3 TimeoutError is built in. So once we are using
# python3 fully we can delete this.
# TODO(jlewi): TimeoutError is a built in exception in python3 so we can
# delete this when we go to Python3.
class TimeoutError(Exception): # pylint: disable=redefined-builtin
"""An error indicating an operation timed out."""

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: "0.1"
gitVersion:
commitSha: 422d521c05aa905df949868143b26445f5e4eda5
refSpec: master
kind: ksonnet.io/registry
libraries:
apache:
path: apache
version: master
efk:
path: efk
version: master
mariadb:
path: mariadb
version: master
memcached:
path: memcached
version: master
mongodb:
path: mongodb
version: master
mysql:
path: mysql
version: master
nginx:
path: nginx
version: master
node:
path: node
version: master
postgres:
path: postgres
version: master
redis:
path: redis
version: master
tomcat:
path: tomcat
version: master
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: "0.1"
gitVersion:
commitSha: 845f2a02e6ef4e25cae8555a37924d3510d07b36
refSpec: master
kind: ksonnet.io/registry
libraries:
argo:
path: argo
version: master
core:
path: core
version: master
tf-job:
path: tf-job
version: master
tf-serving:
path: tf-serving
version: master
Loading