From 037e9153bac3deff2ee1f370d2bcccc633bdc98b Mon Sep 17 00:00:00 2001 From: Richard Liu <39319471+richardsliu@users.noreply.github.com> Date: Wed, 29 Aug 2018 20:50:06 -0700 Subject: [PATCH] Add an e2etest to verify clean pod policy in TF job operator (#795) * Add jsonnet Fix syntax Params.libsonnet Add jsonnet Fix import Fix syntax Debug: no teardown Fix component name Add cleanup policy check Revert some changes Fix a few things Fix format Fix format * Modify images in test jsonnet * Add tests for cleanPodPolicy = Running and None * Fix tests; refactor code * Fix tests * Add comments to jsonnet files --- py/test_runner.py | 36 ++++++++- .../components/clean_pod_all.jsonnet | 78 ++++++++++++++++++ .../components/clean_pod_none.jsonnet | 79 +++++++++++++++++++ .../components/clean_pod_running.jsonnet | 78 ++++++++++++++++++ test/workflows/components/params.libsonnet | 15 ++++ test/workflows/components/workflows.libsonnet | 70 ++++++++++++++++ 6 files changed, 353 insertions(+), 3 deletions(-) create mode 100644 test/workflows/components/clean_pod_all.jsonnet create mode 100644 test/workflows/components/clean_pod_none.jsonnet create mode 100644 test/workflows/components/clean_pod_running.jsonnet diff --git a/py/test_runner.py b/py/test_runner.py index a955796d4c..5f9fdf2d0d 100644 --- a/py/test_runner.py +++ b/py/test_runner.py @@ -213,6 +213,14 @@ def list_pods(client, namespace, label_selector): message) raise e +def wait_for_replica_type_in_phases(api_client, namespace, tfjob_name, replica_type, phases): + pod_labels = get_labels_v1alpha2(tfjob_name, replica_type) + pod_selector = to_selector(pod_labels) + wait_for_pods_to_be_in_phases(api_client, namespace, + pod_selector, + phases, + timeout=datetime.timedelta( + minutes=4)) def get_events(client, namespace, uid): """Get the events for the provided object.""" @@ -549,11 +557,26 @@ def run_test(args): # pylint: disable=too-many-branches,too-many-statements pod_labels = get_labels_v1alpha2(name) pod_selector = to_selector(pod_labels) - # We don't wait for pods to be deleted in v1alpha2 because CleanPodPolicy - # means completed pods won't be deleted. - # TODO(jlewi): We should add a test to deal with deleted pods. + # In v1alpha1 all pods are deleted. In v1alpha2, this depends on the pod + # cleanup policy. if args.tfjob_version == "v1alpha1": wait_for_pods_to_be_deleted(api_client, namespace, pod_selector) + else: + # All pods are deleted. + if args.verify_clean_pod_policy == "All": + wait_for_pods_to_be_deleted(api_client, namespace, pod_selector) + # Only running pods (PS) are deleted, completed pods are not. + elif args.verify_clean_pod_policy == "Running": + wait_for_replica_type_in_phases(api_client, namespace, name, "Chief", ["Completed"]) + wait_for_replica_type_in_phases(api_client, namespace, name, "Worker", ["Completed"]) + ps_pod_labels = get_labels_v1alpha2(name, "PS") + ps_pod_selector = to_selector(ps_pod_labels) + wait_for_pods_to_be_deleted(api_client, namespace, ps_pod_selector) + # No pods are deleted. + elif args.verify_clean_pod_policy == "None": + wait_for_replica_type_in_phases(api_client, namespace, name, "Chief", ["Completed"]) + wait_for_replica_type_in_phases(api_client, namespace, name, "Worker", ["Completed"]) + wait_for_replica_type_in_phases(api_client, namespace, name, "PS", ["Running"]) tf_job_client.delete_tf_job(api_client, namespace, name, version=args.tfjob_version) @@ -650,6 +673,13 @@ def add_common_args(parser): help="(Optional) the name for the ksonnet environment; if not specified " "a random one is created.") + parser.add_argument( + "--verify_clean_pod_policy", + default=None, + type=str, + help="(Optional) the clean pod policy (None, Running, or All).") + + def build_parser(): # create the top-level parser parser = argparse.ArgumentParser(description="Run a TFJob test.") diff --git a/test/workflows/components/clean_pod_all.jsonnet b/test/workflows/components/clean_pod_all.jsonnet new file mode 100644 index 0000000000..fc672a7cd9 --- /dev/null +++ b/test/workflows/components/clean_pod_all.jsonnet @@ -0,0 +1,78 @@ +// Tests that when cleanPodPolicy is set to "All", all of the pods are deleted +// when the TFJob completes. +local params = std.extVar("__ksonnet/params").components.clean_pod_all; + +local k = import "k.libsonnet"; + +local parts(namespace, name, image) = { + job:: { + apiVersion: "kubeflow.org/v1alpha2", + kind: "TFJob", + metadata: { + name: name, + namespace: namespace, + }, + spec: { + cleanPodPolicy: "All", + tfReplicaSpecs: { + Chief: { + replicas: 1, + restartPolicy: "Never", + template: { + spec: { + containers: [ + { + name: "tensorflow", + image: "ubuntu", + command: [ + "echo", + "Hello", + ], + }, + ], + }, + }, + }, + PS: { + replicas: 2, + restartPolicy: "Never", + template: { + spec: { + containers: [ + { + name: "tensorflow", + image: "ubuntu", + command: [ + "tail", + "-f", + "/dev/null", + ], + }, + ], + }, + }, + }, + Worker: { + replicas: 4, + restartPolicy: "Never", + template: { + spec: { + containers: [ + { + name: "tensorflow", + image: "ubuntu", + command: [ + "echo", + "Hello", + ], + }, + ], + }, + }, + }, + }, + }, + }, +}; + +std.prune(k.core.v1.list.new([parts(params.namespace, params.name, params.image).job])) diff --git a/test/workflows/components/clean_pod_none.jsonnet b/test/workflows/components/clean_pod_none.jsonnet new file mode 100644 index 0000000000..eb516989e7 --- /dev/null +++ b/test/workflows/components/clean_pod_none.jsonnet @@ -0,0 +1,79 @@ +// Tests that when cleanPodPolicy is set to "None", none of the pods are deleted +// when the TFJob completes. + +local params = std.extVar("__ksonnet/params").components.clean_pod_none; + +local k = import "k.libsonnet"; + +local parts(namespace, name, image) = { + job:: { + apiVersion: "kubeflow.org/v1alpha2", + kind: "TFJob", + metadata: { + name: name, + namespace: namespace, + }, + spec: { + cleanPodPolicy: "None", + tfReplicaSpecs: { + Chief: { + replicas: 1, + restartPolicy: "Never", + template: { + spec: { + containers: [ + { + name: "tensorflow", + image: "ubuntu", + command: [ + "echo", + "Hello", + ], + }, + ], + }, + }, + }, + PS: { + replicas: 2, + restartPolicy: "Never", + template: { + spec: { + containers: [ + { + name: "tensorflow", + image: "ubuntu", + command: [ + "tail", + "-f", + "/dev/null", + ], + }, + ], + }, + }, + }, + Worker: { + replicas: 4, + restartPolicy: "Never", + template: { + spec: { + containers: [ + { + name: "tensorflow", + image: "ubuntu", + command: [ + "echo", + "Hello", + ], + }, + ], + }, + }, + }, + }, + }, + }, +}; + +std.prune(k.core.v1.list.new([parts(params.namespace, params.name, params.image).job])) diff --git a/test/workflows/components/clean_pod_running.jsonnet b/test/workflows/components/clean_pod_running.jsonnet new file mode 100644 index 0000000000..5003a58f0c --- /dev/null +++ b/test/workflows/components/clean_pod_running.jsonnet @@ -0,0 +1,78 @@ +// Tests that when cleanPodPolicy is set to "Running", only the Running pods are deleted +// when the TFJob completes. The completed pods will not be deleted. +local params = std.extVar("__ksonnet/params").components.clean_pod_running; + +local k = import "k.libsonnet"; + +local parts(namespace, name, image) = { + job:: { + apiVersion: "kubeflow.org/v1alpha2", + kind: "TFJob", + metadata: { + name: name, + namespace: namespace, + }, + spec: { + cleanPodPolicy: "Running", + tfReplicaSpecs: { + Chief: { + replicas: 1, + restartPolicy: "Never", + template: { + spec: { + containers: [ + { + name: "tensorflow", + image: "ubuntu", + command: [ + "echo", + "Hello", + ], + }, + ], + }, + }, + }, + PS: { + replicas: 2, + restartPolicy: "Never", + template: { + spec: { + containers: [ + { + name: "tensorflow", + image: "ubuntu", + command: [ + "tail", + "-f", + "/dev/null", + ], + }, + ], + }, + }, + }, + Worker: { + replicas: 4, + restartPolicy: "Never", + template: { + spec: { + containers: [ + { + name: "tensorflow", + image: "ubuntu", + command: [ + "echo", + "Hello", + ], + }, + ], + }, + }, + }, + }, + }, + }, +}; + +std.prune(k.core.v1.list.new([parts(params.namespace, params.name, params.image).job])) diff --git a/test/workflows/components/params.libsonnet b/test/workflows/components/params.libsonnet index 543582314d..547f72e5c8 100644 --- a/test/workflows/components/params.libsonnet +++ b/test/workflows/components/params.libsonnet @@ -46,5 +46,20 @@ namespace: "kubeflow-test-infra", image: "", }, + clean_pod_all: { + name: "clean_pod_all", + namespace: "kubeflow-test-infra", + image: "", + }, + clean_pod_running: { + name: "clean_pod_running", + namespace: "kubeflow-test-infra", + image: "", + }, + clean_pod_none: { + name: "clean_pod_none", + namespace: "kubeflow-test-infra", + image: "", + }, }, } diff --git a/test/workflows/components/workflows.libsonnet b/test/workflows/components/workflows.libsonnet index 6882b98371..34742af3a5 100644 --- a/test/workflows/components/workflows.libsonnet +++ b/test/workflows/components/workflows.libsonnet @@ -251,6 +251,31 @@ template: "run-gpu-tests", dependencies: ["setup-kubeflow"], }, + if params.tfJobVersion == "v1alpha2" then + { + name: "run-clean-pod-all", + template: "run-clean-pod-all", + dependencies: ["setup-kubeflow"], + } + else + {}, + if params.tfJobVersion == "v1alpha2" then + { + name: "run-clean-pod-running", + template: "run-clean-pod-running", + dependencies: ["setup-kubeflow"], + } + else + {}, + if params.tfJobVersion == "v1alpha2" then + { + name: "run-clean-pod-none", + template: "run-clean-pod-none", + dependencies: ["setup-kubeflow"], + } + else + {}, + ], //tasks }, }, @@ -407,6 +432,51 @@ "--tfjob_version=" + params.tfJobVersion, "--junit_path=" + artifactsDir + "/junit_gpu-tests.xml", ]), // run gpu_tests + $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("run-clean-pod-all", [ + "python", + "-m", + "py.test_runner", + "test", + "--cluster=" + cluster, + "--zone=" + zone, + "--project=" + project, + "--app_dir=" + srcDir + "/test/workflows", + "--component=clean_pod_all", + "--params=name=clean-pod-all,namespace=default", + "--tfjob_version=" + params.tfJobVersion, + "--verify_clean_pod_policy=All", + "--junit_path=" + artifactsDir + "/junit_clean-pod-all-tests.xml", + ]), // run clean_pod_all + $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("run-clean-pod-running", [ + "python", + "-m", + "py.test_runner", + "test", + "--cluster=" + cluster, + "--zone=" + zone, + "--project=" + project, + "--app_dir=" + srcDir + "/test/workflows", + "--component=clean_pod_running", + "--params=name=clean-pod-running,namespace=default", + "--tfjob_version=" + params.tfJobVersion, + "--verify_clean_pod_policy=Running", + "--junit_path=" + artifactsDir + "/junit_clean-pod-running-tests.xml", + ]), // run clean_pod_running + $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("run-clean-pod-none", [ + "python", + "-m", + "py.test_runner", + "test", + "--cluster=" + cluster, + "--zone=" + zone, + "--project=" + project, + "--app_dir=" + srcDir + "/test/workflows", + "--component=clean_pod_none", + "--params=name=clean-pod-none,namespace=default", + "--tfjob_version=" + params.tfJobVersion, + "--verify_clean_pod_policy=None", + "--junit_path=" + artifactsDir + "/junit_clean-pod-none-tests.xml", + ]), // run clean_pod_none $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("create-pr-symlink", [ "python", "-m",