Skip to content

Commit

Permalink
Add an e2etest to verify clean pod policy in TF job operator (#795)
Browse files Browse the repository at this point in the history
* Add jsonnet

Fix syntax

Params.libsonnet

Add jsonnet

Fix import

Fix syntax

Debug: no teardown

Fix component name

Add cleanup policy check

Revert some changes

Fix a few things

Fix format

Fix format

* Modify images in test jsonnet

* Add tests for cleanPodPolicy = Running and None

* Fix tests; refactor code

* Fix tests

* Add comments to jsonnet files
  • Loading branch information
richardsliu authored and k8s-ci-robot committed Aug 30, 2018
1 parent e63ea46 commit 037e915
Show file tree
Hide file tree
Showing 6 changed files with 353 additions and 3 deletions.
36 changes: 33 additions & 3 deletions py/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,14 @@ def list_pods(client, namespace, label_selector):
message)
raise e

def wait_for_replica_type_in_phases(api_client, namespace, tfjob_name, replica_type, phases):
pod_labels = get_labels_v1alpha2(tfjob_name, replica_type)
pod_selector = to_selector(pod_labels)
wait_for_pods_to_be_in_phases(api_client, namespace,
pod_selector,
phases,
timeout=datetime.timedelta(
minutes=4))

def get_events(client, namespace, uid):
"""Get the events for the provided object."""
Expand Down Expand Up @@ -549,11 +557,26 @@ def run_test(args): # pylint: disable=too-many-branches,too-many-statements
pod_labels = get_labels_v1alpha2(name)
pod_selector = to_selector(pod_labels)

# We don't wait for pods to be deleted in v1alpha2 because CleanPodPolicy
# means completed pods won't be deleted.
# TODO(jlewi): We should add a test to deal with deleted pods.
# In v1alpha1 all pods are deleted. In v1alpha2, this depends on the pod
# cleanup policy.
if args.tfjob_version == "v1alpha1":
wait_for_pods_to_be_deleted(api_client, namespace, pod_selector)
else:
# All pods are deleted.
if args.verify_clean_pod_policy == "All":
wait_for_pods_to_be_deleted(api_client, namespace, pod_selector)
# Only running pods (PS) are deleted, completed pods are not.
elif args.verify_clean_pod_policy == "Running":
wait_for_replica_type_in_phases(api_client, namespace, name, "Chief", ["Completed"])
wait_for_replica_type_in_phases(api_client, namespace, name, "Worker", ["Completed"])
ps_pod_labels = get_labels_v1alpha2(name, "PS")
ps_pod_selector = to_selector(ps_pod_labels)
wait_for_pods_to_be_deleted(api_client, namespace, ps_pod_selector)
# No pods are deleted.
elif args.verify_clean_pod_policy == "None":
wait_for_replica_type_in_phases(api_client, namespace, name, "Chief", ["Completed"])
wait_for_replica_type_in_phases(api_client, namespace, name, "Worker", ["Completed"])
wait_for_replica_type_in_phases(api_client, namespace, name, "PS", ["Running"])

tf_job_client.delete_tf_job(api_client, namespace, name, version=args.tfjob_version)

Expand Down Expand Up @@ -650,6 +673,13 @@ def add_common_args(parser):
help="(Optional) the name for the ksonnet environment; if not specified "
"a random one is created.")

parser.add_argument(
"--verify_clean_pod_policy",
default=None,
type=str,
help="(Optional) the clean pod policy (None, Running, or All).")


def build_parser():
# create the top-level parser
parser = argparse.ArgumentParser(description="Run a TFJob test.")
Expand Down
78 changes: 78 additions & 0 deletions test/workflows/components/clean_pod_all.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Tests that when cleanPodPolicy is set to "All", all of the pods are deleted
// when the TFJob completes.
local params = std.extVar("__ksonnet/params").components.clean_pod_all;

local k = import "k.libsonnet";

local parts(namespace, name, image) = {
job:: {
apiVersion: "kubeflow.org/v1alpha2",
kind: "TFJob",
metadata: {
name: name,
namespace: namespace,
},
spec: {
cleanPodPolicy: "All",
tfReplicaSpecs: {
Chief: {
replicas: 1,
restartPolicy: "Never",
template: {
spec: {
containers: [
{
name: "tensorflow",
image: "ubuntu",
command: [
"echo",
"Hello",
],
},
],
},
},
},
PS: {
replicas: 2,
restartPolicy: "Never",
template: {
spec: {
containers: [
{
name: "tensorflow",
image: "ubuntu",
command: [
"tail",
"-f",
"/dev/null",
],
},
],
},
},
},
Worker: {
replicas: 4,
restartPolicy: "Never",
template: {
spec: {
containers: [
{
name: "tensorflow",
image: "ubuntu",
command: [
"echo",
"Hello",
],
},
],
},
},
},
},
},
},
};

std.prune(k.core.v1.list.new([parts(params.namespace, params.name, params.image).job]))
79 changes: 79 additions & 0 deletions test/workflows/components/clean_pod_none.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Tests that when cleanPodPolicy is set to "None", none of the pods are deleted
// when the TFJob completes.

local params = std.extVar("__ksonnet/params").components.clean_pod_none;

local k = import "k.libsonnet";

local parts(namespace, name, image) = {
job:: {
apiVersion: "kubeflow.org/v1alpha2",
kind: "TFJob",
metadata: {
name: name,
namespace: namespace,
},
spec: {
cleanPodPolicy: "None",
tfReplicaSpecs: {
Chief: {
replicas: 1,
restartPolicy: "Never",
template: {
spec: {
containers: [
{
name: "tensorflow",
image: "ubuntu",
command: [
"echo",
"Hello",
],
},
],
},
},
},
PS: {
replicas: 2,
restartPolicy: "Never",
template: {
spec: {
containers: [
{
name: "tensorflow",
image: "ubuntu",
command: [
"tail",
"-f",
"/dev/null",
],
},
],
},
},
},
Worker: {
replicas: 4,
restartPolicy: "Never",
template: {
spec: {
containers: [
{
name: "tensorflow",
image: "ubuntu",
command: [
"echo",
"Hello",
],
},
],
},
},
},
},
},
},
};

std.prune(k.core.v1.list.new([parts(params.namespace, params.name, params.image).job]))
78 changes: 78 additions & 0 deletions test/workflows/components/clean_pod_running.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Tests that when cleanPodPolicy is set to "Running", only the Running pods are deleted
// when the TFJob completes. The completed pods will not be deleted.
local params = std.extVar("__ksonnet/params").components.clean_pod_running;

local k = import "k.libsonnet";

local parts(namespace, name, image) = {
job:: {
apiVersion: "kubeflow.org/v1alpha2",
kind: "TFJob",
metadata: {
name: name,
namespace: namespace,
},
spec: {
cleanPodPolicy: "Running",
tfReplicaSpecs: {
Chief: {
replicas: 1,
restartPolicy: "Never",
template: {
spec: {
containers: [
{
name: "tensorflow",
image: "ubuntu",
command: [
"echo",
"Hello",
],
},
],
},
},
},
PS: {
replicas: 2,
restartPolicy: "Never",
template: {
spec: {
containers: [
{
name: "tensorflow",
image: "ubuntu",
command: [
"tail",
"-f",
"/dev/null",
],
},
],
},
},
},
Worker: {
replicas: 4,
restartPolicy: "Never",
template: {
spec: {
containers: [
{
name: "tensorflow",
image: "ubuntu",
command: [
"echo",
"Hello",
],
},
],
},
},
},
},
},
},
};

std.prune(k.core.v1.list.new([parts(params.namespace, params.name, params.image).job]))
15 changes: 15 additions & 0 deletions test/workflows/components/params.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,20 @@
namespace: "kubeflow-test-infra",
image: "",
},
clean_pod_all: {
name: "clean_pod_all",
namespace: "kubeflow-test-infra",
image: "",
},
clean_pod_running: {
name: "clean_pod_running",
namespace: "kubeflow-test-infra",
image: "",
},
clean_pod_none: {
name: "clean_pod_none",
namespace: "kubeflow-test-infra",
image: "",
},
},
}
Loading

0 comments on commit 037e915

Please sign in to comment.