From c4a006c95fa4dfae90a5e1f27da0ce5b6e4f5cc3 Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Mon, 30 Oct 2017 13:52:07 -0700 Subject: [PATCH 1/5] Change group to tensorflow.org and version to v1alpha1. * We have abandoned the name mlkube.io and moved the code into tensorflow so it makes sense to use tensorflow.org as the group. * We downgrade the API version from v1beta1 to v1alpha1. We want to follow the guidelines used by K8s for alpha, beta, stable: https://github.com/kubernetes/community/blob/master/contributors/devel/api_changes.md#alpha-beta-and-stable-versions according to this definition we our closer to alpha since the API hasn't achieved stability. --- README.md | 14 +++++++------- examples/tf_job.yaml | 2 +- examples/tf_job/templates/tf_job.yaml | 2 +- examples/tf_job_defaults.yaml | 2 +- examples/tf_job_gpu.yaml | 2 +- examples/tf_job_tensorboard_azure.yaml | 2 +- pkg/spec/tf_job.go | 4 ++-- pkg/trainer/replicas.go | 2 +- pkg/trainer/replicas_test.go | 2 +- pkg/trainer/tensorboard.go | 2 +- pkg/trainer/tensorboard_test.go | 2 +- 11 files changed, 18 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index d00a9de6a5..68c8202fa7 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ To list jobs kubectl get tfjobs NAME KINDS -example-job TfJob.v1beta1.mlkube.io +example-job TfJob.v1alpha.tensorflow.org ``` For additional information about motivation and design for the @@ -146,7 +146,7 @@ kubectl create -f https://raw.githubusercontent.com/tensorflow/k8s/master/exampl In this case the job spec looks like the following ``` -apiVersion: "mlkube.io/v1beta1" +apiVersion: "tensorflow.org/v1alpha1" kind: "TfJob" metadata: name: "example-job" @@ -220,7 +220,7 @@ Ensure your K8s cluster is properly configured to use GPUs To attach GPUs specify the GPU resource on the container e.g. ``` -apiVersion: "mlkube.io/v1beta1" +apiVersion: "tensorflow.org/v1alpha1" kind: "TfJob" metadata: name: "tf-smoke-gpu" @@ -263,7 +263,7 @@ On Azure you can store your event files on an Azure Files and use volumes to make them available to TensorBoard. ``` -apiVersion: "mlkube.io/v1beta1" +apiVersion: "tensorflow.org/v1alpha1" kind: "TfJob" metadata: name: "tf-smoke-gpu" @@ -300,7 +300,7 @@ On GKE you can store your event files on GCS and TensorBoard/TensorFlow can read/write directly to GCS. ``` -apiVersion: "mlkube.io/v1beta1" +apiVersion: "tensorflow.org/v1alpha1" kind: "TfJob" metadata: name: "tf-smoke-gpu" @@ -356,7 +356,7 @@ kubectl get -o yaml tfjobs $JOB Here is sample output for an example job ``` -apiVersion: mlkube.io/v1beta1 +apiVersion: tensorflow.org/v1alpha1 kind: TfJob metadata: clusterName: "" @@ -365,7 +365,7 @@ metadata: name: example-job namespace: default resourceVersion: "1881" - selfLink: /apis/mlkube.io/v1beta1/namespaces/default/tfjobs/example-job + selfLink: /apis/tensorflow.org/v1alpha1/namespaces/default/tfjobs/example-job uid: e11f9577-b5e5-11e7-8522-42010a8e01a4 spec: RuntimeId: 76no diff --git a/examples/tf_job.yaml b/examples/tf_job.yaml index 30d26c4be4..7678eab109 100644 --- a/examples/tf_job.yaml +++ b/examples/tf_job.yaml @@ -1,4 +1,4 @@ -apiVersion: "mlkube.io/v1beta1" +apiVersion: "tensorflow.org/v1alpha1" kind: "TfJob" metadata: name: "example-job" diff --git a/examples/tf_job/templates/tf_job.yaml b/examples/tf_job/templates/tf_job.yaml index 4dfe5fa2cd..b5c8c59879 100644 --- a/examples/tf_job/templates/tf_job.yaml +++ b/examples/tf_job/templates/tf_job.yaml @@ -1,4 +1,4 @@ -apiVersion: "mlkube.io/v1beta1" +apiVersion: "tensorflow.org/v1alpha1" kind: "TfJob" metadata: name: {{ .Release.Name }} diff --git a/examples/tf_job_defaults.yaml b/examples/tf_job_defaults.yaml index 3c1132629d..0f67cfb3a5 100644 --- a/examples/tf_job_defaults.yaml +++ b/examples/tf_job_defaults.yaml @@ -1,6 +1,6 @@ # This template is used to verify that REPLICAS, TfPort, and TfReplicaType are properly set to default # values if unspecified by the user. -apiVersion: "mlkube.io/v1beta1" +apiVersion: "tensorflow.org/v1alpha1" kind: "TfJob" metadata: name: "example-job-defaults" diff --git a/examples/tf_job_gpu.yaml b/examples/tf_job_gpu.yaml index 829fccacc6..325820c882 100644 --- a/examples/tf_job_gpu.yaml +++ b/examples/tf_job_gpu.yaml @@ -1,4 +1,4 @@ -apiVersion: "mlkube.io/v1beta1" +apiVersion: "tensorflow.org/v1alpha1" kind: "TfJob" metadata: name: "tf-smoke-gpu" diff --git a/examples/tf_job_tensorboard_azure.yaml b/examples/tf_job_tensorboard_azure.yaml index bd11415617..c81109b2f6 100644 --- a/examples/tf_job_tensorboard_azure.yaml +++ b/examples/tf_job_tensorboard_azure.yaml @@ -1,4 +1,4 @@ -apiVersion: "mlkube.io/v1beta1" +apiVersion: "tensorflow.org/v1alpha1" kind: "TfJob" metadata: name: "example-job" diff --git a/pkg/spec/tf_job.go b/pkg/spec/tf_job.go index 2cd1247bbb..30e8cfa1f2 100644 --- a/pkg/spec/tf_job.go +++ b/pkg/spec/tf_job.go @@ -15,8 +15,8 @@ import ( const ( CRDKind = "TfJob" CRDKindPlural = "tfjobs" - CRDGroup = "mlkube.io" - CRDVersion = "v1beta1" + CRDGroup = "tensorflow.org" + CRDVersion = "v1alpha1" // Value of the APP label that gets applied to a lot of entities. AppLabel = "tensorflow-job" diff --git a/pkg/trainer/replicas.go b/pkg/trainer/replicas.go index 2a95562fde..a79dded298 100644 --- a/pkg/trainer/replicas.go +++ b/pkg/trainer/replicas.go @@ -90,7 +90,7 @@ func NewTFReplicaSet(clientSet kubernetes.Interface, tfReplicaSpec spec.TfReplic // Labels returns the labels for this replica set. func (s *TFReplicaSet) Labels() KubernetesLabels { return KubernetesLabels(map[string]string{ - "mlkube.io": "", + "tensorflow.org": "", "job_type": string(s.Spec.TfReplicaType), // runtime_id is set by Job.setup, which is called after the TfReplicaSet is created. // this is why labels aren't a member variable. diff --git a/pkg/trainer/replicas_test.go b/pkg/trainer/replicas_test.go index 2c311c98dd..04dded0e59 100644 --- a/pkg/trainer/replicas_test.go +++ b/pkg/trainer/replicas_test.go @@ -65,7 +65,7 @@ func TestTFReplicaSet(t *testing.T) { for index := 0; index < 2; index++ { // Expected labels expectedLabels := map[string]string{ - "mlkube.io": "", + "tensorflow.org": "", "task_index": fmt.Sprintf("%v", index), "job_type": "PS", "runtime_id": "some-runtime", diff --git a/pkg/trainer/tensorboard.go b/pkg/trainer/tensorboard.go index 958102252f..db438af506 100644 --- a/pkg/trainer/tensorboard.go +++ b/pkg/trainer/tensorboard.go @@ -170,7 +170,7 @@ func (s *TBReplicaSet) getDeploymentSpecTemplate(image string) v1.PodTemplateSpe func (s *TBReplicaSet) Labels() KubernetesLabels { return KubernetesLabels(map[string]string{ - "mlkube.io": "", + "tensorflow.org": "", "runtime_id": s.Job.job.Spec.RuntimeId, "app": "tensorboard", }) diff --git a/pkg/trainer/tensorboard_test.go b/pkg/trainer/tensorboard_test.go index 76cb60938b..668978e890 100644 --- a/pkg/trainer/tensorboard_test.go +++ b/pkg/trainer/tensorboard_test.go @@ -56,7 +56,7 @@ func TestTBReplicaSet(t *testing.T) { // Expected labels expectedLabels := map[string]string{ - "mlkube.io": "", + "tensorflow.org": "", "app": "tensorboard", "runtime_id": "some-runtime", } From c52bcf6ac9ed992a55b5540cbb629f05ba30a818 Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Mon, 30 Oct 2017 18:42:21 -0700 Subject: [PATCH 2/5] Add Name of the job to the labels. * This will make it easier to fetch resources belonging to a specific job. * Fixes #72. --- pkg/trainer/replicas.go | 3 ++- pkg/trainer/replicas_test.go | 4 ++++ pkg/trainer/tensorboard.go | 1 + pkg/trainer/tensorboard_test.go | 4 ++++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pkg/trainer/replicas.go b/pkg/trainer/replicas.go index a79dded298..ec2f5aa415 100644 --- a/pkg/trainer/replicas.go +++ b/pkg/trainer/replicas.go @@ -94,7 +94,8 @@ func (s *TFReplicaSet) Labels() KubernetesLabels { "job_type": string(s.Spec.TfReplicaType), // runtime_id is set by Job.setup, which is called after the TfReplicaSet is created. // this is why labels aren't a member variable. - "runtime_id": s.Job.job.Spec.RuntimeId}) + "runtime_id": s.Job.job.Spec.RuntimeId, + "tf_job_name": s.Job.job.Metadata.Name,}) } // Transforms the tfconfig to work with grpc_tensorflow_server diff --git a/pkg/trainer/replicas_test.go b/pkg/trainer/replicas_test.go index 04dded0e59..6483ac910c 100644 --- a/pkg/trainer/replicas_test.go +++ b/pkg/trainer/replicas_test.go @@ -22,6 +22,9 @@ func TestTFReplicaSet(t *testing.T) { clientSet := fake.NewSimpleClientset() jobSpec := &spec.TfJob{ + Metadata: meta_v1.ObjectMeta { + Name: "some-job", + }, Spec: spec.TfJobSpec{ RuntimeId: "some-runtime", ReplicaSpecs: []*spec.TfReplicaSpec{ @@ -69,6 +72,7 @@ func TestTFReplicaSet(t *testing.T) { "task_index": fmt.Sprintf("%v", index), "job_type": "PS", "runtime_id": "some-runtime", + "tf_job_name": "some-job", } // Check that a service was created. diff --git a/pkg/trainer/tensorboard.go b/pkg/trainer/tensorboard.go index db438af506..3347028803 100644 --- a/pkg/trainer/tensorboard.go +++ b/pkg/trainer/tensorboard.go @@ -173,6 +173,7 @@ func (s *TBReplicaSet) Labels() KubernetesLabels { "tensorflow.org": "", "runtime_id": s.Job.job.Spec.RuntimeId, "app": "tensorboard", + "tf_job_name": s.Job.job.Metadata.Name, }) } diff --git a/pkg/trainer/tensorboard_test.go b/pkg/trainer/tensorboard_test.go index 668978e890..f0b251e779 100644 --- a/pkg/trainer/tensorboard_test.go +++ b/pkg/trainer/tensorboard_test.go @@ -19,6 +19,9 @@ func TestTBReplicaSet(t *testing.T) { clientSet := fake.NewSimpleClientset() jobSpec := &spec.TfJob{ + Metadata: meta_v1.ObjectMeta { + Name: "some-job", + }, Spec: spec.TfJobSpec{ RuntimeId: "some-runtime", ReplicaSpecs: []*spec.TfReplicaSpec{ @@ -59,6 +62,7 @@ func TestTBReplicaSet(t *testing.T) { "tensorflow.org": "", "app": "tensorboard", "runtime_id": "some-runtime", + "tf_job_name": "some-job", } // Check that a service was created. From a54ab41752e0da53e470f8bbdf72b9440f67b945 Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Tue, 31 Oct 2017 18:08:43 -0700 Subject: [PATCH 3/5] Start updating the docs. --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 68c8202fa7..982969a896 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,10 @@ CRD please refer to Custom Resources require Kubernetes >= 1.7 +For GPUs we recommend using Kubernetes 1.8 + + * Kubernetes 1.7 had some support for GPUs + ## Installing the TfJob CRD and operator on your k8s cluster From fa982e979604d568379488097dd4b74082724c20 Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Tue, 31 Oct 2017 21:20:33 -0700 Subject: [PATCH 4/5] Add links describing how to configure service accounts for tiller if needed. --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 982969a896..c4abba4194 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,15 @@ For GPUs we recommend using Kubernetes 1.8 ## Installing the TfJob CRD and operator on your k8s cluster +1. Ensure helm is running on your cluster + + * On GKE with K8s 1.8, follow these + [instructions](https://github.com/kubernetes/helm/blob/master/docs/service_accounts.md) + to setup appropriate service accounts for tiller. + + * Azure K8s clusters should have service accounts configured by + default for tiller. + 1. Deploy the operator For non-RBAC enabled clusters: From d1cebd8c538b2d335dc342d7c7f764c3a70ab153 Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Wed, 1 Nov 2017 21:19:09 -0700 Subject: [PATCH 5/5] Update the link to helm. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5a3888541c..be2ccef1b2 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ For GPUs we recommend using Kubernetes 1.8 1. Ensure helm is running on your cluster * On GKE with K8s 1.8, follow these - [instructions](https://github.com/kubernetes/helm/blob/master/docs/service_accounts.md) + [instructions](https://docs.helm.sh/using_helm/#tiller-namespaces-and-rbac) to setup appropriate service accounts for tiller. * Azure K8s clusters should have service accounts configured by