From e6b13471a6f57a4f86ca33ad9d2e9661b25edd20 Mon Sep 17 00:00:00 2001 From: inc0 Date: Fri, 15 Jun 2018 13:08:23 -0700 Subject: [PATCH 1/2] Example CRD for katib operator I think operator-pattern would fit very nicely into Katib. --- examples/CRD/example_study.yaml | 72 +++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 examples/CRD/example_study.yaml diff --git a/examples/CRD/example_study.yaml b/examples/CRD/example_study.yaml new file mode 100644 index 00000000000..9447954bd6c --- /dev/null +++ b/examples/CRD/example_study.yaml @@ -0,0 +1,72 @@ +--- +apiVersion: "kubeflow.org/v1alpha1" +kind: Study +metadata: + name: mnist-demo + owner: katib +spec: + # This is equivalent to studyconf + # Each trial config will land in env variable according to name + # On top of that trial will also have unique id built from these configs + # This will be required to, for example, save model at the end of the trial + # to S3 under unique, but descriptive, name + optimizationType: "max" + optimizationGoal: 0.9 + parameters: + - name: optimizer + values: + - "sgd" + - "adam" + - name: learning_rate + min: 0.1 + max: 0.9 + step: 0.1 + tfJobSpec: + # Set of tf-jobs will be created with this spec + # This will allow to use all the features of tf-job + # Name will be autogenerated from study name and params + replicaSpecs: + - replicas: 1 + tfReplicaType: MASTER + template: + spec: + containers: + - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff + name: tensorflow + restartPolicy: OnFailure + - replicas: 1 + tfReplicaType: WORKER + template: + spec: + containers: + - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff + name: tensorflow + restartPolicy: OnFailure + - replicas: 2 + tfReplicaType: PS + +# Above yaml will be enough to run whole set of optimization, but each +# optimization will create it's own CRDs, which can be manually manipulated, +# monitored etc. +# For example +# +# kubectl get trial -> mnist-demo-ldwpw +# kubectl get trial mnist-demo-ldwpw + +--- +apiVersion: "kubeflow.org/v1alpha1" +# Reason I'd like this to be called Model not Trial is because, at the end, we will +# want to use by, for example, tf-serving to spawn serving cluster +kind: Model +metadata: + name: mnist-demo-ldwpw + tfJob: mnist-demo-ldwpw # this will be filled after operator successfully creates tfJob, can be used to get logs, pod status etc + optimizationScore: 0.2 # filled by operator after study is complete + location: s3://mybucket/mnist-demo-sgd-lr01 # optional path that will represent where model is saved. Ideally generated by us +spec: + parameters: + - name: optimizer + value: "sgd" + - name: learning_rate + value: 0.1 + From 3a88119467c47175be1dab24a9af4adb74ebe6c6 Mon Sep 17 00:00:00 2001 From: inc0 Date: Fri, 15 Jun 2018 13:28:21 -0700 Subject: [PATCH 2/2] add note about storage --- examples/CRD/example_study.yaml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/CRD/example_study.yaml b/examples/CRD/example_study.yaml index 9447954bd6c..6204e76b2e2 100644 --- a/examples/CRD/example_study.yaml +++ b/examples/CRD/example_study.yaml @@ -21,6 +21,13 @@ spec: min: 0.1 max: 0.9 step: 0.1 + # prefix to desired storage path. This way we can generate modelLocation and logsLocation + # and ask users to save stuff there. We'll set these paths to env variables. + # This way model, later on, will be easily reusable by tf-serving etc. + # Also tensorboard will be easy to spawn + # For local storage (hopefully over pvc) this can be just root path, but it will + # require to specify correct pvc mounts in tfJobSpec. + studyLocation: s3://mybucket tfJobSpec: # Set of tf-jobs will be created with this spec # This will allow to use all the features of tf-job @@ -62,7 +69,8 @@ metadata: name: mnist-demo-ldwpw tfJob: mnist-demo-ldwpw # this will be filled after operator successfully creates tfJob, can be used to get logs, pod status etc optimizationScore: 0.2 # filled by operator after study is complete - location: s3://mybucket/mnist-demo-sgd-lr01 # optional path that will represent where model is saved. Ideally generated by us + modelLocation: s3://mybucket/mnist-demo-sgd-lr01/model # optional path that will represent where model is saved. Ideally generated by us + logsLocation: s3://mybucket/mnist-demo-sgd-lr01/logs # tensorboard-style logs. Scalars there can be used for ModelDB and early stopping etc spec: parameters: - name: optimizer