diff --git a/examples/CRD/example_study.yaml b/examples/CRD/example_study.yaml new file mode 100644 index 00000000000..6204e76b2e2 --- /dev/null +++ b/examples/CRD/example_study.yaml @@ -0,0 +1,80 @@ +--- +apiVersion: "kubeflow.org/v1alpha1" +kind: Study +metadata: + name: mnist-demo + owner: katib +spec: + # This is equivalent to studyconf + # Each trial config will land in env variable according to name + # On top of that trial will also have unique id built from these configs + # This will be required to, for example, save model at the end of the trial + # to S3 under unique, but descriptive, name + optimizationType: "max" + optimizationGoal: 0.9 + parameters: + - name: optimizer + values: + - "sgd" + - "adam" + - name: learning_rate + min: 0.1 + max: 0.9 + step: 0.1 + # prefix to desired storage path. This way we can generate modelLocation and logsLocation + # and ask users to save stuff there. We'll set these paths to env variables. + # This way model, later on, will be easily reusable by tf-serving etc. + # Also tensorboard will be easy to spawn + # For local storage (hopefully over pvc) this can be just root path, but it will + # require to specify correct pvc mounts in tfJobSpec. + studyLocation: s3://mybucket + tfJobSpec: + # Set of tf-jobs will be created with this spec + # This will allow to use all the features of tf-job + # Name will be autogenerated from study name and params + replicaSpecs: + - replicas: 1 + tfReplicaType: MASTER + template: + spec: + containers: + - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff + name: tensorflow + restartPolicy: OnFailure + - replicas: 1 + tfReplicaType: WORKER + template: + spec: + containers: + - image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff + name: tensorflow + restartPolicy: OnFailure + - replicas: 2 + tfReplicaType: PS + +# Above yaml will be enough to run whole set of optimization, but each +# optimization will create it's own CRDs, which can be manually manipulated, +# monitored etc. +# For example +# +# kubectl get trial -> mnist-demo-ldwpw +# kubectl get trial mnist-demo-ldwpw + +--- +apiVersion: "kubeflow.org/v1alpha1" +# Reason I'd like this to be called Model not Trial is because, at the end, we will +# want to use by, for example, tf-serving to spawn serving cluster +kind: Model +metadata: + name: mnist-demo-ldwpw + tfJob: mnist-demo-ldwpw # this will be filled after operator successfully creates tfJob, can be used to get logs, pod status etc + optimizationScore: 0.2 # filled by operator after study is complete + modelLocation: s3://mybucket/mnist-demo-sgd-lr01/model # optional path that will represent where model is saved. Ideally generated by us + logsLocation: s3://mybucket/mnist-demo-sgd-lr01/logs # tensorboard-style logs. Scalars there can be used for ModelDB and early stopping etc +spec: + parameters: + - name: optimizer + value: "sgd" + - name: learning_rate + value: 0.1 +