-
Notifications
You must be signed in to change notification settings - Fork 449
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
I think operator-pattern would fit very nicely into Katib.
- Loading branch information
Showing
1 changed file
with
72 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
--- | ||
apiVersion: "kubeflow.org/v1alpha1" | ||
kind: Study | ||
metadata: | ||
name: mnist-demo | ||
owner: katib | ||
spec: | ||
# This is equivalent to studyconf | ||
# Each trial config will land in env variable according to name | ||
# On top of that trial will also have unique id built from these configs | ||
# This will be required to, for example, save model at the end of the trial | ||
# to S3 under unique, but descriptive, name | ||
optimizationType: "max" | ||
optimizationGoal: 0.9 | ||
parameters: | ||
- name: optimizer | ||
values: | ||
- "sgd" | ||
- "adam" | ||
- name: learning_rate | ||
min: 0.1 | ||
max: 0.9 | ||
step: 0.1 | ||
tfJobSpec: | ||
# Set of tf-jobs will be created with this spec | ||
# This will allow to use all the features of tf-job | ||
# Name will be autogenerated from study name and params | ||
replicaSpecs: | ||
- replicas: 1 | ||
tfReplicaType: MASTER | ||
template: | ||
spec: | ||
containers: | ||
- image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff | ||
name: tensorflow | ||
restartPolicy: OnFailure | ||
- replicas: 1 | ||
tfReplicaType: WORKER | ||
template: | ||
spec: | ||
containers: | ||
- image: gcr.io/tf-on-k8s-dogfood/tf_sample:dc944ff | ||
name: tensorflow | ||
restartPolicy: OnFailure | ||
- replicas: 2 | ||
tfReplicaType: PS | ||
|
||
# Above yaml will be enough to run whole set of optimization, but each | ||
# optimization will create it's own CRDs, which can be manually manipulated, | ||
# monitored etc. | ||
# For example | ||
# | ||
# kubectl get trial -> mnist-demo-ldwpw | ||
# kubectl get trial mnist-demo-ldwpw | ||
|
||
--- | ||
apiVersion: "kubeflow.org/v1alpha1" | ||
# Reason I'd like this to be called Model not Trial is because, at the end, we will | ||
# want to use by, for example, tf-serving to spawn serving cluster | ||
kind: Model | ||
metadata: | ||
name: mnist-demo-ldwpw | ||
tfJob: mnist-demo-ldwpw # this will be filled after operator successfully creates tfJob, can be used to get logs, pod status etc | ||
optimizationScore: 0.2 # filled by operator after study is complete | ||
location: s3://mybucket/mnist-demo-sgd-lr01 # optional path that will represent where model is saved. Ideally generated by us | ||
spec: | ||
parameters: | ||
- name: optimizer | ||
value: "sgd" | ||
- name: learning_rate | ||
value: 0.1 | ||
|