Skip to content

Commit

Permalink
Extend Katib API for NAS jobs (#327)
Browse files Browse the repository at this point in the history
* Add fields to studyjob structure

* Change nasjob yaml file

* Change parameter type

* Add Parameter Type=range

* Change API

* Change input size

* Reset API structure

* Change StudyJob API structure

* Remove Range parameter

* Fix api.proto

* Fix gopkg.toml

* Remove old nasjob file

* Fix nasjob.yaml

* Add custom suggestion

* Add blank NAS suggestion
Change Katib API to process yaml file for NAS

* Add correct YAML file for NAS example

* Fix newline

* Change StudyID to 1

* Add jobType parameter in Parsing

* Remove changes in manager

* Add NasConfig inside Yaml file

* Fix name in nasConfig

* Fix get StudyConfig in NAS

* Add JobType in all services

* Add job_type in bayesian_service

* Add pointers in NasConfig structure

* Fix Pointer in API

* Add consts for jobType
Remove return from populateCommonConfigFields

* Move const jobType to const file

* Remove Range parameter

* Modify YAML file for NAS jobs

* Add getStudyJobType function in GRPC server

* Add blank GetStudyJobType func in manager

* Fix metrics collector

* Remove jobType from getStudy

* Remove getStudyJobType from manager

* Add NAS RL yaml deployment

* Change worker to GPU

* Clean nasrl suggestion

* Add -u inside training-container

* Fix namespace in worker template
  • Loading branch information
andreyvelich authored and k8s-ci-robot committed Jan 29, 2019
1 parent f4026e4 commit f11c13e
Show file tree
Hide file tree
Showing 22 changed files with 1,604 additions and 451 deletions.
4 changes: 2 additions & 2 deletions cmd/manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ func (s *server) GetTrials(ctx context.Context, in *api_pb.GetTrialsRequest) (*a
}

func (s *server) GetTrial(ctx context.Context, in *api_pb.GetTrialRequest) (*api_pb.GetTrialReply, error) {
t, err := dbIf.GetTrial(in.TrialId)
return &api_pb.GetTrialReply{Trial: t}, err
t, err := dbIf.GetTrial(in.TrialId)
return &api_pb.GetTrialReply{Trial: t}, err
}

func (s *server) GetSuggestions(ctx context.Context, in *api_pb.GetSuggestionsRequest) (*api_pb.GetSuggestionsReply, error) {
Expand Down
8 changes: 8 additions & 0 deletions cmd/suggestion/nasrl/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM python:3

ADD . /usr/src/app/github.com/kubeflow/katib
WORKDIR /usr/src/app/github.com/kubeflow/katib/cmd/suggestion/nasrl
RUN pip install --no-cache-dir -r requirements.txt
ENV PYTHONPATH /usr/src/app/github.com/kubeflow/katib:/usr/src/app/github.com/kubeflow/katib/pkg/api/python

ENTRYPOINT ["python", "-u", "main.py"]
Empty file.
29 changes: 29 additions & 0 deletions cmd/suggestion/nasrl/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import grpc
from concurrent import futures

import time

from pkg.api.python import api_pb2_grpc
from pkg.suggestion.nasrl_service import NasrlService
from pkg.suggestion.types import DEFAULT_PORT
from logging import getLogger, StreamHandler, INFO, DEBUG


_ONE_DAY_IN_SECONDS = 60 * 60 * 24


def serve():
print("NAS RL Suggestion Service")
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
api_pb2_grpc.add_SuggestionServicer_to_server(NasrlService(), server)
server.add_insecure_port(DEFAULT_PORT)
print("Listening...")
server.start()
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)

if __name__ == "__main__":
serve()
9 changes: 9 additions & 0 deletions cmd/suggestion/nasrl/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
grpcio
duecredit
cloudpickle==0.5.6
numpy>=1.13.3
scikit-learn>=0.19.0
scipy>=0.19.1
forestci
protobuf
googleapis-common-protos
120 changes: 120 additions & 0 deletions examples/nasjob-example-RL.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
apiVersion: "kubeflow.org/v1alpha1"
kind: StudyJob
metadata:
namespace: kubeflow
labels:
controller-tools.k8s.io: "1.0"
name: nas-rl-example
spec:
studyName: nas-rl-example
owner: crd
optimizationtype: maximize
objectivevaluename: Validation-Accuracy
optimizationgoal: 0.99
requestcount: 3
metricsnames:
- accuracy
nasConfig:
graphConfig:
numLayers: 8
inputSize:
- 32
- 32
- 3
outputSize:
- 10
operations:
- operationType: convolution
parameterconfigs:
- name: filter_size
parametertype: categorical
feasible:
list:
- "3"
- "5"
- "7"
- name: num_filter
parametertype: categorical
feasible:
list:
- "32"
- "48"
- "64"
- "96"
- "128"
- name: stride
parametertype: categorical
feasible:
list:
- "1"
- "2"
- operationType: reduction
parameterconfigs:
- name: reduction_type
parametertype: categorical
feasible:
list:
- max_pooling
- avg_pooling
- name: pool_size
parametertype: int
feasible:
min: "2"
max: "3"
step: "1"
workerSpec:
goTemplate:
rawTemplate: |-
apiVersion: batch/v1
kind: Job
metadata:
name: {{.WorkerID}}
namespace: {{.NameSpace}}
spec:
template:
spec:
containers:
- name: {{.WorkerID}}
image: docker.io/deepermind/training-container-nas
command:
- "python3.5"
- "-u"
- "RunTrial.py"
{{- with .HyperParameters}}
{{- range .}}
- "--{{.Name}}={{.Value}}"
{{- end}}
{{- end}}
resources:
limits:
nvidia.com/gpu: 1
restartPolicy: Never
suggestionSpec:
suggestionAlgorithm: "nasrl"
suggestionParameters:
- name: "lstm_num_cells"
value: "64"
- name: "lstm_num_layers"
value: "1"
- name: "lstm_keep_prob"
value: "1.0"
- name: "optimizer"
value: "adam"
- name: "init_learning_rate"
value: "1e-3"
- name: "lr_decay_start"
value: "0"
- name: "lr_decay_every"
value: "1000"
- name: "lr_decay_rate"
value: "0.9"
- name: "skip-target"
value: "0.4"
- name: "skip-weight"
value: "0.8"
- name: "l2_reg"
value: "0"
- name: "entropy_weight"
value: "1e-4"
- name: "baseline_decay"
value: "0.9999"
23 changes: 23 additions & 0 deletions manifests/vizier/suggestion/reinforcementlearning/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: vizier-suggestion-nasrl
namespace: kubeflow
labels:
app: vizier
component: suggestion-nasrl
spec:
replicas: 1
template:
metadata:
name: vizier-suggestion-nasrl
labels:
app: vizier
component: suggestion-nasrl
spec:
containers:
- name: vizier-suggestion-nasrl
image: docker.io/deepermind/katib-nasrl-suggestion
ports:
- name: api
containerPort: 6789
17 changes: 17 additions & 0 deletions manifests/vizier/suggestion/reinforcementlearning/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: v1
kind: Service
metadata:
name: vizier-suggestion-nasrl
namespace: kubeflow
labels:
app: vizier
component: suggestion-nasrl
spec:
type: ClusterIP
ports:
- port: 6789
protocol: TCP
name: api
selector:
app: vizier
component: suggestion-nasrl
Loading

0 comments on commit f11c13e

Please sign in to comment.