Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate IsDefaultPS in TFJob CRD API #343

Merged
merged 2 commits into from
Jan 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ metadata:
data:
controller-config-file.yaml: |
accelerators:
grpcServerFilePath: /opt/mlkube/grpc_tensorflow_server/grpc_tensorflow_server.py
alpha.kubernetes.io/nvidia-gpu:
volumes:
- name: <volume-name> # Desired name of the volume, ex: nvidia-libs
Expand Down Expand Up @@ -405,8 +404,7 @@ metadata:
spec:
RuntimeId: 76no
replicaSpecs:
- IsDefaultPS: false
replicas: 1
- replicas: 1
template:
metadata:
creationTimestamp: null
Expand All @@ -418,8 +416,7 @@ spec:
restartPolicy: OnFailure
tfPort: 2222
tfReplicaType: MASTER
- IsDefaultPS: false
replicas: 1
- replicas: 1
template:
metadata:
creationTimestamp: null
Expand All @@ -431,8 +428,7 @@ spec:
restartPolicy: OnFailure
tfPort: 2222
tfReplicaType: WORKER
- IsDefaultPS: true
replicas: 2
- replicas: 2
template:
metadata:
creationTimestamp: null
Expand Down
3 changes: 1 addition & 2 deletions build/images/tf_operator/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@ RUN mkdir -p /opt/mlkube/test
RUN mkdir -p /opt/tensorflow_k8s/dashboard/
COPY tf_operator /opt/mlkube
COPY e2e /opt/mlkube/test
COPY grpc_tensorflow_server.py /opt/mlkube/grpc_tensorflow_server/grpc_tensorflow_server.py
COPY backend /opt/tensorflow_k8s/dashboard/
COPY build /opt/tensorflow_k8s/dashboard/frontend/build
RUN chmod a+x /opt/mlkube/tf_operator
RUN chmod a+x /opt/mlkube/test/e2e
RUN chmod a+x /opt/tensorflow_k8s/dashboard/backend

# TODO(jlewi): Reduce log level.
ENTRYPOINT ["/opt/mlkube/tf_operator", "-alsologtostderr"]
ENTRYPOINT ["/opt/mlkube/tf_operator", "-alsologtostderr"]
3 changes: 1 addition & 2 deletions build/images/tf_operator/build_and_push.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,7 @@ def main(): # pylint: disable=too-many-locals, too-many-statements
os.path.join(go_path, "bin/tf_operator"),
os.path.join(go_path, "bin/e2e"),
os.path.join(go_path, "bin/backend"),
"dashboard/frontend/build",
"hack/grpc_tensorflow_server/grpc_tensorflow_server.py"
"dashboard/frontend/build"
]

for s in sources:
Expand Down
12 changes: 1 addition & 11 deletions developer_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,20 +121,10 @@ export MY_POD_NAME=my-pod
set the corresponding namespace for the resource.
* TODO(jlewi): Do we still need to set MY_POD_NAME? Why?

Make a copy of `grpc_tensorflow_server.py` and create a config file named `controller-config-file.yaml`:

```sh
cp hack/grpc_tensorflow_server/grpc_tensorflow_server.py /tmp/grpc_tensorflow_server.py

cat > /tmp/controller-config-file.yaml << EOL
grpcServerFilePath: /tmp/grpc_tensorflow_server.py
EOL
```

Now we are ready to run operator locally:

```sh
tf_operator -controller_config_file=/tmp/controller_config_file.yaml
tf_operator --logtostderr
```

The command creates a CRD `tfjobs` and block watching for creation of the resource kind. To verify local
Expand Down
160 changes: 0 additions & 160 deletions hack/grpc_tensorflow_server/grpc_tensorflow_server.py

This file was deleted.

27 changes: 0 additions & 27 deletions pkg/apis/tensorflow/v1alpha1/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package v1alpha1

import (
"github.com/golang/protobuf/proto"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
)

Expand Down Expand Up @@ -32,11 +31,6 @@ func SetDefaults_TFJob(obj *TFJob) {
if r.Replicas == nil {
r.Replicas = proto.Int32(Replicas)
}

//Set the default configuration for a PS server if the user didn't specify a PodTemplateSpec
if r.Template == nil && r.TFReplicaType == PS {
setDefault_PSPodTemplateSpec(r, c.TFImage)
}
}
if c.TerminationPolicy == nil {
c.TerminationPolicy = &TerminationPolicySpec{
Expand All @@ -48,24 +42,3 @@ func SetDefaults_TFJob(obj *TFJob) {
}

}

func setDefault_PSPodTemplateSpec(r *TFReplicaSpec, tfImage string) {
r.IsDefaultPS = true
r.Template = &v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{
v1.Container{
Image: tfImage,
Name: "tensorflow",
VolumeMounts: []v1.VolumeMount{
v1.VolumeMount{
Name: "ps-config-volume",
MountPath: "/ps-server",
},
},
},
},
RestartPolicy: v1.RestartPolicyOnFailure,
},
}
}
22 changes: 2 additions & 20 deletions pkg/apis/tensorflow/v1alpha1/defaults_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,27 +78,9 @@ func TestSetDefaults_TFJob(t *testing.T) {
Spec: TFJobSpec{
ReplicaSpecs: []*TFReplicaSpec{
{
Replicas: proto.Int32(1),
TFPort: proto.Int32(2222),
Template: &v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{
v1.Container{
Image: "tensorflow/tensorflow:1.3.0",
Name: "tensorflow",
VolumeMounts: []v1.VolumeMount{
v1.VolumeMount{
Name: "ps-config-volume",
MountPath: "/ps-server",
},
},
},
},
RestartPolicy: v1.RestartPolicyOnFailure,
},
},
Replicas: proto.Int32(1),
TFPort: proto.Int32(2222),
TFReplicaType: PS,
IsDefaultPS: true,
},
},
TFImage: "tensorflow/tensorflow:1.3.0",
Expand Down
2 changes: 0 additions & 2 deletions pkg/apis/tensorflow/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,6 @@ type TFReplicaSpec struct {
// TFPort is the port to use for TF services.
TFPort *int32 `json:"tfPort,omitempty" protobuf:"varint,1,opt,name=tfPort"`
TFReplicaType `json:"tfReplicaType"`
// IsDefaultPS denotes if the parameter server should use the default grpc_tensorflow_server
IsDefaultPS bool
}

type TensorBoardSpec struct {
Expand Down
2 changes: 1 addition & 1 deletion pkg/apis/tensorflow/validation/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ func ValidateTFJobSpec(c *tfv1.TFJobSpec) error {
// Check that each replica has a TensorFlow container and a chief.
for _, r := range c.ReplicaSpecs {
found := false
if r.Template == nil && r.TFReplicaType != tfv1.PS {
if r.Template == nil {
return fmt.Errorf("Replica is missing Template; %v", util.Pformat(r))
}

Expand Down
Loading