Skip to content

Commit

Permalink
fix: Address comments
Browse files Browse the repository at this point in the history
Signed-off-by: Ce Gao <[email protected]>
  • Loading branch information
gaocegege committed Nov 26, 2021
1 parent 877110d commit 2940450
Show file tree
Hide file tree
Showing 9 changed files with 36 additions and 41 deletions.
2 changes: 1 addition & 1 deletion examples/pytorch/elastic/echo/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM python:3.8-buster
WORKDIR /workspace
RUN pip install -i https://mirror.sjtu.edu.cn/pypi/web/simple torch==1.10.0 numpy
RUN pip install torch==1.10.0 numpy
# TODO Replace this with the PIP version when available
ADD echo.py echo.py
ENV PYTHONPATH /workspace
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/elastic/echo/echo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ spec:
spec:
containers:
- name: pytorch
image: gaocegege/pytorch-elastic-example-echo:1.0.0
image: kubeflow/pytorch-elastic-example-echo:1.0.0
imagePullPolicy: IfNotPresent
env:
- name: LOGLEVEL
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/elastic/imagenet/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ ARG BASE_IMAGE=pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
FROM $BASE_IMAGE

# install utilities and dependencies
RUN pip install -i https://mirror.sjtu.edu.cn/pypi/web/simple classy-vision
RUN pip install classy-vision

WORKDIR /workspace

Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/elastic/imagenet/imagenet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
spec:
containers:
- name: pytorch
image: gaocegege/pytorch-elastic-example-imagenet:1.0.0-sigterm
image: kubeflow/pytorch-elastic-example-imagenet:1.0.0-sigterm
imagePullPolicy: IfNotPresent
env:
- name: LOGLEVEL
Expand Down
1 change: 0 additions & 1 deletion hack/update-codegen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ cd - >/dev/null
# Notice: The code in kube-openapi does not generate defaulter by default.
# We need to build binary from pkg cmd folder.
echo "Building openapi-gen"
echo ${OPENAPI_PKG}
go build -o openapi-gen ${OPENAPI_PKG}/cmd/openapi-gen

echo "Generating OpenAPI specification for tensorflow/v1"
Expand Down
10 changes: 5 additions & 5 deletions manifests/base/crds/kubeflow.org_pytorchjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@ spec:
format: int32
type: integer
metrics:
description: metrics contains the specifications for which to
use to calculate the desired replica count (the maximum replica
description: Metrics contains the specifications which are used
to calculate the desired replica count (the maximum replica
count across all metrics will be used). The desired replica
count is calculated multiplying the ratio between the target
value and the current value by the current number of pods. Ergo,
count is calculated with multiplying the ratio between the target
value and the current value by the current number of pods. Ergo,
metrics used must decrease as the pod count is increased, and
vice-versa. See the individual metric source types for more
information about how each type of metric must respond. If not
set, the default metric will be set to 80% average CPU utilization.
set, the HPA will not be created.
items:
description: MetricSpec specifies how to scale based on a single
metric (only `type` and one other matching field should be
Expand Down
14 changes: 7 additions & 7 deletions pkg/apis/pytorch/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,14 @@ type ElasticPolicy struct {

MaxRestarts *int32 `json:"maxRestarts,omitempty"`

// metrics contains the specifications for which to use to calculate the
// Metrics contains the specifications which are used to calculate the
// desired replica count (the maximum replica count across all metrics will
// be used). The desired replica count is calculated multiplying the
// be used). The desired replica count is calculated with multiplying the
// ratio between the target value and the current value by the current
// number of pods. Ergo, metrics used must decrease as the pod count is
// number of pods. Ergo, metrics used must decrease as the pod count is
// increased, and vice-versa. See the individual metric source types for
// more information about how each type of metric must respond.
// If not set, the default metric will be set to 80% average CPU utilization.
// If not set, the HPA will not be created.
// +optional
Metrics []autoscalingv2beta2.MetricSpec `json:"metrics,omitempty"`
}
Expand All @@ -111,14 +111,14 @@ const (
// BackendC10D is the rendezvous backend type for C10d.
BackendC10D RDZVBackend = "c10d"
// BackendETCD is the rendezvous backend type for ETCD.
BackendETCD = "etcd"
BackendETCD RDZVBackend = "etcd"
// BackendETCDV2 is the rendezvous backend type for ETCD v2.
BackendETCDV2 = "etcd-v2"
BackendETCDV2 RDZVBackend = "etcd-v2"

// PyTorchReplicaTypeMaster is the type of Master of distributed PyTorch
PyTorchReplicaTypeMaster common.ReplicaType = "Master"
// PyTorchReplicaTypeWorker is the type for workers of distributed PyTorch.
PyTorchReplicaTypeWorker = "Worker"
PyTorchReplicaTypeWorker common.ReplicaType = "Worker"
)

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand Down
38 changes: 19 additions & 19 deletions pkg/controller.v1/pytorch/hpa.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,40 +31,40 @@ import (
func (r *PyTorchJobReconciler) ReconcileHPA(pytorhcjob *pytorchv1.PyTorchJob) error {
logger := r.Log.WithValues(pytorchv1.Singular, pytorhcjob.Name)

if pytorhcjob.Spec.ElasticPolicy == nil {
logger.V(1).Info("No ElasicPolicy is specified, skipping HPA reconciling process")
if pytorhcjob.Spec.ElasticPolicy == nil || pytorhcjob.Spec.ElasticPolicy.Metrics == nil {
logger.V(1).Info(
"No ElasicPolicy or Metric is specified, skipping HPA reconciling process")
return nil
}

// Create or update HPA
hpa := &autoscalingv2beta2.HorizontalPodAutoscaler{}
err := r.Get(context.TODO(), types.NamespacedName{Name: pytorhcjob.Name, Namespace: pytorhcjob.Namespace}, hpa)
current := &autoscalingv2beta2.HorizontalPodAutoscaler{}

// Get the exepected HPA.
expected, err := desiredHPA(pytorhcjob, r.Scheme)
if err != nil {
return err
}

if err := r.Get(context.TODO(), types.NamespacedName{
Name: pytorhcjob.Name,
Namespace: pytorhcjob.Namespace,
}, current); err != nil {
if !errors.IsNotFound(err) {
return err
}

// Create the new HPA.
hpa, err = desiredHPA(pytorhcjob, r.Scheme)
if err != nil {
return err
}
logger.V(1).Info("Creating HPA", "HPA.Namespace", hpa.Namespace, "HPA.Name", hpa.Name)
err = r.Create(context.TODO(), hpa)
logger.V(1).Info("Creating HPA", "namespace", expected.Namespace, "name", expected.Name)
err = r.Create(context.TODO(), expected)
if err != nil {
return err
}
return nil
}

// Update HPA
expected, err := desiredHPA(pytorhcjob, r.Scheme)
if err != nil {
return err
}
if !equality.Semantic.DeepEqual(expected.Spec, hpa.Spec) {
logger.V(1).Info("Updating HPA", "HPA.Namespace", hpa.Namespace, "HPA.Name", hpa.Name)
expected.ResourceVersion = hpa.ResourceVersion
if !equality.Semantic.DeepEqual(expected.Spec, current.Spec) {
logger.V(1).Info("Updating HPA", "namespace", current.Namespace, "name", current.Name)
expected.ResourceVersion = current.ResourceVersion
err = r.Update(context.TODO(), expected)
if err != nil {
return err
Expand Down
6 changes: 1 addition & 5 deletions pkg/controller.v1/pytorch/pytorch.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,7 @@ func SetPodEnv(obj interface{}, podTemplateSpec *corev1.PodTemplateSpec, rtype,
if err != nil {
return err
}
if rtype == strings.ToLower(string(pytorchv1.PyTorchReplicaTypeMaster)) {
if rank != 0 {
return fmt.Errorf("invalid config: There should be only a single master with index=0")
}
} else {
if rtype == strings.ToLower(string(pytorchv1.PyTorchReplicaTypeWorker)) {
rank = rank + 1
}

Expand Down

0 comments on commit 2940450

Please sign in to comment.