Skip to content

Commit

Permalink
Merge pull request #21 from kuizhiqing/main
Browse files Browse the repository at this point in the history
first milestone
  • Loading branch information
kuizhiqing authored Mar 3, 2021
2 parents 8357573 + f7ca61b commit 548d729
Show file tree
Hide file tree
Showing 11 changed files with 75 additions and 21 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

# Image URL to use all building/pushing image targets
IMG ?= registry.baidubce.com/kuizhiqing/paddle-operator:v1
IMG ?= registry.baidubce.com/paddle-operator/controller:v1
# Produce CRDs that work back to Kubernetes 1.11 (no version conversion)
CRD_OPTIONS ?= "crd:maxDescLen=0,trivialVersions=true,preserveUnknownFields=false"

Expand Down
2 changes: 1 addition & 1 deletion charts/paddle-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

image: registry.baidubce.com/kuizhiqing/paddle-operator:v1
image: registry.baidubce.com/paddle-operator/controller:v1

controllernamespace: paddle-system
jobnamespace: paddle-system
2 changes: 1 addition & 1 deletion config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@ configMapGenerator:

images:
- name: controller
newName: registry.baidubce.com/kuizhiqing/paddle-operator
newName: registry.baidubce.com/paddle-operator/controller
newTag: v1
24 changes: 24 additions & 0 deletions controllers/paddlejob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,30 @@ func (r *PaddleJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
}
}

if len(pdj.Status.PS.Refs) == pdj.Spec.PS.Replicas && len(pdj.Status.Worker.Refs) == pdj.Spec.Worker.Replicas {
if pdj.Spec.Intranet == pdv1.Service {
if len(pdj.Status.PS.Refs)+len(pdj.Status.Worker.Refs) != len(svcs.Items) {
return ctrl.Result{}, nil
}
}
if err := r.Get(ctx, types.NamespacedName{Name: pdj.Name, Namespace: pdj.Namespace}, &corev1.ConfigMap{}); err == nil || !apierrors.IsNotFound(err) {
return ctrl.Result{}, err
}
cm := constructConfigMap(&pdj, childPods)
if cm == nil {
return ctrl.Result{Requeue: true}, nil
}
if err := ctrl.SetControllerReference(&pdj, cm, r.Scheme); err != nil {
log.Error(err, "make reference failed")
return ctrl.Result{Requeue: true}, nil
}
err := r.createResource(ctx, &pdj, cm)
if apierrors.IsConflict(err) {
return ctrl.Result{Requeue: true}, nil
}
return ctrl.Result{}, err
}

if len(pdj.Status.PS.Refs) == pdj.Spec.PS.Replicas && len(pdj.Status.Worker.Refs) == pdj.Spec.Worker.Replicas {
if err := r.Get(ctx, types.NamespacedName{Name: pdj.Name, Namespace: pdj.Namespace}, &corev1.ConfigMap{}); err == nil || !apierrors.IsNotFound(err) {
return ctrl.Result{}, err
Expand Down
9 changes: 8 additions & 1 deletion controllers/paddlejob_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,14 @@ func constructPod(pdj *pdv1.PaddleJob, resType string, idx int) (pod *corev1.Pod
if pdj.Spec.Intranet == pdv1.Service {
pod.Spec.Containers[0].Ports = append(pod.Spec.Containers[0].Ports, corev1.ContainerPort{ContainerPort: pdv1.PADDLE_PORT})
}
pod.Spec.RestartPolicy = "Never"

if pod.Spec.RestartPolicy == "" {
if resType == pdv1.ResourceWorker && pdj.Spec.Intranet == pdv1.Service {
pod.Spec.RestartPolicy = "OnFailure"
} else {
pod.Spec.RestartPolicy = "Never"
}
}

return pod
}
Expand Down
29 changes: 29 additions & 0 deletions deploy/examples/resnet.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
apiVersion: batch.paddlepaddle.org/v1
kind: PaddleJob
metadata:
name: resnet
spec:
cleanPodPolicy: Never
worker:
replicas: 2
template:
spec:
containers:
- name: resnet
image: registry.baidubce.com/paddle-operator/demo-resnet:v1
command:
- python
args:
- "-m"
- "paddle.distributed.launch"
- "train_fleet.py"
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
limits:
nvidia.com/gpu: 1
volumes:
- name: dshm
emptyDir:
medium: Memory
7 changes: 2 additions & 5 deletions deploy/examples/wide_and_deep.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,13 @@ spec:
replicas: 2
template:
spec:
restartPolicy: "Never"
schedulerName: volcano
containers:
- name: paddle
image: registry.baidubce.com/kuizhiqing/demo-wide-and-deep:v1
image: registry.baidubce.com/paddle-operator/demo-wide-and-deep:v1
ps:
replicas: 2
template:
spec:
restartPolicy: "Never"
containers:
- name: paddle
image: registry.baidubce.com/kuizhiqing/demo-wide-and-deep:v1
image: registry.baidubce.com/paddle-operator/demo-wide-and-deep:v1
8 changes: 3 additions & 5 deletions deploy/examples/wide_and_deep_service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,19 @@ kind: PaddleJob
metadata:
name: wide-ande-deep-service
spec:
intranet: Service
intranet: Service # default: PodIP
cleanPodPolicy: Nerver
worker:
replicas: 2
template:
spec:
restartPolicy: "Never"
containers:
- name: paddle
image: registry.baidubce.com/kuizhiqing/demo-wide-and-deep:v1
image: registry.baidubce.com/paddle-operator/demo-wide-and-deep:v1
ps:
replicas: 2
template:
spec:
restartPolicy: "Never"
containers:
- name: paddle
image: registry.baidubce.com/kuizhiqing/demo-wide-and-deep:v1
image: registry.baidubce.com/paddle-operator/demo-wide-and-deep:v1
9 changes: 4 additions & 5 deletions deploy/examples/wide_and_deep_volcano.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,23 @@ spec:
replicas: 2
template:
spec:
restartPolicy: "Never"
schedulerName: volcano
containers:
- name: paddle
image: registry.baidubce.com/kuizhiqing/demo-wide-and-deep:v1
image: registry.baidubce.com/paddle-operator/demo-wide-and-deep:v1
ps:
replicas: 2
template:
spec:
restartPolicy: "Never"
containers:
- name: paddle
image: registry.baidubce.com/kuizhiqing/demo-wide-and-deep:v1
image: registry.baidubce.com/paddle-operator/demo-wide-and-deep:v1


---
apiVersion: scheduling.volcano.sh/v1beta1
kind: PodGroup
metadata:
name: wide-ande-deep
name: wide-ande-deep # name should be the same with PaddleJob
spec:
minMember: 4
2 changes: 1 addition & 1 deletion deploy/v1/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ spec:
- --namespace=paddle-system
command:
- /manager
image: registry.baidubce.com/kuizhiqing/paddle-operator:v1
image: registry.baidubce.com/paddle-operator/controller:v1
imagePullPolicy: Always
livenessProbe:
httpGet:
Expand Down
2 changes: 1 addition & 1 deletion deploy/v1beta1/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ spec:
- --namespace=paddle-system
command:
- /manager
image: registry.baidubce.com/kuizhiqing/paddle-operator:v1
image: registry.baidubce.com/paddle-operator/controller:v1
imagePullPolicy: Always
livenessProbe:
httpGet:
Expand Down

0 comments on commit 548d729

Please sign in to comment.