Skip to content

Commit

Permalink
add OwnerReferences to pdb (#565)
Browse files Browse the repository at this point in the history
* add OwnerReferences to pdb

* refactor the syncPdb code

* fix the bug of access pdb name.

* add fake ObjectMeta uid in the testcase of TestPDBForGangScheduling

* fix error when no pdb should be created

* remove the fake uid in the testcase
  • Loading branch information
ChanYiLin authored and k8s-ci-robot committed May 5, 2018
1 parent bab9d9e commit f317193
Showing 1 changed file with 43 additions and 20 deletions.
63 changes: 43 additions & 20 deletions pkg/trainer/training.go
Original file line number Diff line number Diff line change
Expand Up @@ -442,22 +442,21 @@ func (j *TrainingJob) SchedulerName() string {
return j.job.Spec.SchedulerName
}

// SyncPdb will create a PDB for gang scheduling by kube-arbitrator.
func (j *TrainingJob) syncPdb() error {
nrReplicas := int32(0)
for _, r := range j.Replicas {
nrReplicas += *r.Spec.Replicas
}
// genPdbName generate a new pdb name
func (j *TrainingJob) genPdbName() string {
return "tf-job-pdb-" + j.job.ObjectMeta.Name
}

if nrReplicas == 1 {
// gang scheduling isn't required by a non distributed training process
return nil
}
func (j *TrainingJob) CreatePdb(nrReplicas int32) (*v1beta1.PodDisruptionBudget, error) {

// Create the pdb.
minAvailable := intstr.FromInt(int(nrReplicas))
pdb := &v1beta1.PodDisruptionBudget{
ObjectMeta: meta_v1.ObjectMeta{
Name: "tf-job-pdb-" + j.job.ObjectMeta.Name,
Name: j.genPdbName(),
OwnerReferences: []meta_v1.OwnerReference{
helper.AsOwner(j.job),
},
},
Spec: v1beta1.PodDisruptionBudgetSpec{
MinAvailable: &minAvailable,
Expand All @@ -469,20 +468,44 @@ func (j *TrainingJob) syncPdb() error {
},
},
}
j.contextLogger.Infof("Creating PDB: %v", pdb.ObjectMeta.Name)
return j.KubeCli.PolicyV1beta1().PodDisruptionBudgets(j.job.ObjectMeta.Namespace).Create(pdb)
}

createdPdb, err := j.KubeCli.PolicyV1beta1().PodDisruptionBudgets(j.job.ObjectMeta.Namespace).Create(pdb)
if err != nil {
if k8s_errors.IsAlreadyExists(err) {
j.contextLogger.Infof("PDB: %v already exists.", "tf-job-pdb-"+j.job.ObjectMeta.Name)
return nil
// SyncPdb will create a PDB for gang scheduling by kube-arbitrator.
func (j *TrainingJob) syncPdb() error {

nrReplicas := int32(0)
for _, r := range j.Replicas {
nrReplicas += *r.Spec.Replicas
}

if nrReplicas == 1 {
// gang scheduling isn't required by a non distributed training process
return nil
}

createdPdb, err := j.KubeCli.PolicyV1beta1().PodDisruptionBudgets(j.job.ObjectMeta.Namespace).Get(j.genPdbName(), meta_v1.GetOptions{})

if err != nil && k8s_errors.IsNotFound(err) {
j.contextLogger.Infof("PDB: %v not found, create new one.", j.genPdbName())

// Create the pdb
createdPdb, err := j.CreatePdb(nrReplicas)

// If the pdb already exists do nothing.
if err != nil {
if k8s_errors.IsAlreadyExists(err) {
j.contextLogger.Infof("PDB: %v already exists.", j.genPdbName())
return nil
}
j.recorder.Eventf(j.job, v1.EventTypeWarning, FailedCreateReason, "Error creating: %v", err)
return err
}

j.recorder.Eventf(j.job, v1.EventTypeWarning, FailedCreateReason, "Error creating: %v", err)
return err
j.recorder.Eventf(j.job, v1.EventTypeNormal, SuccessfulCreateReason, "Created PDB: %v", createdPdb.Name)
}

j.pdb = createdPdb

j.recorder.Eventf(j.job, v1.EventTypeNormal, SuccessfulCreateReason, "Created PDB: %v", createdPdb.Name)
return nil
}

0 comments on commit f317193

Please sign in to comment.