Skip to content

Commit

Permalink
delete pdb when tfjob is terminated (#721)
Browse files Browse the repository at this point in the history
* delete pdb when tfjob is terminated

* fix some bus

* reorder the import package

* fix not go import warning
  • Loading branch information
ChanYiLin authored and k8s-ci-robot committed Jul 11, 2018
1 parent 7ebe995 commit 3fba4db
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 0 deletions.
7 changes: 7 additions & 0 deletions pkg/controller.v2/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,13 @@ func (tc *TFJobController) reconcileTFJobs(tfjob *tfv1alpha2.TFJob) error {
if err := tc.deletePodsAndServices(tfjob, pods); err != nil {
return err
}

if tc.config.enableGangScheduling {
if err := tc.deletePdb(tfjob); err != nil {
return err
}
}

// Initialize the status.
initializeTFReplicaStatuses(tfjob, tfv1alpha2.TFReplicaTypeWorker)
initializeTFReplicaStatuses(tfjob, tfv1alpha2.TFReplicaTypePS)
Expand Down
26 changes: 26 additions & 0 deletions pkg/controller.v2/controller_tfjob.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (

log "github.com/sirupsen/logrus"
"k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/scheme"

tfv1alpha2 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha2"
Expand Down Expand Up @@ -62,6 +64,30 @@ func (tc *TFJobController) updateTFJob(old, cur interface{}) {
tc.enqueueTFJob(cur)
}

func (tc *TFJobController) deletePdb(tfJob *tfv1alpha2.TFJob) error {

// Check the pdb exist or not
_, err := tc.kubeClientSet.PolicyV1beta1().PodDisruptionBudgets(tfJob.Namespace).Get(tfJob.Name, metav1.GetOptions{})
if err != nil && k8serrors.IsNotFound(err) {
return nil
}

tc.recorder.Event(tfJob, v1.EventTypeNormal, terminatedTFJobReason,
"TFJob is terminated, deleting pdb")

msg := fmt.Sprintf("Deleting pdb %s", tfJob.Name)
log.Info(msg)

if err := tc.kubeClientSet.PolicyV1beta1().PodDisruptionBudgets(tfJob.Namespace).Delete(tfJob.Name, &metav1.DeleteOptions{}); err != nil {
tc.recorder.Eventf(tfJob, v1.EventTypeWarning, "FailedDeletePdb", "Error deleting: %v", err)
return fmt.Errorf("unable to delete pdb: %v", err)
} else {
tc.recorder.Eventf(tfJob, v1.EventTypeNormal, "SuccessfulDeletePdb", "Deleted pdb: %v", tfJob.Name)
}

return nil
}

func (tc *TFJobController) deletePodsAndServices(tfJob *tfv1alpha2.TFJob, pods []*v1.Pod) error {
if len(pods) == 0 {
return nil
Expand Down

0 comments on commit 3fba4db

Please sign in to comment.