Skip to content

Commit

Permalink
Set a Running condition when the XGBoostJob is completed and doesn't …
Browse files Browse the repository at this point in the history
…have a Running condition

Signed-off-by: Yuki Iwai <[email protected]>
  • Loading branch information
tenzen-y committed Mar 28, 2023
1 parent b2ee1cb commit 1d45349
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 5 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.19

require (
github.com/go-logr/logr v1.2.3
github.com/google/go-cmp v0.5.8
github.com/kubeflow/common v0.4.6
github.com/onsi/ginkgo/v2 v2.1.6
github.com/onsi/gomega v1.20.1
Expand Down Expand Up @@ -49,7 +50,6 @@ require (
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/google/gnostic v0.5.7-v3refs // indirect
github.com/google/go-cmp v0.5.8 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/imdario/mergo v0.3.13 // indirect
Expand Down
31 changes: 31 additions & 0 deletions pkg/controller.v1/xgboost/status.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package xgboost

import (
"fmt"

"github.com/sirupsen/logrus"

commonv1 "github.com/kubeflow/common/pkg/apis/common/v1"
commonutil "github.com/kubeflow/common/pkg/util"
)

func setRunningCondition(logger *logrus.Entry, jobName string, jobStatus *commonv1.JobStatus) error {
msg := fmt.Sprintf("XGBoostJob %s is running.", jobName)
if condition := findStatusCondition(jobStatus.Conditions, commonv1.JobRunning); condition == nil {
err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, xgboostJobRunningReason, msg)
if err != nil {
logger.Infof("Append job condition error: %v", err)
return err
}
}
return nil
}

func findStatusCondition(conditions []commonv1.JobCondition, conditionType commonv1.JobConditionType) *commonv1.JobCondition {
for i := range conditions {
if conditions[i].Type == conditionType {
return &conditions[i]
}
}
return nil
}
124 changes: 124 additions & 0 deletions pkg/controller.v1/xgboost/status_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package xgboost

import (
"testing"

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/sirupsen/logrus"
corev1 "k8s.io/api/core/v1"

commonv1 "github.com/kubeflow/common/pkg/apis/common/v1"
)

var ignoreJobConditionsTimeOpts = cmpopts.IgnoreFields(commonv1.JobCondition{}, "LastUpdateTime", "LastTransitionTime")

func TestSetRunningCondition(t *testing.T) {
jobName := "test-xbgoostjob"
logger := logrus.NewEntry(logrus.New())
tests := map[string]struct {
input []commonv1.JobCondition
want []commonv1.JobCondition
}{
"input doesn't have a running condition": {
input: []commonv1.JobCondition{
{
Type: commonv1.JobSucceeded,
Reason: "XGBoostJobSucceeded",
Message: "XGBoostJob test-xbgoostjob is successfully completed.",
Status: corev1.ConditionTrue,
},
},
want: []commonv1.JobCondition{
{
Type: commonv1.JobSucceeded,
Reason: "XGBoostJobSucceeded",
Message: "XGBoostJob test-xbgoostjob is successfully completed.",
Status: corev1.ConditionTrue,
},
{
Type: commonv1.JobRunning,
Reason: "XGBoostJobRunning",
Message: "XGBoostJob test-xbgoostjob is running.",
Status: corev1.ConditionTrue,
},
},
},
"input has a running condition": {
input: []commonv1.JobCondition{
{
Type: commonv1.JobFailed,
Reason: "XGBoostJobFailed",
Message: "XGBoostJob test-sgboostjob is failed because 2 Worker replica(s) failed.",
Status: corev1.ConditionTrue,
},
{
Type: commonv1.JobRunning,
Reason: "XGBoostJobRunning",
Message: "XGBoostJob test-xbgoostjob is running.",
Status: corev1.ConditionTrue,
},
},
want: []commonv1.JobCondition{
{
Type: commonv1.JobFailed,
Reason: "XGBoostJobFailed",
Message: "XGBoostJob test-sgboostjob is failed because 2 Worker replica(s) failed.",
Status: corev1.ConditionTrue,
},
{
Type: commonv1.JobRunning,
Reason: "XGBoostJobRunning",
Message: "XGBoostJob test-xbgoostjob is running.",
Status: corev1.ConditionTrue,
},
},
},
}
for name, tc := range tests {
t.Run(name, func(t *testing.T) {
jobStatus := &commonv1.JobStatus{Conditions: tc.input}
err := setRunningCondition(logger, jobName, jobStatus)
if err != nil {
t.Fatalf("failed to update job condition: %v", err)
}
if diff := cmp.Diff(tc.want, jobStatus.Conditions, ignoreJobConditionsTimeOpts); len(diff) != 0 {
t.Fatalf("Unexpected conditions from setRunningCondition (-want,+got):\n%s", diff)
}
})
}
}

func TestFindStatusCondition(t *testing.T) {
tests := map[string]struct {
conditions []commonv1.JobCondition
want *commonv1.JobCondition
}{
"conditions have a running condition": {
conditions: []commonv1.JobCondition{
{
Type: commonv1.JobRunning,
},
},
want: &commonv1.JobCondition{
Type: commonv1.JobRunning,
},
},
"condition doesn't have a running condition": {
conditions: []commonv1.JobCondition{
{
Type: commonv1.JobSucceeded,
},
},
want: nil,
},
}
for name, tc := range tests {
t.Run(name, func(t *testing.T) {
got := findStatusCondition(tc.conditions, commonv1.JobRunning)
if diff := cmp.Diff(tc.want, got, ignoreJobConditionsTimeOpts); len(diff) != 0 {
t.Fatalf("Unexpected jobConditions from findStatusCondition (-want,got):\n%s", diff)
}
})
}
}
11 changes: 7 additions & 4 deletions pkg/controller.v1/xgboost/xgboostjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -403,15 +403,15 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com

if rtype == commonv1.ReplicaType(kubeflowv1.XGBoostJobReplicaTypeMaster) {
if running > 0 {
msg := fmt.Sprintf("XGBoostJob %s is running.", xgboostJob.Name)
err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, xgboostJobRunningReason, msg)
if err != nil {
logger.Infof("Append job condition error: %v", err)
if err := setRunningCondition(logger, xgboostJob.Name, jobStatus); err != nil {
return err
}
}
// when master is succeed, the job is finished.
if expected == 0 {
if err := setRunningCondition(logger, xgboostJob.Name, jobStatus); err != nil {
return err
}
msg := fmt.Sprintf("XGBoostJob %s is successfully completed.", xgboostJob.Name)
logrus.Info(msg)
r.Recorder.Event(xgboostJob, corev1.EventTypeNormal, xgboostJobSucceededReason, msg)
Expand All @@ -429,6 +429,9 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com
}
}
if failed > 0 {
if err := setRunningCondition(logger, xgboostJob.Name, jobStatus); err != nil {
return err
}
if spec.RestartPolicy == commonv1.RestartPolicyExitCode {
msg := fmt.Sprintf("XGBoostJob %s is restarting because %d %s replica(s) failed.", xgboostJob.Name, failed, rtype)
r.Recorder.Event(xgboostJob, corev1.EventTypeWarning, xgboostJobRestartingReason, msg)
Expand Down

0 comments on commit 1d45349

Please sign in to comment.