diff --git a/go.mod b/go.mod index 320a284563..fd90ff5fa6 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.19 require ( github.com/go-logr/logr v1.2.3 + github.com/google/go-cmp v0.5.8 github.com/kubeflow/common v0.4.6 github.com/onsi/ginkgo/v2 v2.1.6 github.com/onsi/gomega v1.20.1 @@ -49,7 +50,6 @@ require ( github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.2 // indirect github.com/google/gnostic v0.5.7-v3refs // indirect - github.com/google/go-cmp v0.5.8 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/uuid v1.3.0 // indirect github.com/imdario/mergo v0.3.13 // indirect diff --git a/pkg/controller.v1/xgboost/status.go b/pkg/controller.v1/xgboost/status.go new file mode 100644 index 0000000000..f9f818c277 --- /dev/null +++ b/pkg/controller.v1/xgboost/status.go @@ -0,0 +1,31 @@ +package xgboost + +import ( + "fmt" + + "github.com/sirupsen/logrus" + + commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" + commonutil "github.com/kubeflow/common/pkg/util" +) + +func setRunningCondition(logger *logrus.Entry, jobName string, jobStatus *commonv1.JobStatus) error { + msg := fmt.Sprintf("XGBoostJob %s is running.", jobName) + if condition := findStatusCondition(jobStatus.Conditions, commonv1.JobRunning); condition == nil { + err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, xgboostJobRunningReason, msg) + if err != nil { + logger.Infof("Append job condition error: %v", err) + return err + } + } + return nil +} + +func findStatusCondition(conditions []commonv1.JobCondition, conditionType commonv1.JobConditionType) *commonv1.JobCondition { + for i := range conditions { + if conditions[i].Type == conditionType { + return &conditions[i] + } + } + return nil +} diff --git a/pkg/controller.v1/xgboost/status_test.go b/pkg/controller.v1/xgboost/status_test.go new file mode 100644 index 0000000000..672f194865 --- /dev/null +++ b/pkg/controller.v1/xgboost/status_test.go @@ -0,0 +1,124 @@ +package xgboost + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/sirupsen/logrus" + corev1 "k8s.io/api/core/v1" + + commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" +) + +var ignoreJobConditionsTimeOpts = cmpopts.IgnoreFields(commonv1.JobCondition{}, "LastUpdateTime", "LastTransitionTime") + +func TestSetRunningCondition(t *testing.T) { + jobName := "test-xbgoostjob" + logger := logrus.NewEntry(logrus.New()) + tests := map[string]struct { + input []commonv1.JobCondition + want []commonv1.JobCondition + }{ + "input doesn't have a running condition": { + input: []commonv1.JobCondition{ + { + Type: commonv1.JobSucceeded, + Reason: "XGBoostJobSucceeded", + Message: "XGBoostJob test-xbgoostjob is successfully completed.", + Status: corev1.ConditionTrue, + }, + }, + want: []commonv1.JobCondition{ + { + Type: commonv1.JobSucceeded, + Reason: "XGBoostJobSucceeded", + Message: "XGBoostJob test-xbgoostjob is successfully completed.", + Status: corev1.ConditionTrue, + }, + { + Type: commonv1.JobRunning, + Reason: "XGBoostJobRunning", + Message: "XGBoostJob test-xbgoostjob is running.", + Status: corev1.ConditionTrue, + }, + }, + }, + "input has a running condition": { + input: []commonv1.JobCondition{ + { + Type: commonv1.JobFailed, + Reason: "XGBoostJobFailed", + Message: "XGBoostJob test-sgboostjob is failed because 2 Worker replica(s) failed.", + Status: corev1.ConditionTrue, + }, + { + Type: commonv1.JobRunning, + Reason: "XGBoostJobRunning", + Message: "XGBoostJob test-xbgoostjob is running.", + Status: corev1.ConditionTrue, + }, + }, + want: []commonv1.JobCondition{ + { + Type: commonv1.JobFailed, + Reason: "XGBoostJobFailed", + Message: "XGBoostJob test-sgboostjob is failed because 2 Worker replica(s) failed.", + Status: corev1.ConditionTrue, + }, + { + Type: commonv1.JobRunning, + Reason: "XGBoostJobRunning", + Message: "XGBoostJob test-xbgoostjob is running.", + Status: corev1.ConditionTrue, + }, + }, + }, + } + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + jobStatus := &commonv1.JobStatus{Conditions: tc.input} + err := setRunningCondition(logger, jobName, jobStatus) + if err != nil { + t.Fatalf("failed to update job condition: %v", err) + } + if diff := cmp.Diff(tc.want, jobStatus.Conditions, ignoreJobConditionsTimeOpts); len(diff) != 0 { + t.Fatalf("Unexpected conditions from setRunningCondition (-want,+got):\n%s", diff) + } + }) + } +} + +func TestFindStatusCondition(t *testing.T) { + tests := map[string]struct { + conditions []commonv1.JobCondition + want *commonv1.JobCondition + }{ + "conditions have a running condition": { + conditions: []commonv1.JobCondition{ + { + Type: commonv1.JobRunning, + }, + }, + want: &commonv1.JobCondition{ + Type: commonv1.JobRunning, + }, + }, + "condition doesn't have a running condition": { + conditions: []commonv1.JobCondition{ + { + Type: commonv1.JobSucceeded, + }, + }, + want: nil, + }, + } + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + got := findStatusCondition(tc.conditions, commonv1.JobRunning) + if diff := cmp.Diff(tc.want, got, ignoreJobConditionsTimeOpts); len(diff) != 0 { + t.Fatalf("Unexpected jobConditions from findStatusCondition (-want,got):\n%s", diff) + } + }) + } +} diff --git a/pkg/controller.v1/xgboost/xgboostjob_controller.go b/pkg/controller.v1/xgboost/xgboostjob_controller.go index b78c91e9b1..e850e2f30b 100644 --- a/pkg/controller.v1/xgboost/xgboostjob_controller.go +++ b/pkg/controller.v1/xgboost/xgboostjob_controller.go @@ -403,15 +403,15 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com if rtype == commonv1.ReplicaType(kubeflowv1.XGBoostJobReplicaTypeMaster) { if running > 0 { - msg := fmt.Sprintf("XGBoostJob %s is running.", xgboostJob.Name) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, xgboostJobRunningReason, msg) - if err != nil { - logger.Infof("Append job condition error: %v", err) + if err := setRunningCondition(logger, xgboostJob.Name, jobStatus); err != nil { return err } } // when master is succeed, the job is finished. if expected == 0 { + if err := setRunningCondition(logger, xgboostJob.Name, jobStatus); err != nil { + return err + } msg := fmt.Sprintf("XGBoostJob %s is successfully completed.", xgboostJob.Name) logrus.Info(msg) r.Recorder.Event(xgboostJob, corev1.EventTypeNormal, xgboostJobSucceededReason, msg) @@ -429,6 +429,9 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com } } if failed > 0 { + if err := setRunningCondition(logger, xgboostJob.Name, jobStatus); err != nil { + return err + } if spec.RestartPolicy == commonv1.RestartPolicyExitCode { msg := fmt.Sprintf("XGBoostJob %s is restarting because %d %s replica(s) failed.", xgboostJob.Name, failed, rtype) r.Recorder.Event(xgboostJob, corev1.EventTypeWarning, xgboostJobRestartingReason, msg)