diff --git a/pkg/scheduler/api/job_info.go b/pkg/scheduler/api/job_info.go index efb0316e96..859fc03d35 100644 --- a/pkg/scheduler/api/job_info.go +++ b/pkg/scheduler/api/job_info.go @@ -610,7 +610,7 @@ func (ji JobInfo) String() string { // FitError returns detailed information on why a job's task failed to fit on // each available node -func (ji *JobInfo) FitError() string { +func (ji *JobInfo) FitError(nodeMap map[string]*NodeInfo) string { sortReasonsHistogram := func(reasons map[string]int) []string { reasonStrings := []string{} for k, v := range reasons { @@ -620,6 +620,31 @@ func (ji *JobInfo) FitError() string { return reasonStrings } + getUnavailableResources := func(nodeMap map[string]*NodeInfo, tasks tasksMap) string { + if nodeMap == nil { + return "" + } + var unavailableResources []string + seenResources := make(map[string]bool) + for uid := range tasks { + nodeName := tasks[uid].NodeName + if nodeName != "" { + taskResource := tasks[uid].InitResreq + nodeResource := nodeMap[nodeName].Allocatable + for _, rn := range taskResource.ResourceNames() { + if nodeResource.Get(rn) < taskResource.Get(rn) { + msg := fmt.Sprintf("Insufficient resource %s in %s: required %f available %f", rn.String(), nodeName, taskResource.Get(rn), nodeResource.Get(rn)) + if !seenResources[msg] { + unavailableResources = append(unavailableResources, msg) + seenResources[msg] = true + } + } + } + } + } + return strings.Join(unavailableResources, ", ") + } + // Stat histogram for all tasks of the job reasons := make(map[string]int) for status, taskMap := range ji.TaskStatusIndex { @@ -630,12 +655,13 @@ func (ji *JobInfo) FitError() string { // Stat histogram for pending tasks only reasons = make(map[string]int) + resourceError := getUnavailableResources(nodeMap, ji.Tasks) for uid := range ji.TaskStatusIndex[Pending] { reason, _ := ji.TaskSchedulingReason(uid) reasons[reason]++ } if len(reasons) > 0 { - reasonMsg += "; " + fmt.Sprintf("%s: %s", Pending.String(), strings.Join(sortReasonsHistogram(reasons), ", ")) + reasonMsg += "; " + fmt.Sprintf("%s: %s", Pending.String(), strings.Join(sortReasonsHistogram(reasons), ", "+resourceError)) } return reasonMsg } diff --git a/pkg/scheduler/api/job_info_test.go b/pkg/scheduler/api/job_info_test.go index e4d0953283..7df264091d 100644 --- a/pkg/scheduler/api/job_info_test.go +++ b/pkg/scheduler/api/job_info_test.go @@ -273,7 +273,7 @@ func TestTaskSchedulingReason(t *testing.T) { task.Status = Pending job.TaskStatusIndex[Pending][task.UID] = task } - job.JobFitErrors = job.FitError() + job.JobFitErrors = job.FitError(nil) // assert for uid, exp := range test.expected { diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go index 1eb1558287..89294cd939 100644 --- a/pkg/scheduler/cache/cache.go +++ b/pkg/scheduler/cache/cache.go @@ -1212,7 +1212,7 @@ func (sc *SchedulerCache) RecordJobStatusEvent(job *schedulingapi.JobInfo) { msg := fmt.Sprintf("%v/%v tasks in gang unschedulable: %v", len(job.TaskStatusIndex[schedulingapi.Pending]), len(job.Tasks), - job.FitError()) + job.FitError(sc.Nodes)) sc.recordPodGroupEvent(job.PodGroup, v1.EventTypeWarning, string(scheduling.PodGroupUnschedulableType), msg) } else { sc.recordPodGroupEvent(job.PodGroup, v1.EventTypeNormal, string(scheduling.PodGroupScheduled), string(scheduling.PodGroupReady)) diff --git a/pkg/scheduler/plugins/gang/gang.go b/pkg/scheduler/plugins/gang/gang.go index 5ceb3ace40..c5cce0a54d 100644 --- a/pkg/scheduler/plugins/gang/gang.go +++ b/pkg/scheduler/plugins/gang/gang.go @@ -180,7 +180,7 @@ func (gp *gangPlugin) OnSessionClose(ssn *framework.Session) { } unreadyTaskCount = job.MinAvailable - schedulableTaskNum() msg := fmt.Sprintf("%v/%v tasks in gang unschedulable: %v", - unreadyTaskCount, len(job.Tasks), job.FitError()) + unreadyTaskCount, len(job.Tasks), job.FitError(ssn.Nodes)) job.JobFitErrors = msg unScheduleJobCount++