Skip to content

Commit

Permalink
fix exhausted node metrics reporting in preemption (#20346)
Browse files Browse the repository at this point in the history
  • Loading branch information
gabivlj authored and philrenaud committed Apr 18, 2024
1 parent 0d925b3 commit e0d6ede
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .changelog/20346.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
scheduler: Record exhausted node metrics for devices when preemption fails to find an allocation to evict
```
2 changes: 2 additions & 0 deletions scheduler/rank.go
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,7 @@ OUTER:

if devicePreemptions == nil {
iter.ctx.Logger().Named("binpack").Debug("preemption not possible", "requested_device", req)
iter.ctx.Metrics().ExhaustedNode(option.Node, fmt.Sprintf("devices: %s", err))
netIdx.Release()
continue OUTER
}
Expand All @@ -460,6 +461,7 @@ OUTER:
offer, sumAffinities, err = devAllocator.AssignDevice(req)
if offer == nil {
iter.ctx.Logger().Named("binpack").Debug("unexpected error, unable to create device offer after considering preemption", "error", err)
iter.ctx.Metrics().ExhaustedNode(option.Node, fmt.Sprintf("devices: %s", err))
continue OUTER
}
}
Expand Down
110 changes: 110 additions & 0 deletions scheduler/rank_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1965,6 +1965,116 @@ func TestBinPackIterator_Devices(t *testing.T) {
}
}

// Tests that bin packing iterator fails due to overprovisioning of devices
// This test has devices at task level
func TestBinPackIterator_Device_Failure_With_Eviction(t *testing.T) {
_, ctx := testContext(t)
nodes := []*RankedNode{
{
Node: &structs.Node{
NodeResources: &structs.NodeResources{
Processors: processorResources4096,
Cpu: legacyCpuResources4096,
Memory: structs.NodeMemoryResources{
MemoryMB: 4096,
},
Networks: []*structs.NetworkResource{},
Devices: []*structs.NodeDeviceResource{
{
Vendor: "nvidia",
Type: "gpu",
Instances: []*structs.NodeDevice{
{
ID: "1",
Healthy: true,
HealthDescription: "healthy",
Locality: &structs.NodeDeviceLocality{},
},
},
Name: "SOME-GPU",
},
},
},
ReservedResources: &structs.NodeReservedResources{
Cpu: structs.NodeReservedCpuResources{
CpuShares: 1024,
},
Memory: structs.NodeReservedMemoryResources{
MemoryMB: 1024,
},
},
},
},
}

// Add a planned alloc that takes up a gpu
plan := ctx.Plan()
plan.NodeAllocation[nodes[0].Node.ID] = []*structs.Allocation{
{
AllocatedResources: &structs.AllocatedResources{
Tasks: map[string]*structs.AllocatedTaskResources{
"web": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 2048,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 2048,
},
Networks: []*structs.NetworkResource{},
Devices: []*structs.AllocatedDeviceResource{
{
Vendor: "nvidia",
Type: "gpu",
Name: "SOME-GPU",
DeviceIDs: []string{"1"},
},
},
},
},
Shared: structs.AllocatedSharedResources{},
},
},
}
static := NewStaticRankIterator(ctx, nodes)

// Create a task group with gpu device specified
taskGroup := &structs.TaskGroup{
EphemeralDisk: &structs.EphemeralDisk{},
Tasks: []*structs.Task{
{
Name: "web",
Resources: &structs.Resources{
CPU: 1024,
MemoryMB: 1024,
Networks: []*structs.NetworkResource{},
Devices: structs.ResourceDevices{
{
Name: "nvidia/gpu",
Count: 1,
},
},
},
},
},
Networks: []*structs.NetworkResource{},
}

binp := NewBinPackIterator(ctx, static, true, 0)
binp.SetTaskGroup(taskGroup)
binp.SetSchedulerConfiguration(testSchedulerConfig)

scoreNorm := NewScoreNormalizationIterator(ctx, binp)

out := collectRanked(scoreNorm)
require := require.New(t)

// We expect a placement failure because we need 1 GPU device
// and the other one is taken

require.Len(out, 0)
require.Equal(1, ctx.metrics.DimensionExhausted["devices: no devices match request"])
}

func TestJobAntiAffinity_PlannedAlloc(t *testing.T) {
_, ctx := testContext(t)
nodes := []*RankedNode{
Expand Down

0 comments on commit e0d6ede

Please sign in to comment.