Skip to content

Commit

Permalink
permit custom device requests
Browse files Browse the repository at this point in the history
Karpenter is negative towards custom device requests it is unaware of, assuming those cannot be scheduled.

fixes #1900

This changes the request handling to be scoped only to resource request that Karpenter is aware of and actively manages. The reasoning here is that it cannot influence those resource requests anyways, they come into existance by means of other concepts such as the device-plugin manager that even might be late bound and thus it is out of scope.
  • Loading branch information
universam1 committed Jul 21, 2022
1 parent 28c2e3d commit f12142f
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 24 deletions.
35 changes: 33 additions & 2 deletions pkg/controllers/provisioning/scheduling/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,12 @@ import (

"github.com/samber/lo"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"knative.dev/pkg/logging"

"github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5"
"github.com/aws/karpenter/pkg/cloudprovider"
"github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1"
"github.com/aws/karpenter/pkg/scheduling"
"github.com/aws/karpenter/pkg/utils/resources"
"github.com/aws/karpenter/pkg/utils/sets"
Expand All @@ -44,6 +47,33 @@ type Node struct {

var nodeID int64

// TODO: deduplicate with unexported func (*InstanceType).computeResources
func getWellKnownRequests() v1.ResourceList {
return v1.ResourceList{
v1.ResourceCPU: resource.Quantity{},
v1.ResourceMemory: resource.Quantity{},
v1.ResourceEphemeralStorage: resource.Quantity{},
v1.ResourcePods: resource.Quantity{},
v1alpha1.ResourceAWSPodENI: resource.Quantity{},
v1alpha1.ResourceNVIDIAGPU: resource.Quantity{},
v1alpha1.ResourceAMDGPU: resource.Quantity{},
v1alpha1.ResourceAWSNeuron: resource.Quantity{},
}
}

func filterWellKnownRequests(ctx context.Context, podResources v1.ResourceList) (filteredReq v1.ResourceList) {
wkReq := getWellKnownRequests()
filteredReq = make(v1.ResourceList, len(wkReq))
for k, v := range podResources {
if _, ok := wkReq[k]; ok {
filteredReq[k] = v
} else {
logging.FromContext(ctx).With(k, v).Info("Ignoring custom pod request")
}
}
return
}

func NewNode(nodeTemplate *scheduling.NodeTemplate, topology *Topology, daemonResources v1.ResourceList, instanceTypes []cloudprovider.InstanceType) *Node {
// Copy the template, and add hostname
hostname := fmt.Sprintf("hostname-placeholder-%04d", atomic.AddInt64(&nodeID, 1))
Expand Down Expand Up @@ -90,10 +120,11 @@ func (n *Node) Add(ctx context.Context, pod *v1.Pod) error {
nodeRequirements.Add(topologyRequirements)

// Check instance type combinations
requests := resources.Merge(n.requests, resources.RequestsForPods(pod))
podRequests := filterWellKnownRequests(ctx, resources.RequestsForPods(pod))
requests := resources.Merge(n.requests, podRequests)
instanceTypes := filterInstanceTypes(n.InstanceTypeOptions, nodeRequirements, requests)
if len(instanceTypes) == 0 {
return fmt.Errorf("no instance type satisfied resources %s and requirements %s", resources.String(resources.RequestsForPods(pod)), nodeRequirements)
return fmt.Errorf("no instance type satisfied resources %s and requirements %s", resources.String(podRequests), nodeRequirements)
}

// Update node
Expand Down
26 changes: 4 additions & 22 deletions pkg/controllers/provisioning/scheduling/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2912,8 +2912,8 @@ var _ = Describe("Instance Type Compatibility", func() {
})
It("should launch pods with resources that aren't on any single instance type on different instances", func() {
cloudProv.InstanceTypes = fake.InstanceTypes(5)
const fakeGPU1 = "karpenter.sh/super-great-gpu"
const fakeGPU2 = "karpenter.sh/even-better-gpu"
fakeGPU1 := v1alpha1.ResourceNVIDIAGPU
fakeGPU2 := v1alpha1.ResourceAMDGPU
cloudProv.InstanceTypes[0].Resources()[fakeGPU1] = resource.MustParse("25")
cloudProv.InstanceTypes[1].Resources()[fakeGPU2] = resource.MustParse("25")

Expand All @@ -2936,24 +2936,6 @@ var _ = Describe("Instance Type Compatibility", func() {
}
Expect(nodeNames.Len()).To(Equal(2))
})
It("should fail to schedule a pod with resources requests that aren't on a single instance type", func() {
cloudProv.InstanceTypes = fake.InstanceTypes(5)
const fakeGPU1 = "karpenter.sh/super-great-gpu"
const fakeGPU2 = "karpenter.sh/even-better-gpu"
cloudProv.InstanceTypes[0].Resources()[fakeGPU1] = resource.MustParse("25")
cloudProv.InstanceTypes[1].Resources()[fakeGPU2] = resource.MustParse("25")

ExpectApplied(ctx, env.Client, provisioner)
pods := ExpectProvisioned(ctx, env.Client, controller,
test.UnschedulablePod(test.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Limits: v1.ResourceList{
fakeGPU1: resource.MustParse("1"),
fakeGPU2: resource.MustParse("1")},
},
}))
ExpectNotScheduled(ctx, env.Client, pods[0])
})
Context("Provider Specific Labels", func() {
It("should filter instance types that match labels", func() {
cloudProv.InstanceTypes = fake.InstanceTypes(5)
Expand Down Expand Up @@ -3288,8 +3270,8 @@ var _ = Describe("Binpacking", func() {
pod := ExpectProvisioned(ctx, env.Client, controller,
test.UnschedulablePod(test.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{"foo.com/weird-resources": resource.MustParse("0")},
Limits: v1.ResourceList{"foo.com/weird-resources": resource.MustParse("0")},
Requests: v1.ResourceList{v1alpha1.ResourceAWSNeuron: resource.MustParse("0")},
Limits: v1.ResourceList{v1alpha1.ResourceAWSNeuron: resource.MustParse("0")},
},
}))
// requesting a resource of quantity zero of a type unsupported by any instance is fine
Expand Down

0 comments on commit f12142f

Please sign in to comment.