Skip to content

Commit

Permalink
permit custom device requests
Browse files Browse the repository at this point in the history
Karpenter is negative towards custom device requests it is unaware of, assuming those cannot be scheduled.

fixes #1900

This changes the request handling to be scoped only to resource request that Karpenter is aware of and actively manages. The reasoning here is that it cannot influence those resource requests anyways, they come into existance by means of other concepts such as the device-plugin manager that even might be late bound and thus it is out of scope.
  • Loading branch information
universam1 committed Jul 22, 2022
1 parent 8ab2b20 commit b0a4520
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 27 deletions.
3 changes: 2 additions & 1 deletion pkg/controllers/provisioning/scheduling/inflightnode.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ func (n *InFlightNode) Add(ctx context.Context, pod *v1.Pod) error {

// check resource requests first since that's a pretty likely reason the pod won't schedule on an in-flight
// node, which at this point can't be increased in size
requests := resources.Merge(n.requests, resources.RequestsForPods(pod))
podRequests := state.FilterWellKnownRequests(ctx, resources.RequestsForPods(pod))
requests := resources.Merge(n.requests, podRequests)

if !resources.Fits(requests, n.available) {
return fmt.Errorf("exceeds node resources")
Expand Down
6 changes: 4 additions & 2 deletions pkg/controllers/provisioning/scheduling/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

"github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5"
"github.com/aws/karpenter/pkg/cloudprovider"
"github.com/aws/karpenter/pkg/controllers/state"
"github.com/aws/karpenter/pkg/scheduling"
"github.com/aws/karpenter/pkg/utils/resources"
"github.com/aws/karpenter/pkg/utils/sets"
Expand Down Expand Up @@ -90,10 +91,11 @@ func (n *Node) Add(ctx context.Context, pod *v1.Pod) error {
nodeRequirements.Add(topologyRequirements)

// Check instance type combinations
requests := resources.Merge(n.requests, resources.RequestsForPods(pod))
podRequests := state.FilterWellKnownRequests(ctx, resources.RequestsForPods(pod))
requests := resources.Merge(n.requests, podRequests)
instanceTypes := filterInstanceTypes(n.InstanceTypeOptions, nodeRequirements, requests)
if len(instanceTypes) == 0 {
return fmt.Errorf("no instance type satisfied resources %s and requirements %s", resources.String(resources.RequestsForPods(pod)), nodeRequirements)
return fmt.Errorf("no instance type satisfied resources %s and requirements %s", resources.String(podRequests), nodeRequirements)
}

// Update node
Expand Down
37 changes: 15 additions & 22 deletions pkg/controllers/provisioning/scheduling/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2912,8 +2912,8 @@ var _ = Describe("Instance Type Compatibility", func() {
})
It("should launch pods with resources that aren't on any single instance type on different instances", func() {
cloudProv.InstanceTypes = fake.InstanceTypes(5)
const fakeGPU1 = "karpenter.sh/super-great-gpu"
const fakeGPU2 = "karpenter.sh/even-better-gpu"
fakeGPU1 := v1alpha1.ResourceNVIDIAGPU
fakeGPU2 := v1alpha1.ResourceAMDGPU
cloudProv.InstanceTypes[0].Resources()[fakeGPU1] = resource.MustParse("25")
cloudProv.InstanceTypes[1].Resources()[fakeGPU2] = resource.MustParse("25")

Expand All @@ -2936,24 +2936,6 @@ var _ = Describe("Instance Type Compatibility", func() {
}
Expect(nodeNames.Len()).To(Equal(2))
})
It("should fail to schedule a pod with resources requests that aren't on a single instance type", func() {
cloudProv.InstanceTypes = fake.InstanceTypes(5)
const fakeGPU1 = "karpenter.sh/super-great-gpu"
const fakeGPU2 = "karpenter.sh/even-better-gpu"
cloudProv.InstanceTypes[0].Resources()[fakeGPU1] = resource.MustParse("25")
cloudProv.InstanceTypes[1].Resources()[fakeGPU2] = resource.MustParse("25")

ExpectApplied(ctx, env.Client, provisioner)
pods := ExpectProvisioned(ctx, env.Client, controller,
test.UnschedulablePod(test.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Limits: v1.ResourceList{
fakeGPU1: resource.MustParse("1"),
fakeGPU2: resource.MustParse("1")},
},
}))
ExpectNotScheduled(ctx, env.Client, pods[0])
})
Context("Provider Specific Labels", func() {
It("should filter instance types that match labels", func() {
cloudProv.InstanceTypes = fake.InstanceTypes(5)
Expand Down Expand Up @@ -3288,13 +3270,24 @@ var _ = Describe("Binpacking", func() {
pod := ExpectProvisioned(ctx, env.Client, controller,
test.UnschedulablePod(test.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{"foo.com/weird-resources": resource.MustParse("0")},
Limits: v1.ResourceList{"foo.com/weird-resources": resource.MustParse("0")},
Requests: v1.ResourceList{v1alpha1.ResourceAWSNeuron: resource.MustParse("0")},
Limits: v1.ResourceList{v1alpha1.ResourceAWSNeuron: resource.MustParse("0")},
},
}))
// requesting a resource of quantity zero of a type unsupported by any instance is fine
ExpectScheduled(ctx, env.Client, pod[0])
})
It("should be neutral regarding custom, unknown resource requests", func() {
ExpectApplied(ctx, env.Client, provisioner)
pod := ExpectProvisioned(ctx, env.Client, controller,
test.UnschedulablePod(test.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{"smarter-device/fuse": resource.MustParse("1")},
Limits: v1.ResourceList{"smarter-device/fuse": resource.MustParse("1")},
},
}))
ExpectScheduled(ctx, env.Client, pod[0])
})
It("should not schedule pods that exceed every instance type's capacity", func() {
ExpectApplied(ctx, env.Client, provisioner)
pod := ExpectProvisioned(ctx, env.Client, controller, test.UnschedulablePod(
Expand Down
4 changes: 2 additions & 2 deletions pkg/controllers/state/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,8 @@ func (c *Cluster) populateResourceRequests(ctx context.Context, node *v1.Node, n
var daemonsetLimits []v1.ResourceList
for i := range pods.Items {
pod := &pods.Items[i]
requests := resources.RequestsForPods(pod)
podLimits := resources.LimitsForPods(pod)
requests := FilterWellKnownRequests(ctx, resources.RequestsForPods(pod))
podLimits := FilterWellKnownRequests(ctx, resources.LimitsForPods(pod))
podKey := client.ObjectKeyFromObject(pod)

n.podRequests[podKey] = requests
Expand Down
31 changes: 31 additions & 0 deletions pkg/controllers/state/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,14 @@ import (

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
"knative.dev/pkg/logging"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1"
)

var stateRetryPeriod = 1 * time.Minute
Expand Down Expand Up @@ -69,3 +72,31 @@ func (c *PodController) Register(ctx context.Context, m manager.Manager) error {
For(&v1.Pod{}).
Complete(c)
}

// TODO: deduplicate with unexported func (*InstanceType).computeResources
func getWellKnownRequests() v1.ResourceList {
return v1.ResourceList{
v1.ResourceCPU: resource.Quantity{},
v1.ResourceMemory: resource.Quantity{},
v1.ResourceStorage: resource.Quantity{},
v1.ResourceEphemeralStorage: resource.Quantity{},
v1.ResourcePods: resource.Quantity{},
v1alpha1.ResourceAWSPodENI: resource.Quantity{},
v1alpha1.ResourceNVIDIAGPU: resource.Quantity{},
v1alpha1.ResourceAMDGPU: resource.Quantity{},
v1alpha1.ResourceAWSNeuron: resource.Quantity{},
}
}

func FilterWellKnownRequests(ctx context.Context, podResources v1.ResourceList) (filteredReq v1.ResourceList) {
wkReq := getWellKnownRequests()
filteredReq = make(v1.ResourceList, len(wkReq))
for k, v := range podResources {
if _, ok := wkReq[k]; ok {
filteredReq[k] = v
} else {
logging.FromContext(ctx).With(k.String(), v.String()).Info("Ignoring custom pod request")
}
}
return
}

0 comments on commit b0a4520

Please sign in to comment.