diff --git a/charts/karpenter/crds/karpenter.sh_provisioners.yaml b/charts/karpenter/crds/karpenter.sh_provisioners.yaml index 1fb2d56e513e..a1feeedd2a2f 100644 --- a/charts/karpenter/crds/karpenter.sh_provisioners.yaml +++ b/charts/karpenter/crds/karpenter.sh_provisioners.yaml @@ -41,6 +41,13 @@ spec: Node properties are determined from a combination of provisioner and pod scheduling constraints. properties: + consolidation: + description: Consolidation are the consolidation parameters + properties: + enabled: + description: Enabled enables consolidation if it has been set + type: boolean + type: object kubeletConfiguration: description: KubeletConfiguration are options passed to the kubelet when provisioning nodes @@ -192,8 +199,8 @@ spec: will wait before attempting to delete a node, measured from when the node is detected to be empty. A Node is considered to be empty when it does not have pods scheduled to it, excluding daemonsets. - \n Termination due to underutilization is disabled if this field - is not set." + \n Termination due to no utilization is disabled if this field is + not set." format: int64 type: integer ttlSecondsUntilExpired: diff --git a/charts/karpenter/templates/clusterrole.yaml b/charts/karpenter/templates/clusterrole.yaml index c9541e46639a..65f872e87991 100644 --- a/charts/karpenter/templates/clusterrole.yaml +++ b/charts/karpenter/templates/clusterrole.yaml @@ -22,12 +22,15 @@ rules: - apiGroups: ["storage.k8s.io"] resources: ["storageclasses", "csinodes"] verbs: ["get", "watch", "list"] - - apiGroups: ["apps"] - resources: ["daemonsets"] + - apiGroups: ["apps", ""] + resources: ["daemonsets","replicasets", "replicationcontrollers"] verbs: ["list", "watch"] - apiGroups: ["admissionregistration.k8s.io"] resources: ["validatingwebhookconfigurations", "mutatingwebhookconfigurations"] verbs: ["get", "watch", "list"] + - apiGroups: [ "policy" ] + resources: [ "poddisruptionbudgets" ] + verbs: [ "get", "list", "watch" ] # Write - apiGroups: ["karpenter.sh"] resources: ["provisioners/status"] diff --git a/charts/karpenter/values.yaml b/charts/karpenter/values.yaml index 37ae6ce3bfd1..ca5cfd80ecd0 100644 --- a/charts/karpenter/values.yaml +++ b/charts/karpenter/values.yaml @@ -92,7 +92,9 @@ controller: # -- SecurityContext for the controller container. securityContext: {} # -- Additional environment variables for the controller pod. - env: [] + env: + - name: ENABLE_PROFILING + value: "true" # - name: AWS_REGION # value: eu-west-1 diff --git a/cmd/controller/main.go b/cmd/controller/main.go index a5437a13043d..e5ef79951817 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -22,6 +22,7 @@ import ( "github.com/go-logr/zapr" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/clock" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/client-go/kubernetes" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -43,6 +44,7 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/registry" "github.com/aws/karpenter/pkg/config" "github.com/aws/karpenter/pkg/controllers" + "github.com/aws/karpenter/pkg/controllers/consolidation" "github.com/aws/karpenter/pkg/controllers/counter" metricspod "github.com/aws/karpenter/pkg/controllers/metrics/pod" metricsprovisioner "github.com/aws/karpenter/pkg/controllers/metrics/provisioner" @@ -112,16 +114,19 @@ func main() { } recorder := events.NewDedupeRecorder(events.NewRecorder(manager.GetEventRecorderFor(appName))) - cluster := state.NewCluster(cfg, manager.GetClient(), cloudProvider) + realClock := &clock.RealClock{} + cluster := state.NewCluster(realClock, cfg, manager.GetClient(), cloudProvider) + provisioner := provisioning.NewProvisioner(ctx, cfg, manager.GetClient(), clientSet.CoreV1(), recorder, cloudProvider, cluster) + consolidation.NewController(ctx, realClock, manager.GetClient(), provisioner, cloudProvider, recorder, cluster, manager.Elected()) statemetrics.StartMetricScraper(ctx, cluster) if err := manager.RegisterControllers(ctx, - provisioning.NewController(ctx, cfg, manager.GetClient(), clientSet.CoreV1(), recorder, cloudProvider, cluster), + provisioning.NewController(manager.GetClient(), provisioner, recorder), state.NewNodeController(manager.GetClient(), cluster), state.NewPodController(manager.GetClient(), cluster), - node.NewController(manager.GetClient(), cloudProvider, cluster), - termination.NewController(ctx, manager.GetClient(), clientSet.CoreV1(), recorder, cloudProvider), + termination.NewController(ctx, realClock, manager.GetClient(), clientSet.CoreV1(), recorder, cloudProvider), + node.NewController(realClock, manager.GetClient(), cloudProvider, cluster), metricspod.NewController(manager.GetClient()), metricsprovisioner.NewController(manager.GetClient()), counter.NewController(manager.GetClient(), cluster), diff --git a/hack/docs/metrics_gen_docs.go b/hack/docs/metrics_gen_docs.go index c5300db3029f..ca0e878ec690 100644 --- a/hack/docs/metrics_gen_docs.go +++ b/hack/docs/metrics_gen_docs.go @@ -163,7 +163,7 @@ func handleVariableDeclaration(v *ast.GenDecl) []metricInfo { if funcPkg != "prometheus" { continue } - if len(ce.Args) != 2 { + if len(ce.Args) == 0 { continue } arg := ce.Args[0].(*ast.CompositeLit) diff --git a/pkg/apis/provisioning/v1alpha5/labels.go b/pkg/apis/provisioning/v1alpha5/labels.go index df087023ad36..d6f8ac549465 100644 --- a/pkg/apis/provisioning/v1alpha5/labels.go +++ b/pkg/apis/provisioning/v1alpha5/labels.go @@ -30,10 +30,11 @@ var ( // Karpenter specific domains and labels KarpenterLabelDomain = "karpenter.sh" - ProvisionerNameLabelKey = Group + "/provisioner-name" - DoNotEvictPodAnnotationKey = Group + "/do-not-evict" - EmptinessTimestampAnnotationKey = Group + "/emptiness-timestamp" - TerminationFinalizer = Group + "/termination" + ProvisionerNameLabelKey = Group + "/provisioner-name" + DoNotEvictPodAnnotationKey = Group + "/do-not-evict" + DoNotConsolidateNodeAnnotationKey = KarpenterLabelDomain + "/do-not-consolidate" + EmptinessTimestampAnnotationKey = Group + "/emptiness-timestamp" + TerminationFinalizer = Group + "/termination" LabelCapacityType = KarpenterLabelDomain + "/capacity-type" LabelNodeInitialized = KarpenterLabelDomain + "/initialized" @@ -64,7 +65,7 @@ var ( ) // RestrictedLabels are labels that should not be used - // because they may interfer the internal provisioning logic. + // because they may interfere with the internal provisioning logic. RestrictedLabels = sets.NewString( EmptinessTimestampAnnotationKey, v1.LabelHostname, diff --git a/pkg/apis/provisioning/v1alpha5/provisioner.go b/pkg/apis/provisioning/v1alpha5/provisioner.go index dd4243ab1d90..f513051f8026 100644 --- a/pkg/apis/provisioning/v1alpha5/provisioner.go +++ b/pkg/apis/provisioning/v1alpha5/provisioner.go @@ -57,7 +57,7 @@ type ProvisionerSpec struct { // detected to be empty. A Node is considered to be empty when it does not // have pods scheduled to it, excluding daemonsets. // - // Termination due to underutilization is disabled if this field is not set. + // Termination due to no utilization is disabled if this field is not set. // +optional TTLSecondsAfterEmpty *int64 `json:"ttlSecondsAfterEmpty,omitempty"` // TTLSecondsUntilExpired is the number of seconds the controller will wait @@ -70,6 +70,14 @@ type ProvisionerSpec struct { TTLSecondsUntilExpired *int64 `json:"ttlSecondsUntilExpired,omitempty"` // Limits define a set of bounds for provisioning capacity. Limits *Limits `json:"limits,omitempty"` + // Consolidation are the consolidation parameters + // +optional + Consolidation *Consolidation `json:"consolidation,omitempty"` +} + +type Consolidation struct { + // Enabled enables consolidation if it has been set + Enabled *bool `json:"enabled,omitempty"` } // +kubebuilder:object:generate=false diff --git a/pkg/apis/provisioning/v1alpha5/provisioner_validation.go b/pkg/apis/provisioning/v1alpha5/provisioner_validation.go index f5067fa50112..b30c89464ded 100644 --- a/pkg/apis/provisioning/v1alpha5/provisioner_validation.go +++ b/pkg/apis/provisioning/v1alpha5/provisioner_validation.go @@ -18,6 +18,8 @@ import ( "context" "fmt" + "github.com/aws/aws-sdk-go/aws" + "go.uber.org/multierr" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" @@ -27,8 +29,8 @@ import ( ) var ( - SupportedNodeSelectorOps sets.String = sets.NewString(string(v1.NodeSelectorOpIn), string(v1.NodeSelectorOpNotIn), string(v1.NodeSelectorOpExists), string(v1.NodeSelectorOpDoesNotExist)) - SupportedProvisionerOps sets.String = sets.NewString(string(v1.NodeSelectorOpIn), string(v1.NodeSelectorOpNotIn), string(v1.NodeSelectorOpExists)) + SupportedNodeSelectorOps = sets.NewString(string(v1.NodeSelectorOpIn), string(v1.NodeSelectorOpNotIn), string(v1.NodeSelectorOpExists), string(v1.NodeSelectorOpDoesNotExist)) + SupportedProvisionerOps = sets.NewString(string(v1.NodeSelectorOpIn), string(v1.NodeSelectorOpNotIn), string(v1.NodeSelectorOpExists)) ) const ( @@ -62,6 +64,10 @@ func (s *ProvisionerSpec) validateTTLSecondsAfterEmpty() (errs *apis.FieldError) if ptr.Int64Value(s.TTLSecondsAfterEmpty) < 0 { return errs.Also(apis.ErrInvalidValue("cannot be negative", "ttlSecondsAfterEmpty")) } + // TTLSecondsAfterEmpty and consolidation are mutually exclusive + if s.Consolidation != nil && aws.BoolValue(s.Consolidation.Enabled) && s.TTLSecondsAfterEmpty != nil { + return errs.Also(apis.ErrGeneric("must not set the field if consolidation is enabled", "ttlSecondsAfterEmpty")) + } return errs } diff --git a/pkg/apis/provisioning/v1alpha5/suite_test.go b/pkg/apis/provisioning/v1alpha5/suite_test.go index 4b1afa522d5c..98aa4b3e7a88 100644 --- a/pkg/apis/provisioning/v1alpha5/suite_test.go +++ b/pkg/apis/provisioning/v1alpha5/suite_test.go @@ -16,6 +16,7 @@ package v1alpha5 import ( "context" + "github.com/aws/aws-sdk-go/aws" "strings" "testing" @@ -64,6 +65,25 @@ var _ = Describe("Validation", func() { provisioner.Spec.TTLSecondsAfterEmpty = nil Expect(provisioner.Validate(ctx)).To(Succeed()) }) + It("should succeed on a valid empty ttl", func() { + provisioner.Spec.TTLSecondsAfterEmpty = aws.Int64(30) + Expect(provisioner.Validate(ctx)).To(Succeed()) + }) + It("should fail if both consolidation and TTLSecondsAfterEmpty are enabled", func() { + provisioner.Spec.TTLSecondsAfterEmpty = ptr.Int64(30) + provisioner.Spec.Consolidation = &Consolidation{Enabled: aws.Bool(true)} + Expect(provisioner.Validate(ctx)).ToNot(Succeed()) + }) + It("should succeed if consolidation is off and TTLSecondsAfterEmpty is set", func() { + provisioner.Spec.TTLSecondsAfterEmpty = ptr.Int64(30) + provisioner.Spec.Consolidation = &Consolidation{Enabled: aws.Bool(false)} + Expect(provisioner.Validate(ctx)).To(Succeed()) + }) + It("should succeed if consolidation is on and TTLSecondsAfterEmpty is not set", func() { + provisioner.Spec.TTLSecondsAfterEmpty = nil + provisioner.Spec.Consolidation = &Consolidation{Enabled: aws.Bool(true)} + Expect(provisioner.Validate(ctx)).To(Succeed()) + }) Context("Limits", func() { It("should allow undefined limits", func() { diff --git a/pkg/apis/provisioning/v1alpha5/zz_generated.deepcopy.go b/pkg/apis/provisioning/v1alpha5/zz_generated.deepcopy.go index a701be1bc31d..ab6afff339d4 100644 --- a/pkg/apis/provisioning/v1alpha5/zz_generated.deepcopy.go +++ b/pkg/apis/provisioning/v1alpha5/zz_generated.deepcopy.go @@ -25,6 +25,26 @@ import ( "knative.dev/pkg/apis" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Consolidation) DeepCopyInto(out *Consolidation) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Consolidation. +func (in *Consolidation) DeepCopy() *Consolidation { + if in == nil { + return nil + } + out := new(Consolidation) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { *out = *in @@ -207,6 +227,11 @@ func (in *ProvisionerSpec) DeepCopyInto(out *ProvisionerSpec) { *out = new(Limits) (*in).DeepCopyInto(*out) } + if in.Consolidation != nil { + in, out := &in.Consolidation, &out.Consolidation + *out = new(Consolidation) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProvisionerSpec. diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index 897e0a81f4cb..a78c7e1812f6 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -29,24 +29,23 @@ import ( "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/ssm" "github.com/patrickmn/go-cache" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/transport" + "knative.dev/pkg/logging" + "knative.dev/pkg/ptr" + k8sClient "sigs.k8s.io/controller-runtime/pkg/client" awsv1alpha1 "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/cloudprovider/aws/amifamily" "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" + "github.com/aws/karpenter/pkg/scheduling" "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" "github.com/aws/karpenter/pkg/utils/project" - - k8sClient "sigs.k8s.io/controller-runtime/pkg/client" - - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/transport" - "knative.dev/pkg/logging" - "knative.dev/pkg/ptr" ) const ( @@ -67,6 +66,8 @@ func init() { v1alpha5.NormalizedLabels = functional.UnionStringMaps(v1alpha5.NormalizedLabels, map[string]string{"topology.ebs.csi.aws.com/zone": v1.LabelTopologyZone}) } +var _ cloudprovider.CloudProvider = (*CloudProvider)(nil) + type CloudProvider struct { instanceTypeProvider *InstanceTypeProvider subnetProvider *SubnetProvider @@ -147,6 +148,31 @@ func (c *CloudProvider) Delete(ctx context.Context, node *v1.Node) error { return c.instanceProvider.Terminate(ctx, node) } +func (c *CloudProvider) ShouldConsolidate(existingNode *v1.Node, newNodeRequirements scheduling.Requirements, instanceTypeOptions []cloudprovider.InstanceType) bool { + // no replacement node is being launched, so we always prefer to delete a node + if newNodeRequirements == nil { + return true + } + + // If the entire list of replacement instance types are de-prioritized, then don't perform the node replacement. + // This prevents us from doing something like a c6g.large -> c6g.medium -> a1.medium. + allAreDeprioritized := true + for _, it := range instanceTypeOptions { + it, ok := it.(*InstanceType) + if !ok { + continue + } + allAreDeprioritized = allAreDeprioritized && isDeprioritized(it) + } + if allAreDeprioritized { + return false + } + + // We currently don't replace spot nodes as we don't know if the replacement node is less available than the node we already + // have. + return existingNode.Labels[v1alpha5.LabelCapacityType] != v1alpha1.CapacityTypeSpot +} + // Name returns the CloudProvider implementation name. func (c *CloudProvider) Name() string { return "aws" diff --git a/pkg/cloudprovider/aws/instance.go b/pkg/cloudprovider/aws/instance.go index 727d1414405c..49d7f2589428 100644 --- a/pkg/cloudprovider/aws/instance.go +++ b/pkg/cloudprovider/aws/instance.go @@ -351,6 +351,32 @@ func (p *InstanceProvider) getCapacityType(nodeRequest *cloudprovider.NodeReques return v1alpha1.CapacityTypeOnDemand } +func isDeprioritized(it *InstanceType) bool { + // allow regular instance families and prioritize all others last + if !functional.HasAnyPrefix(*it.InstanceType, "m", "c", "r", "a", "t", "i") { + return true + } + + // deprioritize some older instance types including 1st/2nd gen burstable, compute and graviton + if functional.HasAnyPrefix(*it.InstanceType, "t1", "t2", "a1", "c1") { + return true + } + + // deprioritize metal + if aws.BoolValue(it.BareMetal) { + return true + } + itRes := it.Resources() + + // deprioritize GPU instance types + if !resources.IsZero(itRes[v1alpha1.ResourceAWSNeuron]) || + !resources.IsZero(itRes[v1alpha1.ResourceAMDGPU]) || + !resources.IsZero(itRes[v1alpha1.ResourceNVIDIAGPU]) { + return true + } + return false +} + // filterInstanceTypes is used to eliminate less desirable instance types (like GPUs) from the list of possible instance types when // a set of more appropriate instance types would work. If a set of more desirable instance types is not found, then the original slice // of instance types are returned. @@ -358,24 +384,10 @@ func (p *InstanceProvider) filterInstanceTypes(instanceTypes []cloudprovider.Ins var genericInstanceTypes []cloudprovider.InstanceType for _, it := range instanceTypes { it := it.(*InstanceType) - // allow regular instance families and prioritize all others last - if !functional.HasAnyPrefix(*it.InstanceType, "m", "c", "r", "a", "t", "i") { - continue - } - // deprioritize some older instance types including 1st/2nd gen burstable, compute and graviton - if functional.HasAnyPrefix(*it.InstanceType, "t1", "t2", "a1", "c1") { - continue - } - // deprioritize metal - if aws.BoolValue(it.BareMetal) { - continue - } - itRes := it.Resources() - if !resources.IsZero(itRes[v1alpha1.ResourceAWSNeuron]) || - !resources.IsZero(itRes[v1alpha1.ResourceAMDGPU]) || - !resources.IsZero(itRes[v1alpha1.ResourceNVIDIAGPU]) { + if isDeprioritized(it) { continue } + genericInstanceTypes = append(genericInstanceTypes, it) } // if we got some subset of instance types, then prefer to use those diff --git a/pkg/cloudprovider/aws/suite_test.go b/pkg/cloudprovider/aws/suite_test.go index b1b515260ca5..14117d4463fd 100644 --- a/pkg/cloudprovider/aws/suite_test.go +++ b/pkg/cloudprovider/aws/suite_test.go @@ -20,6 +20,7 @@ import ( "errors" "fmt" "io/ioutil" + "k8s.io/apimachinery/pkg/util/clock" "strings" "testing" "time" @@ -28,19 +29,6 @@ import ( "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/pricing" - "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/cloudprovider" - "github.com/aws/karpenter/pkg/cloudprovider/aws/amifamily" - "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" - awsapisv1alpha5 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha5" - "github.com/aws/karpenter/pkg/cloudprovider/aws/fake" - "github.com/aws/karpenter/pkg/controllers/provisioning" - "github.com/aws/karpenter/pkg/controllers/state" - "github.com/aws/karpenter/pkg/test" - "github.com/aws/karpenter/pkg/utils/injection" - "github.com/aws/karpenter/pkg/utils/options" - . "github.com/onsi/ginkgo" - . "github.com/onsi/gomega" "github.com/patrickmn/go-cache" "github.com/samber/lo" v1 "k8s.io/api/core/v1" @@ -48,11 +36,26 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/kubernetes" + corev1 "k8s.io/client-go/kubernetes/typed/core/v1" "knative.dev/pkg/ptr" "sigs.k8s.io/controller-runtime/pkg/client" . "github.com/aws/karpenter/pkg/test/expectations" + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" . "knative.dev/pkg/logging/testing" + + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider" + "github.com/aws/karpenter/pkg/cloudprovider/aws/amifamily" + "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" + awsapisv1alpha5 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider/aws/fake" + "github.com/aws/karpenter/pkg/controllers/provisioning" + "github.com/aws/karpenter/pkg/controllers/state" + "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter/pkg/utils/injection" + "github.com/aws/karpenter/pkg/utils/options" ) var ctx context.Context @@ -74,6 +77,7 @@ var clientSet *kubernetes.Clientset var cluster *state.Cluster var recorder *test.EventRecorder var cfg *test.Config +var fakeClock *clock.FakeClock func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) @@ -136,9 +140,11 @@ var _ = BeforeSuite(func() { kubeClient: e.Client, } cfg = test.NewConfig() - cluster = state.NewCluster(cfg, e.Client, cloudProvider) + fakeClock = clock.NewFakeClock(time.Now()) + cluster = state.NewCluster(fakeClock, cfg, e.Client, cloudProvider) recorder = test.NewEventRecorder() - controller = provisioning.NewController(ctx, cfg, e.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + prov := provisioning.NewProvisioner(ctx, cfg, e.Client, corev1.NewForConfigOrDie(e.Config), recorder, cloudProvider, cluster) + controller = provisioning.NewController(e.Client, prov, recorder) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") @@ -274,7 +280,9 @@ var _ = Describe("Allocation", func() { cancelCtx, cancelFunc := context.WithCancel(injection.WithOptions(ctx, optsCopy)) // ensure the provisioner is shut down at the end of this test defer cancelFunc() - provisionContoller := provisioning.NewController(cancelCtx, cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + + prov := provisioning.NewProvisioner(cancelCtx, cfg, env.Client, corev1.NewForConfigOrDie(env.Config), recorder, cloudProvider, cluster) + provisionContoller := provisioning.NewController(env.Client, prov, recorder) ExpectApplied(ctx, env.Client, provisioner) for _, pod := range ExpectProvisioned(cancelCtx, env.Client, provisionContoller, test.UnschedulablePod(test.PodOptions{ @@ -936,7 +944,8 @@ var _ = Describe("Allocation", func() { Context("User Data", func() { It("should not specify --use-max-pods=false when using ENI-based pod density", func() { opts.AWSENILimitedPodDensity = true - controller := provisioning.NewController(injection.WithOptions(ctx, opts), cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + prov := provisioning.NewProvisioner(injection.WithOptions(ctx, opts), cfg, env.Client, corev1.NewForConfigOrDie(env.Config), recorder, cloudProvider, cluster) + controller := provisioning.NewController(env.Client, prov, recorder) ExpectApplied(ctx, env.Client, Provisioner(test.ProvisionerOptions{Provider: provider})) pod := ExpectProvisioned(ctx, env.Client, controller, test.UnschedulablePod())[0] ExpectScheduled(ctx, env.Client, pod) @@ -947,8 +956,8 @@ var _ = Describe("Allocation", func() { }) It("should specify --use-max-pods=false when not using ENI-based pod density", func() { opts.AWSENILimitedPodDensity = false - controller := provisioning.NewController(injection.WithOptions(ctx, opts), cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) - + prov := provisioning.NewProvisioner(injection.WithOptions(ctx, opts), cfg, env.Client, corev1.NewForConfigOrDie(env.Config), recorder, cloudProvider, cluster) + controller := provisioning.NewController(env.Client, prov, recorder) ExpectApplied(ctx, env.Client, Provisioner(test.ProvisionerOptions{Provider: provider})) pod := ExpectProvisioned(ctx, env.Client, controller, test.UnschedulablePod())[0] ExpectScheduled(ctx, env.Client, pod) @@ -1032,7 +1041,8 @@ var _ = Describe("Allocation", func() { AWS: *provider, ObjectMeta: metav1.ObjectMeta{Name: providerRefName}}) ExpectApplied(ctx, env.Client, nodeTemplate) - controller := provisioning.NewController(injection.WithOptions(ctx, opts), cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + controller := provisioning.NewController(env.Client, + provisioning.NewProvisioner(injection.WithOptions(ctx, opts), cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster), recorder) newProvisioner := Provisioner(test.ProvisionerOptions{ProviderRef: providerRef}) ExpectApplied(ctx, env.Client, newProvisioner) env.Client.Get(ctx, client.ObjectKeyFromObject(newProvisioner), newProvisioner) @@ -1060,9 +1070,11 @@ var _ = Describe("Allocation", func() { AWS: *provider, ObjectMeta: metav1.ObjectMeta{Name: providerRefName}}) ExpectApplied(ctx, env.Client, nodeTemplate) - controller := provisioning.NewController(injection.WithOptions(ctx, opts), cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + controller := provisioning.NewController(env.Client, + provisioning.NewProvisioner(injection.WithOptions(ctx, opts), cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster), recorder) newProvisioner := Provisioner(test.ProvisionerOptions{ProviderRef: providerRef}) ExpectApplied(ctx, env.Client, newProvisioner) + env.Client.Get(ctx, client.ObjectKeyFromObject(newProvisioner), newProvisioner) pod := ExpectProvisioned(ctx, env.Client, controller, test.UnschedulablePod())[0] ExpectScheduled(ctx, env.Client, pod) Expect(fakeEC2API.CalledWithCreateLaunchTemplateInput.Len()).To(Equal(1)) @@ -1080,7 +1092,6 @@ var _ = Describe("Allocation", func() { providerRef := &v1alpha5.ProviderRef{ Name: "doesnotexist", } - controller := provisioning.NewController(ctx, cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) newProvisioner := Provisioner(test.ProvisionerOptions{ProviderRef: providerRef}) ExpectApplied(ctx, env.Client, newProvisioner) pod := ExpectProvisioned(ctx, env.Client, controller, test.UnschedulablePod())[0] @@ -1099,7 +1110,6 @@ var _ = Describe("Allocation", func() { AWS: *provider, ObjectMeta: metav1.ObjectMeta{Name: providerRefName}}) ExpectApplied(ctx, env.Client, nodeTemplate) - controller := provisioning.NewController(ctx, cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) newProvisioner := Provisioner(test.ProvisionerOptions{ProviderRef: providerRef}) ExpectApplied(ctx, env.Client, newProvisioner) pod := ExpectProvisioned(ctx, env.Client, controller, test.UnschedulablePod())[0] @@ -1121,7 +1131,9 @@ var _ = Describe("Allocation", func() { AWS: *provider, ObjectMeta: metav1.ObjectMeta{Name: providerRefName}}) ExpectApplied(ctx, env.Client, nodeTemplate) - controller := provisioning.NewController(injection.WithOptions(ctx, opts), cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + controller := provisioning.NewController(env.Client, + provisioning.NewProvisioner(injection.WithOptions(ctx, opts), cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster), + recorder) newProvisioner := Provisioner(test.ProvisionerOptions{ProviderRef: providerRef}) ExpectApplied(ctx, env.Client, newProvisioner) pod := ExpectProvisioned(ctx, env.Client, controller, test.UnschedulablePod())[0] @@ -1145,7 +1157,9 @@ var _ = Describe("Allocation", func() { AWS: *provider, ObjectMeta: metav1.ObjectMeta{Name: providerRefName}}) ExpectApplied(ctx, env.Client, nodeTemplate) - controller := provisioning.NewController(injection.WithOptions(ctx, opts), cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + controller := provisioning.NewController(env.Client, + provisioning.NewProvisioner(injection.WithOptions(ctx, opts), cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster), + recorder) newProvisioner := Provisioner(test.ProvisionerOptions{ProviderRef: providerRef}) ExpectApplied(ctx, env.Client, newProvisioner) pod := ExpectProvisioned(ctx, env.Client, controller, test.UnschedulablePod())[0] @@ -1169,7 +1183,9 @@ var _ = Describe("Allocation", func() { AWS: *provider, ObjectMeta: metav1.ObjectMeta{Name: providerRefName}}) ExpectApplied(ctx, env.Client, nodeTemplate) - controller := provisioning.NewController(injection.WithOptions(ctx, opts), cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + controller := provisioning.NewController(env.Client, + provisioning.NewProvisioner(injection.WithOptions(ctx, opts), cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster), + recorder) newProvisioner := Provisioner(test.ProvisionerOptions{ProviderRef: providerRef}) ExpectApplied(ctx, env.Client, newProvisioner) pod := ExpectProvisioned(ctx, env.Client, controller, test.UnschedulablePod())[0] diff --git a/pkg/cloudprovider/fake/cloudprovider.go b/pkg/cloudprovider/fake/cloudprovider.go index 978a8ee2a719..d8ab6e456880 100644 --- a/pkg/cloudprovider/fake/cloudprovider.go +++ b/pkg/cloudprovider/fake/cloudprovider.go @@ -36,6 +36,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +var _ cloudprovider.CloudProvider = (*CloudProvider)(nil) + type CloudProvider struct { InstanceTypes []cloudprovider.InstanceType @@ -148,6 +150,10 @@ func (c *CloudProvider) Validate(context.Context, *v1alpha5.Provisioner) *apis.F return nil } +func (c *CloudProvider) ShouldConsolidate(existingNode *v1.Node, newNodeRequirements scheduling.Requirements, instanceTypeOptions []cloudprovider.InstanceType) bool { + return true +} + // Name returns the CloudProvider implementation name. func (c *CloudProvider) Name() string { return "fake" diff --git a/pkg/cloudprovider/types.go b/pkg/cloudprovider/types.go index 8858dbb3d825..c74ebab88fbc 100644 --- a/pkg/cloudprovider/types.go +++ b/pkg/cloudprovider/types.go @@ -48,6 +48,13 @@ type CloudProvider interface { // availability, the GetInstanceTypes method should always return all instance types, // even those with no offerings available. GetInstanceTypes(context.Context, *v1alpha5.Provisioner) ([]InstanceType, error) + // ShouldConsolidate should return true if the cloud provider believes that consolidation should replace the given + // instance type with one of the instance type options provided. The instance types provided will all be cheaper + // than the node existing node. This method allows the cloud provider to provide insight to the consolidation + // algorithm beyond price, specifically regarding if a given node replacement decision is good with respect to the + // availability of the instance type options provided. Both the newNodeRequirements and instanceTypeOptions will be + // nil in the case of deleting a node for consolidation. + ShouldConsolidate(existingNode *v1.Node, newNodeRequirements scheduling.Requirements, instanceTypeOptions []InstanceType) bool // Name returns the CloudProvider implementation name. Name() string } @@ -72,8 +79,8 @@ type InstanceType interface { // Overhead is the amount of resource overhead expected to be used by kubelet and any other system daemons outside // of Kubernetes. Overhead() v1.ResourceList - // Price is a metric that is used to optimize pod placement onto nodes. This can be an actual monetary price per hour - // for the instance type, or just a weighting where lower 'prices' are preferred. + // Price is a metric that is used to optimize pod placement onto nodes as well as determining if replacing a node with + // a cheaper instance type is possible when performing consolidation. Price() float64 } diff --git a/pkg/controllers/consolidation/controller.go b/pkg/controllers/consolidation/controller.go new file mode 100644 index 000000000000..50c2d4beb838 --- /dev/null +++ b/pkg/controllers/consolidation/controller.go @@ -0,0 +1,645 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package consolidation + +import ( + "bytes" + "context" + "errors" + "fmt" + "math" + "sort" + "strconv" + "sync" + "time" + + "go.uber.org/multierr" + + appsv1 "k8s.io/api/apps/v1" + + "github.com/avast/retry-go" + "github.com/aws/aws-sdk-go/aws" + "github.com/prometheus/client_golang/prometheus" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/clock" + "knative.dev/pkg/logging" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider" + "github.com/aws/karpenter/pkg/controllers/provisioning" + "github.com/aws/karpenter/pkg/controllers/provisioning/scheduling" + "github.com/aws/karpenter/pkg/controllers/state" + "github.com/aws/karpenter/pkg/events" + "github.com/aws/karpenter/pkg/metrics" + "github.com/aws/karpenter/pkg/utils/pod" +) + +// Controller is the consolidation controller. It is not a standard controller-runtime controller in that it doesn't +// have a reconcile method. +type Controller struct { + kubeClient client.Client + cluster *state.Cluster + provisioner *provisioning.Provisioner + recorder events.Recorder + clock clock.Clock + cloudProvider cloudprovider.CloudProvider + lastConsolidationState int64 +} + +// pollingPeriod that we inspect cluster to look for opportunities to consolidate +const pollingPeriod = 10 * time.Second + +func NewController(ctx context.Context, clk clock.Clock, kubeClient client.Client, provisioner *provisioning.Provisioner, + cp cloudprovider.CloudProvider, recorder events.Recorder, cluster *state.Cluster, startAsync <-chan struct{}) *Controller { + c := &Controller{ + clock: clk, + kubeClient: kubeClient, + cluster: cluster, + provisioner: provisioner, + recorder: recorder, + cloudProvider: cp, + } + + go func() { + select { + case <-ctx.Done(): + return + case <-startAsync: + c.run(ctx) + } + }() + + return c +} + +func (c *Controller) run(ctx context.Context) { + logger := logging.FromContext(ctx).Named("consolidation") + ctx = logging.WithLogger(ctx, logger) + for { + select { + case <-ctx.Done(): + logger.Infof("Shutting down") + return + case <-time.After(pollingPeriod): + // the last cluster consolidation wasn't able to improve things and nothing has changed regarding + // the cluster that makes us think we would be successful now + if c.lastConsolidationState == c.cluster.ClusterConsolidationState() { + continue + } + + // don't consolidate as we recently scaled down too soon + stabilizationTime := c.clock.Now().Add(-c.stabilizationWindow(ctx)) + if c.cluster.LastNodeDeletionTime().Before(stabilizationTime) { + if c.ProcessCluster(ctx) == ProcessResultNothingToDo { + c.lastConsolidationState = c.cluster.ClusterConsolidationState() + } + } + } + } +} + +// consolidatableNode is a node that we are considering for consolidation along with extra information to be used in +// making that determination +type consolidatableNode struct { + *v1.Node + instanceType cloudprovider.InstanceType + provisioner *v1alpha5.Provisioner + disruptionCost float64 + pods []*v1.Pod +} + +type ProcessResult byte + +const ( + ProcessResultNothingToDo ProcessResult = iota + ProcessResultFailed + ProcessResultConsolidated +) + +// ProcessCluster is exposed for unit testing purposes +func (c *Controller) ProcessCluster(ctx context.Context) ProcessResult { + candidates := c.candidateNodes(ctx) + if len(candidates) == 0 { + return ProcessResultNothingToDo + } + + emptyNodes := lo.Filter(candidates, func(n consolidatableNode, _ int) bool { return len(n.pods) == 0 }) + // first see if there are empty nodes that we can delete immediately, and if so delete them all at once + if len(emptyNodes) > 0 { + c.performConsolidation(ctx, consolidationAction{ + oldNodes: lo.Map(emptyNodes, func(n consolidatableNode, _ int) *v1.Node { return n.Node }), + result: consolidateResultDeleteEmpty, + }) + return ProcessResultConsolidated + } + + pdbs, err := NewPDBLimits(ctx, c.kubeClient) + if err != nil { + logging.FromContext(ctx).Errorf("Tracking PodDisruptionBudgets, %s", err) + return ProcessResultFailed + } + + // the remaining nodes are all non-empty, so we just consolidate the first one that we can + sort.Slice(candidates, byNodeDisruptionCost(candidates)) + for _, node := range candidates { + // is this a node that we can terminate? This check is meant to be fast so we can save the expense of simulated + // scheduling unless its really needed + if err = c.canBeTerminated(node, pdbs); err != nil { + continue + } + action := c.nodeConsolidationActions(ctx, node) + if action.result == consolidateResultDelete || action.result == consolidateResultShrink { + // perform the first consolidation we can since we are looking at nodes in ascending order of disruption cost + c.performConsolidation(ctx, action) + return ProcessResultConsolidated + } + } + return ProcessResultNothingToDo +} + +// candidateNodes returns nodes that appear to be currently consolidatable based off of their provisioner +func (c *Controller) candidateNodes(ctx context.Context) []consolidatableNode { + var nodes []consolidatableNode + uninitializedNodeExists := false + c.cluster.ForEachNode(func(n *state.Node) bool { + // skip any nodes that we can't determine the instance of or for which we don't have consolidation enabled + if n.InstanceType == nil || n.Provisioner == nil || n.Provisioner.Spec.Consolidation == nil || !aws.BoolValue(n.Provisioner.Spec.Consolidation.Enabled) { + return true + } + // already found one un-initialized node, so we can skip the rest + if uninitializedNodeExists { + return true + } + + if n.Node.Labels[v1alpha5.LabelNodeInitialized] != "true" { + uninitializedNodeExists = true + return true + } + // skip nodes that are annotated as do-not-consolidate + if n.Node.Annotations[v1alpha5.DoNotConsolidateNodeAnnotationKey] == "true" { + return true + } + + // skip nodes that the scheduler thinks will soon have pending pods bound to them + if c.cluster.IsNodeNominated(n.Node.Name) { + return true + } + + pods, err := c.getNodePods(ctx, n.Node.Name) + if err != nil { + logging.FromContext(ctx).Errorf("Determining node pods, %s", err) + return true + } + + nodes = append(nodes, consolidatableNode{ + Node: n.Node, + instanceType: n.InstanceType, + provisioner: n.Provisioner, + pods: pods, + disruptionCost: disruptionCost(ctx, pods), + }) + return true + }) + + // we have some node that hasn't fully become ready yet, so don't perform any consolidation + if uninitializedNodeExists { + return nil + } + return nodes +} + +func (c *Controller) performConsolidation(ctx context.Context, action consolidationAction) { + if action.result != consolidateResultDelete && + action.result != consolidateResultShrink && + action.result != consolidateResultDeleteEmpty { + logging.FromContext(ctx).Errorf("Invalid disruption action calculated: %s", action.result) + return + } + + consolidationActionsPerformedCounter.With(prometheus.Labels{"action": action.result.String()}).Add(1) + + // action's stringer + logging.FromContext(ctx).Infof("Consolidating via %s", action.String()) + + if action.result == consolidateResultShrink { + if err := c.launchReplacementNode(ctx, action); err != nil { + // If we failed to launch the replacement, don't consolidate. If this is some permanent failure, + // we don't want to disrupt workloads with no way to provision new nodes for them. + logging.FromContext(ctx).Errorf("Launching replacement node, %s", err) + return + } + } + + for _, oldNode := range action.oldNodes { + c.recorder.TerminatingNodeForConsolidation(oldNode, action.String()) + if err := c.kubeClient.Delete(ctx, oldNode); err != nil { + logging.FromContext(ctx).Errorf("Deleting node, %s", err) + } else { + consolidationNodesTerminatedCounter.Add(1) + } + } +} + +func byNodeDisruptionCost(nodes []consolidatableNode) func(i int, j int) bool { + return func(a, b int) bool { + if nodes[a].disruptionCost == nodes[b].disruptionCost { + // if costs are equal, choose the older node + return nodes[a].CreationTimestamp.Before(&nodes[b].CreationTimestamp) + } + return nodes[a].disruptionCost < nodes[b].disruptionCost + } +} + +// launchReplacementNode launches a replacement node and blocks until it is ready +func (c *Controller) launchReplacementNode(ctx context.Context, minCost consolidationAction) error { + defer metrics.Measure(consolidationReplacementNodeInitializedHistogram)() + if len(minCost.oldNodes) != 1 { + return fmt.Errorf("expected a single node to replace, found %d", len(minCost.oldNodes)) + } + + // cordon the node before we launch the replacement to prevent new pods from scheduling to the node + if err := c.setNodeUnschedulable(ctx, minCost.oldNodes[0].Name, true); err != nil { + return fmt.Errorf("cordoning node %s, %w", minCost.oldNodes[0].Name, err) + } + + nodeNames, err := c.provisioner.LaunchNodes(ctx, minCost.replacementNode) + if err != nil { + return err + } + if len(nodeNames) != 1 { + return fmt.Errorf("expected a single node name, got %d", len(nodeNames)) + } + + consolidationNodesCreatedCounter.Add(1) + + var k8Node v1.Node + // Wait for the node to be ready, we can wait for up to about 3 minutes which is ok as all this blocks is + // further consolidation. + var once sync.Once + if err := retry.Do(func() error { + if err := c.kubeClient.Get(ctx, client.ObjectKey{Name: nodeNames[0]}, &k8Node); err != nil { + return fmt.Errorf("getting node, %w", err) + } + once.Do(func() { + c.recorder.LaunchingNodeForConsolidation(&k8Node, minCost.String()) + }) + + if _, ok := k8Node.Labels[v1alpha5.LabelNodeInitialized]; !ok { + // make the user aware of why consolidation is paused + c.recorder.WaitingOnReadinessForConsolidation(&k8Node) + return errors.New("node is not initialized") + } + return nil + }, retry.Delay(2*time.Second), + retry.LastErrorOnly(true), + retry.Attempts(30), + retry.MaxDelay(10*time.Second), // ~ 4.5 minutes in total + ); err != nil { + // node never become ready, so uncordon the node we were trying to delete and report the error + return multierr.Combine(c.setNodeUnschedulable(ctx, minCost.oldNodes[0].Name, false), + fmt.Errorf("checking node readiness, %w", err)) + } + return nil +} + +func (c *Controller) getNodePods(ctx context.Context, nodeName string) ([]*v1.Pod, error) { + var podList v1.PodList + if err := c.kubeClient.List(ctx, &podList, client.MatchingFields{"spec.nodeName": nodeName}); err != nil { + return nil, fmt.Errorf("listing pods, %w", err) + } + var pods []*v1.Pod + for i := range podList.Items { + // these pods don't need to be rescheduled + if pod.IsOwnedByNode(&podList.Items[i]) || + pod.IsOwnedByDaemonSet(&podList.Items[i]) || + pod.IsTerminal(&podList.Items[i]) { + continue + } + pods = append(pods, &podList.Items[i]) + } + return pods, nil +} + +func (c *Controller) canBeTerminated(node consolidatableNode, pdbs *PDBLimits) error { + if !node.DeletionTimestamp.IsZero() { + return fmt.Errorf("already being deleted") + } + if !pdbs.CanEvictPods(node.pods) { + return fmt.Errorf("not eligible for termination due to PDBs") + } + return c.podsPreventEviction(node) +} + +func (c *Controller) podsPreventEviction(node consolidatableNode) error { + for _, p := range node.pods { + // don't care about pods that are finishing, finished or owned by the node + if pod.IsTerminating(p) || pod.IsTerminal(p) || pod.IsOwnedByNode(p) { + continue + } + + if pod.HasDoNotEvict(p) { + return fmt.Errorf("found do-not-evict pod") + } + + if pod.IsNotOwned(p) { + return fmt.Errorf("found pod with no controller") + } + } + return nil +} + +type consolidateResult byte + +const ( + consolidateResultUnknown = iota + consolidateResultNotPossible + consolidateResultDelete + consolidateResultDeleteEmpty + consolidateResultShrink +) + +func (r consolidateResult) String() string { + switch r { + case consolidateResultUnknown: + return "Unknown" + case consolidateResultNotPossible: + return "Not Possible" + case consolidateResultDelete: + return "Delete" + case consolidateResultDeleteEmpty: + return "Delete (empty node)" + case consolidateResultShrink: + return "Shrink" + default: + return fmt.Sprintf("Unknown (%d)", r) + } +} + +type consolidationAction struct { + oldNodes []*v1.Node + disruptionCost float64 + result consolidateResult + replacementNode *scheduling.Node + savings float64 +} + +func (o consolidationAction) String() string { + var buf bytes.Buffer + fmt.Fprintf(&buf, "%s, terminating %d nodes ", o.result, len(o.oldNodes)) + for i, old := range o.oldNodes { + if i != 0 { + fmt.Fprint(&buf, ", ") + } + fmt.Fprintf(&buf, "%s", old.Name) + if instanceType, ok := old.Labels[v1.LabelInstanceTypeStable]; ok { + fmt.Fprintf(&buf, "/%s", instanceType) + } + } + if o.replacementNode != nil { + fmt.Fprintf(&buf, " and replacing with a node from types %s", + scheduling.InstanceTypeList(o.replacementNode.InstanceTypeOptions)) + } + return buf.String() +} + +func clamp(min, val, max float64) float64 { + if val < min { + return min + } + if val > max { + return max + } + return val +} + +func (c *Controller) nodeConsolidationActions(ctx context.Context, node consolidatableNode) consolidationAction { + // lifetimeRemaining is the fraction of node lifetime remaining in the range [0.0, 1.0]. If the TTLSecondsUntilExpired + // is non-zero, we use it to scale down the disruption costs of nodes that are going to expire. Just after creation, the + // disruption cost is highest and it approaches zero as the node ages towards its expiration time. + lifetimeRemaining := c.calculateLifetimeRemaining(node) + + cost, err := c.nodeConsolidationOptionShrinkOrDelete(ctx, node) + if err != nil { + logging.FromContext(ctx).Errorf("Consolidating node (shrink), %s", err) + } + cost.disruptionCost *= lifetimeRemaining + + // we only care about possibly successful consolidations + return cost +} + +// calculateLifetimeRemaining calculates the fraction of node lifetime remaining in the range [0.0, 1.0]. If the TTLSecondsUntilExpired +// is non-zero, we use it to scale down the disruption costs of nodes that are going to expire. Just after creation, the +// disruption cost is highest and it approaches zero as the node ages towards its expiration time. +func (c *Controller) calculateLifetimeRemaining(node consolidatableNode) float64 { + remaining := 1.0 + if node.provisioner.Spec.TTLSecondsUntilExpired != nil { + ageInSeconds := c.clock.Since(node.CreationTimestamp.Time).Seconds() + totalLifetimeSeconds := float64(*node.provisioner.Spec.TTLSecondsUntilExpired) + lifetimeRemainingSeconds := totalLifetimeSeconds - ageInSeconds + remaining = clamp(0.0, lifetimeRemainingSeconds/totalLifetimeSeconds, 1.0) + } + return remaining +} + +func (c *Controller) nodeConsolidationOptionShrinkOrDelete(ctx context.Context, node consolidatableNode) (consolidationAction, error) { + defer metrics.Measure(consolidationDurationHistogram.WithLabelValues("Shrink/Delete"))() + + scheduler, err := c.provisioner.NewScheduler(ctx, node.pods, scheduling.SchedulerOptions{ + SimulationMode: true, + ExcludeNodes: []string{node.Name}, + }) + + if err != nil { + return consolidationAction{result: consolidateResultUnknown}, fmt.Errorf("creating scheduler, %w", err) + } + + newNodes, inflightNodes, err := scheduler.Solve(ctx, node.pods) + if err != nil { + return consolidationAction{result: consolidateResultUnknown}, fmt.Errorf("simulating scheduling, %w", err) + } + + // were we able to schedule all the pods on the inflight nodes? + if len(newNodes) == 0 { + schedulableCount := 0 + for _, inflight := range inflightNodes { + schedulableCount += len(inflight.Pods) + } + if len(node.pods) == schedulableCount && c.cloudProvider.ShouldConsolidate(node.Node, nil, nil) { + savings := node.instanceType.Price() + return consolidationAction{ + oldNodes: []*v1.Node{node.Node}, + disruptionCost: disruptionCost(ctx, node.pods), + savings: savings, + result: consolidateResultDelete, + }, nil + } + } + + // we're not going to turn a single node into multiple nodes + if len(newNodes) != 1 { + return consolidationAction{result: consolidateResultNotPossible}, nil + } + + nodePrice := node.instanceType.Price() + newNodes[0].InstanceTypeOptions = filterByPrice(newNodes[0].InstanceTypeOptions, nodePrice, false) + if len(newNodes[0].InstanceTypeOptions) == 0 { + // no instance types remain after filtering by price + return consolidationAction{result: consolidateResultNotPossible}, nil + } + + // Since we are looking at replacing one node with another, we check with the cloud provider to see if that makes + // sense availability wise. E.g. if we have an xlarge spot node when a large would suffice, but the xlarge is from + // a deep pool where the large may not even launch, or would get reclaimed near immediately we should not replace + // the xlarge node. + if !c.cloudProvider.ShouldConsolidate(node.Node, newNodes[0].Requirements, newNodes[0].InstanceTypeOptions) { + return consolidationAction{result: consolidateResultNotPossible}, nil + } + + savings := nodePrice + // savings is reduced by the price of the new node + savings -= newNodes[0].InstanceTypeOptions[0].Price() + + return consolidationAction{ + oldNodes: []*v1.Node{node.Node}, + disruptionCost: disruptionCost(ctx, node.pods), + savings: savings, + result: consolidateResultShrink, + replacementNode: newNodes[0], + }, nil +} + +func filterByPrice(options []cloudprovider.InstanceType, price float64, inclusive bool) []cloudprovider.InstanceType { + var result []cloudprovider.InstanceType + for _, it := range options { + if (it.Price() < price) || (inclusive && it.Price() == price) { + result = append(result, it) + } + } + return result +} + +func disruptionCost(ctx context.Context, pods []*v1.Pod) float64 { + cost := 0.0 + for _, p := range pods { + cost += GetPodEvictionCost(ctx, p) + } + return cost +} + +func (c *Controller) hasPendingPods(ctx context.Context) bool { + var podList v1.PodList + if err := c.kubeClient.List(ctx, &podList, client.MatchingFields{"spec.nodeName": ""}); err != nil { + // failed to list pods, assume there must be pending as it's harmless and just ensures we wait longer + return true + } + for i := range podList.Items { + if pod.IsProvisionable(&podList.Items[i]) { + return true + } + } + return false +} + +func (c *Controller) replicaSetsReady(ctx context.Context) bool { + var rsList appsv1.ReplicaSetList + if err := c.kubeClient.List(ctx, &rsList); err != nil { + // failed to list pods, assume there must be pending as it's harmless and just ensures we wait longer + return true + } + for _, rs := range rsList.Items { + desired := aws.Int32Value(rs.Spec.Replicas) + if rs.Spec.Replicas == nil { + // unspecified defaults to 1 + desired = 1 + } + if rs.Status.ReadyReplicas < desired { + return false + } + } + return true +} + +func (c *Controller) replicationControllersReady(ctx context.Context) bool { + var rsList v1.ReplicationControllerList + if err := c.kubeClient.List(ctx, &rsList); err != nil { + // failed to list pods, assume there must be pending as it's harmless and just ensures we wait longer + return true + } + for _, rs := range rsList.Items { + desired := aws.Int32Value(rs.Spec.Replicas) + if rs.Spec.Replicas == nil { + // unspecified defaults to 1 + desired = 1 + } + if rs.Status.ReadyReplicas < desired { + return false + } + } + return true +} + +func (c *Controller) stabilizationWindow(ctx context.Context) time.Duration { + // no pending pods, and all replica sets/replication controllers are reporting ready so quickly consider another consolidation + if !c.hasPendingPods(ctx) && c.replicaSetsReady(ctx) && c.replicationControllersReady(ctx) { + return 0 + } + return 5 * time.Minute +} + +func (c *Controller) setNodeUnschedulable(ctx context.Context, nodeName string, isUnschedulable bool) error { + var node v1.Node + if err := c.kubeClient.Get(ctx, client.ObjectKey{Name: nodeName}, &node); err != nil { + return fmt.Errorf("getting node, %w", err) + } + + // already matches the state we want to be in + if node.Spec.Unschedulable == isUnschedulable { + return nil + } + + persisted := node.DeepCopy() + node.Spec.Unschedulable = isUnschedulable + if err := c.kubeClient.Patch(ctx, &node, client.MergeFrom(persisted)); err != nil { + return fmt.Errorf("patching node %s, %w", node.Name, err) + } + return nil +} + +// GetPodEvictionCost returns the disruption cost computed for evicting the given pod. +func GetPodEvictionCost(ctx context.Context, p *v1.Pod) float64 { + cost := 1.0 + podDeletionCostStr, ok := p.Annotations[v1.PodDeletionCost] + if ok { + podDeletionCost, err := strconv.ParseFloat(podDeletionCostStr, 64) + if err != nil { + logging.FromContext(ctx).Errorf("parsing %s=%s from pod %s, %s", + v1.PodDeletionCost, podDeletionCostStr, client.ObjectKeyFromObject(p), err) + } else { + // the pod deletion disruptionCost is in [-2147483647, 2147483647] + // the min pod disruptionCost makes one pod ~ -15 pods, and the max pod disruptionCost to ~ 17 pods. + cost += podDeletionCost / math.Pow(2, 27.0) + } + } + // the scheduling priority is in [-2147483648, 1000000000] + if p.Spec.Priority != nil { + cost += float64(*p.Spec.Priority) / math.Pow(2, 25) + } + + // overall we clamp the pod cost to the range [-10.0, 10.0] with the default being 1.0 + return clamp(-10.0, cost, 10.0) +} diff --git a/pkg/controllers/consolidation/metrics.go b/pkg/controllers/consolidation/metrics.go new file mode 100644 index 000000000000..1323f09e9b6a --- /dev/null +++ b/pkg/controllers/consolidation/metrics.go @@ -0,0 +1,76 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package consolidation + +import ( + "github.com/prometheus/client_golang/prometheus" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/aws/karpenter/pkg/metrics" +) + +func init() { + crmetrics.Registry.MustRegister(consolidationDurationHistogram) + crmetrics.Registry.MustRegister(consolidationReplacementNodeInitializedHistogram) + crmetrics.Registry.MustRegister(consolidationNodesCreatedCounter) + crmetrics.Registry.MustRegister(consolidationNodesTerminatedCounter) + crmetrics.Registry.MustRegister(consolidationActionsPerformedCounter) +} + +var consolidationDurationHistogram = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: metrics.Namespace, + Subsystem: "consolidation", + Name: "evaluation_duration_seconds", + Help: "Duration of the consolidation evaluation process in seconds.", + Buckets: metrics.DurationBuckets(), + }, + []string{"method"}, +) + +var consolidationReplacementNodeInitializedHistogram = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: metrics.Namespace, + Subsystem: "consolidation", + Name: "replacement_node_initialized_seconds", + Help: "Amount of time required for a replacement node to become initialized.", + Buckets: metrics.DurationBuckets(), + }) + +var consolidationNodesCreatedCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: "consolidation", + Name: "nodes_created", + Help: "Number of nodes created in total by consolidation.", + }, +) +var consolidationNodesTerminatedCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: "consolidation", + Name: "nodes_terminated", + Help: "Number of nodes terminated in total by consolidation.", + }, +) +var consolidationActionsPerformedCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: "consolidation", + Name: "actions_performed", + Help: "Number of consolidation actions performed. Labeled by action.", + }, + []string{"action"}, +) diff --git a/pkg/controllers/consolidation/pdblimits.go b/pkg/controllers/consolidation/pdblimits.go new file mode 100644 index 000000000000..a49f8e095c7f --- /dev/null +++ b/pkg/controllers/consolidation/pdblimits.go @@ -0,0 +1,86 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package consolidation + +import ( + "context" + + v1 "k8s.io/api/core/v1" + policyv1 "k8s.io/api/policy/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// PDBLimits is used to evaluate if evicting a list of pods is possible. +type PDBLimits struct { + ctx context.Context + kubeClient client.Client + pdbs []*pdbItem +} + +func NewPDBLimits(ctx context.Context, kubeClient client.Client) (*PDBLimits, error) { + ps := &PDBLimits{ + ctx: ctx, + kubeClient: kubeClient, + } + + var pdbList policyv1.PodDisruptionBudgetList + if err := kubeClient.List(ctx, &pdbList); err != nil { + return nil, err + } + for _, pdb := range pdbList.Items { + pi, err := newPdb(pdb) + if err != nil { + return nil, err + } + ps.pdbs = append(ps.pdbs, pi) + } + + return ps, nil +} + +// CanEvictPods returns true if every pod in the list is evictable. They may not all be evictable simultaneously, but +// for every PDB that controls the pods at least one pod can be evicted. +func (s *PDBLimits) CanEvictPods(pods []*v1.Pod) bool { + for _, pod := range pods { + for _, pdb := range s.pdbs { + if pdb.selector.Matches(labels.Set(pod.Labels)) { + if pdb.disruptionsAllowed == 0 { + return false + } + } + } + } + return true +} + +type pdbItem struct { + name client.ObjectKey + selector labels.Selector + disruptionsAllowed int32 +} + +func newPdb(pdb policyv1.PodDisruptionBudget) (*pdbItem, error) { + selector, err := metav1.LabelSelectorAsSelector(pdb.Spec.Selector) + if err != nil { + return nil, err + } + return &pdbItem{ + name: client.ObjectKeyFromObject(&pdb), + selector: selector, + disruptionsAllowed: pdb.Status.DisruptionsAllowed, + }, nil +} diff --git a/pkg/controllers/consolidation/suite_test.go b/pkg/controllers/consolidation/suite_test.go new file mode 100644 index 000000000000..be07f7ce001f --- /dev/null +++ b/pkg/controllers/consolidation/suite_test.go @@ -0,0 +1,1083 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package consolidation_test + +import ( + "context" + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider" + "github.com/aws/karpenter/pkg/cloudprovider/fake" + "github.com/aws/karpenter/pkg/controllers/consolidation" + "github.com/aws/karpenter/pkg/controllers/provisioning" + "github.com/aws/karpenter/pkg/controllers/state" + "github.com/aws/karpenter/pkg/test" + . "github.com/aws/karpenter/pkg/test/expectations" + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + policyv1 "k8s.io/api/policy/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/clock" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/kubernetes" + . "knative.dev/pkg/logging/testing" + "sigs.k8s.io/controller-runtime/pkg/client" + "sync" + "testing" + "time" +) + +var ctx context.Context +var env *test.Environment +var cluster *state.Cluster +var controller *consolidation.Controller +var provisioner *provisioning.Provisioner +var cloudProvider *fake.CloudProvider +var clientSet *kubernetes.Clientset +var recorder *test.EventRecorder +var nodeStateController *state.NodeController +var fakeClock *clock.FakeClock +var cfg *test.Config +var mostExpensiveInstance cloudprovider.InstanceType +var leastExpensiveInstance cloudprovider.InstanceType + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "Consolidation") +} + +var _ = BeforeSuite(func() { + env = test.NewEnvironment(ctx, func(e *test.Environment) { + cloudProvider = &fake.CloudProvider{} + cfg = test.NewConfig() + fakeClock = clock.NewFakeClock(time.Now()) + cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) + nodeStateController = state.NewNodeController(env.Client, cluster) + clientSet = kubernetes.NewForConfigOrDie(e.Config) + recorder = test.NewEventRecorder() + provisioner = provisioning.NewProvisioner(ctx, cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + }) + Expect(env.Start()).To(Succeed(), "Failed to start environment") +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = BeforeEach(func() { + cloudProvider.CreateCalls = nil + cloudProvider.InstanceTypes = fake.InstanceTypesAssorted() + mostExpensiveInstance = lo.MaxBy(cloudProvider.InstanceTypes, func(lhs, rhs cloudprovider.InstanceType) bool { + return lhs.Price() > rhs.Price() + }) + // los MaxBy & MinBy functions are identical. https://github.com/samber/lo/issues/129 + leastExpensiveInstance = lo.MaxBy(cloudProvider.InstanceTypes, func(lhs, rhs cloudprovider.InstanceType) bool { + return lhs.Price() < rhs.Price() + }) + + recorder.Reset() + fakeClock.SetTime(time.Now()) + controller = consolidation.NewController(env.Ctx, fakeClock, env.Client, provisioner, cloudProvider, recorder, cluster, nil) +}) +var _ = AfterEach(func() { + ExpectCleanedUp(ctx, env.Client) + var nodes []client.ObjectKey + cluster.ForEachNode(func(n *state.Node) bool { + nodes = append(nodes, client.ObjectKeyFromObject(n.Node)) + return true + }) + + // inform cluster state of node deletion + for _, nodeKey := range nodes { + ExpectReconcileSucceeded(ctx, nodeStateController, nodeKey) + } +}) + +var _ = Describe("Pod Eviction Cost", func() { + const standardPodCost = 1.0 + It("should have a standard disruptionCost for a pod with no priority or disruptionCost specified", func() { + cost := consolidation.GetPodEvictionCost(ctx, &v1.Pod{}) + Expect(cost).To(BeNumerically("==", standardPodCost)) + }) + It("should have a higher disruptionCost for a pod with a positive deletion disruptionCost", func() { + cost := consolidation.GetPodEvictionCost(ctx, &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ + v1.PodDeletionCost: "100", + }}, + }) + Expect(cost).To(BeNumerically(">", standardPodCost)) + }) + It("should have a lower disruptionCost for a pod with a positive deletion disruptionCost", func() { + cost := consolidation.GetPodEvictionCost(ctx, &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ + v1.PodDeletionCost: "-100", + }}, + }) + Expect(cost).To(BeNumerically("<", standardPodCost)) + }) + It("should have higher costs for higher deletion costs", func() { + cost1 := consolidation.GetPodEvictionCost(ctx, &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ + v1.PodDeletionCost: "101", + }}, + }) + cost2 := consolidation.GetPodEvictionCost(ctx, &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ + v1.PodDeletionCost: "100", + }}, + }) + cost3 := consolidation.GetPodEvictionCost(ctx, &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ + v1.PodDeletionCost: "99", + }}, + }) + Expect(cost1).To(BeNumerically(">", cost2)) + Expect(cost2).To(BeNumerically(">", cost3)) + }) + It("should have a higher disruptionCost for a pod with a higher priority", func() { + cost := consolidation.GetPodEvictionCost(ctx, &v1.Pod{ + Spec: v1.PodSpec{Priority: aws.Int32(1)}, + }) + Expect(cost).To(BeNumerically(">", standardPodCost)) + }) + It("should have a lower disruptionCost for a pod with a lower priority", func() { + cost := consolidation.GetPodEvictionCost(ctx, &v1.Pod{ + Spec: v1.PodSpec{Priority: aws.Int32(-1)}, + }) + Expect(cost).To(BeNumerically("<", standardPodCost)) + }) +}) + +var _ = Describe("Shrink Nodes", func() { + It("can shrink node", func() { + labels := map[string]string{ + "app": "test", + } + // create our RS so we can link a pod to it + rs := test.ReplicaSet() + ExpectApplied(ctx, env.Client, rs) + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(rs), rs)).To(Succeed()) + + pod := test.Pod(test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: labels, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: rs.Name, + UID: rs.UID, + Controller: aws.Bool(true), + BlockOwnerDeletion: aws.Bool(true), + }, + }}}) + + prov := test.Provisioner(test.ProvisionerOptions{Consolidation: &v1alpha5.Consolidation{Enabled: aws.Bool(true)}}) + node := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{v1.ResourceCPU: resource.MustParse("32")}}) + + ExpectApplied(ctx, env.Client, rs, pod, node, prov) + ExpectMakeNodesReady(ctx, env.Client, node) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node)) + ExpectManualBinding(ctx, env.Client, pod, node) + ExpectScheduled(ctx, env.Client, pod) + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(node), node)).To(Succeed()) + + // consolidation won't delete the old node until the new node is ready + wg := ExpectMakeNewNodesReady(ctx, env.Client, 1, node) + fakeClock.Step(10 * time.Minute) + controller.ProcessCluster(ctx) + wg.Wait() + + // should create a new node as there is a cheaper one that can hold the pod + Expect(cloudProvider.CreateCalls).To(HaveLen(1)) + // and delete the old one + ExpectNotFound(ctx, env.Client, node) + }) + It("can shrink nodes, considers PDB", func() { + labels := map[string]string{ + "app": "test", + } + // create our RS so we can link a pod to it + rs := test.ReplicaSet() + ExpectApplied(ctx, env.Client, rs) + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(rs), rs)).To(Succeed()) + + pods := test.Pods(3, test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: labels, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: rs.Name, + UID: rs.UID, + Controller: aws.Bool(true), + BlockOwnerDeletion: aws.Bool(true), + }, + }}}) + + pdb := test.PodDisruptionBudget(test.PDBOptions{ + Labels: labels, + MaxUnavailable: fromInt(0), + Status: &policyv1.PodDisruptionBudgetStatus{ + ObservedGeneration: 1, + DisruptionsAllowed: 0, + CurrentHealthy: 1, + DesiredHealthy: 1, + ExpectedPods: 1, + }, + }) + + prov := test.Provisioner(test.ProvisionerOptions{Consolidation: &v1alpha5.Consolidation{Enabled: aws.Bool(true)}}) + node1 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], node1, prov, pdb) + ExpectApplied(ctx, env.Client, node1) + // all pods on node1 + ExpectManualBinding(ctx, env.Client, pods[0], node1) + ExpectManualBinding(ctx, env.Client, pods[1], node1) + ExpectManualBinding(ctx, env.Client, pods[2], node1) + ExpectScheduled(ctx, env.Client, pods[0]) + ExpectScheduled(ctx, env.Client, pods[1]) + ExpectScheduled(ctx, env.Client, pods[2]) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node1)) + + // inform cluster state about the nodes + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node1)) + fakeClock.Step(10 * time.Minute) + controller.ProcessCluster(ctx) + + // we don't need a new node + Expect(cloudProvider.CreateCalls).To(HaveLen(0)) + // and can't delete the node due to the PDB + ExpectNodeExists(ctx, env.Client, node1.Name) + }) + It("can shrink nodes, considers do-not-consolidate annotation", func() { + labels := map[string]string{ + "app": "test", + } + + // create our RS so we can link a pod to it + rs := test.ReplicaSet() + ExpectApplied(ctx, env.Client, rs) + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(rs), rs)).To(Succeed()) + + pods := test.Pods(3, test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: labels, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: rs.Name, + UID: rs.UID, + Controller: aws.Bool(true), + BlockOwnerDeletion: aws.Bool(true), + }, + }}}) + + prov := test.Provisioner(test.ProvisionerOptions{TTLSecondsUntilExpired: aws.Int64(30), Consolidation: &v1alpha5.Consolidation{Enabled: aws.Bool(true)}}) + regularNode := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + annotatedNode := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + v1alpha5.DoNotConsolidateNodeAnnotationKey: "true", + }, + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], prov) + ExpectApplied(ctx, env.Client, regularNode, annotatedNode) + ExpectApplied(ctx, env.Client, regularNode, annotatedNode) + ExpectMakeNodesReady(ctx, env.Client, regularNode, annotatedNode) + ExpectManualBinding(ctx, env.Client, pods[0], regularNode) + ExpectManualBinding(ctx, env.Client, pods[1], regularNode) + ExpectManualBinding(ctx, env.Client, pods[2], annotatedNode) + ExpectScheduled(ctx, env.Client, pods[0]) + ExpectScheduled(ctx, env.Client, pods[1]) + ExpectScheduled(ctx, env.Client, pods[2]) + + // inform cluster state about the nodes + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(regularNode)) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(annotatedNode)) + fakeClock.Step(10 * time.Minute) + controller.ProcessCluster(ctx) + + Expect(cloudProvider.CreateCalls).To(HaveLen(0)) + // we should delete the non-annotated node + ExpectNotFound(ctx, env.Client, regularNode) + }) +}) + +var _ = Describe("Delete Node", func() { + It("can delete nodes", func() { + labels := map[string]string{ + "app": "test", + } + // create our RS so we can link a pod to it + rs := test.ReplicaSet() + ExpectApplied(ctx, env.Client, rs) + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(rs), rs)).To(Succeed()) + + pods := test.Pods(3, test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: labels, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: rs.Name, + UID: rs.UID, + Controller: aws.Bool(true), + BlockOwnerDeletion: aws.Bool(true), + }, + }}}) + + prov := test.Provisioner(test.ProvisionerOptions{Consolidation: &v1alpha5.Consolidation{Enabled: aws.Bool(true)}}) + node1 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + node2 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], node1, node2, prov) + ExpectMakeNodesReady(ctx, env.Client, node1, node2) + + ExpectManualBinding(ctx, env.Client, pods[0], node1) + ExpectManualBinding(ctx, env.Client, pods[1], node1) + ExpectManualBinding(ctx, env.Client, pods[2], node2) + ExpectScheduled(ctx, env.Client, pods[0]) + ExpectScheduled(ctx, env.Client, pods[1]) + ExpectScheduled(ctx, env.Client, pods[2]) + + // inform cluster state about the nodes + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node1)) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node2)) + fakeClock.Step(10 * time.Minute) + controller.ProcessCluster(ctx) + + // we don't need a new node, but we should evict everything off one of node2 which only has a single pod + Expect(cloudProvider.CreateCalls).To(HaveLen(0)) + // and delete the old one + ExpectNotFound(ctx, env.Client, node2) + }) + It("can delete nodes, considers PDB", func() { + var nl v1.NodeList + Expect(env.Client.List(ctx, &nl)).To(Succeed()) + Expect(nl.Items).To(HaveLen(0)) + labels := map[string]string{ + "app": "test", + } + // create our RS so we can link a pod to it + rs := test.ReplicaSet() + ExpectApplied(ctx, env.Client, rs) + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(rs), rs)).To(Succeed()) + + pods := test.Pods(3, test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: rs.Name, + UID: rs.UID, + Controller: aws.Bool(true), + BlockOwnerDeletion: aws.Bool(true), + }, + }}}) + + // only pod[2] is covered by the PDB + pods[2].Labels = labels + pdb := test.PodDisruptionBudget(test.PDBOptions{ + Labels: labels, + MaxUnavailable: fromInt(0), + Status: &policyv1.PodDisruptionBudgetStatus{ + ObservedGeneration: 1, + DisruptionsAllowed: 0, + CurrentHealthy: 1, + DesiredHealthy: 1, + ExpectedPods: 1, + }, + }) + + prov := test.Provisioner(test.ProvisionerOptions{Consolidation: &v1alpha5.Consolidation{Enabled: aws.Bool(true)}}) + node1 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + node2 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], node1, node2, prov, pdb) + ExpectMakeNodesReady(ctx, env.Client, node1, node2) + // two pods on node 1 + ExpectManualBinding(ctx, env.Client, pods[0], node1) + ExpectManualBinding(ctx, env.Client, pods[1], node1) + // one on node 2, but it has a PDB with zero disruptions allowed + ExpectManualBinding(ctx, env.Client, pods[2], node2) + ExpectScheduled(ctx, env.Client, pods[0]) + ExpectScheduled(ctx, env.Client, pods[1]) + ExpectScheduled(ctx, env.Client, pods[2]) + + // inform cluster state about the nodes + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node1)) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node2)) + fakeClock.Step(10 * time.Minute) + controller.ProcessCluster(ctx) + + // we don't need a new node + Expect(cloudProvider.CreateCalls).To(HaveLen(0)) + // but we expect to delete the nmode with more pods (node1) as the pod on node2 has a PDB preventing + // eviction + ExpectNotFound(ctx, env.Client, node1) + }) + It("can delete nodes, considers do-not-evict", func() { + // create our RS so we can link a pod to it + rs := test.ReplicaSet() + ExpectApplied(ctx, env.Client, rs) + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(rs), rs)).To(Succeed()) + + pods := test.Pods(3, test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: rs.Name, + UID: rs.UID, + Controller: aws.Bool(true), + BlockOwnerDeletion: aws.Bool(true), + }, + }}}) + + // only pod[2] has a do not evict annotation + pods[2].Annotations = map[string]string{ + v1alpha5.DoNotEvictPodAnnotationKey: "true", + } + + prov := test.Provisioner(test.ProvisionerOptions{Consolidation: &v1alpha5.Consolidation{Enabled: aws.Bool(true)}}) + node1 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + node2 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], node1, node2, prov) + ExpectMakeNodesReady(ctx, env.Client, node1, node2) + // two pods on node 1 + ExpectManualBinding(ctx, env.Client, pods[0], node1) + ExpectManualBinding(ctx, env.Client, pods[1], node1) + // one on node 2, but it has a do-not-evict annotation + ExpectManualBinding(ctx, env.Client, pods[2], node2) + ExpectScheduled(ctx, env.Client, pods[0]) + ExpectScheduled(ctx, env.Client, pods[1]) + ExpectScheduled(ctx, env.Client, pods[2]) + + // inform cluster state about the nodes + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node1)) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node2)) + fakeClock.Step(10 * time.Minute) + controller.ProcessCluster(ctx) + + // we don't need a new node + Expect(cloudProvider.CreateCalls).To(HaveLen(0)) + // but we expect to delete the node with more pods (node1) as the pod on node2 has a do-not-evict annotation + ExpectNotFound(ctx, env.Client, node1) + }) + It("can delete nodes, doesn't evict standalone pods", func() { + // create our RS so we can link a pod to it + rs := test.ReplicaSet() + ExpectApplied(ctx, env.Client, rs) + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(rs), rs)).To(Succeed()) + + pods := test.Pods(3, test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: rs.Name, + UID: rs.UID, + Controller: aws.Bool(true), + BlockOwnerDeletion: aws.Bool(true), + }, + }}}) + + // pod[2] is a stand-alone (non ReplicaSet) pod + pods[2].OwnerReferences = nil + + prov := test.Provisioner(test.ProvisionerOptions{Consolidation: &v1alpha5.Consolidation{Enabled: aws.Bool(true)}}) + node1 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + node2 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], node1, node2, prov) + ExpectMakeNodesReady(ctx, env.Client, node1, node2) + // two pods on node 1 + ExpectManualBinding(ctx, env.Client, pods[0], node1) + ExpectManualBinding(ctx, env.Client, pods[1], node1) + // one on node 2, but it's a standalone pod + ExpectManualBinding(ctx, env.Client, pods[2], node2) + ExpectScheduled(ctx, env.Client, pods[0]) + ExpectScheduled(ctx, env.Client, pods[1]) + ExpectScheduled(ctx, env.Client, pods[2]) + + // inform cluster state about the nodes + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node1)) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node2)) + fakeClock.Step(10 * time.Minute) + controller.ProcessCluster(ctx) + + // we don't need a new node + Expect(cloudProvider.CreateCalls).To(HaveLen(0)) + // but we expect to delete the node with more pods (node1) as the pod on node2 doesn't have a controller to + // recreate it + ExpectNotFound(ctx, env.Client, node1) + }) +}) + +var _ = Describe("Node Lifetime Consideration", func() { + It("should consider node lifetime remaining when calculating disruption cost", func() { + labels := map[string]string{ + "app": "test", + } + // create our RS so we can link a pod to it + rs := test.ReplicaSet() + ExpectApplied(ctx, env.Client, rs) + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(rs), rs)).To(Succeed()) + + pods := test.Pods(2, test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: labels, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: rs.Name, + UID: rs.UID, + Controller: aws.Bool(true), + BlockOwnerDeletion: aws.Bool(true), + }, + }}}) + + prov := test.Provisioner(test.ProvisionerOptions{TTLSecondsUntilExpired: aws.Int64(30), Consolidation: &v1alpha5.Consolidation{Enabled: aws.Bool(true)}}) + node1 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + node2 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], prov) + ExpectApplied(ctx, env.Client, node1) // ensure node1 is the oldest node + time.Sleep(1 * time.Second) // this sleep is unfortunate, but necessary. The creation time is from etcd and we can't mock it, so we + // need to sleep to force the second node to be created a bit after the first node. + ExpectApplied(ctx, env.Client, node2) + ExpectMakeNodesReady(ctx, env.Client, node1, node2) + ExpectManualBinding(ctx, env.Client, pods[0], node1) + ExpectManualBinding(ctx, env.Client, pods[1], node2) + ExpectScheduled(ctx, env.Client, pods[0]) + ExpectScheduled(ctx, env.Client, pods[1]) + + // inform cluster state about the nodes + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node1)) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node2)) + fakeClock.Step(10 * time.Minute) + controller.ProcessCluster(ctx) + + // the nodes are identical (same size, price, disruption cost, etc.) except for age. We should prefer to + // delete the older one + Expect(cloudProvider.CreateCalls).To(HaveLen(0)) + ExpectNotFound(ctx, env.Client, node1) + }) +}) + +var _ = Describe("Topology Consideration", func() { + It("can shrink node maintaining zonal topology spread", func() { + labels := map[string]string{ + "app": "test-zonal-spread", + } + + // create our RS so we can link a pod to it + rs := test.ReplicaSet() + ExpectApplied(ctx, env.Client, rs) + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(rs), rs)).To(Succeed()) + + tsc := v1.TopologySpreadConstraint{ + MaxSkew: 1, + TopologyKey: v1.LabelTopologyZone, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + } + pods := test.Pods(4, test.PodOptions{ + ResourceRequirements: v1.ResourceRequirements{Requests: map[v1.ResourceName]resource.Quantity{v1.ResourceCPU: resource.MustParse("1")}}, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{tsc}, + ObjectMeta: metav1.ObjectMeta{ + Labels: labels, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: rs.Name, + UID: rs.UID, + Controller: aws.Bool(true), + BlockOwnerDeletion: aws.Bool(true), + }, + }}}) + + prov := test.Provisioner(test.ProvisionerOptions{Consolidation: &v1alpha5.Consolidation{Enabled: aws.Bool(true)}}) + zone1Node := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelTopologyZone: "test-zone-1", + v1.LabelInstanceTypeStable: leastExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{v1.ResourceCPU: resource.MustParse("1")}}) + + zone2Node := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelTopologyZone: "test-zone-2", + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{v1.ResourceCPU: resource.MustParse("1")}}) + + zone3Node := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelTopologyZone: "test-zone-3", + v1.LabelInstanceTypeStable: leastExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{v1.ResourceCPU: resource.MustParse("1")}}) + + ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], zone1Node, zone2Node, zone3Node, prov) + ExpectMakeNodesReady(ctx, env.Client, zone1Node, zone2Node, zone3Node) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(zone1Node)) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(zone2Node)) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(zone3Node)) + ExpectManualBinding(ctx, env.Client, pods[0], zone1Node) + ExpectManualBinding(ctx, env.Client, pods[1], zone2Node) + ExpectManualBinding(ctx, env.Client, pods[2], zone3Node) + ExpectScheduled(ctx, env.Client, pods[0]) + ExpectScheduled(ctx, env.Client, pods[1]) + ExpectScheduled(ctx, env.Client, pods[2]) + + ExpectSkew(ctx, env.Client, "default", &tsc).To(ConsistOf(1, 1, 1)) + + // consolidation won't delete the old node until the new node is ready + wg := ExpectMakeNewNodesReady(ctx, env.Client, 1, zone1Node, zone2Node, zone3Node) + fakeClock.Step(10 * time.Minute) + controller.ProcessCluster(ctx) + wg.Wait() + + // should create a new node as there is a cheaper one that can hold the pod + Expect(cloudProvider.CreateCalls).To(HaveLen(1)) + + // we need to emulate the replicaset controller and bind a new pod to the newly created node + ExpectApplied(ctx, env.Client, pods[3]) + var nodes v1.NodeList + Expect(env.Client.List(ctx, &nodes)).To(Succeed()) + Expect(nodes.Items).To(HaveLen(3)) + for i, n := range nodes.Items { + // bind the pod to the new node we don't recognize as it is the one that consolidation created + if n.Name != zone1Node.Name && n.Name != zone2Node.Name && n.Name != zone3Node.Name { + ExpectManualBinding(ctx, env.Client, pods[3], &nodes.Items[i]) + } + } + // we should maintain our skew, the new node must be in the same zone as the old node it replaced + ExpectSkew(ctx, env.Client, "default", &tsc).To(ConsistOf(1, 1, 1)) + }) + It("won't delete node if it would violate pod anti-affinity", func() { + labels := map[string]string{ + "app": "test", + } + // create our RS so we can link a pod to it + rs := test.ReplicaSet() + ExpectApplied(ctx, env.Client, rs) + Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(rs), rs)).To(Succeed()) + + pods := test.Pods(3, test.PodOptions{ + ResourceRequirements: v1.ResourceRequirements{Requests: map[v1.ResourceName]resource.Quantity{v1.ResourceCPU: resource.MustParse("1")}}, + PodAntiRequirements: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + TopologyKey: v1.LabelHostname, + }, + }, + ObjectMeta: metav1.ObjectMeta{Labels: labels, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: rs.Name, + UID: rs.UID, + Controller: aws.Bool(true), + BlockOwnerDeletion: aws.Bool(true), + }, + }}}) + + prov := test.Provisioner(test.ProvisionerOptions{Consolidation: &v1alpha5.Consolidation{Enabled: aws.Bool(true)}}) + zone1Node := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelTopologyZone: "test-zone-1", + v1.LabelInstanceTypeStable: leastExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{v1.ResourceCPU: resource.MustParse("1")}}) + + zone2Node := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelTopologyZone: "test-zone-2", + v1.LabelInstanceTypeStable: leastExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{v1.ResourceCPU: resource.MustParse("1")}}) + + zone3Node := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelTopologyZone: "test-zone-3", + v1.LabelInstanceTypeStable: leastExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{v1.ResourceCPU: resource.MustParse("1")}}) + + ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], zone1Node, zone2Node, zone3Node, prov) + ExpectMakeNodesReady(ctx, env.Client, zone1Node, zone2Node, zone3Node) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(zone1Node)) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(zone2Node)) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(zone3Node)) + ExpectManualBinding(ctx, env.Client, pods[0], zone1Node) + ExpectManualBinding(ctx, env.Client, pods[1], zone2Node) + ExpectManualBinding(ctx, env.Client, pods[2], zone3Node) + ExpectScheduled(ctx, env.Client, pods[0]) + ExpectScheduled(ctx, env.Client, pods[1]) + ExpectScheduled(ctx, env.Client, pods[2]) + + wg := ExpectMakeNewNodesReady(ctx, env.Client, 1, zone1Node, zone2Node, zone3Node) + fakeClock.Step(10 * time.Minute) + controller.ProcessCluster(ctx) + wg.Wait() + + // our nodes are already the cheapest available, so we can't shrink them. If we delete or merge, it would + // violate the anti-affinity rule so we can't do anything. + Expect(cloudProvider.CreateCalls).To(HaveLen(0)) + ExpectNodeExists(ctx, env.Client, zone1Node.Name) + ExpectNodeExists(ctx, env.Client, zone2Node.Name) + ExpectNodeExists(ctx, env.Client, zone3Node.Name) + + }) +}) + +var _ = Describe("Empty Nodes", func() { + It("can delete empty nodes", func() { + prov := test.Provisioner(test.ProvisionerOptions{Consolidation: &v1alpha5.Consolidation{Enabled: aws.Bool(true)}}) + + node1 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + v1alpha5.LabelNodeInitialized: "true", + }, + }, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + ExpectApplied(ctx, env.Client, node1, prov) + + // inform cluster state about the nodes + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node1)) + fakeClock.Step(10 * time.Minute) + controller.ProcessCluster(ctx) + + // we don't need any new nodes + Expect(cloudProvider.CreateCalls).To(HaveLen(0)) + // and should delete the empty one + ExpectNotFound(ctx, env.Client, node1) + }) + It("can delete multiple empty nodes", func() { + prov := test.Provisioner(test.ProvisionerOptions{Consolidation: &v1alpha5.Consolidation{Enabled: aws.Bool(true)}}) + + node1 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + node2 := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }}, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + ExpectApplied(ctx, env.Client, node1, node2, prov) + ExpectMakeNodesReady(ctx, env.Client, node1, node2) + + // inform cluster state about the nodes + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node1)) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node2)) + fakeClock.Step(10 * time.Minute) + controller.ProcessCluster(ctx) + + // we don't need any new nodes + Expect(cloudProvider.CreateCalls).To(HaveLen(0)) + // and should delete both empty ones + ExpectNotFound(ctx, env.Client, node1) + ExpectNotFound(ctx, env.Client, node2) + }) +}) + +var _ = Describe("Special Cases", func() { + It("doesn't consolidate in the presence of uninitialized nodes", func() { + prov := test.Provisioner(test.ProvisionerOptions{Consolidation: &v1alpha5.Consolidation{Enabled: aws.Bool(true)}}) + + uninitializedNode := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + }, + }, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + emptyNode := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: prov.Name, + v1.LabelInstanceTypeStable: mostExpensiveInstance.Name(), + v1alpha5.LabelNodeInitialized: "true", + }, + }, + Allocatable: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse("32"), + v1.ResourcePods: resource.MustParse("100"), + }}) + + ExpectApplied(ctx, env.Client, emptyNode, uninitializedNode, prov) + + // inform cluster state about the nodes + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(emptyNode)) + ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(uninitializedNode)) + fakeClock.Step(10 * time.Minute) + controller.ProcessCluster(ctx) + + // we don't need any new nodes + Expect(cloudProvider.CreateCalls).To(HaveLen(0)) + // and shouldn't delete the empty one due to the un-initialized node + ExpectNodeExists(ctx, env.Client, emptyNode.Name) + }) +}) + +func fromInt(i int) *intstr.IntOrString { + v := intstr.FromInt(i) + return &v +} + +func ExpectMakeNewNodesReady(ctx context.Context, client client.Client, numNewNodes int, existingNodes ...*v1.Node) *sync.WaitGroup { + var wg sync.WaitGroup + + existingNodeNames := sets.NewString() + for _, existing := range existingNodes { + existingNodeNames.Insert(existing.Name) + } + go func() { + defer GinkgoRecover() + wg.Add(1) + defer wg.Add(-1) + start := time.Now() + for { + select { + case <-time.After(50 * time.Millisecond): + // give up after 2 seconds + if time.Since(start) > 2*time.Second { + return + } + var nodeList v1.NodeList + client.List(ctx, &nodeList) + nodesMadeReady := 0 + for i := range nodeList.Items { + n := &nodeList.Items[i] + if existingNodeNames.Has(n.Name) { + continue + } + ExpectMakeNodesReady(ctx, env.Client, n) + nodesMadeReady++ + // did we make all of the nodes ready that we expected? + if nodesMadeReady == numNewNodes { + return + } + } + case <-ctx.Done(): + return + } + } + }() + return &wg +} + +func ExpectMakeNodesReady(ctx context.Context, c client.Client, nodes ...*v1.Node) { + for _, node := range nodes { + var n v1.Node + Expect(c.Get(ctx, client.ObjectKeyFromObject(node), &n)).To(Succeed()) + n.Status.Phase = v1.NodeRunning + n.Status.Conditions = []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Now(), + LastTransitionTime: metav1.Now(), + Reason: "KubeletReady", + }, + } + if n.Labels == nil { + n.Labels = map[string]string{} + } + n.Labels[v1alpha5.LabelNodeInitialized] = "true" + n.Spec.Taints = nil + ExpectApplied(ctx, c, &n) + } +} diff --git a/pkg/controllers/counter/controller.go b/pkg/controllers/counter/controller.go index 80e5dda453f7..f7c277ac63f7 100644 --- a/pkg/controllers/counter/controller.go +++ b/pkg/controllers/counter/controller.go @@ -79,6 +79,10 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco } provisioner.Status.Resources = resourceCounts if err := c.kubeClient.Status().Patch(ctx, provisioner, client.MergeFrom(persisted)); err != nil { + // provisioner was deleted + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } return reconcile.Result{}, fmt.Errorf("patching provisioner, %w", err) } return reconcile.Result{}, nil diff --git a/pkg/controllers/metrics/state/suite_test.go b/pkg/controllers/metrics/state/suite_test.go index 68a6933784e3..bbc41a7a2bab 100644 --- a/pkg/controllers/metrics/state/suite_test.go +++ b/pkg/controllers/metrics/state/suite_test.go @@ -17,7 +17,9 @@ package metrics_test import ( "context" "fmt" + "k8s.io/apimachinery/pkg/util/clock" "testing" + "time" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider/fake" @@ -38,6 +40,7 @@ import ( var ctx context.Context var cfg *test.Config +var fakeClock *clock.FakeClock var env *test.Environment var cluster *state.Cluster var nodeController *state.NodeController @@ -58,7 +61,8 @@ var _ = BeforeSuite(func() { Expect(env.Start()).To(Succeed(), "Failed to start environment") cloudProvider = &fake.CloudProvider{InstanceTypes: fake.InstanceTypesAssorted()} - cluster = state.NewCluster(cfg, env.Client, cloudProvider) + fakeClock = clock.NewFakeClock(time.Now()) + cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) provisioner = test.Provisioner(test.ProvisionerOptions{ObjectMeta: metav1.ObjectMeta{Name: "default"}}) nodeController = state.NewNodeController(env.Client, cluster) podController = state.NewPodController(env.Client, cluster) diff --git a/pkg/controllers/node/controller.go b/pkg/controllers/node/controller.go index 78578c38c58a..52dd90acd540 100644 --- a/pkg/controllers/node/controller.go +++ b/pkg/controllers/node/controller.go @@ -23,6 +23,7 @@ import ( "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/clock" "knative.dev/pkg/logging" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -41,12 +42,12 @@ import ( const controllerName = "node" // NewController constructs a controller instance -func NewController(kubeClient client.Client, cloudProvider cloudprovider.CloudProvider, cluster *state.Cluster) *Controller { +func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider cloudprovider.CloudProvider, cluster *state.Cluster) *Controller { return &Controller{ kubeClient: kubeClient, initialization: &Initialization{kubeClient: kubeClient, cloudProvider: cloudProvider}, - emptiness: &Emptiness{kubeClient: kubeClient, cluster: cluster}, - expiration: &Expiration{kubeClient: kubeClient}, + emptiness: &Emptiness{kubeClient: kubeClient, clock: clk, cluster: cluster}, + expiration: &Expiration{kubeClient: kubeClient, clock: clk}, } } diff --git a/pkg/controllers/node/emptiness.go b/pkg/controllers/node/emptiness.go index bed37f1fd733..ba0a2255e969 100644 --- a/pkg/controllers/node/emptiness.go +++ b/pkg/controllers/node/emptiness.go @@ -19,6 +19,8 @@ import ( "fmt" "time" + "k8s.io/apimachinery/pkg/util/clock" + v1 "k8s.io/api/core/v1" "knative.dev/pkg/logging" "knative.dev/pkg/ptr" @@ -28,13 +30,13 @@ import ( "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/controllers/state" "github.com/aws/karpenter/pkg/utils/functional" - "github.com/aws/karpenter/pkg/utils/injectabletime" "github.com/aws/karpenter/pkg/utils/pod" ) // Emptiness is a subreconciler that deletes nodes that are empty after a ttl type Emptiness struct { kubeClient client.Client + clock clock.Clock cluster *state.Cluster } @@ -73,7 +75,7 @@ func (r *Emptiness) Reconcile(ctx context.Context, provisioner *v1alpha5.Provisi n.Annotations = functional.UnionStringMaps(n.Annotations) ttl := time.Duration(ptr.Int64Value(provisioner.Spec.TTLSecondsAfterEmpty)) * time.Second if !hasEmptinessTimestamp { - n.Annotations[v1alpha5.EmptinessTimestampAnnotationKey] = injectabletime.Now().Format(time.RFC3339) + n.Annotations[v1alpha5.EmptinessTimestampAnnotationKey] = r.clock.Now().Format(time.RFC3339) logging.FromContext(ctx).Infof("Added TTL to empty node") return reconcile.Result{RequeueAfter: ttl}, nil } @@ -82,13 +84,13 @@ func (r *Emptiness) Reconcile(ctx context.Context, provisioner *v1alpha5.Provisi if err != nil { return reconcile.Result{}, fmt.Errorf("parsing emptiness timestamp, %s", emptinessTimestamp) } - if injectabletime.Now().After(emptinessTime.Add(ttl)) { + if r.clock.Now().After(emptinessTime.Add(ttl)) { logging.FromContext(ctx).Infof("Triggering termination after %s for empty node", ttl) if err := r.kubeClient.Delete(ctx, n); err != nil { return reconcile.Result{}, fmt.Errorf("deleting node, %w", err) } } - return reconcile.Result{RequeueAfter: emptinessTime.Add(ttl).Sub(injectabletime.Now())}, nil + return reconcile.Result{RequeueAfter: emptinessTime.Add(ttl).Sub(r.clock.Now())}, nil } func (r *Emptiness) isEmpty(ctx context.Context, n *v1.Node) (bool, error) { diff --git a/pkg/controllers/node/expiration.go b/pkg/controllers/node/expiration.go index 1f2befced47e..2f98acfb7779 100644 --- a/pkg/controllers/node/expiration.go +++ b/pkg/controllers/node/expiration.go @@ -19,6 +19,8 @@ import ( "fmt" "time" + "k8s.io/apimachinery/pkg/util/clock" + v1 "k8s.io/api/core/v1" "knative.dev/pkg/logging" "knative.dev/pkg/ptr" @@ -26,12 +28,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/utils/injectabletime" ) // Expiration is a subreconciler that terminates nodes after a period of time. type Expiration struct { kubeClient client.Client + clock clock.Clock } // Reconcile reconciles the node @@ -43,7 +45,7 @@ func (r *Expiration) Reconcile(ctx context.Context, provisioner *v1alpha5.Provis // 2. Trigger termination workflow if expired expirationTTL := time.Duration(ptr.Int64Value(provisioner.Spec.TTLSecondsUntilExpired)) * time.Second expirationTime := node.CreationTimestamp.Add(expirationTTL) - if injectabletime.Now().After(expirationTime) { + if r.clock.Now().After(expirationTime) { logging.FromContext(ctx).Infof("Triggering termination for expired node after %s (+%s)", expirationTTL, time.Since(expirationTime)) if err := r.kubeClient.Delete(ctx, node); err != nil { return reconcile.Result{}, fmt.Errorf("deleting node, %w", err) diff --git a/pkg/controllers/node/suite_test.go b/pkg/controllers/node/suite_test.go index d0e4db9e1881..ff9df3d8bd0c 100644 --- a/pkg/controllers/node/suite_test.go +++ b/pkg/controllers/node/suite_test.go @@ -19,26 +19,29 @@ import ( "testing" "time" - "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/cloudprovider/fake" - "github.com/aws/karpenter/pkg/controllers/node" - "github.com/aws/karpenter/pkg/controllers/state" - "github.com/aws/karpenter/pkg/test" - . "github.com/aws/karpenter/pkg/test/expectations" - "github.com/aws/karpenter/pkg/utils/injectabletime" - . "github.com/onsi/ginkgo" - . "github.com/onsi/gomega" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - . "knative.dev/pkg/logging/testing" + "k8s.io/apimachinery/pkg/util/clock" "knative.dev/pkg/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" + + . "github.com/aws/karpenter/pkg/test/expectations" + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + . "knative.dev/pkg/logging/testing" + + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider/fake" + "github.com/aws/karpenter/pkg/controllers/node" + "github.com/aws/karpenter/pkg/controllers/state" + "github.com/aws/karpenter/pkg/test" ) var ctx context.Context var controller *node.Controller var env *test.Environment +var fakeClock *clock.FakeClock func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) @@ -47,10 +50,11 @@ func TestAPIs(t *testing.T) { } var _ = BeforeSuite(func() { + fakeClock = clock.NewFakeClock(time.Now()) env = test.NewEnvironment(ctx, func(e *test.Environment) { cp := &fake.CloudProvider{} - cluster := state.NewCluster(test.NewConfig(), e.Client, cp) - controller = node.NewController(e.Client, cp, cluster) + cluster := state.NewCluster(fakeClock, test.NewConfig(), e.Client, cp) + controller = node.NewController(fakeClock, e.Client, cp, cluster) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) @@ -69,7 +73,7 @@ var _ = Describe("Controller", func() { }) AfterEach(func() { - injectabletime.Now = time.Now + fakeClock.SetTime(time.Now()) ExpectCleanedUp(ctx, env.Client) }) @@ -106,6 +110,7 @@ var _ = Describe("Controller", func() { }, }}) ExpectApplied(ctx, env.Client, provisioner, n) + fakeClock.SetTime(time.Now()) // Should still exist ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(n)) @@ -113,9 +118,8 @@ var _ = Describe("Controller", func() { Expect(n.DeletionTimestamp.IsZero()).To(BeTrue()) // Simulate time passing - injectabletime.Now = func() time.Time { - return time.Now().Add(time.Duration(*provisioner.Spec.TTLSecondsUntilExpired) * time.Second) - } + fakeClock.Step(time.Duration(*provisioner.Spec.TTLSecondsUntilExpired) * time.Second) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(n)) n = ExpectNodeExists(ctx, env.Client, n.Name) Expect(n.DeletionTimestamp.IsZero()).To(BeFalse()) @@ -157,11 +161,11 @@ var _ = Describe("Controller", func() { ExpectApplied(ctx, env.Client, provisioner, node) // mark it empty first to get past the debounce check - injectabletime.Now = func() time.Time { return time.Now().Add(30 * time.Second) } + fakeClock.Step(30 * time.Second) ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(node)) // make the node more than 5 minutes old - injectabletime.Now = func() time.Time { return time.Now().Add(320 * time.Second) } + fakeClock.Step(320 * time.Second) ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(node)) node = ExpectNodeExists(ctx, env.Client, node.Name) @@ -172,7 +176,7 @@ var _ = Describe("Controller", func() { node := test.Node(test.NodeOptions{ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{v1alpha5.ProvisionerNameLabelKey: provisioner.Name}, Annotations: map[string]string{ - v1alpha5.EmptinessTimestampAnnotationKey: time.Now().Add(100 * time.Second).Format(time.RFC3339), + v1alpha5.EmptinessTimestampAnnotationKey: fakeClock.Now().Add(100 * time.Second).Format(time.RFC3339), }}, }) ExpectApplied(ctx, env.Client, provisioner, node, test.Pod(test.PodOptions{ @@ -180,7 +184,7 @@ var _ = Describe("Controller", func() { Conditions: []v1.PodCondition{{Type: v1.PodReady, Status: v1.ConditionTrue}}, })) // make the node more than 5 minutes old - injectabletime.Now = func() time.Time { return time.Now().Add(320 * time.Second) } + fakeClock.Step(320 * time.Second) ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(node)) node = ExpectNodeExists(ctx, env.Client, node.Name) @@ -196,12 +200,9 @@ var _ = Describe("Controller", func() { }}, }) ExpectApplied(ctx, env.Client, provisioner, node) - // debounce emptiness - injectabletime.Now = func() time.Time { return time.Now().Add(10 * time.Second) } - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(node)) // make the node more than 5 minutes old - injectabletime.Now = func() time.Time { return time.Now().Add(320 * time.Second) } + fakeClock.Step(320 * time.Second) ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(node)) node = ExpectNodeExists(ctx, env.Client, node.Name) @@ -209,8 +210,6 @@ var _ = Describe("Controller", func() { }) It("should requeue reconcile if node is empty, but not past emptiness TTL", func() { provisioner.Spec.TTLSecondsAfterEmpty = ptr.Int64(30) - now := time.Now() - injectabletime.Now = func() time.Time { return now } // injectabletime.Now() is called multiple times in function being tested. node := test.Node(test.NodeOptions{ObjectMeta: metav1.ObjectMeta{ Finalizers: []string{v1alpha5.TerminationFinalizer}, Labels: map[string]string{v1alpha5.ProvisionerNameLabelKey: provisioner.Name}, @@ -218,15 +217,10 @@ var _ = Describe("Controller", func() { ExpectApplied(ctx, env.Client, provisioner, node) - // debounce the emptiness - now = now.Add(10 * time.Second) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(node)) - // make the node eligible to be expired - now = now.Add(320 * time.Second) - injectabletime.Now = func() time.Time { return now } + fakeClock.Step(320 * time.Second) - emptinessTime := injectabletime.Now().Add(-10 * time.Second) + emptinessTime := fakeClock.Now().Add(-10 * time.Second) node.Annotations = map[string]string{ v1alpha5.EmptinessTimestampAnnotationKey: emptinessTime.Format(time.RFC3339), } @@ -234,7 +228,7 @@ var _ = Describe("Controller", func() { // Emptiness timestamps are first formatted to a string friendly (time.RFC3339) (to put it in the node object) // and then eventually parsed back into time.Time when comparing ttls. Repeating that logic in the test. emptinessTimestamp, _ := time.Parse(time.RFC3339, emptinessTime.Format(time.RFC3339)) - expectedRequeueTime := emptinessTimestamp.Add(30 * time.Second).Sub(injectabletime.Now()) // we should requeue in ~20 seconds. + expectedRequeueTime := emptinessTimestamp.Add(30 * time.Second).Sub(fakeClock.Now()) // we should requeue in ~20 seconds. result := ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(node)) Expect(result).To(Equal(reconcile.Result{Requeue: true, RequeueAfter: expectedRequeueTime})) diff --git a/pkg/controllers/provisioning/controller.go b/pkg/controllers/provisioning/controller.go index a4cc872ff02e..9461b5405675 100644 --- a/pkg/controllers/provisioning/controller.go +++ b/pkg/controllers/provisioning/controller.go @@ -18,18 +18,11 @@ import ( "context" "time" - "github.com/aws/karpenter/pkg/config" - "github.com/aws/karpenter/pkg/utils/pod" - "github.com/aws/karpenter/pkg/events" - - "github.com/aws/karpenter/pkg/controllers/state" - - "github.com/aws/karpenter/pkg/cloudprovider" + "github.com/aws/karpenter/pkg/utils/pod" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" - corev1 "k8s.io/client-go/kubernetes/typed/core/v1" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" @@ -47,10 +40,10 @@ type Controller struct { } // NewController constructs a controller instance -func NewController(ctx context.Context, cfg config.Config, kubeClient client.Client, coreV1Client corev1.CoreV1Interface, recorder events.Recorder, cloudProvider cloudprovider.CloudProvider, cluster *state.Cluster) *Controller { +func NewController(kubeClient client.Client, provisioner *Provisioner, recorder events.Recorder) *Controller { return &Controller{ kubeClient: kubeClient, - provisioner: NewProvisioner(ctx, cfg, kubeClient, coreV1Client, recorder, cloudProvider, cluster), + provisioner: provisioner, recorder: recorder, } } diff --git a/pkg/controllers/provisioning/provisioner.go b/pkg/controllers/provisioning/provisioner.go index 15b4d3cce2da..8db3adce4ffb 100644 --- a/pkg/controllers/provisioning/provisioner.go +++ b/pkg/controllers/provisioning/provisioner.go @@ -133,23 +133,30 @@ func (p *Provisioner) Provision(ctx context.Context) error { return nil } + _, err = p.LaunchNodes(ctx, nodes...) + return err +} + +func (p *Provisioner) LaunchNodes(ctx context.Context, nodes ...*scheduler.Node) ([]string, error) { // Launch capacity and bind pods errs := make([]error, len(nodes)) + nodeNames := make([]string, len(nodes)) workqueue.ParallelizeUntil(ctx, len(nodes), len(nodes), func(i int) { // create a new context to avoid a data race on the ctx variable ctx := logging.WithLogger(ctx, logging.FromContext(ctx).With("provisioner", nodes[i].Labels[v1alpha5.ProvisionerNameLabelKey])) // register the provisioner on the context so we can pull it off for tagging purposes // TODO: rethink this, maybe just pass the provisioner down instead of hiding it in the context? ctx = injection.WithNamespacedName(ctx, types.NamespacedName{Name: nodes[i].Labels[v1alpha5.ProvisionerNameLabelKey]}) - if err := p.launch(ctx, nodes[i]); err != nil { + if nodeName, err := p.launch(ctx, nodes[i]); err != nil { errs[i] = fmt.Errorf("launching node, %w", err) + } else { + nodeNames[i] = nodeName } }) if err := multierr.Combine(errs...); err != nil { - return err + return nodeNames, err } - logging.FromContext(ctx).Infof("Waiting for unschedulable pods") - return nil + return nodeNames, nil } func (p *Provisioner) getPods(ctx context.Context) ([]*v1.Pod, error) { @@ -175,9 +182,7 @@ func (p *Provisioner) getPods(ctx context.Context) ([]*v1.Pod, error) { } // nolint: gocyclo -func (p *Provisioner) schedule(ctx context.Context, pods []*v1.Pod) ([]*scheduler.Node, error) { - defer metrics.Measure(schedulingDuration.WithLabelValues(injection.GetNamespacedName(ctx).Name))() - +func (p *Provisioner) NewScheduler(ctx context.Context, pods []*v1.Pod, opts scheduler.SchedulerOptions) (*scheduler.Scheduler, error) { // Build node templates var nodeTemplates []*scheduling.NodeTemplate var provisionerList v1alpha5.ProvisionerList @@ -228,19 +233,31 @@ func (p *Provisioner) schedule(ctx context.Context, pods []*v1.Pod) ([]*schedule if err != nil { return nil, fmt.Errorf("getting daemon overhead, %w", err) } + return scheduler.NewScheduler(ctx, p.kubeClient, nodeTemplates, provisionerList.Items, p.cluster, topology, instanceTypes, daemonOverhead, p.recorder, opts), nil +} - return scheduler.NewScheduler(ctx, p.kubeClient, nodeTemplates, provisionerList.Items, p.cluster, topology, instanceTypes, daemonOverhead, p.recorder).Solve(ctx, pods) +func (p *Provisioner) schedule(ctx context.Context, pods []*v1.Pod) ([]*scheduler.Node, error) { + defer metrics.Measure(schedulingDuration.WithLabelValues(injection.GetNamespacedName(ctx).Name))() + + scheduler, err := p.NewScheduler(ctx, pods, scheduler.SchedulerOptions{}) + if err != nil { + return nil, fmt.Errorf("creating scheduler, %w", err) + } + + // don't care about inflight scheduling results in this context + nodes, _, err := scheduler.Solve(ctx, pods) + return nodes, err } -func (p *Provisioner) launch(ctx context.Context, node *scheduler.Node) error { +func (p *Provisioner) launch(ctx context.Context, node *scheduler.Node) (string, error) { // Check limits latest := &v1alpha5.Provisioner{} name := node.Requirements.Get(v1alpha5.ProvisionerNameLabelKey).Any() if err := p.kubeClient.Get(ctx, types.NamespacedName{Name: name}, latest); err != nil { - return fmt.Errorf("getting current resource usage, %w", err) + return "", fmt.Errorf("getting current resource usage, %w", err) } if err := latest.Spec.Limits.ExceededBy(latest.Status.Resources); err != nil { - return err + return "", err } k8sNode, err := p.cloudProvider.Create( @@ -248,11 +265,11 @@ func (p *Provisioner) launch(ctx context.Context, node *scheduler.Node) error { &cloudprovider.NodeRequest{InstanceTypeOptions: node.InstanceTypeOptions, Template: &node.NodeTemplate}, ) if err != nil { - return fmt.Errorf("creating cloud provider instance, %w", err) + return "", fmt.Errorf("creating cloud provider instance, %w", err) } if err := mergo.Merge(k8sNode, node.ToNode()); err != nil { - return fmt.Errorf("merging cloud provider node, %w", err) + return "", fmt.Errorf("merging cloud provider node, %w", err) } // ensure we clear out the status k8sNode.Status = v1.NodeStatus{} @@ -266,7 +283,7 @@ func (p *Provisioner) launch(ctx context.Context, node *scheduler.Node) error { if errors.IsAlreadyExists(err) { logging.FromContext(ctx).Debugf("node %s already registered", k8sNode.Name) } else { - return fmt.Errorf("creating node %s, %w", k8sNode.Name, err) + return "", fmt.Errorf("creating node %s, %w", k8sNode.Name, err) } } logging.FromContext(ctx).Infof("Created %s", node) @@ -274,7 +291,7 @@ func (p *Provisioner) launch(ctx context.Context, node *scheduler.Node) error { for _, pod := range node.Pods { p.recorder.NominatePod(pod, k8sNode) } - return nil + return k8sNode.Name, nil } func (p *Provisioner) getDaemonOverhead(ctx context.Context, nodeTemplates []*scheduling.NodeTemplate) (map[*scheduling.NodeTemplate]v1.ResourceList, error) { diff --git a/pkg/controllers/provisioning/scheduling/node.go b/pkg/controllers/provisioning/scheduling/node.go index 0ceef8b3c1d0..7e10dd295cf8 100644 --- a/pkg/controllers/provisioning/scheduling/node.go +++ b/pkg/controllers/provisioning/scheduling/node.go @@ -91,7 +91,7 @@ func (n *Node) Add(ctx context.Context, pod *v1.Pod) error { // Check instance type combinations requests := resources.Merge(n.requests, resources.RequestsForPods(pod)) - instanceTypes := filterInstanceTypes(n.InstanceTypeOptions, nodeRequirements, requests) + instanceTypes := filterInstanceTypesByRequirements(n.InstanceTypeOptions, nodeRequirements, requests) if len(instanceTypes) == 0 { return fmt.Errorf("no instance type satisfied resources %s and requirements %s", resources.String(resources.RequestsForPods(pod)), nodeRequirements) } @@ -107,21 +107,26 @@ func (n *Node) Add(ctx context.Context, pod *v1.Pod) error { } func (n *Node) String() string { + return fmt.Sprintf("node with %d pods requesting %s from types %s", len(n.Pods), resources.String(n.requests), + InstanceTypeList(n.InstanceTypeOptions)) +} + +func InstanceTypeList(instanceTypeOptions []cloudprovider.InstanceType) string { var itSb strings.Builder - for i, it := range n.InstanceTypeOptions { + for i, it := range instanceTypeOptions { // print the first 5 instance types only (indices 0-4) if i > 4 { - fmt.Fprintf(&itSb, " and %d other(s)", len(n.InstanceTypeOptions)-i) + fmt.Fprintf(&itSb, " and %d other(s)", len(instanceTypeOptions)-i) break } else if i > 0 { fmt.Fprint(&itSb, ", ") } fmt.Fprint(&itSb, it.Name()) } - return fmt.Sprintf("node with %d pods requesting %s from types %s", len(n.Pods), resources.String(n.requests), itSb.String()) + return itSb.String() } -func filterInstanceTypes(instanceTypes []cloudprovider.InstanceType, requirements scheduling.Requirements, requests v1.ResourceList) []cloudprovider.InstanceType { +func filterInstanceTypesByRequirements(instanceTypes []cloudprovider.InstanceType, requirements scheduling.Requirements, requests v1.ResourceList) []cloudprovider.InstanceType { return lo.Filter(instanceTypes, func(instanceType cloudprovider.InstanceType, _ int) bool { return compatible(instanceType, requirements) && fits(instanceType, requests) && hasOffering(instanceType, requirements) }) diff --git a/pkg/controllers/provisioning/scheduling/preferences.go b/pkg/controllers/provisioning/scheduling/preferences.go index f57244397821..60fcc778844b 100644 --- a/pkg/controllers/provisioning/scheduling/preferences.go +++ b/pkg/controllers/provisioning/scheduling/preferences.go @@ -27,17 +27,25 @@ import ( "github.com/aws/karpenter/pkg/utils/pretty" ) -type Preferences struct{} +type Preferences struct { + // ToleratePreferNoSchedule controls if preference relaxation adds a toleration for PreferNoSchedule taints. This only + // helps if there is a corresponding taint, so we don't always add it. + ToleratePreferNoSchedule bool +} func (p *Preferences) Relax(ctx context.Context, pod *v1.Pod) bool { - for _, relaxFunc := range []func(*v1.Pod) *string{ + relaxations := []func(*v1.Pod) *string{ p.removeRequiredNodeAffinityTerm, p.removePreferredPodAffinityTerm, p.removePreferredPodAntiAffinityTerm, p.removePreferredNodeAffinityTerm, - p.removeTopologySpreadScheduleAnyway, - p.toleratePreferNoScheduleTaints, - } { + p.removeTopologySpreadScheduleAnyway} + + if p.ToleratePreferNoSchedule { + relaxations = append(relaxations, p.toleratePreferNoScheduleTaints) + } + + for _, relaxFunc := range relaxations { if reason := relaxFunc(pod); reason != nil { logging.FromContext(ctx). diff --git a/pkg/controllers/provisioning/scheduling/scheduler.go b/pkg/controllers/provisioning/scheduling/scheduler.go index 2d81959ce114..bc04d948c006 100644 --- a/pkg/controllers/provisioning/scheduling/scheduler.go +++ b/pkg/controllers/provisioning/scheduling/scheduler.go @@ -22,6 +22,7 @@ import ( "github.com/samber/lo" "go.uber.org/multierr" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/client" @@ -33,7 +34,25 @@ import ( "github.com/aws/karpenter/pkg/utils/resources" ) -func NewScheduler(ctx context.Context, kubeClient client.Client, nodeTemplates []*scheduling.NodeTemplate, provisioners []v1alpha5.Provisioner, cluster *state.Cluster, topology *Topology, instanceTypes map[string][]cloudprovider.InstanceType, daemonOverhead map[*scheduling.NodeTemplate]v1.ResourceList, recorder events.Recorder) *Scheduler { +// SchedulerOptions can be used to control the scheduling, these options are currently only used during consolidation. +type SchedulerOptions struct { + // SimulationMode if true will prevent recording of the pod nomination decisions as events + SimulationMode bool + // ExcludeNodes are a list of node names that are excluded from inflight nodes for scheduling purposes. + ExcludeNodes []string +} + +func NewScheduler(ctx context.Context, kubeClient client.Client, nodeTemplates []*scheduling.NodeTemplate, provisioners []v1alpha5.Provisioner, cluster *state.Cluster, topology *Topology, instanceTypes map[string][]cloudprovider.InstanceType, daemonOverhead map[*scheduling.NodeTemplate]v1.ResourceList, recorder events.Recorder, opts SchedulerOptions) *Scheduler { + // if any of the provisioners add a taint with a prefer no schedule effect, we add a toleration for the taint + // during preference relaxation + toleratePreferNoSchedule := false + for _, prov := range provisioners { + for _, taint := range prov.Spec.Taints { + if taint.Effect == v1.TaintEffectPreferNoSchedule { + toleratePreferNoSchedule = true + } + } + } for provisioner := range instanceTypes { sort.Slice(instanceTypes[provisioner], func(i, j int) bool { return instanceTypes[provisioner][i].Price() < instanceTypes[provisioner][j].Price() @@ -48,7 +67,8 @@ func NewScheduler(ctx context.Context, kubeClient client.Client, nodeTemplates [ instanceTypes: instanceTypes, daemonOverhead: daemonOverhead, recorder: recorder, - preferences: &Preferences{}, + opts: opts, + preferences: &Preferences{ToleratePreferNoSchedule: toleratePreferNoSchedule}, remainingResources: map[string]v1.ResourceList{}, } @@ -63,7 +83,12 @@ func NewScheduler(ctx context.Context, kubeClient client.Client, nodeTemplates [ } // create our in-flight nodes + excluded := sets.NewString(opts.ExcludeNodes...) s.cluster.ForEachNode(func(node *state.Node) bool { + // skip any nodes that have been excluded + if excluded.Has(node.Node.Name) { + return true + } name, ok := node.Node.Labels[v1alpha5.ProvisionerNameLabelKey] if !ok { // ignoring this node as it wasn't launched by us @@ -97,10 +122,11 @@ type Scheduler struct { topology *Topology cluster *state.Cluster recorder events.Recorder + opts SchedulerOptions kubeClient client.Client } -func (s *Scheduler) Solve(ctx context.Context, pods []*v1.Pod) ([]*Node, error) { +func (s *Scheduler) Solve(ctx context.Context, pods []*v1.Pod) ([]*Node, []*InFlightNode, error) { // We loop trying to schedule unschedulable pods as long as we are making progress. This solves a few // issues including pods with affinity to another pod in the batch. We could topo-sort to solve this, but it wouldn't // solve the problem of scheduling pods where a particular order is needed to prevent a max-skew violation. E.g. if we @@ -129,8 +155,10 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*v1.Pod) ([]*Node, error) } } } - s.recordSchedulingResults(ctx, pods, q.List(), errors) - return s.nodes, nil + if !s.opts.SimulationMode { + s.recordSchedulingResults(ctx, pods, q.List(), errors) + } + return s.nodes, s.inflight, nil } func (s *Scheduler) recordSchedulingResults(ctx context.Context, pods []*v1.Pod, failedToSchedule []*v1.Pod, errors map[*v1.Pod]error) { diff --git a/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go b/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go index aec8d1433c13..a9d52427f322 100644 --- a/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go +++ b/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go @@ -120,7 +120,8 @@ func benchmarkScheduler(b *testing.B, instanceCount, podCount int) { &Topology{}, map[string][]cloudprovider.InstanceType{provisioner.Name: instanceTypes}, map[*scheduling.NodeTemplate]v1.ResourceList{}, - test.NewEventRecorder()) + test.NewEventRecorder(), + SchedulerOptions{}) pods := makeDiversePods(podCount) @@ -130,7 +131,7 @@ func benchmarkScheduler(b *testing.B, instanceCount, podCount int) { podsScheduledInRound1 := 0 nodesInRound1 := 0 for i := 0; i < b.N; i++ { - nodes, err := scheduler.Solve(ctx, pods) + nodes, _, err := scheduler.Solve(ctx, pods) if err != nil { b.FailNow() } diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go index 0224a51d3f8d..083e3114177f 100644 --- a/pkg/controllers/provisioning/scheduling/suite_test.go +++ b/pkg/controllers/provisioning/scheduling/suite_test.go @@ -17,6 +17,7 @@ package scheduling_test import ( "context" "fmt" + "k8s.io/apimachinery/pkg/util/clock" "math" "math/rand" "testing" @@ -51,6 +52,7 @@ var ctx context.Context var provisioner *v1alpha5.Provisioner var controller *provisioning.Controller var env *test.Environment +var fakeClock *clock.FakeClock var cloudProv *fake.CloudProvider var cluster *state.Cluster var nodeStateController *state.NodeController @@ -71,11 +73,14 @@ var _ = BeforeSuite(func() { instanceTypes, _ := cloudProv.GetInstanceTypes(ctx, nil) // set these on the cloud provider so we can manipulate them if needed cloudProv.InstanceTypes = instanceTypes - cluster = state.NewCluster(cfg, e.Client, cloudProv) + fakeClock = clock.NewFakeClock(time.Now()) + cluster = state.NewCluster(fakeClock, cfg, e.Client, cloudProv) nodeStateController = state.NewNodeController(e.Client, cluster) podStateController = state.NewPodController(e.Client, cluster) recorder = test.NewEventRecorder() - controller = provisioning.NewController(ctx, cfg, e.Client, corev1.NewForConfigOrDie(e.Config), recorder, cloudProv, cluster) + cfg = test.NewConfig() + prov := provisioning.NewProvisioner(ctx, cfg, e.Client, corev1.NewForConfigOrDie(e.Config), recorder, cloudProv, cluster) + controller = provisioning.NewController(e.Client, prov, recorder) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) @@ -4240,7 +4245,6 @@ func ExpectMaxSkew(ctx context.Context, c client.Client, namespace string, const for _, count := range skew { if count < minCount { minCount = count - } if count > maxCount { maxCount = count @@ -4248,28 +4252,3 @@ func ExpectMaxSkew(ctx context.Context, c client.Client, namespace string, const } return Expect(maxCount - minCount) } -func ExpectSkew(ctx context.Context, c client.Client, namespace string, constraint *v1.TopologySpreadConstraint) Assertion { - nodes := &v1.NodeList{} - Expect(c.List(ctx, nodes)).To(Succeed()) - pods := &v1.PodList{} - Expect(c.List(ctx, pods, scheduling.TopologyListOptions(namespace, constraint.LabelSelector))).To(Succeed()) - skew := map[string]int{} - for i, pod := range pods.Items { - if scheduling.IgnoredForTopology(&pods.Items[i]) { - continue - } - for _, node := range nodes.Items { - if pod.Spec.NodeName == node.Name { - switch constraint.TopologyKey { - case v1.LabelHostname: - skew[node.Name]++ // Check node name since hostname labels aren't applied - default: - if key, ok := node.Labels[constraint.TopologyKey]; ok { - skew[key]++ - } - } - } - } - } - return Expect(skew) -} diff --git a/pkg/controllers/provisioning/scheduling/topology.go b/pkg/controllers/provisioning/scheduling/topology.go index c93e4a6ba0b0..88329eb59c86 100644 --- a/pkg/controllers/provisioning/scheduling/topology.go +++ b/pkg/controllers/provisioning/scheduling/topology.go @@ -49,7 +49,10 @@ type Topology struct { inverseTopologies map[uint64]*TopologyGroup // The universe of domains by topology key domains map[string]utilsets.String - cluster *state.Cluster + // excludedPods are the pod UIDs of pods that are excluded from counting. This is used so we can simulate + // moving pods to prevent them from being double counted. + excludedPods sets.Set + cluster *state.Cluster } func NewTopology(ctx context.Context, kubeClient client.Client, cluster *state.Cluster, domains map[string]utilsets.String, pods []*v1.Pod) (*Topology, error) { @@ -59,7 +62,15 @@ func NewTopology(ctx context.Context, kubeClient client.Client, cluster *state.C domains: domains, topologies: map[uint64]*TopologyGroup{}, inverseTopologies: map[uint64]*TopologyGroup{}, + excludedPods: sets.NewSet(), } + + // these are the pods that we intend to schedule, so if they are currently in the cluster we shouldn't count them for + // topology purposes + for _, p := range pods { + t.excludedPods.Insert(string(p.UID)) + } + errs := t.updateInverseAffinities(ctx) for i := range pods { errs = multierr.Append(errs, t.Update(ctx, pods[i])) @@ -176,6 +187,10 @@ func (t *Topology) Register(topologyKey string, domain string) { func (t *Topology) updateInverseAffinities(ctx context.Context) error { var errs error t.cluster.ForPodsWithAntiAffinity(func(pod *v1.Pod, node *v1.Node) bool { + // don't count the pod we are excluding + if t.excludedPods.Has(string(pod.UID)) { + return true + } if err := t.updateInverseAntiAffinity(ctx, pod, node.Labels); err != nil { errs = multierr.Append(errs, fmt.Errorf("tracking existing pod anti-affinity, %w", err)) } @@ -232,6 +247,10 @@ func (t *Topology) countDomains(ctx context.Context, tg *TopologyGroup) error { if IgnoredForTopology(&pods[i]) { continue } + // pod is excluded for counting purposes + if t.excludedPods.Has(string(p.UID)) { + continue + } node := &v1.Node{} if err := t.kubeClient.Get(ctx, types.NamespacedName{Name: p.Spec.NodeName}, node); err != nil { return fmt.Errorf("getting node %s, %w", p.Spec.NodeName, err) diff --git a/pkg/controllers/provisioning/suite_test.go b/pkg/controllers/provisioning/suite_test.go index 808f71141039..f94782dd68bf 100644 --- a/pkg/controllers/provisioning/suite_test.go +++ b/pkg/controllers/provisioning/suite_test.go @@ -16,6 +16,7 @@ package provisioning_test import ( "context" + "k8s.io/apimachinery/pkg/util/clock" "testing" "time" @@ -42,6 +43,7 @@ import ( ) var ctx context.Context +var fakeClock *clock.FakeClock var controller *provisioning.Controller var env *test.Environment var recorder *test.EventRecorder @@ -59,13 +61,16 @@ var _ = BeforeSuite(func() { cloudProvider := &fake.CloudProvider{} recorder = test.NewEventRecorder() cfg = test.NewConfig() + recorder = test.NewEventRecorder() + fakeClock = clock.NewFakeClock(time.Now()) + cluster := state.NewCluster(fakeClock, cfg, e.Client, cloudProvider) + prov := provisioning.NewProvisioner(ctx, cfg, e.Client, corev1.NewForConfigOrDie(e.Config), recorder, cloudProvider, cluster) + controller = provisioning.NewController(e.Client, prov, recorder) instanceTypes, _ := cloudProvider.GetInstanceTypes(context.Background(), nil) instanceTypeMap = map[string]cloudprovider.InstanceType{} for _, it := range instanceTypes { instanceTypeMap[it.Name()] = it } - cluster := state.NewCluster(cfg, e.Client, cloudProvider) - controller = provisioning.NewController(ctx, cfg, e.Client, corev1.NewForConfigOrDie(e.Config), recorder, cloudProvider, cluster) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) diff --git a/pkg/controllers/state/cluster.go b/pkg/controllers/state/cluster.go index 1e9fbb305124..f011907a2cbe 100644 --- a/pkg/controllers/state/cluster.go +++ b/pkg/controllers/state/cluster.go @@ -19,11 +19,15 @@ import ( "fmt" "sort" "sync" + "sync/atomic" "time" + "k8s.io/apimachinery/pkg/util/clock" + + "go.uber.org/multierr" + "github.com/aws/aws-sdk-go/aws" "github.com/patrickmn/go-cache" - "go.uber.org/multierr" v1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -42,6 +46,7 @@ import ( type Cluster struct { kubeClient client.Client cloudProvider cloudprovider.CloudProvider + clock clock.Clock // Pod Specific Tracking antiAffinityPods sync.Map // mapping of pod namespaced name to *v1.Pod of pods that have required anti affinities @@ -52,9 +57,15 @@ type Cluster struct { mu sync.RWMutex nodes map[string]*Node // node name -> node bindings map[types.NamespacedName]string // pod namespaced named -> node name + + // consolidationState is a number indicating the state of the cluster with respect to consolidation. If this number + // hasn't changed, it indicates that the cluster hasn't changed in a state which would enable consolidation if + // it previously couldn't occur. + consolidationState int64 + lastNodeDeletionTime int64 } -func NewCluster(cfg config.Config, client client.Client, cp cloudprovider.CloudProvider) *Cluster { +func NewCluster(clk clock.Clock, cfg config.Config, client client.Client, cp cloudprovider.CloudProvider) *Cluster { // The nominationPeriod is how long we consider a node as 'likely to be used' after a pending pod was // nominated for it. This time can very depending on the batching window size + time spent scheduling // so we try to adjust based off the window size. @@ -64,6 +75,7 @@ func NewCluster(cfg config.Config, client client.Client, cp cloudprovider.CloudP } c := &Cluster{ + clock: clk, kubeClient: client, cloudProvider: cp, nominatedNodes: cache.New(nominationPeriod, 10*time.Second), @@ -288,6 +300,7 @@ func (c *Cluster) deleteNode(nodeName string) { c.mu.Lock() defer c.mu.Unlock() delete(c.nodes, nodeName) + c.recordConsolidationChange() } // updateNode is called for every node reconciliation @@ -301,13 +314,41 @@ func (c *Cluster) updateNode(ctx context.Context, node *v1.Node) error { return err } c.nodes[node.Name] = n + + if node.DeletionTimestamp != nil { + nodeDeletionTime := node.DeletionTimestamp.UnixMilli() + if nodeDeletionTime > atomic.LoadInt64(&c.lastNodeDeletionTime) { + atomic.StoreInt64(&c.lastNodeDeletionTime, nodeDeletionTime) + } + } return nil } +// ClusterConsolidationState returns a number representing the state of the cluster with respect to consolidation. If +// consolidation can't occur and this number hasn't changed, there is no point in re-attempting consolidation. This +// allows reducing overall CPU utilization by pausing consolidation when the cluster is in a static state. +func (c *Cluster) ClusterConsolidationState() int64 { + cs := atomic.LoadInt64(&c.consolidationState) + // If 5 minutes elapsed since the last time the consolidation state was changed, we change the state anyway. This + // ensures that at least once every 5 minutes we consider consolidating our cluster in case something else has + // changed (e.g. instance type availability) that we can't detect which would allow consolidation to occur. + if c.clock.Now().After(time.UnixMilli(cs).Add(5 * time.Minute)) { + c.recordConsolidationChange() + return atomic.LoadInt64(&c.consolidationState) + } + return cs +} + +// LastNodeDeletionTime returns the last time that at a node was marked for deletion. +func (c *Cluster) LastNodeDeletionTime() time.Time { + return time.UnixMilli(atomic.LoadInt64(&c.lastNodeDeletionTime)) +} + // deletePod is called when the pod has been deleted func (c *Cluster) deletePod(podKey types.NamespacedName) { c.antiAffinityPods.Delete(podKey) c.updateNodeUsageFromPodDeletion(podKey) + c.recordConsolidationChange() } func (c *Cluster) updateNodeUsageFromPodDeletion(podKey types.NamespacedName) { @@ -392,9 +433,12 @@ func (c *Cluster) updateNodeUsageFromPod(ctx context.Context, pod *v1.Pod) error delete(n.podRequests, podKey) delete(n.podLimits, podKey) } + } else { + // new pod binding has occurred + c.recordConsolidationChange() } - // we have noticed that the pod is bound to a node and didn't know about the binding before + // did we notice that the pod is bound to a node and didn't know about the node before? n, ok := c.nodes[pod.Spec.NodeName] if !ok { var node v1.Node @@ -450,3 +494,7 @@ func (c *Cluster) populateInstanceType(ctx context.Context, node *v1.Node, n *No } return fmt.Errorf("instance type '%s' not found", instanceTypeName) } + +func (c *Cluster) recordConsolidationChange() { + atomic.StoreInt64(&c.consolidationState, c.clock.Now().UnixMilli()) +} diff --git a/pkg/controllers/state/suite_test.go b/pkg/controllers/state/suite_test.go index 922722995451..58f2c27f95a9 100644 --- a/pkg/controllers/state/suite_test.go +++ b/pkg/controllers/state/suite_test.go @@ -17,8 +17,10 @@ package state_test import ( "context" "fmt" + "k8s.io/apimachinery/pkg/util/clock" "math/rand" "testing" + "time" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" @@ -43,6 +45,7 @@ import ( var ctx context.Context var cfg *test.Config var env *test.Environment +var fakeClock *clock.FakeClock var cluster *state.Cluster var nodeController *state.NodeController var podController *state.PodController @@ -67,7 +70,8 @@ var _ = AfterSuite(func() { var _ = BeforeEach(func() { cloudProvider = &fake.CloudProvider{InstanceTypes: fake.InstanceTypesAssorted()} - cluster = state.NewCluster(cfg, env.Client, cloudProvider) + fakeClock = clock.NewFakeClock(time.Now()) + cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) nodeController = state.NewNodeController(env.Client, cluster) podController = state.NewPodController(env.Client, cluster) provisioner = test.Provisioner(test.ProvisionerOptions{ObjectMeta: metav1.ObjectMeta{Name: "default"}}) diff --git a/pkg/controllers/termination/controller.go b/pkg/controllers/termination/controller.go index 35c917e5550a..b0fd225e6782 100644 --- a/pkg/controllers/termination/controller.go +++ b/pkg/controllers/termination/controller.go @@ -19,6 +19,8 @@ import ( "fmt" "time" + "k8s.io/apimachinery/pkg/util/clock" + "golang.org/x/time/rate" "knative.dev/pkg/logging" @@ -49,7 +51,7 @@ type Controller struct { } // NewController constructs a controller instance -func NewController(ctx context.Context, kubeClient client.Client, coreV1Client corev1.CoreV1Interface, recorder events.Recorder, cloudProvider cloudprovider.CloudProvider) *Controller { +func NewController(ctx context.Context, clk clock.Clock, kubeClient client.Client, coreV1Client corev1.CoreV1Interface, recorder events.Recorder, cloudProvider cloudprovider.CloudProvider) *Controller { return &Controller{ KubeClient: kubeClient, Terminator: &Terminator{ @@ -57,6 +59,7 @@ func NewController(ctx context.Context, kubeClient client.Client, coreV1Client c CoreV1Client: coreV1Client, CloudProvider: cloudProvider, EvictionQueue: NewEvictionQueue(ctx, coreV1Client, recorder), + Clock: clk, }, Recorder: recorder, } diff --git a/pkg/controllers/termination/eviction.go b/pkg/controllers/termination/eviction.go index f624ce61c0fe..e9cabbe78ee0 100644 --- a/pkg/controllers/termination/eviction.go +++ b/pkg/controllers/termination/eviction.go @@ -112,6 +112,5 @@ func (e *EvictionQueue) evict(ctx context.Context, nn types.NamespacedName) bool logging.FromContext(ctx).Error(err) return false } - logging.FromContext(ctx).Debug("Evicted pod") return true } diff --git a/pkg/controllers/termination/suite_test.go b/pkg/controllers/termination/suite_test.go index 7eabb549d8f8..3ef2782b243b 100644 --- a/pkg/controllers/termination/suite_test.go +++ b/pkg/controllers/termination/suite_test.go @@ -17,6 +17,7 @@ package termination_test import ( "context" "fmt" + "k8s.io/apimachinery/pkg/util/clock" "testing" "time" @@ -25,7 +26,6 @@ import ( "github.com/aws/karpenter/pkg/controllers/termination" "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter/pkg/utils/functional" - "github.com/aws/karpenter/pkg/utils/injectabletime" "sigs.k8s.io/controller-runtime/pkg/client" . "github.com/aws/karpenter/pkg/test/expectations" @@ -43,6 +43,7 @@ var ctx context.Context var controller *termination.Controller var evictionQueue *termination.EvictionQueue var env *test.Environment +var fakeClock *clock.FakeClock func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) @@ -51,6 +52,7 @@ func TestAPIs(t *testing.T) { } var _ = BeforeSuite(func() { + fakeClock = clock.NewFakeClock(time.Now()) env = test.NewEnvironment(ctx, func(e *test.Environment) { cloudProvider := &fake.CloudProvider{} coreV1Client := corev1.NewForConfigOrDie(e.Config) @@ -63,6 +65,7 @@ var _ = BeforeSuite(func() { CoreV1Client: coreV1Client, CloudProvider: cloudProvider, EvictionQueue: evictionQueue, + Clock: fakeClock, }, Recorder: recorder, } @@ -83,7 +86,7 @@ var _ = Describe("Termination", func() { AfterEach(func() { ExpectCleanedUp(ctx, env.Client) - injectabletime.Now = time.Now + fakeClock.SetTime(time.Now()) }) Context("Reconciliation", func() { @@ -223,7 +226,7 @@ var _ = Describe("Termination", func() { ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(node)) node = ExpectNodeExists(ctx, env.Client, node.Name) // Simulate stuck terminating - injectabletime.Now = func() time.Time { return time.Now().Add(1 * time.Minute) } + fakeClock.Step(1 * time.Minute) ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(node)) ExpectNotFound(ctx, env.Client, node) }) @@ -382,7 +385,7 @@ var _ = Describe("Termination", func() { ExpectEvicted(env.Client, pod) // After grace period, node should delete - injectabletime.Now = func() time.Time { return time.Now().Add(30 * time.Second) } + fakeClock.Step(31 * time.Second) ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(node)) ExpectNotFound(ctx, env.Client, node) }) diff --git a/pkg/controllers/termination/terminate.go b/pkg/controllers/termination/terminate.go index 8e707f98a37e..9e4825c069d1 100644 --- a/pkg/controllers/termination/terminate.go +++ b/pkg/controllers/termination/terminate.go @@ -19,6 +19,8 @@ import ( "errors" "fmt" + "k8s.io/apimachinery/pkg/util/clock" + v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" corev1 "k8s.io/client-go/kubernetes/typed/core/v1" @@ -29,8 +31,7 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/scheduling" "github.com/aws/karpenter/pkg/utils/functional" - "github.com/aws/karpenter/pkg/utils/injectabletime" - "github.com/aws/karpenter/pkg/utils/pod" + podutil "github.com/aws/karpenter/pkg/utils/pod" "github.com/aws/karpenter/pkg/utils/ptr" ) @@ -39,6 +40,7 @@ type Terminator struct { KubeClient client.Client CoreV1Client corev1.CoreV1Interface CloudProvider cloudprovider.CloudProvider + Clock clock.Clock } type NodeDrainErr error @@ -77,7 +79,7 @@ func (t *Terminator) drain(ctx context.Context, node *v1.Node) (bool, error) { // if a pod doesn't have owner references then we can't expect a controller to manage its lifecycle if len(pod.ObjectMeta.OwnerReferences) == 0 { return false, NodeDrainErr(fmt.Errorf("pod %s/%s does not have any owner references", pod.Namespace, pod.Name)) - } else if val := pod.Annotations[v1alpha5.DoNotEvictPodAnnotationKey]; val == "true" { + } else if podutil.HasDoNotEvict(pod) { return false, NodeDrainErr(fmt.Errorf("pod %s/%s has do-not-evict annotation", pod.Namespace, pod.Name)) } } @@ -118,11 +120,11 @@ func (t *Terminator) getPods(ctx context.Context, node *v1.Node) ([]*v1.Pod, err continue } // Ignore if kubelet is partitioned and pods are beyond graceful termination window - if IsStuckTerminating(ptr.Pod(p)) { + if t.isStuckTerminating(ptr.Pod(p)) { continue } // Ignore static mirror pods - if pod.IsOwnedByNode(ptr.Pod(p)) { + if podutil.IsOwnedByNode(ptr.Pod(p)) { continue } pods = append(pods, ptr.Pod(p)) @@ -152,9 +154,9 @@ func (t *Terminator) evict(pods []*v1.Pod) { } } -func IsStuckTerminating(pod *v1.Pod) bool { +func (t *Terminator) isStuckTerminating(pod *v1.Pod) bool { if pod.DeletionTimestamp == nil { return false } - return injectabletime.Now().After(pod.DeletionTimestamp.Time) + return t.Clock.Now().After(pod.DeletionTimestamp.Time) } diff --git a/pkg/events/dedupe.go b/pkg/events/dedupe.go index 312bade3c771..31aa1447e69a 100644 --- a/pkg/events/dedupe.go +++ b/pkg/events/dedupe.go @@ -34,29 +34,52 @@ type dedupe struct { cache *cache.Cache } +func (d *dedupe) WaitingOnReadinessForConsolidation(node *v1.Node) { + if !d.shouldCreateEvent(fmt.Sprintf("wait-node-consolidate-%s", node.UID)) { + return + } + d.rec.WaitingOnReadinessForConsolidation(node) +} + +func (d *dedupe) TerminatingNodeForConsolidation(node *v1.Node, reason string) { + if !d.shouldCreateEvent(fmt.Sprintf("terminate-node-consolidate-%s-%s", node.UID, reason)) { + return + } + d.rec.TerminatingNodeForConsolidation(node, reason) +} + +func (d *dedupe) LaunchingNodeForConsolidation(node *v1.Node, reason string) { + if !d.shouldCreateEvent(fmt.Sprintf("launch-node-consolidate-%s-%s", node.UID, reason)) { + return + } + d.rec.LaunchingNodeForConsolidation(node, reason) +} + func (d *dedupe) NominatePod(pod *v1.Pod, node *v1.Node) { - key := fmt.Sprintf("nominate-node-%s-%s", pod.Name, node.Name) - if _, exists := d.cache.Get(key); exists { + if !d.shouldCreateEvent(fmt.Sprintf("nominate-node-%s-%s", pod.UID, node.UID)) { return } - d.cache.SetDefault(key, nil) d.rec.NominatePod(pod, node) } func (d *dedupe) PodFailedToSchedule(pod *v1.Pod, err error) { - key := fmt.Sprintf("failed-to-schedule-%s-%s", pod.Name, err.Error()) - if _, exists := d.cache.Get(key); exists { + if !d.shouldCreateEvent(fmt.Sprintf("failed-to-schedule-%s-%s", pod.UID, err)) { return } - d.cache.SetDefault(key, nil) d.rec.PodFailedToSchedule(pod, err) } func (d *dedupe) NodeFailedToDrain(node *v1.Node, err error) { - key := fmt.Sprintf("failed-to-drain-%s", node.Name) - if _, exists := d.cache.Get(key); exists { + if !d.shouldCreateEvent(fmt.Sprintf("failed-to-drain-%s", node.Name)) { return } - d.cache.SetDefault(key, nil) d.rec.NodeFailedToDrain(node, err) } + +func (d *dedupe) shouldCreateEvent(key string) bool { + if _, exists := d.cache.Get(key); exists { + return false + } + d.cache.SetDefault(key, nil) + return true +} \ No newline at end of file diff --git a/pkg/events/recorder.go b/pkg/events/recorder.go index a84f1df53ae3..106d981726f4 100644 --- a/pkg/events/recorder.go +++ b/pkg/events/recorder.go @@ -29,6 +29,14 @@ type Recorder interface { PodFailedToSchedule(*v1.Pod, error) // NodeFailedToDrain is called when a pod causes a node draining to fail NodeFailedToDrain(*v1.Node, error) + // TerminatingNodeForConsolidation is called just before terminating the node due to consolidation with a user + // presentable string describing the consolidation operation + TerminatingNodeForConsolidation(node *v1.Node, reason string) + // LaunchingNodeForConsolidation is called with the new node that was just created due to a consolidation operation. + LaunchingNodeForConsolidation(v *v1.Node, reason string) + // WaitingOnReadinessForConsolidation is called when consolidation is waiting on a node to become ready prior to + // continuing consolidation + WaitingOnReadinessForConsolidation(v *v1.Node) } type recorder struct { @@ -39,6 +47,18 @@ func NewRecorder(rec record.EventRecorder) Recorder { return &recorder{rec: rec} } +func (r recorder) WaitingOnReadinessForConsolidation(node *v1.Node) { + r.rec.Eventf(node, "Normal", "ConsolidateWaiting", "Waiting on readiness to continue consolidation") +} + +func (r recorder) TerminatingNodeForConsolidation(node *v1.Node, reason string) { + r.rec.Eventf(node, "Normal", "ConsolidateTerminateNode", "Consolidating node via %s", reason) +} + +func (r recorder) LaunchingNodeForConsolidation(node *v1.Node, reason string) { + r.rec.Eventf(node, "Normal", "ConsolidateLaunchNode", "Launching node for %s", reason) +} + func (r recorder) NominatePod(pod *v1.Pod, node *v1.Node) { r.rec.Eventf(pod, "Normal", "NominatePod", "Pod should schedule on %s", node.Name) } diff --git a/pkg/test/eventrecorder.go b/pkg/test/eventrecorder.go index 39c1de71b6d6..8abc130b529e 100644 --- a/pkg/test/eventrecorder.go +++ b/pkg/test/eventrecorder.go @@ -36,6 +36,10 @@ func NewEventRecorder() *EventRecorder { return &EventRecorder{} } +func (e *EventRecorder) WaitingOnReadinessForConsolidation(v *v1.Node) {} +func (e *EventRecorder) TerminatingNodeForConsolidation(node *v1.Node, reason string) {} +func (e *EventRecorder) LaunchingNodeForConsolidation(node *v1.Node, reason string) {} + func (e *EventRecorder) NominatePod(pod *v1.Pod, node *v1.Node) { e.mu.Lock() defer e.mu.Unlock() diff --git a/pkg/test/expectations/expectations.go b/pkg/test/expectations/expectations.go index c7730ea425a6..aa8a4a08604d 100644 --- a/pkg/test/expectations/expectations.go +++ b/pkg/test/expectations/expectations.go @@ -21,13 +21,9 @@ import ( "sync" "time" - "github.com/aws/karpenter/pkg/test" - "github.com/onsi/ginkgo" . "github.com/onsi/gomega" //nolint:revive,stylecheck prometheus "github.com/prometheus/client_model/go" - "sigs.k8s.io/controller-runtime/pkg/metrics" - appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" "k8s.io/api/policy/v1beta1" @@ -36,10 +32,13 @@ import ( "k8s.io/apimachinery/pkg/types" "knative.dev/pkg/ptr" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/controllers/provisioning" + "github.com/aws/karpenter/pkg/controllers/provisioning/scheduling" + "github.com/aws/karpenter/pkg/test" ) const ( @@ -64,7 +63,7 @@ func ExpectNotFound(ctx context.Context, c client.Client, objects ...client.Obje Eventually(func() bool { return errors.IsNotFound(c.Get(ctx, types.NamespacedName{Name: object.GetName(), Namespace: object.GetNamespace()}, object)) }, ReconcilerPropagationTime, RequestInterval).Should(BeTrue(), func() string { - return fmt.Sprintf("expected %s to be deleted, but it still exists", object.GetSelfLink()) + return fmt.Sprintf("expected %s to be deleted, but it still exists", client.ObjectKeyFromObject(object)) }) } } @@ -216,3 +215,29 @@ func ExpectManualBinding(ctx context.Context, c client.Client, pod *v1.Pod, node }, })).To(Succeed()) } + +func ExpectSkew(ctx context.Context, c client.Client, namespace string, constraint *v1.TopologySpreadConstraint) Assertion { + nodes := &v1.NodeList{} + Expect(c.List(ctx, nodes)).To(Succeed()) + pods := &v1.PodList{} + Expect(c.List(ctx, pods, scheduling.TopologyListOptions(namespace, constraint.LabelSelector))).To(Succeed()) + skew := map[string]int{} + for i, pod := range pods.Items { + if scheduling.IgnoredForTopology(&pods.Items[i]) { + continue + } + for _, node := range nodes.Items { + if pod.Spec.NodeName == node.Name { + switch constraint.TopologyKey { + case v1.LabelHostname: + skew[node.Name]++ // Check node name since hostname labels aren't applied + default: + if key, ok := node.Labels[constraint.TopologyKey]; ok { + skew[key]++ + } + } + } + } + } + return Expect(skew) +} diff --git a/pkg/test/nodes.go b/pkg/test/nodes.go index fd168159c541..88463876ccdb 100644 --- a/pkg/test/nodes.go +++ b/pkg/test/nodes.go @@ -50,6 +50,7 @@ func Node(overrides ...NodeOptions) *v1.Node { }, Status: v1.NodeStatus{ Allocatable: options.Allocatable, + Capacity: options.Allocatable, Conditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: options.ReadyStatus, Reason: options.ReadyReason}}, }, } diff --git a/pkg/test/pods.go b/pkg/test/pods.go index 99b93571f441..440c35c1b920 100644 --- a/pkg/test/pods.go +++ b/pkg/test/pods.go @@ -19,7 +19,7 @@ import ( "github.com/imdario/mergo" v1 "k8s.io/api/core/v1" - "k8s.io/api/policy/v1beta1" + policyv1 "k8s.io/api/policy/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" ) @@ -45,6 +45,7 @@ type PodOptions struct { PersistentVolumeClaims []string Conditions []v1.PodCondition Phase v1.PodPhase + RestartPolicy v1.RestartPolicy } type PDBOptions struct { @@ -52,6 +53,7 @@ type PDBOptions struct { Labels map[string]string MinAvailable *intstr.IntOrString MaxUnavailable *intstr.IntOrString + Status *policyv1.PodDisruptionBudgetStatus } // Pod creates a test pod with defaults that can be overridden by PodOptions. @@ -92,6 +94,7 @@ func Pod(overrides ...PodOptions) *v1.Pod { NodeName: options.NodeName, Volumes: volumes, PriorityClassName: options.PriorityClassName, + RestartPolicy: options.RestartPolicy, }, Status: v1.PodStatus{ Conditions: options.Conditions, @@ -137,29 +140,34 @@ func UnschedulablePod(options ...PodOptions) *v1.Pod { } // PodDisruptionBudget creates a PodDisruptionBudget. To function properly, it should have its status applied -func PodDisruptionBudget(overrides ...PDBOptions) *v1beta1.PodDisruptionBudget { +func PodDisruptionBudget(overrides ...PDBOptions) *policyv1.PodDisruptionBudget { options := PDBOptions{} for _, opts := range overrides { if err := mergo.Merge(&options, opts, mergo.WithOverride); err != nil { panic(fmt.Sprintf("Failed to merge pdb options: %s", err)) } } - return &v1beta1.PodDisruptionBudget{ + status := policyv1.PodDisruptionBudgetStatus{ + // To be considered for application by eviction, the Status.ObservedGeneration must be >= the PDB generation. + // kube-controller-manager normally sets ObservedGeneration, but we don't have one when running under + // EnvTest. If this isn't modified the eviction controller assumes that the PDB hasn't been processed + // by the disruption controller yet and adds a 10 second retry to our evict() call + ObservedGeneration: 1, + } + if options.Status != nil { + status = *options.Status + } + + return &policyv1.PodDisruptionBudget{ ObjectMeta: ObjectMeta(options.ObjectMeta), - Spec: v1beta1.PodDisruptionBudgetSpec{ + Spec: policyv1.PodDisruptionBudgetSpec{ MinAvailable: options.MinAvailable, Selector: &metav1.LabelSelector{ MatchLabels: options.Labels, }, MaxUnavailable: options.MaxUnavailable, }, - Status: v1beta1.PodDisruptionBudgetStatus{ - // To be considered for application by eviction, the Status.ObservedGeneration must be >= the PDB generation. - // kube-controller-manager normally sets ObservedGeneration, but we don't have one when running under - // EnvTest. If this isn't modified the eviction controller assumes that the PDB hasn't been processed - // by the disruption controller yet and adds a 10 second retry to our evict() call - ObservedGeneration: 1, - }, + Status: status, } } diff --git a/pkg/test/provisioner.go b/pkg/test/provisioner.go index c427d5e09b9e..e7dce025bfb0 100644 --- a/pkg/test/provisioner.go +++ b/pkg/test/provisioner.go @@ -33,16 +33,18 @@ import ( // ProvisionerOptions customizes a Provisioner. type ProvisionerOptions struct { metav1.ObjectMeta - Limits v1.ResourceList - Provider interface{} - ProviderRef *v1alpha5.ProviderRef - Kubelet *v1alpha5.KubeletConfiguration - Labels map[string]string - Taints []v1.Taint - StartupTaints []v1.Taint - Requirements []v1.NodeSelectorRequirement - Status v1alpha5.ProvisionerStatus - TTLSecondsAfterEmpty *int64 + Limits v1.ResourceList + Provider interface{} + ProviderRef *v1alpha5.ProviderRef + Kubelet *v1alpha5.KubeletConfiguration + Labels map[string]string + Taints []v1.Taint + StartupTaints []v1.Taint + Requirements []v1.NodeSelectorRequirement + Status v1alpha5.ProvisionerStatus + TTLSecondsUntilExpired *int64 + TTLSecondsAfterEmpty *int64 + Consolidation *v1alpha5.Consolidation } // Provisioner creates a test provisioner with defaults that can be overridden by ProvisionerOptions. @@ -60,21 +62,26 @@ func Provisioner(overrides ...ProvisionerOptions) *v1alpha5.Provisioner { if options.Limits == nil { options.Limits = v1.ResourceList{v1.ResourceCPU: resource.MustParse("1000")} } - if options.TTLSecondsAfterEmpty == nil { + // mutually exclusive + if options.Consolidation != nil { + options.TTLSecondsAfterEmpty = nil + } else if options.TTLSecondsAfterEmpty == nil { options.TTLSecondsAfterEmpty = ptr.Int64(10) } provisioner := &v1alpha5.Provisioner{ ObjectMeta: ObjectMeta(options.ObjectMeta), Spec: v1alpha5.ProvisionerSpec{ - Requirements: options.Requirements, - KubeletConfiguration: options.Kubelet, - ProviderRef: options.ProviderRef, - Taints: options.Taints, - StartupTaints: options.StartupTaints, - Labels: options.Labels, - Limits: &v1alpha5.Limits{Resources: options.Limits}, - TTLSecondsAfterEmpty: options.TTLSecondsAfterEmpty, + Requirements: options.Requirements, + KubeletConfiguration: options.Kubelet, + ProviderRef: options.ProviderRef, + Taints: options.Taints, + StartupTaints: options.StartupTaints, + Labels: options.Labels, + Limits: &v1alpha5.Limits{Resources: options.Limits}, + TTLSecondsAfterEmpty: options.TTLSecondsAfterEmpty, + TTLSecondsUntilExpired: options.TTLSecondsUntilExpired, + Consolidation: options.Consolidation, }, Status: options.Status, } diff --git a/pkg/test/replicaset.go b/pkg/test/replicaset.go new file mode 100644 index 000000000000..06a9e96f4b50 --- /dev/null +++ b/pkg/test/replicaset.go @@ -0,0 +1,64 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package test + +import ( + "fmt" + "strings" + + "github.com/Pallinder/go-randomdata" + "github.com/imdario/mergo" + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ReplicaSetOptions customizes a ReplicaSet. +type ReplicaSetOptions struct { + metav1.ObjectMeta + Selector map[string]string + PodOptions PodOptions +} + +// ReplicaSet creates a test ReplicaSet with defaults that can be overridden by ReplicaSetOptions. +// Overrides are applied in order, with a last write wins semantic. +func ReplicaSet(overrides ...ReplicaSetOptions) *appsv1.ReplicaSet { + options := ReplicaSetOptions{} + for _, opts := range overrides { + if err := mergo.Merge(&options, opts, mergo.WithOverride); err != nil { + panic(fmt.Sprintf("Failed to merge pod options: %s", err)) + } + } + if options.Name == "" { + options.Name = strings.ToLower(randomdata.SillyName()) + } + if options.Namespace == "" { + options.Namespace = "default" + } + if options.Selector == nil { + options.Selector = map[string]string{"app": options.Name} + } + return &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{Name: options.Name, Namespace: options.Namespace}, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: options.Selector}, + Template: v1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Labels: options.Selector}, + Spec: Pod(options.PodOptions).Spec, + }, + }, + } +} diff --git a/pkg/utils/injectabletime/time.go b/pkg/utils/injectabletime/time.go deleted file mode 100644 index 1a804f55bfa0..000000000000 --- a/pkg/utils/injectabletime/time.go +++ /dev/null @@ -1,20 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package injectabletime - -import "time" - -// Now is a time.Now() that may be mocked by tests. -var Now = time.Now diff --git a/pkg/utils/pod/scheduling.go b/pkg/utils/pod/scheduling.go index 5d430307f7e7..e11b5dc7ad66 100644 --- a/pkg/utils/pod/scheduling.go +++ b/pkg/utils/pod/scheduling.go @@ -17,6 +17,8 @@ package pod import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" ) func IsProvisionable(pod *v1.Pod) bool { @@ -65,6 +67,10 @@ func IsOwnedByNode(pod *v1.Pod) bool { }) } +func IsNotOwned(pod *v1.Pod) bool { + return len(pod.ObjectMeta.OwnerReferences) == 0 +} + func IsOwnedBy(pod *v1.Pod, gvks []schema.GroupVersionKind) bool { for _, ignoredOwner := range gvks { for _, owner := range pod.ObjectMeta.OwnerReferences { @@ -76,6 +82,13 @@ func IsOwnedBy(pod *v1.Pod, gvks []schema.GroupVersionKind) bool { return false } +func HasDoNotEvict(pod *v1.Pod) bool { + if pod.Annotations == nil { + return false + } + return pod.Annotations[v1alpha5.DoNotEvictPodAnnotationKey] == "true" +} + // HasRequiredPodAntiAffinity returns true if a non-empty PodAntiAffinity/RequiredDuringSchedulingIgnoredDuringExecution // is defined in the pod spec func HasRequiredPodAntiAffinity(pod *v1.Pod) bool { diff --git a/test/pkg/environment/expectations.go b/test/pkg/environment/expectations.go index ee4d07408413..c7924359dd15 100644 --- a/test/pkg/environment/expectations.go +++ b/test/pkg/environment/expectations.go @@ -3,11 +3,12 @@ package environment import ( "fmt" "sync" + "time" "github.com/samber/lo" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" - "k8s.io/api/policy/v1beta1" + policyv1 "k8s.io/api/policy/v1" storagev1 "k8s.io/api/storage/v1" "k8s.io/apimachinery/pkg/labels" "knative.dev/pkg/ptr" @@ -28,7 +29,7 @@ var ( &v1.Pod{}, &appsv1.Deployment{}, &appsv1.DaemonSet{}, - &v1beta1.PodDisruptionBudget{}, + &policyv1.PodDisruptionBudget{}, &v1.PersistentVolumeClaim{}, &v1.PersistentVolume{}, &storagev1.StorageClass{}, @@ -67,6 +68,15 @@ func (env *Environment) ExpectCreated(objects ...client.Object) { } } +func (env *Environment) ExpectUpdate(objects ...client.Object) { + for _, o := range objects { + current := o.DeepCopyObject().(client.Object) + Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(current), current)).To(Succeed()) + o.SetResourceVersion(current.GetResourceVersion()) + Expect(env.Client.Update(env.Context, o)).To(Succeed()) + } +} + func (env *Environment) ExpectDeleted(objects ...client.Object) { for _, object := range objects { Expect(env.Client.Delete(env, object, &client.DeleteOptions{GracePeriodSeconds: ptr.Int64(0)})).To(Succeed()) @@ -129,3 +139,15 @@ func (env *Environment) ExpectNoCrashes() { fmt.Sprintf("expected restart count of %s = 0, had %d", name, restartCount)) } } + +func (env *Environment) EventuallyExpectMinUtilization(resource v1.ResourceName, comparator string, value float64) { + Eventually(func(g Gomega) { + g.Expect(env.Monitor.MinUtilization(resource)).To(BeNumerically(comparator, value)) + }).Should(Succeed()) +} + +func (env *Environment) EventuallyExpectAvgUtilization(resource v1.ResourceName, comparator string, value float64) { + Eventually(func(g Gomega) { + g.Expect(env.Monitor.AvgUtilization(resource)).To(BeNumerically(comparator, value)) + }, 10*time.Minute).Should(Succeed()) +} diff --git a/test/pkg/environment/monitor.go b/test/pkg/environment/monitor.go index bebdd26b35c6..c49e5c2e9847 100644 --- a/test/pkg/environment/monitor.go +++ b/test/pkg/environment/monitor.go @@ -3,15 +3,16 @@ package environment import ( "context" "fmt" + "math" "sync" - "time" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/sets" - - v1 "k8s.io/api/core/v1" "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/aws/karpenter/pkg/utils/resources" ) // Monitor is used to monitor the cluster state during a running test @@ -20,13 +21,14 @@ type Monitor struct { kubeClient client.Client mu sync.RWMutex - recordings []recording nodesSeen sets.String numberNodesAtReset int } -type recording struct { - nodes v1.NodeList - pods v1.PodList +type state struct { + pods v1.PodList + nodes map[string]*v1.Node // node name -> node + nodePods map[string][]*v1.Pod // node name -> pods bound to the node + nodeRequests map[string]v1.ResourceList // node name -> sum of pod resource requests } func NewClusterMonitor(ctx context.Context, kubeClient client.Client) *Monitor { @@ -36,24 +38,12 @@ func NewClusterMonitor(ctx context.Context, kubeClient client.Client) *Monitor { nodesSeen: sets.NewString(), } m.Reset() - - go func() { - for { - select { - case <-ctx.Done(): - return - case <-time.After(1 * time.Second): - m.poll() - } - } - }() return m } // Reset resets the cluster monitor prior to running a test. func (m *Monitor) Reset() { m.mu.Lock() - m.recordings = nil m.nodesSeen = map[string]sets.Empty{} m.mu.Unlock() m.poll() @@ -63,13 +53,12 @@ func (m *Monitor) Reset() { // RestartCount returns the containers and number of restarts for that container for all containers in the pods in the // given namespace func (m *Monitor) RestartCount() map[string]int { - m.poll() + st := m.poll() m.mu.RLock() defer m.mu.RUnlock() restarts := map[string]int{} - last := m.recordings[len(m.recordings)-1] - for _, pod := range last.pods.Items { + for _, pod := range st.pods.Items { if pod.Namespace != "karpenter" { continue } @@ -83,11 +72,7 @@ func (m *Monitor) RestartCount() map[string]int { // NodeCount returns the current number of nodes func (m *Monitor) NodeCount() int { - m.poll() - m.mu.RLock() - defer m.mu.RUnlock() - last := m.recordings[len(m.recordings)-1] - return len(last.nodes.Items) + return len(m.poll().nodes) } // NodeCountAtReset returns the number of nodes that were running when the monitor was last reset, typically at the @@ -113,12 +98,8 @@ func (m *Monitor) CreatedNodes() int { // RunningPods returns the number of running pods matching the given selector func (m *Monitor) RunningPods(selector labels.Selector) int { - m.poll() - m.mu.RLock() - defer m.mu.RUnlock() - last := m.recordings[len(m.recordings)-1] count := 0 - for _, pod := range last.pods.Items { + for _, pod := range m.poll().pods.Items { if pod.Status.Phase != v1.PodRunning { continue } @@ -129,7 +110,7 @@ func (m *Monitor) RunningPods(selector labels.Selector) int { return count } -func (m *Monitor) poll() { +func (m *Monitor) poll() state { var nodes v1.NodeList if err := m.kubeClient.List(m.ctx, &nodes); err != nil { logging.FromContext(m.ctx).Errorf("listing nodes, %s", err) @@ -138,18 +119,65 @@ func (m *Monitor) poll() { if err := m.kubeClient.List(m.ctx, &pods); err != nil { logging.FromContext(m.ctx).Errorf("listing pods, %s", err) } - m.record(nodes, pods) -} -func (m *Monitor) record(nodes v1.NodeList, pods v1.PodList) { m.mu.Lock() - defer m.mu.Unlock() - m.recordings = append(m.recordings, recording{ - nodes: nodes, - pods: pods, - }) - for _, node := range nodes.Items { m.nodesSeen.Insert(node.Name) } + m.mu.Unlock() + + st := state{ + nodes: map[string]*v1.Node{}, + pods: pods, + nodePods: map[string][]*v1.Pod{}, + nodeRequests: map[string]v1.ResourceList{}, + } + for i := range nodes.Items { + st.nodes[nodes.Items[i].Name] = &nodes.Items[i] + } + + // collect pods per node + for i := range pods.Items { + pod := &pods.Items[i] + if pod.Spec.NodeName == "" { + continue + } + st.nodePods[pod.Spec.NodeName] = append(st.nodePods[pod.Spec.NodeName], pod) + } + + for _, n := range nodes.Items { + st.nodeRequests[n.Name] = resources.RequestsForPods(st.nodePods[n.Name]...) + } + return st +} + +func (m *Monitor) AvgUtilization(resource v1.ResourceName) float64 { + utilization := m.nodeUtilization(resource) + sum := 0.0 + for _, v := range utilization { + sum += v + } + return sum / float64(len(utilization)) +} + +func (m *Monitor) MinUtilization(resource v1.ResourceName) float64 { + min := math.MaxFloat64 + for _, v := range m.nodeUtilization(resource) { + min = math.Min(v, min) + } + return min +} + +func (m *Monitor) nodeUtilization(resource v1.ResourceName) []float64 { + st := m.poll() + var utilization []float64 + for nodeName, requests := range st.nodeRequests { + allocatable := st.nodes[nodeName].Status.Allocatable[resource] + if allocatable.IsZero() { + continue + } + requested := requests[resource] + utilization = append(utilization, requested.AsApproximateFloat64()/allocatable.AsApproximateFloat64()) + } + return utilization } diff --git a/test/suites/consolidation/suite_test.go b/test/suites/consolidation/suite_test.go new file mode 100644 index 000000000000..cb450222cbe8 --- /dev/null +++ b/test/suites/consolidation/suite_test.go @@ -0,0 +1,225 @@ +package consolidation + +import ( + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" + "github.com/aws/karpenter/pkg/test" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "strings" + "testing" + + "github.com/aws/karpenter/test/pkg/environment" + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +var env *environment.Environment + +func TestConsolidation(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + var err error + env, err = environment.NewEnvironment(t) + Expect(err).ToNot(HaveOccurred()) + }) + RunSpecs(t, "Consolidation") +} + +var _ = BeforeEach(func() { + env.BeforeEach() +}) + +var _ = AfterEach(func() { + env.AfterEach() +}) + +var _ = Describe("Consolidation", func() { + It("should consolidate nodes (delete)", func() { + provider := test.AWSNodeTemplate(test.AWSNodeTemplateOptions{AWS: v1alpha1.AWS{ + SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.Options.EnvironmentName}, + SubnetSelector: map[string]string{"karpenter.sh/discovery": env.Options.EnvironmentName}, + }}) + provisioner := test.Provisioner(test.ProvisionerOptions{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1alpha5.LabelCapacityType, + Operator: v1.NodeSelectorOpIn, + // we don't shrink spot nodes, so this forces us to only delete nodes + Values: []string{"spot"}, + }, + { + Key: v1alpha1.LabelInstanceSize, + Operator: v1.NodeSelectorOpIn, + Values: []string{"medium", "large", "xlarge"}, + }, + { + Key: v1alpha1.LabelInstanceFamily, + Operator: v1.NodeSelectorOpNotIn, + // remove some cheap burstable and the odd c1 instance types so we have + // more control over what gets provisioned + Values: []string{"t2", "t3", "c1", "t3a", "t4g"}, + }, + }, + // prevent emptiness from deleting the nodes + TTLSecondsAfterEmpty: aws.Int64(99999), + ProviderRef: &v1alpha5.ProviderRef{Name: provider.Name}, + }) + + var numPods int32 = 100 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: numPods, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1")}, + }, + }, + }) + + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreatedNodeCount("==", 0) + env.ExpectCreated(provisioner, provider, dep) + + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + + // reduce the number of pods by 60% + dep.Spec.Replicas = aws.Int32(40) + env.ExpectUpdate(dep) + env.EventuallyExpectAvgUtilization(v1.ResourceCPU, "<", 0.5) + + provisioner.Spec.TTLSecondsAfterEmpty = nil + provisioner.Spec.Consolidation = &v1alpha5.Consolidation{ + Enabled: aws.Bool(true), + } + env.ExpectUpdate(provisioner) + + // With consolidation enabled, we now must delete nodes + env.EventuallyExpectAvgUtilization(v1.ResourceCPU, ">", 0.6) + + env.ExpectDeleted(dep) + env.EventuallyExpectScaleDown() + env.ExpectNoCrashes() + }) + It("should consolidate nodes (shrink)", func() { + provider := test.AWSNodeTemplate(test.AWSNodeTemplateOptions{AWS: v1alpha1.AWS{ + SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.Options.EnvironmentName}, + SubnetSelector: map[string]string{"karpenter.sh/discovery": env.Options.EnvironmentName}, + }}) + provisioner := test.Provisioner(test.ProvisionerOptions{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1alpha5.LabelCapacityType, + Operator: v1.NodeSelectorOpIn, + Values: []string{"on-demand"}, + }, + { + Key: v1alpha1.LabelInstanceSize, + Operator: v1.NodeSelectorOpIn, + Values: []string{"medium", "large", "xlarge"}, + }, + { + Key: v1alpha1.LabelInstanceFamily, + Operator: v1.NodeSelectorOpNotIn, + // remove some cheap burstable and the odd c1 instance types so we have + // more control over what gets provisioned + Values: []string{"t2", "t3", "c1", "t3a", "t4g"}, + }, + }, + ProviderRef: &v1alpha5.ProviderRef{Name: provider.Name}, + }) + + var numPods int32 = 5 + largeDep := test.Deployment(test.DeploymentOptions{ + Replicas: numPods, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: v1.LabelHostname, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "large-app", + }, + }, + }, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1")}, + }, + }, + }) + smallDep := test.Deployment(test.DeploymentOptions{ + Replicas: numPods, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "small-app"}, + }, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: v1.LabelHostname, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "small-app", + }, + }, + }, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("500m")}, + }, + }, + }) + + selector := labels.SelectorFromSet(largeDep.Spec.Selector.MatchLabels) + env.ExpectCreatedNodeCount("==", 0) + env.ExpectCreated(provisioner, provider, largeDep, smallDep) + + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + + // 5 nodes due to the anti-affinity rules + env.ExpectCreatedNodeCount("==", 5) + + // scaling down the large deployment leaves only small pods on each node + largeDep.Spec.Replicas = aws.Int32(0) + env.ExpectUpdate(largeDep) + env.EventuallyExpectAvgUtilization(v1.ResourceCPU, "<", 0.5) + + provisioner.Spec.TTLSecondsAfterEmpty = nil + provisioner.Spec.Consolidation = &v1alpha5.Consolidation{ + Enabled: aws.Bool(true), + } + env.ExpectUpdate(provisioner) + + // With consolidation enabled, we now must shrink each node in turn to consolidate due to the anti-affinity + // rules on the smaller deployment. The large nodes should go to a medium + env.EventuallyExpectAvgUtilization(v1.ResourceCPU, ">", 0.6) + + var nodes v1.NodeList + Expect(env.Client.List(env.Context, &nodes)).To(Succeed()) + numMediumNodes := 0 + for _, n := range nodes.Items { + if strings.HasSuffix(n.Labels[v1.LabelInstanceTypeStable], ".medium") { + numMediumNodes++ + } + } + + // all of the nodes should have been replaced with medium instance types + Expect(numMediumNodes).To(Equal(5)) + + env.ExpectDeleted(largeDep, smallDep) + env.EventuallyExpectScaleDown() + env.ExpectNoCrashes() + }) +}) diff --git a/website/content/en/preview/concepts/_index.md b/website/content/en/preview/concepts/_index.md index e482f1a06468..446b7ffa6c5a 100644 --- a/website/content/en/preview/concepts/_index.md +++ b/website/content/en/preview/concepts/_index.md @@ -68,6 +68,7 @@ Karpenter handles all clean-up work needed to properly delete the node. * **Node Expiry**: If a node expiry time-to-live value (`ttlSecondsUntilExpired`) is reached, that node is drained of pods and deleted (even if it is still running workloads). * **Empty nodes**: When the last workload pod running on a Karpenter-managed node is gone, the node is annotated with an emptiness timestamp. Once that "node empty" time-to-live (`ttlSecondsAfterEmpty`) is reached, finalization is triggered. +* **Consolidation**: If enabled, Karpenter will work to actively reduce cluster cost by identifying when nodes can be removed as their workloads will run on other nodes in the cluster and when nodes can be replaced with cheaper variants due to a change in the workloads. For more details on how Karpenter deletes nodes, see [Deprovisioning nodes](../tasks/deprovisioning) for details. diff --git a/website/content/en/preview/provisioner.md b/website/content/en/preview/provisioner.md index c41f557ccb73..8a037626c536 100644 --- a/website/content/en/preview/provisioner.md +++ b/website/content/en/preview/provisioner.md @@ -15,12 +15,17 @@ kind: Provisioner metadata: name: default spec: + # Enables consolidation which attempts to reduce cluster cost by both removing un-needed nodes and down-sizing those + # that can't be removed. Mutually exclusive with the ttlSecondsAfterEmpty parameter. + consolidation: + enabled: true + # If omitted, the feature is disabled and nodes will never expire. If set to less time than it requires for a node # to become ready, the node may expire before any pods successfully start. ttlSecondsUntilExpired: 2592000 # 30 Days = 60 * 60 * 24 * 30 Seconds; # If omitted, the feature is disabled, nodes will never scale down due to low utilization - ttlSecondsAfterEmpty: 30 + # ttlSecondsAfterEmpty: 30 # Provisioned nodes will have these taints # Taints may prevent pods from scheduling if they are not tolerated by the pod. diff --git a/website/content/en/preview/tasks/deprovisioning.md b/website/content/en/preview/tasks/deprovisioning.md index b2e8d13f8ce4..0e09f10b6b13 100644 --- a/website/content/en/preview/tasks/deprovisioning.md +++ b/website/content/en/preview/tasks/deprovisioning.md @@ -22,16 +22,17 @@ There are both automated and manual ways of deprovisioning nodes provisioned by * **Provisioner Deletion**: Nodes are considered to be "owned" by the Provisioner that launched them. Karpenter will gracefully terminate nodes when a provisioner is deleted. Nodes may be reparented to another provisioner by modifying their labels. For example: `kubectl label node -l karpenter.sh/provisioner-name=source-provisioner-name karpenter.sh/provisioner-name=destination-provisioner-name --overwrite`. * **Node empty**: Karpenter notes when the last workload (non-daemonset) pod stops running on a node. From that point, Karpenter waits the number of seconds set by `ttlSecondsAfterEmpty` in the provisioner, then Karpenter requests to delete the node. This feature can keep costs down by removing nodes that are no longer being used for workloads. * **Node expired**: Karpenter requests to delete the node after a set number of seconds, based on the provisioner `ttlSecondsUntilExpired` value, from the time the node was provisioned. One use case for node expiry is to handle node upgrades. Old nodes (with a potentially outdated Kubernetes version or operating system) are deleted, and replaced with nodes on the current version (assuming that you requested the latest version, rather than a specific version). +* **Consolidation**: Karpenter works to actively reduce cluster cost by identifying when nodes can be removed as their workloads will run on other nodes in the cluster and when nodes can be replaced with cheaper variants due to a change in the workloads. - {{% alert title="Note" color="primary" %}} - - Automated deprovisioning is configured through the ProvisionerSpec .ttlSecondsAfterEmpty - and .ttlSecondsUntilExpired fields. If either field is left empty, Karpenter will not - default a value and will not terminate nodes in that condition. +{{% alert title="Note" color="primary" %}} +- Automated deprovisioning is configured through the ProvisionerSpec `.ttlSecondsAfterEmpty` +, `.ttlSecondsUntilExpired` and `.consolidation.enabled` fields. If these are not configured, Karpenter will not +default values for them and will not terminate nodes for that purpose. - - Keep in mind that a small NodeExpiry results in a higher churn in cluster activity. So, for - example, if a cluster brings up all nodes at once, all the pods on those nodes would fall into - the same batching window on expiration. - {{% /alert %}} +- Keep in mind that a small NodeExpiry results in a higher churn in cluster activity. So, for +example, if a cluster brings up all nodes at once, all the pods on those nodes would fall into +the same batching window on expiration. +{{% /alert %}} * **Node deleted**: You could use `kubectl` to manually remove a single Karpenter node: @@ -57,6 +58,20 @@ The kubelet isn’t watching for its own existence, so if a node is deleted the All the pod objects get deleted by a garbage collection process later, because the pods’ node is gone. {{% /alert %}} +## Consolidation + + +Karpenter has two mechanisms for cluster consolidation: +- Deletion - A node is eligible for deletion if all of its pods can run on free capacity of other nodes in the cluster. +- Shrink - A node can be replaced if all of its pods can run on a combination of free capacity of other nodes in the cluster and a single cheaper replacement node. + +When there are multiple nodes that could be potentially deleted or replaced, Karpenter choose to consolidate the node that overall disrupts your workloads the least by preferring to terminate: + +* nodes running fewer pods +* nodes that will expire soon +* nodes with lower priority pods + + ## What can cause deprovisioning to fail? There are a few cases where requesting to deprovision a Karpenter node will fail. These include Pod Disruption Budgets and pods that have the `do-not-evict` annotation set. @@ -86,9 +101,17 @@ Review what [disruptions are](https://kubernetes.io/docs/concepts/workloads/pods ### Pod set to do-not-evict -If a pod exists with the annotation `karpenter.sh/do-not-evict: true` on a node, and a request is made to delete the node, Karpenter will not drain any pods from that node or otherwise try to delete the node. This annotation will have no effect for static pods, pods that tolerate `NoSchedule`, or pods terminating past their graceful termination period. +If a pod exists with the annotation `karpenter.sh/do-not-evict: true` on a node, and a request is made to delete the node, Karpenter will not drain any pods from that node or otherwise try to delete the node. Nodes that have pods with a `do-not-evict` annotation are not considered for consolidation, though their unused capacity is considered for the purposes of running pods from other nodes which can ber consolidated. This annotation will have no effect for static pods, pods that tolerate `NoSchedule`, or pods terminating past their graceful termination period. This is useful for pods that you want to run from start to finish without interruption. Examples might include a real-time, interactive game that you don't want to interrupt or a long batch job (such as you might have with machine learning) that would need to start over if it were interrupted. If you want to terminate a node with a `do-not-evict` pod, you can simply remove the annotation and the deprovisioning process will continue. + +### Scheduling Constraints (Consolidation Only) + +Consolidation will be unable to consolidate a node if, as a result of its scheduling simulation, it determines that the pods on a node cannot run on other nodes due to inter-pod affinity/anti-affinity, topology spread constraints, or some other scheduling restriction that couldn't be fulfilled. + +### Controllerless Pods (Consolidation Only) + +Consolidation will not attempt to consolidate a node that is running pods that are not owned by a controller (e.g. a `ReplicAset`). In general we cannot assume that these pods would be recreated if they were evicted from the node that they are currently running on. \ No newline at end of file diff --git a/website/content/en/preview/tasks/metrics.md b/website/content/en/preview/tasks/metrics.md index c8459b543ca5..1fb957822aa3 100644 --- a/website/content/en/preview/tasks/metrics.md +++ b/website/content/en/preview/tasks/metrics.md @@ -8,6 +8,23 @@ description: > --- Karpenter makes several metrics available in Prometheus format to allow monitoring cluster provisioning status. These metrics are available by default at `karpenter.karpenter.svc.cluster.local:8080/metrics` configurable via the `METRICS_PORT` environment variable documented [here](../configuration) +## Consolidation Metrics + +### `karpenter_consolidation_actions_performed` +Number of consolidation actions performed. Labeled by action. + +### `karpenter_consolidation_evaluation_duration_seconds` +Duration of the consolidation evaluation process in seconds. + +### `karpenter_consolidation_nodes_created` +Number of nodes created in total by consolidation. + +### `karpenter_consolidation_nodes_terminated` +Number of nodes terminated in total by consolidation. + +### `karpenter_consolidation_replacement_node_initialized_seconds` +Amount of time required for a replacement node to become initialized. + ## Provisioner Metrics ### `karpenter_provisioner_limit`