From b8ec486e1f45e2f8dca8d8c208f1968c3b23de6f Mon Sep 17 00:00:00 2001 From: Andrew McDermott Date: Fri, 23 Aug 2019 10:36:25 +0100 Subject: [PATCH] UPSTREAM: : openshift: add custom nodeset comparator Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1731011 In OpenShift clusters we see very small memory differences between instances, typically 8Ki. The default comparator which is used when --balance-similar-node-groups is enabled only considers nodes in a nodegroup to be equal when the memory capacity is identical. This new comparator is identical to the default but will tolerate a 128Ki difference in memory capacity. With this change, and with --balance-similar-node-groups enabled, I now see the following: ```console $ oc get machinesets NAME DESIRED CURRENT READY AVAILABLE AGE amcdermo-w6m6z-worker-us-east-2a 3 3 3 3 24h amcdermo-w6m6z-worker-us-east-2b 4 4 4 4 24h amcdermo-w6m6z-worker-us-east-2c 4 4 4 4 24h e2e-59443-w-0 3 3 1 1 20m e2e-59443-w-1 3 3 1 1 20m e2e-59443-w-2 3 3 1 1 20m e2e-59443-w-3 3 3 1 1 20m e2e-59443-w-4 3 3 1 1 20m e2e-59443-w-5 3 3 1 1 20m e2e-59443-w-6 3 3 1 1 20m e2e-59443-w-7 3 3 1 1 20m e2e-59443-w-8 3 3 1 1 20m ``` --- cluster-autoscaler/main.go | 9 ++ .../openshiftmachineapi_compare_nodegroups.go | 90 +++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 cluster-autoscaler/processors/nodegroupset/openshiftmachineapi_compare_nodegroups.go diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 49df4b5bd7c1..440010b20f3c 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -31,6 +31,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" cloudBuilder "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/builder" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/openshiftmachineapi" "k8s.io/autoscaler/cluster-autoscaler/config" "k8s.io/autoscaler/cluster-autoscaler/core" "k8s.io/autoscaler/cluster-autoscaler/estimator" @@ -281,6 +282,14 @@ func buildAutoscaler() (core.Autoscaler, error) { processors.NodeGroupSetProcessor = &nodegroupset.BalancingNodeGroupSetProcessor{ Comparator: nodegroupset.IsGkeNodeInfoSimilar} + } + if autoscalingOptions.CloudProviderName == openshiftmachineapi.ProviderName { + // We do this because of: + // - https://bugzilla.redhat.com/show_bug.cgi?id=1731011 + // - https://bugzilla.redhat.com/show_bug.cgi?id=1733235 + processors.NodeGroupSetProcessor = &nodegroupset.BalancingNodeGroupSetProcessor{ + Comparator: nodegroupset.IsOpenShiftMachineApiNodeInfoSimilar} + } opts := core.AutoscalerOptions{ AutoscalingOptions: autoscalingOptions, diff --git a/cluster-autoscaler/processors/nodegroupset/openshiftmachineapi_compare_nodegroups.go b/cluster-autoscaler/processors/nodegroupset/openshiftmachineapi_compare_nodegroups.go new file mode 100644 index 000000000000..73424943f831 --- /dev/null +++ b/cluster-autoscaler/processors/nodegroupset/openshiftmachineapi_compare_nodegroups.go @@ -0,0 +1,90 @@ +package nodegroupset + +import ( + apiv1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo" +) + +// maxMemoryDifferenceInKiloBytes describes how much +// Node.Status.Capacity can differ but still be considered equal. +var maxMemoryDifferenceInKiloBytes = resource.MustParse("128Ki") + +// IsOpenShiftMachineApiNodeInfoSimilar compares if two nodes should +// be considered part of the same NodeGroupSet. +func IsOpenShiftMachineApiNodeInfoSimilar(n1, n2 *schedulernodeinfo.NodeInfo) bool { + return isOpenShiftNodeInfoSimilar(n1, n2) +} + +// Note: this is a copy of isNodeInfoSimilar() and the only change is +// to tolerate a small memory capacity difference. +func isOpenShiftNodeInfoSimilar(n1, n2 *schedulernodeinfo.NodeInfo) bool { + capacity := make(map[apiv1.ResourceName][]resource.Quantity) + allocatable := make(map[apiv1.ResourceName][]resource.Quantity) + free := make(map[apiv1.ResourceName][]resource.Quantity) + nodes := []*schedulernodeinfo.NodeInfo{n1, n2} + for _, node := range nodes { + for res, quantity := range node.Node().Status.Capacity { + capacity[res] = append(capacity[res], quantity) + } + for res, quantity := range node.Node().Status.Allocatable { + allocatable[res] = append(allocatable[res], quantity) + } + requested := node.RequestedResource() + for res, quantity := range (&requested).ResourceList() { + freeRes := node.Node().Status.Allocatable[res].DeepCopy() + freeRes.Sub(quantity) + free[res] = append(free[res], freeRes) + } + } + + // For capacity we allow quantities to be within a few Kb + // because we find that some cloud instances are slightly + // smaller/larger than other, typically 8-16Ki. + // See: + // https://bugzilla.redhat.com/show_bug.cgi?id=1731011 + // https://bugzilla.redhat.com/show_bug.cgi?id=1733235 + for _, qtyList := range capacity { + if len(qtyList) != 2 || !compareResourceEqualWithTolerance(qtyList[0], qtyList[1], maxMemoryDifferenceInKiloBytes) { + return false + } + } + // For allocatable and free we allow resource quantities to be within a few % of each other + if !compareResourceMapsWithTolerance(allocatable, MaxAllocatableDifferenceRatio) { + return false + } + if !compareResourceMapsWithTolerance(free, MaxFreeDifferenceRatio) { + return false + } + + ignoredLabels := map[string]bool{ + apiv1.LabelHostname: true, + apiv1.LabelZoneFailureDomain: true, + apiv1.LabelZoneRegion: true, + "beta.kubernetes.io/fluentd-ds-ready": true, // this is internal label used for determining if fluentd should be installed as deamon set. Used for migration 1.8 to 1.9. + } + + labels := make(map[string][]string) + for _, node := range nodes { + for label, value := range node.Node().ObjectMeta.Labels { + ignore, _ := ignoredLabels[label] + if !ignore { + labels[label] = append(labels[label], value) + } + } + } + for _, labelValues := range labels { + if len(labelValues) != 2 || labelValues[0] != labelValues[1] { + return false + } + } + return true +} + +func compareResourceEqualWithTolerance(x, y, tolerance resource.Quantity) bool { + x.Sub(y) + if x.Sign() == -1 { + x.Neg() + } + return x.Cmp(tolerance) != 1 // <= max +}