From be1d9cb8d625348de9d8cb9642b81db6df90dbc2 Mon Sep 17 00:00:00 2001 From: Joel Speed Date: Thu, 7 May 2020 18:03:29 +0100 Subject: [PATCH] Allow 1.5% tolerance in memory capacity when comparing nodegroups In testing, AWS M5 instances can on occasion display approximately a 1% difference in memory capacity between availability zones, deployed with the same launch configuration and same AMI. Allow a 1.5% tolerance to give some buffer on the actual amount of memory discrepancy since in testing, some examples were just over 1% (eg 1.05%, 1.1%). Tests are included with capacity values taken from real instances to prevent future regression. --- .../nodegroupset/compare_nodegroups.go | 32 +++++------ .../nodegroupset/compare_nodegroups_test.go | 54 +++++++++++++++++-- 2 files changed, 66 insertions(+), 20 deletions(-) diff --git a/cluster-autoscaler/processors/nodegroupset/compare_nodegroups.go b/cluster-autoscaler/processors/nodegroupset/compare_nodegroups.go index 24fcc4bf415..e1c9d7fe8da 100644 --- a/cluster-autoscaler/processors/nodegroupset/compare_nodegroups.go +++ b/cluster-autoscaler/processors/nodegroupset/compare_nodegroups.go @@ -31,9 +31,9 @@ const ( // MaxFreeDifferenceRatio describes how free resources (allocatable - daemon and system pods) // can differ between groups in the same NodeGroupSet MaxFreeDifferenceRatio = 0.05 - // MaxMemoryDifferenceInKiloBytes describes how much memory - // capacity can differ but still be considered equal. - MaxMemoryDifferenceInKiloBytes = 256000 + // MaxCapacityMemoryDifferenceRatio describes how Node.Status.Capacity.Memory can differ between + // groups in the same NodeGroupSet + MaxCapacityMemoryDifferenceRatio = 0.015 ) // BasicIgnoredLabels define a set of basic labels that should be ignored when comparing the similarity @@ -53,21 +53,25 @@ var BasicIgnoredLabels = map[string]bool{ // similar enough to be considered a part of a single NodeGroupSet. type NodeInfoComparator func(n1, n2 *schedulerframework.NodeInfo) bool -func compareResourceMapsWithTolerance(resources map[apiv1.ResourceName][]resource.Quantity, +func resourceMapsWithinTolerance(resources map[apiv1.ResourceName][]resource.Quantity, maxDifferenceRatio float64) bool { for _, qtyList := range resources { - if len(qtyList) != 2 { - return false - } - larger := math.Max(float64(qtyList[0].MilliValue()), float64(qtyList[1].MilliValue())) - smaller := math.Min(float64(qtyList[0].MilliValue()), float64(qtyList[1].MilliValue())) - if larger-smaller > larger*maxDifferenceRatio { + if !resourceListWithinTolerance(qtyList, maxDifferenceRatio) { return false } } return true } +func resourceListWithinTolerance(qtyList []resource.Quantity, maxDifferenceRatio float64) bool { + if len(qtyList) != 2 { + return false + } + larger := math.Max(float64(qtyList[0].MilliValue()), float64(qtyList[1].MilliValue())) + smaller := math.Min(float64(qtyList[0].MilliValue()), float64(qtyList[1].MilliValue())) + return larger-smaller <= larger*maxDifferenceRatio +} + func compareLabels(nodes []*schedulerframework.NodeInfo, ignoredLabels map[string]bool) bool { labels := make(map[string][]string) for _, node := range nodes { @@ -131,9 +135,7 @@ func IsCloudProviderNodeInfoSimilar(n1, n2 *schedulerframework.NodeInfo, ignored } switch kind { case apiv1.ResourceMemory: - // For memory capacity we allow a small tolerance - memoryDifference := math.Abs(float64(qtyList[0].Value()) - float64(qtyList[1].Value())) - if memoryDifference > MaxMemoryDifferenceInKiloBytes { + if !resourceListWithinTolerance(qtyList, MaxCapacityMemoryDifferenceRatio) { return false } default: @@ -147,10 +149,10 @@ func IsCloudProviderNodeInfoSimilar(n1, n2 *schedulerframework.NodeInfo, ignored } // For allocatable and free we allow resource quantities to be within a few % of each other - if !compareResourceMapsWithTolerance(allocatable, MaxAllocatableDifferenceRatio) { + if !resourceMapsWithinTolerance(allocatable, MaxAllocatableDifferenceRatio) { return false } - if !compareResourceMapsWithTolerance(free, MaxFreeDifferenceRatio) { + if !resourceMapsWithinTolerance(free, MaxFreeDifferenceRatio) { return false } diff --git a/cluster-autoscaler/processors/nodegroupset/compare_nodegroups_test.go b/cluster-autoscaler/processors/nodegroupset/compare_nodegroups_test.go index b5ae8ac581c..49a9b8708b7 100644 --- a/cluster-autoscaler/processors/nodegroupset/compare_nodegroups_test.go +++ b/cluster-autoscaler/processors/nodegroupset/compare_nodegroups_test.go @@ -101,16 +101,60 @@ func TestNodesSimilarVariousRequirementsAndPods(t *testing.T) { func TestNodesSimilarVariousMemoryRequirements(t *testing.T) { comparator := CreateGenericNodeInfoComparator([]string{}) - n1 := BuildTestNode("node1", 1000, MaxMemoryDifferenceInKiloBytes) + n1 := BuildTestNode("node1", 1000, 1000) // Different memory capacity within tolerance - n2 := BuildTestNode("node2", 1000, MaxMemoryDifferenceInKiloBytes) - n2.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(2*MaxMemoryDifferenceInKiloBytes, resource.DecimalSI) + n2 := BuildTestNode("node2", 1000, 1000) + n2.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(1000-(1000*MaxCapacityMemoryDifferenceRatio)+1, resource.DecimalSI) checkNodesSimilar(t, n1, n2, comparator, true) // Different memory capacity exceeds tolerance - n3 := BuildTestNode("node3", 1000, MaxMemoryDifferenceInKiloBytes) - n3.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(2*MaxMemoryDifferenceInKiloBytes+1, resource.DecimalSI) + n3 := BuildTestNode("node3", 1000, 1000) + n3.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(1000-(1000*MaxCapacityMemoryDifferenceRatio)-1, resource.DecimalSI) + checkNodesSimilar(t, n1, n3, comparator, false) +} + +func TestNodesSimilarVariousLargeMemoryRequirementsM5XLarge(t *testing.T) { + comparator := CreateGenericNodeInfoComparator([]string{}) + + // Use realistic memory capacity (taken from real nodes) + // 15944120 KB ~= 16GiB (m5.xLarge) + q1 := resource.MustParse("16116152Ki") + q2 := resource.MustParse("15944120Ki") + + n1 := BuildTestNode("node1", 1000, q1.Value()) + + // Different memory capacity within tolerance + // Value taken from another m5.xLarge in a different zone + n2 := BuildTestNode("node2", 1000, q2.Value()) + checkNodesSimilar(t, n1, n2, comparator, true) + + // Different memory capacity exceeds tolerance + // Value of q1 * 1.02 + q3 := resource.MustParse("16438475Ki") + n3 := BuildTestNode("node3", 1000, q3.Value()) + checkNodesSimilar(t, n1, n3, comparator, false) +} + +func TestNodesSimilarVariousLargeMemoryRequirementsM516XLarge(t *testing.T) { + comparator := CreateGenericNodeInfoComparator([]string{}) + + // Use realistic memory capacity (taken from real nodes) + // 257217528 KB ~= 256GiB (m5.16xLarge) + q1 := resource.MustParse("259970052Ki") + q2 := resource.MustParse("257217528Ki") + + n1 := BuildTestNode("node1", 1000, q1.Value()) + + // Different memory capacity within tolerance + // Value taken from another m5.xLarge in a different zone + n2 := BuildTestNode("node2", 1000, q2.Value()) + checkNodesSimilar(t, n1, n2, comparator, true) + + // Different memory capacity exceeds tolerance + // Value of q1 * 1.02 + q3 := resource.MustParse("265169453Ki") + n3 := BuildTestNode("node3", 1000, q3.Value()) checkNodesSimilar(t, n1, n3, comparator, false) }