From 17e062b69fc84d9e1063867a63e63a42d7f112d2 Mon Sep 17 00:00:00 2001 From: Kubernetes Prow Robot Date: Mon, 5 Jul 2021 07:38:54 -0700 Subject: [PATCH 1/2] cherry-pick #4022 [cluster-autoscaler] Publish node group min/max metrics --- cluster-autoscaler/FAQ.md | 1 + cluster-autoscaler/core/scale_up_test.go | 2 +- cluster-autoscaler/core/static_autoscaler.go | 6 +++ cluster-autoscaler/main.go | 4 +- cluster-autoscaler/metrics/metrics.go | 33 ++++++++++++++- cluster-autoscaler/metrics/metrics_test.go | 42 ++++++++++++++++++++ 6 files changed, 85 insertions(+), 3 deletions(-) create mode 100644 cluster-autoscaler/metrics/metrics_test.go diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md index 681c7899a603..22d680a29e5e 100644 --- a/cluster-autoscaler/FAQ.md +++ b/cluster-autoscaler/FAQ.md @@ -673,6 +673,7 @@ The following startup parameters are supported for cluster autoscaler: | `max-node-provision-time` | Maximum time CA waits for node to be provisioned | 15 minutes | `nodes` | sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: :: | "" | `node-group-auto-discovery` | One or more definition(s) of node group auto-discovery.
A definition is expressed `:[[=]]`
The `aws`, `gce`, and `azure` cloud providers are currently supported. AWS matches by ASG tags, e.g. `asg:tag=tagKey,anotherTagKey`
GCE matches by IG name prefix, and requires you to specify min and max nodes per IG, e.g. `mig:namePrefix=pfx,min=0,max=10`
Azure matches by tags on VMSS, e.g. `label:foo=bar`, and will auto-detect `min` and `max` tags on the VMSS to set scaling limits.
Can be used multiple times | "" +| `emit-per-nodegroup-metrics` | If true, emit per node group metrics. | false | `estimator` | Type of resource estimator to be used in scale up | binpacking | `expander` | Type of node group expander to be used in scale up. | random | `write-status-configmap` | Should CA write status information to a configmap | true diff --git a/cluster-autoscaler/core/scale_up_test.go b/cluster-autoscaler/core/scale_up_test.go index 6ff0902bbe33..6da8cb6d509e 100644 --- a/cluster-autoscaler/core/scale_up_test.go +++ b/cluster-autoscaler/core/scale_up_test.go @@ -978,7 +978,7 @@ func TestCheckScaleUpDeltaWithinLimits(t *testing.T) { } func TestAuthError(t *testing.T) { - metrics.RegisterAll() + metrics.RegisterAll(false) context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{}, &fake.Clientset{}, nil, nil, nil) assert.NoError(t, err) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 0b2df38946ba..2b7a97738eb8 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -258,6 +258,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError return errors.ToAutoscalerError(errors.CloudProviderError, err) } + // Update node groups min/max after cloud provider refresh + for _, nodeGroup := range a.AutoscalingContext.CloudProvider.NodeGroups() { + metrics.UpdateNodeGroupMin(nodeGroup.Id(), nodeGroup.MinSize()) + metrics.UpdateNodeGroupMax(nodeGroup.Id(), nodeGroup.MaxSize()) + } + nonExpendableScheduledPods := core_utils.FilterOutExpendablePods(originalScheduledPods, a.ExpendablePodsPriorityCutoff) // Initialize cluster state to ClusterSnapshot if typedErr := a.initializeClusterSnapshot(allNodes, nonExpendableScheduledPods); typedErr != nil { diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index d6316c051f87..17564b3da896 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -180,6 +180,8 @@ var ( cordonNodeBeforeTerminate = flag.Bool("cordon-node-before-terminating", false, "Should CA cordon nodes before terminating during downscale process") daemonSetEvictionForEmptyNodes = flag.Bool("daemonset-eviction-for-empty-nodes", false, "DaemonSet pods will be gracefully terminated from empty nodes") userAgent = flag.String("user-agent", "cluster-autoscaler", "User agent used for HTTP calls.") + + emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.") ) func createAutoscalingOptions() config.AutoscalingOptions { @@ -339,7 +341,7 @@ func buildAutoscaler() (core.Autoscaler, error) { } func run(healthCheck *metrics.HealthCheck) { - metrics.RegisterAll() + metrics.RegisterAll(*emitPerNodeGroupMetrics) autoscaler, err := buildAutoscaler() if err != nil { diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index e3b5ed22edbd..6c501b7965d0 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -138,6 +138,22 @@ var ( }, ) + nodesGroupMinNodes = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "node_group_min_count", + Help: "Minimum number of nodes in the node group", + }, []string{"node_group"}, + ) + + nodesGroupMaxNodes = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "node_group_max_count", + Help: "Maximum number of nodes in the node group", + }, []string{"node_group"}, + ) + /**** Metrics related to autoscaler execution ****/ lastActivity = k8smetrics.NewGaugeVec( &k8smetrics.GaugeOpts{ @@ -282,7 +298,7 @@ var ( ) // RegisterAll registers all metrics. -func RegisterAll() { +func RegisterAll(emitPerNodeGroupMetrics bool) { legacyregistry.MustRegister(clusterSafeToAutoscale) legacyregistry.MustRegister(nodesCount) legacyregistry.MustRegister(nodeGroupsCount) @@ -305,6 +321,11 @@ func RegisterAll() { legacyregistry.MustRegister(napEnabled) legacyregistry.MustRegister(nodeGroupCreationCount) legacyregistry.MustRegister(nodeGroupDeletionCount) + + if emitPerNodeGroupMetrics { + legacyregistry.MustRegister(nodesGroupMinNodes) + legacyregistry.MustRegister(nodesGroupMaxNodes) + } } // UpdateDurationFromStart records the duration of the step identified by the @@ -364,6 +385,16 @@ func UpdateMaxNodesCount(nodesCount int) { maxNodesCount.Set(float64(nodesCount)) } +// UpdateNodeGroupMin records the node group minimum allowed number of nodes +func UpdateNodeGroupMin(nodeGroup string, minNodes int) { + nodesGroupMinNodes.WithLabelValues(nodeGroup).Set(float64(minNodes)) +} + +// UpdateNodeGroupMax records the node group maximum allowed number of nodes +func UpdateNodeGroupMax(nodeGroup string, maxNodes int) { + nodesGroupMaxNodes.WithLabelValues(nodeGroup).Set(float64(maxNodes)) +} + // RegisterError records any errors preventing Cluster Autoscaler from working. // No more than one error should be recorded per loop. func RegisterError(err errors.AutoscalerError) { diff --git a/cluster-autoscaler/metrics/metrics_test.go b/cluster-autoscaler/metrics/metrics_test.go new file mode 100644 index 000000000000..71789d2c7a48 --- /dev/null +++ b/cluster-autoscaler/metrics/metrics_test.go @@ -0,0 +1,42 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" +) + +func TestDisabledPerNodeGroupMetrics(t *testing.T) { + RegisterAll(false) + assert.False(t, nodesGroupMinNodes.IsCreated()) + assert.False(t, nodesGroupMaxNodes.IsCreated()) +} + +func TestEnabledPerNodeGroupMetrics(t *testing.T) { + RegisterAll(true) + assert.True(t, nodesGroupMinNodes.IsCreated()) + assert.True(t, nodesGroupMaxNodes.IsCreated()) + + UpdateNodeGroupMin("foo", 2) + UpdateNodeGroupMax("foo", 100) + + assert.Equal(t, 2, int(testutil.ToFloat64(nodesGroupMinNodes.GaugeVec.WithLabelValues("foo")))) + assert.Equal(t, 100, int(testutil.ToFloat64(nodesGroupMaxNodes.GaugeVec.WithLabelValues("foo")))) +} From 482cb1e71dd5a25d6bb7eead01adf5e921181c65 Mon Sep 17 00:00:00 2001 From: Maciek Pytel Date: Thu, 8 Jul 2021 15:14:26 +0200 Subject: [PATCH 2/2] Skipping metrics tests added in #4022 Each test works in isolation, but they cause panic when the entire suite is run (ex. make test-in-docker), because the underlying metrics library panics when the same metric is registered twice. (cherry picked from commit 52392b3707cb8192bd2841b6f2e8da9678c13fd9) --- cluster-autoscaler/metrics/metrics_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cluster-autoscaler/metrics/metrics_test.go b/cluster-autoscaler/metrics/metrics_test.go index 71789d2c7a48..4bbe87b526f6 100644 --- a/cluster-autoscaler/metrics/metrics_test.go +++ b/cluster-autoscaler/metrics/metrics_test.go @@ -24,12 +24,14 @@ import ( ) func TestDisabledPerNodeGroupMetrics(t *testing.T) { + t.Skip("Registering metrics multiple times causes panic. Skipping until the test is fixed to not impact other tests.") RegisterAll(false) assert.False(t, nodesGroupMinNodes.IsCreated()) assert.False(t, nodesGroupMaxNodes.IsCreated()) } func TestEnabledPerNodeGroupMetrics(t *testing.T) { + t.Skip("Registering metrics multiple times causes panic. Skipping until the test is fixed to not impact other tests.") RegisterAll(true) assert.True(t, nodesGroupMinNodes.IsCreated()) assert.True(t, nodesGroupMaxNodes.IsCreated())