From 17e062b69fc84d9e1063867a63e63a42d7f112d2 Mon Sep 17 00:00:00 2001
From: Kubernetes Prow Robot <k8s-ci-robot@users.noreply.github.com>
Date: Mon, 5 Jul 2021 07:38:54 -0700
Subject: [PATCH 1/2] cherry-pick #4022 [cluster-autoscaler] Publish node group
 min/max metrics

---
 cluster-autoscaler/FAQ.md                    |  1 +
 cluster-autoscaler/core/scale_up_test.go     |  2 +-
 cluster-autoscaler/core/static_autoscaler.go |  6 +++
 cluster-autoscaler/main.go                   |  4 +-
 cluster-autoscaler/metrics/metrics.go        | 33 ++++++++++++++-
 cluster-autoscaler/metrics/metrics_test.go   | 42 ++++++++++++++++++++
 6 files changed, 85 insertions(+), 3 deletions(-)
 create mode 100644 cluster-autoscaler/metrics/metrics_test.go
diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md
index 681c7899a603..22d680a29e5e 100644
--- a/cluster-autoscaler/FAQ.md
+++ b/cluster-autoscaler/FAQ.md
@@ -673,6 +673,7 @@ The following startup parameters are supported for cluster autoscaler:
 | `max-node-provision-time` | Maximum time CA waits for node to be provisioned | 15 minutes
 | `nodes` | sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: <min>:<max>:<other...> | ""
 | `node-group-auto-discovery` | One or more definition(s) of node group auto-discovery.<br>A definition is expressed `<name of discoverer>:[<key>[=<value>]]`<br>The `aws`, `gce`, and `azure` cloud providers are currently supported. AWS matches by ASG tags, e.g. `asg:tag=tagKey,anotherTagKey`<br>GCE matches by IG name prefix, and requires you to specify min and max nodes per IG, e.g. `mig:namePrefix=pfx,min=0,max=10`<br> Azure matches by tags on VMSS, e.g. `label:foo=bar`, and will auto-detect `min` and `max` tags on the VMSS to set scaling limits.<br>Can be used multiple times | ""
+| `emit-per-nodegroup-metrics` | If true, emit per node group metrics. | false
 | `estimator` | Type of resource estimator to be used in scale up | binpacking
 | `expander` | Type of node group expander to be used in scale up.  | random
 | `write-status-configmap` | Should CA write status information to a configmap  | true
diff --git a/cluster-autoscaler/core/scale_up_test.go b/cluster-autoscaler/core/scale_up_test.go
index 6ff0902bbe33..6da8cb6d509e 100644
--- a/cluster-autoscaler/core/scale_up_test.go
+++ b/cluster-autoscaler/core/scale_up_test.go
@@ -978,7 +978,7 @@ func TestCheckScaleUpDeltaWithinLimits(t *testing.T) {
 }
 
 func TestAuthError(t *testing.T) {
-	metrics.RegisterAll()
+	metrics.RegisterAll(false)
 	context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{}, &fake.Clientset{}, nil, nil, nil)
 	assert.NoError(t, err)
 
diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
index 0b2df38946ba..2b7a97738eb8 100644
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@@ -258,6 +258,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 		return errors.ToAutoscalerError(errors.CloudProviderError, err)
 	}
 
+	// Update node groups min/max after cloud provider refresh
+	for _, nodeGroup := range a.AutoscalingContext.CloudProvider.NodeGroups() {
+		metrics.UpdateNodeGroupMin(nodeGroup.Id(), nodeGroup.MinSize())
+		metrics.UpdateNodeGroupMax(nodeGroup.Id(), nodeGroup.MaxSize())
+	}
+
 	nonExpendableScheduledPods := core_utils.FilterOutExpendablePods(originalScheduledPods, a.ExpendablePodsPriorityCutoff)
 	// Initialize cluster state to ClusterSnapshot
 	if typedErr := a.initializeClusterSnapshot(allNodes, nonExpendableScheduledPods); typedErr != nil {
diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go
index d6316c051f87..17564b3da896 100644
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@@ -180,6 +180,8 @@ var (
 	cordonNodeBeforeTerminate          = flag.Bool("cordon-node-before-terminating", false, "Should CA cordon nodes before terminating during downscale process")
 	daemonSetEvictionForEmptyNodes     = flag.Bool("daemonset-eviction-for-empty-nodes", false, "DaemonSet pods will be gracefully terminated from empty nodes")
 	userAgent                          = flag.String("user-agent", "cluster-autoscaler", "User agent used for HTTP calls.")
+
+	emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.")
 )
 
 func createAutoscalingOptions() config.AutoscalingOptions {
@@ -339,7 +341,7 @@ func buildAutoscaler() (core.Autoscaler, error) {
 }
 
 func run(healthCheck *metrics.HealthCheck) {
-	metrics.RegisterAll()
+	metrics.RegisterAll(*emitPerNodeGroupMetrics)
 
 	autoscaler, err := buildAutoscaler()
 	if err != nil {
diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go
index e3b5ed22edbd..6c501b7965d0 100644
--- a/cluster-autoscaler/metrics/metrics.go
+++ b/cluster-autoscaler/metrics/metrics.go
@@ -138,6 +138,22 @@ var (
 		},
 	)
 
+	nodesGroupMinNodes = k8smetrics.NewGaugeVec(
+		&k8smetrics.GaugeOpts{
+			Namespace: caNamespace,
+			Name:      "node_group_min_count",
+			Help:      "Minimum number of nodes in the node group",
+		}, []string{"node_group"},
+	)
+
+	nodesGroupMaxNodes = k8smetrics.NewGaugeVec(
+		&k8smetrics.GaugeOpts{
+			Namespace: caNamespace,
+			Name:      "node_group_max_count",
+			Help:      "Maximum number of nodes in the node group",
+		}, []string{"node_group"},
+	)
+
 	/**** Metrics related to autoscaler execution ****/
 	lastActivity = k8smetrics.NewGaugeVec(
 		&k8smetrics.GaugeOpts{
@@ -282,7 +298,7 @@ var (
 )
 
 // RegisterAll registers all metrics.
-func RegisterAll() {
+func RegisterAll(emitPerNodeGroupMetrics bool) {
 	legacyregistry.MustRegister(clusterSafeToAutoscale)
 	legacyregistry.MustRegister(nodesCount)
 	legacyregistry.MustRegister(nodeGroupsCount)
@@ -305,6 +321,11 @@ func RegisterAll() {
 	legacyregistry.MustRegister(napEnabled)
 	legacyregistry.MustRegister(nodeGroupCreationCount)
 	legacyregistry.MustRegister(nodeGroupDeletionCount)
+
+	if emitPerNodeGroupMetrics {
+		legacyregistry.MustRegister(nodesGroupMinNodes)
+		legacyregistry.MustRegister(nodesGroupMaxNodes)
+	}
 }
 
 // UpdateDurationFromStart records the duration of the step identified by the
@@ -364,6 +385,16 @@ func UpdateMaxNodesCount(nodesCount int) {
 	maxNodesCount.Set(float64(nodesCount))
 }
 
+// UpdateNodeGroupMin records the node group minimum allowed number of nodes
+func UpdateNodeGroupMin(nodeGroup string, minNodes int) {
+	nodesGroupMinNodes.WithLabelValues(nodeGroup).Set(float64(minNodes))
+}
+
+// UpdateNodeGroupMax records the node group maximum allowed number of nodes
+func UpdateNodeGroupMax(nodeGroup string, maxNodes int) {
+	nodesGroupMaxNodes.WithLabelValues(nodeGroup).Set(float64(maxNodes))
+}
+
 // RegisterError records any errors preventing Cluster Autoscaler from working.
 // No more than one error should be recorded per loop.
 func RegisterError(err errors.AutoscalerError) {
diff --git a/cluster-autoscaler/metrics/metrics_test.go b/cluster-autoscaler/metrics/metrics_test.go
new file mode 100644
index 000000000000..71789d2c7a48
--- /dev/null
+++ b/cluster-autoscaler/metrics/metrics_test.go
@@ -0,0 +1,42 @@
+/*
+Copyright 2021 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+import (
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestDisabledPerNodeGroupMetrics(t *testing.T) {
+	RegisterAll(false)
+	assert.False(t, nodesGroupMinNodes.IsCreated())
+	assert.False(t, nodesGroupMaxNodes.IsCreated())
+}
+
+func TestEnabledPerNodeGroupMetrics(t *testing.T) {
+	RegisterAll(true)
+	assert.True(t, nodesGroupMinNodes.IsCreated())
+	assert.True(t, nodesGroupMaxNodes.IsCreated())
+
+	UpdateNodeGroupMin("foo", 2)
+	UpdateNodeGroupMax("foo", 100)
+
+	assert.Equal(t, 2, int(testutil.ToFloat64(nodesGroupMinNodes.GaugeVec.WithLabelValues("foo"))))
+	assert.Equal(t, 100, int(testutil.ToFloat64(nodesGroupMaxNodes.GaugeVec.WithLabelValues("foo"))))
+}

From 482cb1e71dd5a25d6bb7eead01adf5e921181c65 Mon Sep 17 00:00:00 2001
From: Maciek Pytel <maciekpytel@google.com>
Date: Thu, 8 Jul 2021 15:14:26 +0200
Subject: [PATCH 2/2] Skipping metrics tests added in #4022

Each test works in isolation, but they cause panic when the entire
suite is run (ex. make test-in-docker), because the underlying
metrics library panics when the same metric is registered twice.

(cherry picked from commit 52392b3707cb8192bd2841b6f2e8da9678c13fd9)
---
 cluster-autoscaler/metrics/metrics_test.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cluster-autoscaler/metrics/metrics_test.go b/cluster-autoscaler/metrics/metrics_test.go
index 71789d2c7a48..4bbe87b526f6 100644
--- a/cluster-autoscaler/metrics/metrics_test.go
+++ b/cluster-autoscaler/metrics/metrics_test.go
@@ -24,12 +24,14 @@ import (
 )
 
 func TestDisabledPerNodeGroupMetrics(t *testing.T) {
+	t.Skip("Registering metrics multiple times causes panic. Skipping until the test is fixed to not impact other tests.")
 	RegisterAll(false)
 	assert.False(t, nodesGroupMinNodes.IsCreated())
 	assert.False(t, nodesGroupMaxNodes.IsCreated())
 }
 
 func TestEnabledPerNodeGroupMetrics(t *testing.T) {
+	t.Skip("Registering metrics multiple times causes panic. Skipping until the test is fixed to not impact other tests.")
 	RegisterAll(true)
 	assert.True(t, nodesGroupMinNodes.IsCreated())
 	assert.True(t, nodesGroupMaxNodes.IsCreated())