From a24ea6c66b5cc61bbaf834899e4d6e58453f78db Mon Sep 17 00:00:00 2001 From: Michael McCune Date: Tue, 23 Mar 2021 17:00:52 -0400 Subject: [PATCH] add cluster cores and memory bytes count metrics This change adds 4 metrics that can be used to monitor the minimum and maximum limits for CPU and memory, as well as the current counts in cores and bytes, respectively. The four metrics added are: * `cluster_autoscaler_cpu_limits_cores` * `cluster_autoscaler_cluster_cpu_current_cores` * `cluster_autoscaler_memory_limits_bytes` * `cluster_autoscaler_cluster_memory_current_bytes` This change also adds the `max_cores_total` metric to the metrics proposal doc, as it was previously not recorded there. User story: As a cluster autoscaler user, I would like to monitor my cluster through metrics to determine when the cluster is nearing its limits for cores and memory usage. --- cluster-autoscaler/core/static_autoscaler.go | 23 ++++++++ cluster-autoscaler/main.go | 2 + cluster-autoscaler/metrics/metrics.go | 58 ++++++++++++++++++++ cluster-autoscaler/proposals/metrics.md | 5 ++ 4 files changed, 88 insertions(+) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 3dfced108c82..47afd916e000 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -246,6 +246,11 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError return nil } + // Update cluster resource usage metrics + coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime) + metrics.UpdateClusterCPUCurrentCores(coresTotal) + metrics.UpdateClusterMemoryCurrentBytes(memoryTotal) + daemonsets, err := a.ListerRegistry.DaemonSetLister().List(labels.Everything()) if err != nil { klog.Errorf("Failed to get daemonset list: %v", err) @@ -800,3 +805,21 @@ func getUpcomingNodeInfos(registry *clusterstate.ClusterStateRegistry, nodeInfos } return upcomingNodes } + +func calculateCoresMemoryTotal(nodes []*apiv1.Node, timestamp time.Time) (int64, int64) { + // this function is essentially similar to the calculateScaleDownCoresMemoryTotal + // we want to check all nodes, aside from those deleting, to sum the cluster resource usage. + var coresTotal, memoryTotal int64 + for _, node := range nodes { + if isNodeBeingDeleted(node, timestamp) { + // Nodes being deleted do not count towards total cluster resources + continue + } + cores, memory := core_utils.GetNodeCoresAndMemory(node) + + coresTotal += cores + memoryTotal += memory + } + + return coresTotal, memoryTotal +} diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 304bbedd6300..e7c8bc810c7b 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -330,6 +330,8 @@ func buildAutoscaler() (core.Autoscaler, error) { // These metrics should be published only once. metrics.UpdateNapEnabled(autoscalingOptions.NodeAutoprovisioningEnabled) metrics.UpdateMaxNodesCount(autoscalingOptions.MaxNodesTotal) + metrics.UpdateCPULimitsCores(autoscalingOptions.MinCoresTotal, autoscalingOptions.MaxCoresTotal) + metrics.UpdateMemoryLimitsBytes(autoscalingOptions.MinMemoryTotal, autoscalingOptions.MaxMemoryTotal) // Create autoscaler. return core.NewAutoscaler(opts) diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index e3b5ed22edbd..9580ee3344c0 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -138,6 +138,38 @@ var ( }, ) + cpuCurrentCores = k8smetrics.NewGauge( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "cluster_cpu_current_cores", + Help: "Current number of cores in the cluster, minus deleting nodes.", + }, + ) + + cpuLimitsCores = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "cpu_limits_cores", + Help: "Minimum and maximum number of cores in the cluster.", + }, []string{"direction"}, + ) + + memoryCurrentBytes = k8smetrics.NewGauge( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "cluster_memory_current_bytes", + Help: "Current number of bytes of memory in the cluster, minus deleting nodes.", + }, + ) + + memoryLimitsBytes = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "memory_limits_bytes", + Help: "Minimum and maximum number of bytes of memory in cluster.", + }, []string{"direction"}, + ) + /**** Metrics related to autoscaler execution ****/ lastActivity = k8smetrics.NewGaugeVec( &k8smetrics.GaugeOpts{ @@ -288,6 +320,10 @@ func RegisterAll() { legacyregistry.MustRegister(nodeGroupsCount) legacyregistry.MustRegister(unschedulablePodsCount) legacyregistry.MustRegister(maxNodesCount) + legacyregistry.MustRegister(cpuCurrentCores) + legacyregistry.MustRegister(cpuLimitsCores) + legacyregistry.MustRegister(memoryCurrentBytes) + legacyregistry.MustRegister(memoryLimitsBytes) legacyregistry.MustRegister(lastActivity) legacyregistry.MustRegister(functionDuration) legacyregistry.MustRegister(functionDurationSummary) @@ -364,6 +400,28 @@ func UpdateMaxNodesCount(nodesCount int) { maxNodesCount.Set(float64(nodesCount)) } +// UpdateClusterCPUCurrentCores records the number of cores in the cluster, minus deleting nodes +func UpdateClusterCPUCurrentCores(coresCount int64) { + cpuCurrentCores.Set(float64(coresCount)) +} + +// UpdateCPULimitsCores records the minimum and maximum number of cores in the cluster +func UpdateCPULimitsCores(minCoresCount int64, maxCoresCount int64) { + cpuLimitsCores.WithLabelValues("minimum").Set(float64(minCoresCount)) + cpuLimitsCores.WithLabelValues("maximum").Set(float64(maxCoresCount)) +} + +// UpdateClusterMemoryCurrentBytes records the number of bytes of memory in the cluster, minus deleting nodes +func UpdateClusterMemoryCurrentBytes(memoryCount int64) { + memoryCurrentBytes.Set(float64(memoryCount)) +} + +// UpdateMemoryLimitsBytes records the minimum and maximum bytes of memory in the cluster +func UpdateMemoryLimitsBytes(minMemoryCount int64, maxMemoryCount int64) { + memoryLimitsBytes.WithLabelValues("minimum").Set(float64(minMemoryCount)) + memoryLimitsBytes.WithLabelValues("maximum").Set(float64(maxMemoryCount)) +} + // RegisterError records any errors preventing Cluster Autoscaler from working. // No more than one error should be recorded per loop. func RegisterError(err errors.AutoscalerError) { diff --git a/cluster-autoscaler/proposals/metrics.md b/cluster-autoscaler/proposals/metrics.md index 6cb3b5ac0d06..d21eb1a2f642 100644 --- a/cluster-autoscaler/proposals/metrics.md +++ b/cluster-autoscaler/proposals/metrics.md @@ -27,6 +27,11 @@ All the metrics are prefixed with `cluster_autoscaler_`. | nodes_count | Gauge | `state`=<node-state> | Number of nodes in cluster. | | unschedulable_pods_count | Gauge | | Number of unschedulable ("Pending") pods in the cluster. | | node_groups_count | Gauge | `node_group_type`=<node-group-type> | Number of node groups managed by CA. | +| max_nodes_count | Gauge | | Maximum number of nodes in all node groups. | +| cluster_cpu_current_cores | Gauge | | | Current number of cores in the cluster, minus deleting nodes. | +| cpu_limits_cores | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of cores in the cluster. | +| cluster_memory_current_bytes | Gauge | | Current number of bytes of memory in the cluster, minus deleting nodes. | +| memory_limits_bytes | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of bytes of memory in cluster. | * `cluster_safe_to_autoscale` indicates whether cluster is healthy enough for autoscaling. CA stops all operations if significant number of nodes are unready (by default 33% as of CA 0.5.4). * `nodes_count` records the total number of nodes, labeled by node state. Possible