diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 3dfced108c82..3e537f14d764 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -246,6 +246,11 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError return nil } + // Update cluster resource usage metrics + coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime) + metrics.UpdateClusterCoresCount(coresTotal) + metrics.UpdateClusterMemoryCount(memoryTotal) + daemonsets, err := a.ListerRegistry.DaemonSetLister().List(labels.Everything()) if err != nil { klog.Errorf("Failed to get daemonset list: %v", err) @@ -800,3 +805,21 @@ func getUpcomingNodeInfos(registry *clusterstate.ClusterStateRegistry, nodeInfos } return upcomingNodes } + +func calculateCoresMemoryTotal(nodes []*apiv1.Node, timestamp time.Time) (int64, int64) { + // this function is essentially similar to the calculateScaleDownCoresMemoryTotal + // we want to check all nodes, aside from those deleting, to sum the cluster resource usage. + var coresTotal, memoryTotal int64 + for _, node := range nodes { + if isNodeBeingDeleted(node, timestamp) { + // Nodes being deleted do not count towards total cluster resources + continue + } + cores, memory := core_utils.GetNodeCoresAndMemory(node) + + coresTotal += cores + memoryTotal += memory + } + + return coresTotal, memoryTotal +} diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index 782ba930e77e..ea1bc2bad601 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -138,6 +138,14 @@ var ( }, ) + clusterCoresCount = k8smetrics.NewGauge( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "cluster_cores_count", + Help: "Current number of cores in the cluster, minus deleting nodes.", + }, + ) + coresLimitsCount = k8smetrics.NewGaugeVec( &k8smetrics.GaugeOpts{ Namespace: caNamespace, @@ -146,6 +154,14 @@ var ( }, []string{"direction"}, ) + clusterMemoryCount = k8smetrics.NewGauge( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "cluster_memory_count", + Help: "Current number of bytes of memory in the cluster, minus deleting nodes.", + }, + ) + memoryLimitsCount = k8smetrics.NewGaugeVec( &k8smetrics.GaugeOpts{ Namespace: caNamespace, @@ -304,7 +320,9 @@ func RegisterAll() { legacyregistry.MustRegister(nodeGroupsCount) legacyregistry.MustRegister(unschedulablePodsCount) legacyregistry.MustRegister(maxNodesCount) + legacyregistry.MustRegister(clusterCoresCount) legacyregistry.MustRegister(coresLimitsCount) + legacyregistry.MustRegister(clusterMemoryCount) legacyregistry.MustRegister(memoryLimitsCount) legacyregistry.MustRegister(lastActivity) legacyregistry.MustRegister(functionDuration) @@ -382,12 +400,22 @@ func UpdateMaxNodesCount(nodesCount int) { maxNodesCount.Set(float64(nodesCount)) } +// UpdateClusterCoresCount records the number of cores in the cluster, minus deleting nodes +func UpdateClusterCoresCount(coresCount int64) { + clusterCoresCount.Set(float64(coresCount)) +} + // UpdateCoresLimitsCount records the minimum and maximum number of cores in the cluster func UpdateCoresLimitsCount(minCoresCount int64, maxCoresCount int64) { coresLimitsCount.WithLabelValues("minimum").Set(float64(minCoresCount)) coresLimitsCount.WithLabelValues("maximum").Set(float64(maxCoresCount)) } +// UpdateClusterMemoryCount records the number of bytes of memory in the cluster, minus deleting nodes +func UpdateClusterMemoryCount(memoryCount int64) { + clusterMemoryCount.Set(float64(memoryCount)) +} + // UpdateMemoryLimitsCount records the minimum and maximum bytes of memory in the cluster func UpdateMemoryLimitsCount(minMemoryCount int64, maxMemoryCount int64) { memoryLimitsCount.WithLabelValues("minimum").Set(float64(minMemoryCount)) diff --git a/cluster-autoscaler/proposals/metrics.md b/cluster-autoscaler/proposals/metrics.md index 3b535f46948e..31d7062ad5ee 100644 --- a/cluster-autoscaler/proposals/metrics.md +++ b/cluster-autoscaler/proposals/metrics.md @@ -28,7 +28,9 @@ All the metrics are prefixed with `cluster_autoscaler_`. | unschedulable_pods_count | Gauge | | Number of unschedulable ("Pending") pods in the cluster. | | node_groups_count | Gauge | `node_group_type`=<node-group-type> | Number of node groups managed by CA. | | max_nodes_count | Gauge | | Maximum number of nodes in all node groups. | +| cluster_cores_count | Gauge | | | Current number of cores in the cluster, minus deleting nodes. | | cores_limits_count | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of cores in the cluster. | +| cluster_memory_count | Gauge | | Current number of bytes of memory in the cluster, minus deleting nodes. | | memory_limits_count | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of bytes of memory in cluster. | * `cluster_safe_to_autoscale` indicates whether cluster is healthy enough for autoscaling. CA stops all operations if significant number of nodes are unready (by default 33% as of CA 0.5.4).