Skip to content

Commit

Permalink
add cluster cores and memory bytes count metrics
Browse files Browse the repository at this point in the history
This change adds 4 metrics that can be used to monitor the minimum and
maximum limits for CPU and memory, as well as the current counts in
cores and bytes, respectively.

The four metrics added are:
* `cluster_autoscaler_cpu_limits_cores`
* `cluster_autoscaler_cluster_cpu_current_cores`
* `cluster_autoscaler_memory_limits_bytes`
* `cluster_autoscaler_cluster_memory_current_bytes`

This change also adds the `max_cores_total` metric to the metrics
proposal doc, as it was previously not recorded there.

User story: As a cluster autoscaler user, I would like to monitor my
cluster through metrics to determine when the cluster is nearing its
limits for cores and memory usage.
  • Loading branch information
elmiko committed Apr 6, 2021
1 parent 6dcda9d commit a24ea6c
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 0 deletions.
23 changes: 23 additions & 0 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return nil
}

// Update cluster resource usage metrics
coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime)
metrics.UpdateClusterCPUCurrentCores(coresTotal)
metrics.UpdateClusterMemoryCurrentBytes(memoryTotal)

daemonsets, err := a.ListerRegistry.DaemonSetLister().List(labels.Everything())
if err != nil {
klog.Errorf("Failed to get daemonset list: %v", err)
Expand Down Expand Up @@ -800,3 +805,21 @@ func getUpcomingNodeInfos(registry *clusterstate.ClusterStateRegistry, nodeInfos
}
return upcomingNodes
}

func calculateCoresMemoryTotal(nodes []*apiv1.Node, timestamp time.Time) (int64, int64) {
// this function is essentially similar to the calculateScaleDownCoresMemoryTotal
// we want to check all nodes, aside from those deleting, to sum the cluster resource usage.
var coresTotal, memoryTotal int64
for _, node := range nodes {
if isNodeBeingDeleted(node, timestamp) {
// Nodes being deleted do not count towards total cluster resources
continue
}
cores, memory := core_utils.GetNodeCoresAndMemory(node)

coresTotal += cores
memoryTotal += memory
}

return coresTotal, memoryTotal
}
2 changes: 2 additions & 0 deletions cluster-autoscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,8 @@ func buildAutoscaler() (core.Autoscaler, error) {
// These metrics should be published only once.
metrics.UpdateNapEnabled(autoscalingOptions.NodeAutoprovisioningEnabled)
metrics.UpdateMaxNodesCount(autoscalingOptions.MaxNodesTotal)
metrics.UpdateCPULimitsCores(autoscalingOptions.MinCoresTotal, autoscalingOptions.MaxCoresTotal)
metrics.UpdateMemoryLimitsBytes(autoscalingOptions.MinMemoryTotal, autoscalingOptions.MaxMemoryTotal)

// Create autoscaler.
return core.NewAutoscaler(opts)
Expand Down
58 changes: 58 additions & 0 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,38 @@ var (
},
)

cpuCurrentCores = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "cluster_cpu_current_cores",
Help: "Current number of cores in the cluster, minus deleting nodes.",
},
)

cpuLimitsCores = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "cpu_limits_cores",
Help: "Minimum and maximum number of cores in the cluster.",
}, []string{"direction"},
)

memoryCurrentBytes = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "cluster_memory_current_bytes",
Help: "Current number of bytes of memory in the cluster, minus deleting nodes.",
},
)

memoryLimitsBytes = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "memory_limits_bytes",
Help: "Minimum and maximum number of bytes of memory in cluster.",
}, []string{"direction"},
)

/**** Metrics related to autoscaler execution ****/
lastActivity = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Expand Down Expand Up @@ -288,6 +320,10 @@ func RegisterAll() {
legacyregistry.MustRegister(nodeGroupsCount)
legacyregistry.MustRegister(unschedulablePodsCount)
legacyregistry.MustRegister(maxNodesCount)
legacyregistry.MustRegister(cpuCurrentCores)
legacyregistry.MustRegister(cpuLimitsCores)
legacyregistry.MustRegister(memoryCurrentBytes)
legacyregistry.MustRegister(memoryLimitsBytes)
legacyregistry.MustRegister(lastActivity)
legacyregistry.MustRegister(functionDuration)
legacyregistry.MustRegister(functionDurationSummary)
Expand Down Expand Up @@ -364,6 +400,28 @@ func UpdateMaxNodesCount(nodesCount int) {
maxNodesCount.Set(float64(nodesCount))
}

// UpdateClusterCPUCurrentCores records the number of cores in the cluster, minus deleting nodes
func UpdateClusterCPUCurrentCores(coresCount int64) {
cpuCurrentCores.Set(float64(coresCount))
}

// UpdateCPULimitsCores records the minimum and maximum number of cores in the cluster
func UpdateCPULimitsCores(minCoresCount int64, maxCoresCount int64) {
cpuLimitsCores.WithLabelValues("minimum").Set(float64(minCoresCount))
cpuLimitsCores.WithLabelValues("maximum").Set(float64(maxCoresCount))
}

// UpdateClusterMemoryCurrentBytes records the number of bytes of memory in the cluster, minus deleting nodes
func UpdateClusterMemoryCurrentBytes(memoryCount int64) {
memoryCurrentBytes.Set(float64(memoryCount))
}

// UpdateMemoryLimitsBytes records the minimum and maximum bytes of memory in the cluster
func UpdateMemoryLimitsBytes(minMemoryCount int64, maxMemoryCount int64) {
memoryLimitsBytes.WithLabelValues("minimum").Set(float64(minMemoryCount))
memoryLimitsBytes.WithLabelValues("maximum").Set(float64(maxMemoryCount))
}

// RegisterError records any errors preventing Cluster Autoscaler from working.
// No more than one error should be recorded per loop.
func RegisterError(err errors.AutoscalerError) {
Expand Down
5 changes: 5 additions & 0 deletions cluster-autoscaler/proposals/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ All the metrics are prefixed with `cluster_autoscaler_`.
| nodes_count | Gauge | `state`=<node-state> | Number of nodes in cluster. |
| unschedulable_pods_count | Gauge | | Number of unschedulable ("Pending") pods in the cluster. |
| node_groups_count | Gauge | `node_group_type`=<node-group-type> | Number of node groups managed by CA. |
| max_nodes_count | Gauge | | Maximum number of nodes in all node groups. |
| cluster_cpu_current_cores | Gauge | | | Current number of cores in the cluster, minus deleting nodes. |
| cpu_limits_cores | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of cores in the cluster. |
| cluster_memory_current_bytes | Gauge | | Current number of bytes of memory in the cluster, minus deleting nodes. |
| memory_limits_bytes | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of bytes of memory in cluster. |

* `cluster_safe_to_autoscale` indicates whether cluster is healthy enough for autoscaling. CA stops all operations if significant number of nodes are unready (by default 33% as of CA 0.5.4).
* `nodes_count` records the total number of nodes, labeled by node state. Possible
Expand Down

1 comment on commit a24ea6c

@tehlers320
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When could this go in? AFAIK this is not on an officially released version. Its only on master.

Thanks!

Please sign in to comment.