Skip to content

Commit

Permalink
add cluster cores and memory count metrics
Browse files Browse the repository at this point in the history
This change adds metrics for the count of cores and memory bytes in use
by the cluster, excluding nodes in deleting state. This update is
performed once in each call to `RunOnce`, and as such will be calculated
once per scan interval.

User story: As a cluster autoscaler user, I would like to monitor my
cluster through metrics to determine when the cluster is nearing its
limits for cores and memory usage.
  • Loading branch information
elmiko committed Mar 30, 2021
1 parent dcbf12c commit cc12a63
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 0 deletions.
23 changes: 23 additions & 0 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return nil
}

// Update cluster resource usage metrics
coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime)
metrics.UpdateClusterCoresCount(coresTotal)
metrics.UpdateClusterMemoryCount(memoryTotal)

daemonsets, err := a.ListerRegistry.DaemonSetLister().List(labels.Everything())
if err != nil {
klog.Errorf("Failed to get daemonset list: %v", err)
Expand Down Expand Up @@ -800,3 +805,21 @@ func getUpcomingNodeInfos(registry *clusterstate.ClusterStateRegistry, nodeInfos
}
return upcomingNodes
}

func calculateCoresMemoryTotal(nodes []*apiv1.Node, timestamp time.Time) (int64, int64) {
// this function is essentially similar to the calculateScaleDownCoresMemoryTotal
// we want to check all nodes, aside from those deleting, to sum the cluster resource usage.
var coresTotal, memoryTotal int64
for _, node := range nodes {
if isNodeBeingDeleted(node, timestamp) {
// Nodes being deleted do not count towards total cluster resources
continue
}
cores, memory := core_utils.GetNodeCoresAndMemory(node)

coresTotal += cores
memoryTotal += memory
}

return coresTotal, memoryTotal
}
28 changes: 28 additions & 0 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,14 @@ var (
},
)

clusterCoresCount = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "cluster_cores_count",
Help: "Current number of cores in the cluster, minus deleting nodes.",
},
)

coresLimitsCount = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Expand All @@ -146,6 +154,14 @@ var (
}, []string{"direction"},
)

clusterMemoryCount = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "cluster_memory_count",
Help: "Current number of bytes of memory in the cluster, minus deleting nodes.",
},
)

memoryLimitsCount = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Expand Down Expand Up @@ -304,7 +320,9 @@ func RegisterAll() {
legacyregistry.MustRegister(nodeGroupsCount)
legacyregistry.MustRegister(unschedulablePodsCount)
legacyregistry.MustRegister(maxNodesCount)
legacyregistry.MustRegister(clusterCoresCount)
legacyregistry.MustRegister(coresLimitsCount)
legacyregistry.MustRegister(clusterMemoryCount)
legacyregistry.MustRegister(memoryLimitsCount)
legacyregistry.MustRegister(lastActivity)
legacyregistry.MustRegister(functionDuration)
Expand Down Expand Up @@ -382,12 +400,22 @@ func UpdateMaxNodesCount(nodesCount int) {
maxNodesCount.Set(float64(nodesCount))
}

// UpdateClusterCoresCount records the number of cores in the cluster, minus deleting nodes
func UpdateClusterCoresCount(coresCount int64) {
clusterCoresCount.Set(float64(coresCount))
}

// UpdateCoresLimitsCount records the minimum and maximum number of cores in the cluster
func UpdateCoresLimitsCount(minCoresCount int64, maxCoresCount int64) {
coresLimitsCount.WithLabelValues("minimum").Set(float64(minCoresCount))
coresLimitsCount.WithLabelValues("maximum").Set(float64(maxCoresCount))
}

// UpdateClusterMemoryCount records the number of bytes of memory in the cluster, minus deleting nodes
func UpdateClusterMemoryCount(memoryCount int64) {
clusterMemoryCount.Set(float64(memoryCount))
}

// UpdateMemoryLimitsCount records the minimum and maximum bytes of memory in the cluster
func UpdateMemoryLimitsCount(minMemoryCount int64, maxMemoryCount int64) {
memoryLimitsCount.WithLabelValues("minimum").Set(float64(minMemoryCount))
Expand Down
2 changes: 2 additions & 0 deletions cluster-autoscaler/proposals/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ All the metrics are prefixed with `cluster_autoscaler_`.
| unschedulable_pods_count | Gauge | | Number of unschedulable ("Pending") pods in the cluster. |
| node_groups_count | Gauge | `node_group_type`=<node-group-type> | Number of node groups managed by CA. |
| max_nodes_count | Gauge | | Maximum number of nodes in all node groups. |
| cluster_cores_count | Gauge | | | Current number of cores in the cluster, minus deleting nodes. |
| cores_limits_count | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of cores in the cluster. |
| cluster_memory_count | Gauge | | Current number of bytes of memory in the cluster, minus deleting nodes. |
| memory_limits_count | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of bytes of memory in cluster. |

* `cluster_safe_to_autoscale` indicates whether cluster is healthy enough for autoscaling. CA stops all operations if significant number of nodes are unready (by default 33% as of CA 0.5.4).
Expand Down

0 comments on commit cc12a63

Please sign in to comment.