Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds and implements node taints count metric. #6201

Merged
merged 1 commit into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,7 @@ func (a *StaticAutoscaler) obtainNodeLists(cp cloudprovider.CloudProvider) ([]*a
klog.Errorf("Failed to list ready nodes: %v", err)
return nil, nil, caerrors.ToAutoscalerError(caerrors.ApiCallError, err)
}
a.reportTaintsCount(allNodes)

// Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after
// node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959
Expand All @@ -978,6 +979,13 @@ func (a *StaticAutoscaler) updateClusterState(allNodes []*apiv1.Node, nodeInfosF
return nil
}

func (a *StaticAutoscaler) reportTaintsCount(nodes []*apiv1.Node) {
foundTaints := taints.CountNodeTaints(nodes, a.taintConfig)
for taintType, count := range foundTaints {
metrics.ObserveNodeTaintsCount(taintType, float64(count))
}
}

func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool {
if core_utils.GetOldestCreateTime(pods).Add(unschedulablePodTimeBuffer).After(currentTime) {
return true
Expand Down
15 changes: 15 additions & 0 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,15 @@ var (
Help: "Number of node groups deleted by Node Autoprovisioning.",
},
)

nodeTaintsCount = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_taints_count",
Help: "Number of taints per type used in the cluster.",
},
[]string{"type"},
)
)

// RegisterAll registers all metrics.
Expand Down Expand Up @@ -407,6 +416,7 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
legacyregistry.MustRegister(nodeGroupCreationCount)
legacyregistry.MustRegister(nodeGroupDeletionCount)
legacyregistry.MustRegister(pendingNodeDeletions)
legacyregistry.MustRegister(nodeTaintsCount)

if emitPerNodeGroupMetrics {
legacyregistry.MustRegister(nodesGroupMinNodes)
Expand Down Expand Up @@ -615,3 +625,8 @@ func RegisterSkippedScaleUpMemory() {
func ObservePendingNodeDeletions(value int) {
pendingNodeDeletions.Set(float64(value))
}

// ObserveNodeTaintsCount records the node taints count of given type.
func ObserveNodeTaintsCount(taintType string, count float64) {
nodeTaintsCount.WithLabelValues(taintType).Set(count)
}
138 changes: 98 additions & 40 deletions cluster-autoscaler/utils/taints/taints.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package taints
import (
"context"
"fmt"
"maps"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -54,17 +55,48 @@ const (

// AWS: Indicates that a node has volumes stuck in attaching state and hence it is not fit for scheduling more pods
awsNodeWithImpairedVolumesTaint = "NodeWithImpairedVolumes"

// statusNodeTaintReportedType is the value used when reporting node taint count defined as status taint in given taintConfig.
statusNodeTaintReportedType = "status-taint"

// startupNodeTaintReportedType is the value used when reporting node taint count defined as startup taint in given taintConfig.
startupNodeTaintReportedType = "startup-taint"

// unlistedNodeTaintReportedType is the value used when reporting node taint count in case taint key is other than defined in explicitlyReportedNodeTaints and taintConfig.
unlistedNodeTaintReportedType = "other"
)

var (
// NodeConditionTaints lists taint keys used as node conditions
NodeConditionTaints = TaintKeySet{
apiv1.TaintNodeNotReady: true,
apiv1.TaintNodeUnreachable: true,
apiv1.TaintNodeUnschedulable: true,
apiv1.TaintNodeMemoryPressure: true,
apiv1.TaintNodeDiskPressure: true,
apiv1.TaintNodeNetworkUnavailable: true,
apiv1.TaintNodePIDPressure: true,
cloudproviderapi.TaintExternalCloudProvider: true,
cloudproviderapi.TaintNodeShutdown: true,
gkeNodeTerminationHandlerTaint: true,
awsNodeWithImpairedVolumesTaint: true,
}

// Mutable only in unit tests
maxRetryDeadline time.Duration = 5 * time.Second
conflictRetryInterval time.Duration = 750 * time.Millisecond
)

// TaintKeySet is a set of taint key
type TaintKeySet map[string]bool

// TaintConfig is a config of taints that require special handling
type TaintConfig struct {
StartupTaints TaintKeySet
StatusTaints TaintKeySet
StartupTaintPrefixes []string
StatusTaintPrefixes []string
startupTaints TaintKeySet
statusTaints TaintKeySet
startupTaintPrefixes []string
statusTaintPrefixes []string
explicitlyReportedTaints TaintKeySet
}

// NewTaintConfig returns the taint config extracted from options
Expand All @@ -81,34 +113,41 @@ func NewTaintConfig(opts config.AutoscalingOptions) TaintConfig {
statusTaints[taintKey] = true
}

explicitlyReportedTaints := TaintKeySet{
ToBeDeletedTaint: true,
DeletionCandidateTaint: true,
}
maps.Copy(explicitlyReportedTaints, NodeConditionTaints)

return TaintConfig{
StartupTaints: startupTaints,
StatusTaints: statusTaints,
StartupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix},
StatusTaintPrefixes: []string{StatusTaintPrefix},
startupTaints: startupTaints,
statusTaints: statusTaints,
startupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix},
statusTaintPrefixes: []string{StatusTaintPrefix},
explicitlyReportedTaints: explicitlyReportedTaints,
}
}

var (
// NodeConditionTaints lists taint keys used as node conditions
NodeConditionTaints = TaintKeySet{
apiv1.TaintNodeNotReady: true,
apiv1.TaintNodeUnreachable: true,
apiv1.TaintNodeUnschedulable: true,
apiv1.TaintNodeMemoryPressure: true,
apiv1.TaintNodeDiskPressure: true,
apiv1.TaintNodeNetworkUnavailable: true,
apiv1.TaintNodePIDPressure: true,
cloudproviderapi.TaintExternalCloudProvider: true,
cloudproviderapi.TaintNodeShutdown: true,
gkeNodeTerminationHandlerTaint: true,
awsNodeWithImpairedVolumesTaint: true,
// IsStartupTaint checks whether given taint is a startup taint.
func (tc TaintConfig) IsStartupTaint(taint string) bool {
if _, ok := tc.startupTaints[taint]; ok {
return true
}
return matchesAnyPrefix(tc.startupTaintPrefixes, taint)
}

// Mutable only in unit tests
maxRetryDeadline time.Duration = 5 * time.Second
conflictRetryInterval time.Duration = 750 * time.Millisecond
)
// IsStatusTaint checks whether given taint is a status taint.
func (tc TaintConfig) IsStatusTaint(taint string) bool {
if _, ok := tc.statusTaints[taint]; ok {
return true
}
return matchesAnyPrefix(tc.statusTaintPrefixes, taint)
}

func (tc TaintConfig) isExplicitlyReportedTaint(taint string) bool {
_, ok := tc.explicitlyReportedTaints[taint]
return ok
}

// getKeyShortName converts taint key to short name for logging
func getKeyShortName(key string) string {
Expand Down Expand Up @@ -361,18 +400,8 @@ func SanitizeTaints(taints []apiv1.Taint, taintConfig TaintConfig) []apiv1.Taint
continue
}

if _, exists := taintConfig.StartupTaints[taint.Key]; exists {
klog.V(4).Infof("Removing startup taint %s, when creating template from node", taint.Key)
continue
}
shouldRemoveBasedOnPrefix := matchesAnyPrefix(taintConfig.StartupTaintPrefixes, taint.Key) || matchesAnyPrefix(taintConfig.StatusTaintPrefixes, taint.Key)
if shouldRemoveBasedOnPrefix {
klog.V(4).Infof("Removing taint %s based on prefix, when creation template from node", taint.Key)
continue
}

if _, exists := taintConfig.StatusTaints[taint.Key]; exists {
klog.V(4).Infof("Removing status taint %s, when creating template from node", taint.Key)
if taintConfig.IsStartupTaint(taint.Key) || taintConfig.IsStatusTaint(taint.Key) {
klog.V(4).Infof("Removing taint %s, when creating template from node", taint.Key)
continue
}

Expand All @@ -394,8 +423,7 @@ func FilterOutNodesWithStartupTaints(taintConfig TaintConfig, allNodes, readyNod
}
ready := true
for _, t := range node.Spec.Taints {
_, hasStartupTaint := taintConfig.StartupTaints[t.Key]
if hasStartupTaint || matchesAnyPrefix(taintConfig.StartupTaintPrefixes, t.Key) {
if taintConfig.IsStartupTaint(t.Key) {
ready = false
nodesWithStartupTaints[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.StartupNodes)
klog.V(3).Infof("Overriding status of node %v, which seems to have startup taint %q", node.Name, t.Key)
Expand All @@ -416,3 +444,33 @@ func FilterOutNodesWithStartupTaints(taintConfig TaintConfig, allNodes, readyNod
}
return newAllNodes, newReadyNodes
}

// CountNodeTaints counts used node taints.
func CountNodeTaints(nodes []*apiv1.Node, taintConfig TaintConfig) map[string]int {
foundTaintsCount := make(map[string]int)
for _, node := range nodes {
for _, taint := range node.Spec.Taints {
key := getTaintTypeToReport(taint.Key, taintConfig)
foundTaintsCount[key] += 1
}
}
return foundTaintsCount
}

func getTaintTypeToReport(key string, taintConfig TaintConfig) string {
// Track deprecated taints.
if strings.HasPrefix(key, IgnoreTaintPrefix) {
return IgnoreTaintPrefix
}

if taintConfig.isExplicitlyReportedTaint(key) {
return key
}
if taintConfig.IsStartupTaint(key) {
return startupNodeTaintReportedType
}
if taintConfig.IsStatusTaint(key) {
return statusNodeTaintReportedType
}
return unlistedNodeTaintReportedType
}
110 changes: 105 additions & 5 deletions cluster-autoscaler/utils/taints/taints_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"testing"
"time"

"k8s.io/autoscaler/cluster-autoscaler/config"
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"

apiv1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -472,8 +473,8 @@ func TestFilterOutNodesWithStartupTaints(t *testing.T) {
nodes = append(nodes, tc.node)
}
taintConfig := TaintConfig{
StartupTaints: tc.startupTaints,
StartupTaintPrefixes: tc.startupTaintsPrefixes,
startupTaints: tc.startupTaints,
startupTaintPrefixes: tc.startupTaintsPrefixes,
}
allNodes, readyNodes := FilterOutNodesWithStartupTaints(taintConfig, nodes, nodes)
assert.Equal(t, tc.allNodes, len(allNodes))
Expand Down Expand Up @@ -562,13 +563,112 @@ func TestSanitizeTaints(t *testing.T) {
},
}
taintConfig := TaintConfig{
StartupTaints: map[string]bool{"ignore-me": true},
StatusTaints: map[string]bool{"status-me": true},
StartupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix},
startupTaints: map[string]bool{"ignore-me": true},
statusTaints: map[string]bool{"status-me": true},
startupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix},
}

newTaints := SanitizeTaints(node.Spec.Taints, taintConfig)
require.Equal(t, 2, len(newTaints))
assert.Equal(t, newTaints[0].Key, StatusTaintPrefix+"some-taint")
assert.Equal(t, newTaints[1].Key, "test-taint")
}

func TestCountNodeTaints(t *testing.T) {
node := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-count-node-taints",
CreationTimestamp: metav1.NewTime(time.Now()),
},
Spec: apiv1.NodeSpec{
Taints: []apiv1.Taint{
{
Key: IgnoreTaintPrefix + "another-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: StatusTaintPrefix + "some-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: StartupTaintPrefix + "some-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "test-taint",
Value: "test2",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: ToBeDeletedTaint,
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "ignore-me",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "status-me",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "node.kubernetes.io/memory-pressure",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "ignore-taint.cluster-autoscaler.kubernetes.io/to-be-ignored",
Value: "myValue2",
Effect: apiv1.TaintEffectNoSchedule,
},
},
},
Status: apiv1.NodeStatus{
Conditions: []apiv1.NodeCondition{},
},
}
node2 := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-count-node-taints",
CreationTimestamp: metav1.NewTime(time.Now()),
},
Spec: apiv1.NodeSpec{
Taints: []apiv1.Taint{
{
Key: StatusTaintPrefix + "some-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "node.kubernetes.io/unschedulable",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
},
},
Status: apiv1.NodeStatus{
Conditions: []apiv1.NodeCondition{},
},
}
taintConfig := NewTaintConfig(config.AutoscalingOptions{
StatusTaints: []string{"status-me"},
StartupTaints: []string{"ignore-me"},
})
want := map[string]int{
"ignore-taint.cluster-autoscaler.kubernetes.io/": 2,
"ToBeDeletedByClusterAutoscaler": 1,
"node.kubernetes.io/memory-pressure": 1,
"node.kubernetes.io/unschedulable": 1,
"other": 1,
"startup-taint": 2,
"status-taint": 3,
}
got := CountNodeTaints([]*apiv1.Node{node, node2}, taintConfig)
assert.Equal(t, want, got)
}
Loading