diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 68b870786d02..ac3103973931 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -955,6 +955,7 @@ func (a *StaticAutoscaler) obtainNodeLists(cp cloudprovider.CloudProvider) ([]*a klog.Errorf("Failed to list ready nodes: %v", err) return nil, nil, caerrors.ToAutoscalerError(caerrors.ApiCallError, err) } + a.reportTaintsCount(allNodes) // Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after // node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959 @@ -978,6 +979,13 @@ func (a *StaticAutoscaler) updateClusterState(allNodes []*apiv1.Node, nodeInfosF return nil } +func (a *StaticAutoscaler) reportTaintsCount(nodes []*apiv1.Node) { + foundTaints := taints.CountNodeTaints(nodes, a.taintConfig) + for taintType, count := range foundTaints { + metrics.ObserveNodeTaintsCount(taintType, float64(count)) + } +} + func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool { if core_utils.GetOldestCreateTime(pods).Add(unschedulablePodTimeBuffer).After(currentTime) { return true diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index 8f4e0d869ddd..44939c46e17d 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -373,6 +373,15 @@ var ( Help: "Number of node groups deleted by Node Autoprovisioning.", }, ) + + nodeTaintsCount = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "node_taints_count", + Help: "Number of taints per type used in the cluster.", + }, + []string{"type"}, + ) ) // RegisterAll registers all metrics. @@ -407,6 +416,7 @@ func RegisterAll(emitPerNodeGroupMetrics bool) { legacyregistry.MustRegister(nodeGroupCreationCount) legacyregistry.MustRegister(nodeGroupDeletionCount) legacyregistry.MustRegister(pendingNodeDeletions) + legacyregistry.MustRegister(nodeTaintsCount) if emitPerNodeGroupMetrics { legacyregistry.MustRegister(nodesGroupMinNodes) @@ -615,3 +625,8 @@ func RegisterSkippedScaleUpMemory() { func ObservePendingNodeDeletions(value int) { pendingNodeDeletions.Set(float64(value)) } + +// ObserveNodeTaintsCount records the node taints count of given type. +func ObserveNodeTaintsCount(taintType string, count float64) { + nodeTaintsCount.WithLabelValues(taintType).Set(count) +} diff --git a/cluster-autoscaler/utils/taints/taints.go b/cluster-autoscaler/utils/taints/taints.go index c3a09db60e12..6910f8d75ae4 100644 --- a/cluster-autoscaler/utils/taints/taints.go +++ b/cluster-autoscaler/utils/taints/taints.go @@ -19,6 +19,7 @@ package taints import ( "context" "fmt" + "maps" "strconv" "strings" "time" @@ -54,6 +55,36 @@ const ( // AWS: Indicates that a node has volumes stuck in attaching state and hence it is not fit for scheduling more pods awsNodeWithImpairedVolumesTaint = "NodeWithImpairedVolumes" + + // statusNodeTaintReportedType is the value used when reporting node taint count defined as status taint in given taintConfig. + statusNodeTaintReportedType = "status-taint" + + // startupNodeTaintReportedType is the value used when reporting node taint count defined as startup taint in given taintConfig. + startupNodeTaintReportedType = "startup-taint" + + // unlistedNodeTaintReportedType is the value used when reporting node taint count in case taint key is other than defined in explicitlyReportedNodeTaints and taintConfig. + unlistedNodeTaintReportedType = "other" +) + +var ( + // NodeConditionTaints lists taint keys used as node conditions + NodeConditionTaints = TaintKeySet{ + apiv1.TaintNodeNotReady: true, + apiv1.TaintNodeUnreachable: true, + apiv1.TaintNodeUnschedulable: true, + apiv1.TaintNodeMemoryPressure: true, + apiv1.TaintNodeDiskPressure: true, + apiv1.TaintNodeNetworkUnavailable: true, + apiv1.TaintNodePIDPressure: true, + cloudproviderapi.TaintExternalCloudProvider: true, + cloudproviderapi.TaintNodeShutdown: true, + gkeNodeTerminationHandlerTaint: true, + awsNodeWithImpairedVolumesTaint: true, + } + + // Mutable only in unit tests + maxRetryDeadline time.Duration = 5 * time.Second + conflictRetryInterval time.Duration = 750 * time.Millisecond ) // TaintKeySet is a set of taint key @@ -61,10 +92,11 @@ type TaintKeySet map[string]bool // TaintConfig is a config of taints that require special handling type TaintConfig struct { - StartupTaints TaintKeySet - StatusTaints TaintKeySet - StartupTaintPrefixes []string - StatusTaintPrefixes []string + startupTaints TaintKeySet + statusTaints TaintKeySet + startupTaintPrefixes []string + statusTaintPrefixes []string + explicitlyReportedTaints TaintKeySet } // NewTaintConfig returns the taint config extracted from options @@ -81,34 +113,41 @@ func NewTaintConfig(opts config.AutoscalingOptions) TaintConfig { statusTaints[taintKey] = true } + explicitlyReportedTaints := TaintKeySet{ + ToBeDeletedTaint: true, + DeletionCandidateTaint: true, + } + maps.Copy(explicitlyReportedTaints, NodeConditionTaints) + return TaintConfig{ - StartupTaints: startupTaints, - StatusTaints: statusTaints, - StartupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix}, - StatusTaintPrefixes: []string{StatusTaintPrefix}, + startupTaints: startupTaints, + statusTaints: statusTaints, + startupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix}, + statusTaintPrefixes: []string{StatusTaintPrefix}, + explicitlyReportedTaints: explicitlyReportedTaints, } } -var ( - // NodeConditionTaints lists taint keys used as node conditions - NodeConditionTaints = TaintKeySet{ - apiv1.TaintNodeNotReady: true, - apiv1.TaintNodeUnreachable: true, - apiv1.TaintNodeUnschedulable: true, - apiv1.TaintNodeMemoryPressure: true, - apiv1.TaintNodeDiskPressure: true, - apiv1.TaintNodeNetworkUnavailable: true, - apiv1.TaintNodePIDPressure: true, - cloudproviderapi.TaintExternalCloudProvider: true, - cloudproviderapi.TaintNodeShutdown: true, - gkeNodeTerminationHandlerTaint: true, - awsNodeWithImpairedVolumesTaint: true, +// IsStartupTaint checks whether given taint is a startup taint. +func (tc TaintConfig) IsStartupTaint(taint string) bool { + if _, ok := tc.startupTaints[taint]; ok { + return true } + return matchesAnyPrefix(tc.startupTaintPrefixes, taint) +} - // Mutable only in unit tests - maxRetryDeadline time.Duration = 5 * time.Second - conflictRetryInterval time.Duration = 750 * time.Millisecond -) +// IsStatusTaint checks whether given taint is a status taint. +func (tc TaintConfig) IsStatusTaint(taint string) bool { + if _, ok := tc.statusTaints[taint]; ok { + return true + } + return matchesAnyPrefix(tc.statusTaintPrefixes, taint) +} + +func (tc TaintConfig) isExplicitlyReportedTaint(taint string) bool { + _, ok := tc.explicitlyReportedTaints[taint] + return ok +} // getKeyShortName converts taint key to short name for logging func getKeyShortName(key string) string { @@ -361,18 +400,8 @@ func SanitizeTaints(taints []apiv1.Taint, taintConfig TaintConfig) []apiv1.Taint continue } - if _, exists := taintConfig.StartupTaints[taint.Key]; exists { - klog.V(4).Infof("Removing startup taint %s, when creating template from node", taint.Key) - continue - } - shouldRemoveBasedOnPrefix := matchesAnyPrefix(taintConfig.StartupTaintPrefixes, taint.Key) || matchesAnyPrefix(taintConfig.StatusTaintPrefixes, taint.Key) - if shouldRemoveBasedOnPrefix { - klog.V(4).Infof("Removing taint %s based on prefix, when creation template from node", taint.Key) - continue - } - - if _, exists := taintConfig.StatusTaints[taint.Key]; exists { - klog.V(4).Infof("Removing status taint %s, when creating template from node", taint.Key) + if taintConfig.IsStartupTaint(taint.Key) || taintConfig.IsStatusTaint(taint.Key) { + klog.V(4).Infof("Removing taint %s, when creating template from node", taint.Key) continue } @@ -394,8 +423,7 @@ func FilterOutNodesWithStartupTaints(taintConfig TaintConfig, allNodes, readyNod } ready := true for _, t := range node.Spec.Taints { - _, hasStartupTaint := taintConfig.StartupTaints[t.Key] - if hasStartupTaint || matchesAnyPrefix(taintConfig.StartupTaintPrefixes, t.Key) { + if taintConfig.IsStartupTaint(t.Key) { ready = false nodesWithStartupTaints[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.StartupNodes) klog.V(3).Infof("Overriding status of node %v, which seems to have startup taint %q", node.Name, t.Key) @@ -416,3 +444,33 @@ func FilterOutNodesWithStartupTaints(taintConfig TaintConfig, allNodes, readyNod } return newAllNodes, newReadyNodes } + +// CountNodeTaints counts used node taints. +func CountNodeTaints(nodes []*apiv1.Node, taintConfig TaintConfig) map[string]int { + foundTaintsCount := make(map[string]int) + for _, node := range nodes { + for _, taint := range node.Spec.Taints { + key := getTaintTypeToReport(taint.Key, taintConfig) + foundTaintsCount[key] += 1 + } + } + return foundTaintsCount +} + +func getTaintTypeToReport(key string, taintConfig TaintConfig) string { + // Track deprecated taints. + if strings.HasPrefix(key, IgnoreTaintPrefix) { + return IgnoreTaintPrefix + } + + if taintConfig.isExplicitlyReportedTaint(key) { + return key + } + if taintConfig.IsStartupTaint(key) { + return startupNodeTaintReportedType + } + if taintConfig.IsStatusTaint(key) { + return statusNodeTaintReportedType + } + return unlistedNodeTaintReportedType +} diff --git a/cluster-autoscaler/utils/taints/taints_test.go b/cluster-autoscaler/utils/taints/taints_test.go index fff009c81792..27a32e610d1d 100644 --- a/cluster-autoscaler/utils/taints/taints_test.go +++ b/cluster-autoscaler/utils/taints/taints_test.go @@ -24,6 +24,7 @@ import ( "testing" "time" + "k8s.io/autoscaler/cluster-autoscaler/config" . "k8s.io/autoscaler/cluster-autoscaler/utils/test" apiv1 "k8s.io/api/core/v1" @@ -472,8 +473,8 @@ func TestFilterOutNodesWithStartupTaints(t *testing.T) { nodes = append(nodes, tc.node) } taintConfig := TaintConfig{ - StartupTaints: tc.startupTaints, - StartupTaintPrefixes: tc.startupTaintsPrefixes, + startupTaints: tc.startupTaints, + startupTaintPrefixes: tc.startupTaintsPrefixes, } allNodes, readyNodes := FilterOutNodesWithStartupTaints(taintConfig, nodes, nodes) assert.Equal(t, tc.allNodes, len(allNodes)) @@ -562,9 +563,9 @@ func TestSanitizeTaints(t *testing.T) { }, } taintConfig := TaintConfig{ - StartupTaints: map[string]bool{"ignore-me": true}, - StatusTaints: map[string]bool{"status-me": true}, - StartupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix}, + startupTaints: map[string]bool{"ignore-me": true}, + statusTaints: map[string]bool{"status-me": true}, + startupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix}, } newTaints := SanitizeTaints(node.Spec.Taints, taintConfig) @@ -572,3 +573,102 @@ func TestSanitizeTaints(t *testing.T) { assert.Equal(t, newTaints[0].Key, StatusTaintPrefix+"some-taint") assert.Equal(t, newTaints[1].Key, "test-taint") } + +func TestCountNodeTaints(t *testing.T) { + node := &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-count-node-taints", + CreationTimestamp: metav1.NewTime(time.Now()), + }, + Spec: apiv1.NodeSpec{ + Taints: []apiv1.Taint{ + { + Key: IgnoreTaintPrefix + "another-taint", + Value: "myValue", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: StatusTaintPrefix + "some-taint", + Value: "myValue", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: StartupTaintPrefix + "some-taint", + Value: "myValue", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: "test-taint", + Value: "test2", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: ToBeDeletedTaint, + Value: "1", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: "ignore-me", + Value: "1", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: "status-me", + Value: "1", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: "node.kubernetes.io/memory-pressure", + Value: "1", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: "ignore-taint.cluster-autoscaler.kubernetes.io/to-be-ignored", + Value: "myValue2", + Effect: apiv1.TaintEffectNoSchedule, + }, + }, + }, + Status: apiv1.NodeStatus{ + Conditions: []apiv1.NodeCondition{}, + }, + } + node2 := &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-count-node-taints", + CreationTimestamp: metav1.NewTime(time.Now()), + }, + Spec: apiv1.NodeSpec{ + Taints: []apiv1.Taint{ + { + Key: StatusTaintPrefix + "some-taint", + Value: "myValue", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: "node.kubernetes.io/unschedulable", + Value: "1", + Effect: apiv1.TaintEffectNoSchedule, + }, + }, + }, + Status: apiv1.NodeStatus{ + Conditions: []apiv1.NodeCondition{}, + }, + } + taintConfig := NewTaintConfig(config.AutoscalingOptions{ + StatusTaints: []string{"status-me"}, + StartupTaints: []string{"ignore-me"}, + }) + want := map[string]int{ + "ignore-taint.cluster-autoscaler.kubernetes.io/": 2, + "ToBeDeletedByClusterAutoscaler": 1, + "node.kubernetes.io/memory-pressure": 1, + "node.kubernetes.io/unschedulable": 1, + "other": 1, + "startup-taint": 2, + "status-taint": 3, + } + got := CountNodeTaints([]*apiv1.Node{node, node2}, taintConfig) + assert.Equal(t, want, got) +}