Skip to content

Commit

Permalink
Reports node taints.
Browse files Browse the repository at this point in the history
  • Loading branch information
piotrwrotniak committed Oct 17, 2023
1 parent cc888a1 commit f424743
Show file tree
Hide file tree
Showing 4 changed files with 196 additions and 0 deletions.
8 changes: 8 additions & 0 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,7 @@ func (a *StaticAutoscaler) obtainNodeLists(cp cloudprovider.CloudProvider) ([]*a
klog.Errorf("Failed to list ready nodes: %v", err)
return nil, nil, caerrors.ToAutoscalerError(caerrors.ApiCallError, err)
}
a.reportTaintsCount(allNodes)

// Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after
// node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959
Expand All @@ -978,6 +979,13 @@ func (a *StaticAutoscaler) updateClusterState(allNodes []*apiv1.Node, nodeInfosF
return nil
}

func (a *StaticAutoscaler) reportTaintsCount(nodes []*apiv1.Node) {
foundTaints := taints.CountNodeTaints(nodes, a.taintConfig)
for taintType, count := range foundTaints {
metrics.ObserveNodeTaintsCount(taintType, float64(count))
}
}

func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool {
if core_utils.GetOldestCreateTime(pods).Add(unschedulablePodTimeBuffer).After(currentTime) {
return true
Expand Down
14 changes: 14 additions & 0 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,15 @@ var (
Help: "Number of node groups deleted by Node Autoprovisioning.",
},
)

nodeTaintsCount = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_taints_count",
Help: "Number of taints per type used in the cluster.",
},
[]string{"type"},
)
)

// RegisterAll registers all metrics.
Expand Down Expand Up @@ -615,3 +624,8 @@ func RegisterSkippedScaleUpMemory() {
func ObservePendingNodeDeletions(value int) {
pendingNodeDeletions.Set(float64(value))
}

// ObserveNodeTaintsCount records the node taints count of given type.
func ObserveNodeTaintsCount(taintType string, count float64) {
nodeTaintsCount.WithLabelValues(taintType).Set(count)
}
73 changes: 73 additions & 0 deletions cluster-autoscaler/utils/taints/taints.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,15 @@ const (

// AWS: Indicates that a node has volumes stuck in attaching state and hence it is not fit for scheduling more pods
awsNodeWithImpairedVolumesTaint = "NodeWithImpairedVolumes"

// statusNodeTaintReportedType is the value used when reporting node taint count defined as status taint in given taintConfig.
statusNodeTaintReportedType = "status-taint"

// startupNodeTaintReportedType is the value used when reporting node taint count defined as startup taint in given taintConfig.
startupNodeTaintReportedType = "startup-taint"

// unlistedNodeTaintReportedType is the value used when reporting node taint count in case taint key is other than defined in explicitlyReportedNodeTaints and taintConfig.
unlistedNodeTaintReportedType = "other"
)

// TaintKeySet is a set of taint key
Expand Down Expand Up @@ -108,6 +117,23 @@ var (
// Mutable only in unit tests
maxRetryDeadline time.Duration = 5 * time.Second
conflictRetryInterval time.Duration = 750 * time.Millisecond

explicitlyReportedNodeTaints = TaintKeySet{
apiv1.TaintNodeNotReady: true,
apiv1.TaintNodeUnreachable: true,
apiv1.TaintNodeUnschedulable: true,
apiv1.TaintNodeMemoryPressure: true,
apiv1.TaintNodeDiskPressure: true,
apiv1.TaintNodeNetworkUnavailable: true,
apiv1.TaintNodePIDPressure: true,
apiv1.TaintNodeOutOfService: true,
cloudproviderapi.TaintExternalCloudProvider: true,
cloudproviderapi.TaintNodeShutdown: true,
gkeNodeTerminationHandlerTaint: true,
awsNodeWithImpairedVolumesTaint: true,
ToBeDeletedTaint: true,
DeletionCandidateTaint: true,
}
)

// getKeyShortName converts taint key to short name for logging
Expand Down Expand Up @@ -416,3 +442,50 @@ func FilterOutNodesWithStartupTaints(taintConfig TaintConfig, allNodes, readyNod
}
return newAllNodes, newReadyNodes
}

// CountNodeTaints counts used node taints.
func CountNodeTaints(nodes []*apiv1.Node, taintConfig TaintConfig) map[string]int {
foundTaintsCount := make(map[string]int)
for _, node := range nodes {
for _, taint := range node.Spec.Taints {
key := getTaintTypeToReport(taint.Key, taintConfig)
if _, ok := foundTaintsCount[key]; ok {
foundTaintsCount[key] += 1
} else {
foundTaintsCount[key] = 1
}
}
}
return foundTaintsCount
}

func getTaintTypeToReport(key string, taintConfig TaintConfig) string {
// Track deprecated taints.
if strings.HasPrefix(key, IgnoreTaintPrefix) {
return IgnoreTaintPrefix
}

if _, ok := explicitlyReportedNodeTaints[key]; ok {
return key
}

if _, ok := taintConfig.StartupTaints[key]; ok {
return startupNodeTaintReportedType
}
for _, pref := range taintConfig.StartupTaintPrefixes {
if strings.HasPrefix(key, pref) {
return startupNodeTaintReportedType
}
}

if _, ok := taintConfig.StatusTaints[key]; ok {
return statusNodeTaintReportedType
}
for _, pref := range taintConfig.StatusTaintPrefixes {
if strings.HasPrefix(key, pref) {
return statusNodeTaintReportedType
}
}

return unlistedNodeTaintReportedType
}
101 changes: 101 additions & 0 deletions cluster-autoscaler/utils/taints/taints_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -572,3 +572,104 @@ func TestSanitizeTaints(t *testing.T) {
assert.Equal(t, newTaints[0].Key, StatusTaintPrefix+"some-taint")
assert.Equal(t, newTaints[1].Key, "test-taint")
}

func TestCountNodeTaints(t *testing.T) {
node := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-count-node-taints",
CreationTimestamp: metav1.NewTime(time.Now()),
},
Spec: apiv1.NodeSpec{
Taints: []apiv1.Taint{
{
Key: IgnoreTaintPrefix + "another-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: StatusTaintPrefix + "some-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: StartupTaintPrefix + "some-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "test-taint",
Value: "test2",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: ToBeDeletedTaint,
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "ignore-me",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "status-me",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "node.kubernetes.io/memory-pressure",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "ignore-taint.cluster-autoscaler.kubernetes.io/to-be-ignored",
Value: "myValue2",
Effect: apiv1.TaintEffectNoSchedule,
},
},
},
Status: apiv1.NodeStatus{
Conditions: []apiv1.NodeCondition{},
},
}
node2 := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-count-node-taints",
CreationTimestamp: metav1.NewTime(time.Now()),
},
Spec: apiv1.NodeSpec{
Taints: []apiv1.Taint{
{
Key: StatusTaintPrefix + "some-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "node.kubernetes.io/unschedulable",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
},
},
Status: apiv1.NodeStatus{
Conditions: []apiv1.NodeCondition{},
},
}
taintConfig := TaintConfig{
StartupTaints: map[string]bool{"ignore-me": true},
StatusTaints: map[string]bool{"status-me": true},
StartupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix},
StatusTaintPrefixes: []string{StatusTaintPrefix},
}
want := map[string]int{
"ignore-taint.cluster-autoscaler.kubernetes.io/": 2,
"ToBeDeletedByClusterAutoscaler": 1,
"node.kubernetes.io/memory-pressure": 1,
"node.kubernetes.io/unschedulable": 1,
"other": 1,
"startup-taint": 2,
"status-taint": 3,
}
got := CountNodeTaints([]*apiv1.Node{node, node2}, taintConfig)
assert.Equal(t, want, got)
}

0 comments on commit f424743

Please sign in to comment.