-
Notifications
You must be signed in to change notification settings - Fork 4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix race condition between ca and scheduler
Implement dynamically adjustment of NodeDeleteDelayAfterTaint based on round trip time between ca and apiserver
- Loading branch information
Showing
5 changed files
with
284 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
116 changes: 116 additions & 0 deletions
116
cluster-autoscaler/core/scaledown/actuation/update_latency_tracker.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
package actuation | ||
|
||
import ( | ||
"time" | ||
|
||
"k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" | ||
"k8s.io/autoscaler/cluster-autoscaler/utils/taints" | ||
"k8s.io/klog/v2" | ||
) | ||
|
||
const sleepDurationWhenPolling = 50 * time.Millisecond | ||
const waitForTaintingTimeoutDuration = 30 * time.Second | ||
|
||
type NodeTaintStartTime struct { | ||
nodeName string | ||
startTime time.Time | ||
} | ||
|
||
// UpdateLatencyTracker can be used to calculate round-trip time between CA and api-server | ||
// when adding ToBeDeletedTaint to nodes | ||
type UpdateLatencyTracker struct { | ||
startTimestamp map[string]time.Time | ||
finishTimestamp map[string]time.Time | ||
remainingNodeCount int | ||
nodeLister kubernetes.NodeLister | ||
StartTimeChan chan NodeTaintStartTime | ||
// Passing a bool will wait for all the started nodes to get tainted and calculate | ||
// latency based on latencies observed. (If all the nodes did not get tained within | ||
// waitForTaintingTimeoutDuration after passing a bool, latency calculation will be | ||
// aborted and the ResultChan will be closed without returning a value) Closing the | ||
// AwaitOrStopChan without passing any bool will abort the latency calculation. | ||
AwaitOrStopChan chan bool | ||
ResultChan chan time.Duration | ||
// now is used only to make the testing easier | ||
now func() time.Time | ||
} | ||
|
||
func NewUpdateLatencyTracker(nodeLister kubernetes.NodeLister, now func() time.Time) *UpdateLatencyTracker { | ||
return &UpdateLatencyTracker{ | ||
startTimestamp: map[string]time.Time{}, | ||
finishTimestamp: map[string]time.Time{}, | ||
remainingNodeCount: 0, | ||
nodeLister: nodeLister, | ||
StartTimeChan: make(chan NodeTaintStartTime), | ||
AwaitOrStopChan: make(chan bool), | ||
ResultChan: make(chan time.Duration), | ||
now: now, | ||
} | ||
} | ||
|
||
func (u *UpdateLatencyTracker) Start() { | ||
for { | ||
select { | ||
case _, ok := <-u.AwaitOrStopChan: | ||
if ok { | ||
u.await() | ||
} | ||
return | ||
case ntst := <-u.StartTimeChan: | ||
u.startTimestamp[ntst.nodeName] = ntst.startTime | ||
u.remainingNodeCount += 1 | ||
continue | ||
default: | ||
} | ||
u.updateFinishTime() | ||
time.Sleep(sleepDurationWhenPolling) | ||
} | ||
} | ||
|
||
func (u *UpdateLatencyTracker) updateFinishTime() { | ||
for nodeName, _ := range u.startTimestamp { | ||
if _, ok := u.finishTimestamp[nodeName]; ok { | ||
continue | ||
} | ||
node, err := u.nodeLister.Get(nodeName) | ||
if err != nil { | ||
klog.Errorf("Error getting node: %v", err) | ||
continue | ||
} | ||
if taints.HasToBeDeletedTaint(node) { | ||
u.finishTimestamp[node.Name] = u.now() | ||
u.remainingNodeCount -= 1 | ||
} | ||
} | ||
} | ||
|
||
func (u *UpdateLatencyTracker) calculateLatency() time.Duration { | ||
var maxLatency time.Duration = 0 | ||
for node, startTime := range u.startTimestamp { | ||
endTime, _ := u.finishTimestamp[node] | ||
currentLatency := endTime.Sub(startTime) | ||
if currentLatency > maxLatency { | ||
maxLatency = currentLatency | ||
} | ||
} | ||
return maxLatency | ||
} | ||
|
||
func (u *UpdateLatencyTracker) await() { | ||
waitingForTaintingStartTime := time.Now() | ||
for { | ||
switch { | ||
case u.remainingNodeCount == 0: | ||
latency := u.calculateLatency() | ||
u.ResultChan <- latency | ||
return | ||
case time.Now().After(waitingForTaintingStartTime.Add(waitForTaintingTimeoutDuration)): | ||
klog.Errorf("Timeout before tainting all nodes, latency measurement will be stale") | ||
close(u.ResultChan) | ||
return | ||
default: | ||
time.Sleep(sleepDurationWhenPolling) | ||
u.updateFinishTime() | ||
} | ||
} | ||
} |
112 changes: 112 additions & 0 deletions
112
cluster-autoscaler/core/scaledown/actuation/update_latency_tracker_test.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
package actuation | ||
|
||
import ( | ||
"sync" | ||
"testing" | ||
"time" | ||
|
||
"github.com/stretchr/testify/assert" | ||
apiv1 "k8s.io/api/core/v1" | ||
"k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" | ||
"k8s.io/autoscaler/cluster-autoscaler/utils/taints" | ||
"k8s.io/autoscaler/cluster-autoscaler/utils/test" | ||
) | ||
|
||
// mockClock is used to mock time.Now() when testing UpdateLatencyTracker | ||
// For the n th call to Now() it will return a timestamp after duration[n] to | ||
// the startTime if n < the length of durations. Otherwise, it will return current time. | ||
type mockClock struct { | ||
startTime time.Time | ||
durations []time.Duration | ||
index int | ||
mutex sync.Mutex | ||
} | ||
|
||
func NewMockClock(startTime time.Time, durations []time.Duration) mockClock { | ||
return mockClock{ | ||
startTime: startTime, | ||
durations: durations, | ||
index: 0, | ||
} | ||
} | ||
|
||
func (m *mockClock) Now() time.Time { | ||
m.mutex.Lock() | ||
defer m.mutex.Unlock() | ||
var timeToSend time.Time | ||
if m.index < len(m.durations) { | ||
timeToSend = m.startTime.Add(m.durations[m.index]) | ||
} else { | ||
timeToSend = time.Now() | ||
} | ||
m.index += 1 | ||
return timeToSend | ||
} | ||
|
||
func (m *mockClock) getIndex() int { | ||
m.mutex.Lock() | ||
defer m.mutex.Unlock() | ||
return m.index | ||
} | ||
|
||
func TestUpdateLatencyCalculation(t *testing.T) { | ||
toBeDeletedTaint := apiv1.Taint{Key: taints.ToBeDeletedTaint, Effect: apiv1.TaintEffectNoSchedule} | ||
n1 := test.BuildTestNode("n1", 100, 100) | ||
n1.Spec.Taints = append(n1.Spec.Taints, toBeDeletedTaint) | ||
n2 := test.BuildTestNode("n2", 100, 100) | ||
n2.Spec.Taints = append(n2.Spec.Taints, toBeDeletedTaint) | ||
n3 := test.BuildTestNode("n3", 100, 100) | ||
|
||
testCases := []struct { | ||
description string | ||
startTime time.Time | ||
nodes []*apiv1.Node | ||
durations []time.Duration | ||
wantLatency time.Duration | ||
wantResultChanNotClosed bool | ||
}{ | ||
{ | ||
description: "latency when tainting a single node", | ||
startTime: time.Now(), | ||
nodes: []*apiv1.Node{n1}, | ||
durations: []time.Duration{100 * time.Millisecond}, | ||
wantLatency: 100 * time.Millisecond, | ||
wantResultChanNotClosed: true, | ||
}, | ||
{ | ||
description: "latency when tainting multiple nodes", | ||
startTime: time.Now(), | ||
nodes: []*apiv1.Node{n1, n2}, | ||
durations: []time.Duration{100 * time.Millisecond, 150 * time.Millisecond}, | ||
wantLatency: 150 * time.Millisecond, | ||
wantResultChanNotClosed: true, | ||
}, | ||
{ | ||
description: "Some nodes fails to taint before timeout", | ||
startTime: time.Now(), | ||
nodes: []*apiv1.Node{n1, n3}, | ||
durations: []time.Duration{100 * time.Millisecond, 150 * time.Millisecond}, | ||
wantResultChanNotClosed: false, | ||
}, | ||
} | ||
|
||
for _, tc := range testCases { | ||
t.Run(tc.description, func(t *testing.T) { | ||
assert.Equal(t, len(tc.nodes), len(tc.durations)) | ||
mc := NewMockClock(tc.startTime, tc.durations) | ||
nodeLister := kubernetes.NewTestNodeLister(tc.nodes) | ||
updateLatencyTracker := NewUpdateLatencyTracker(nodeLister, mc.Now) | ||
go updateLatencyTracker.Start() | ||
for _, node := range tc.nodes { | ||
updateLatencyTracker.StartTimeChan <- NodeTaintStartTime{node.Name, tc.startTime} | ||
} | ||
updateLatencyTracker.AwaitOrStopChan <- true | ||
latency, ok := <-updateLatencyTracker.ResultChan | ||
assert.Equal(t, tc.wantResultChanNotClosed, ok) | ||
if ok { | ||
assert.Equal(t, tc.wantLatency, latency) | ||
assert.Equal(t, len(tc.durations), mc.getIndex()) | ||
} | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters