-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
178 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
111 changes: 111 additions & 0 deletions
111
pkg/kv/kvclient/kvcoord/dist_sender_rangefeed_canceler.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
// Copyright 2022 The Cockroach Authors. | ||
// | ||
// Use of this software is governed by the Business Source License | ||
// included in the file licenses/BSL.txt. | ||
// | ||
// As of the Change Date specified in that file, in accordance with | ||
// the Business Source License, use of this software will be governed | ||
// by the Apache License, Version 2.0, included in the file | ||
// licenses/APL.txt. | ||
// | ||
|
||
package kvcoord | ||
|
||
import ( | ||
"context" | ||
"sync/atomic" | ||
"time" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/util/timeutil" | ||
) | ||
|
||
// stuckRangeFeedCanceler are a defense-in-depth mechanism to restart rangefeeds that have | ||
// not received events from the KV layer in some time. Rangefeeds are supposed to receive | ||
// regular updates, as at the very least they ought to be receiving closed timestamps. | ||
// However, issues[^1] at the KV layer could prevent this. | ||
// | ||
// The canceler is notified via ping() whenever the associated RangeFeed receives an event. | ||
// Should ping() not be called for the configured threshold duration, the provided cancel | ||
// function will be invoked. | ||
// | ||
// This is implemented without incurring nontrivial work on each call to ping(). | ||
// Instead, work is done roughly on each threshold interval, which is assumed to | ||
// be large enough (i.e. at least a couple of seconds) to make this negligible. | ||
// Concretely, a timer is set that would invoke the cancellation, and the timer | ||
// is reset on the first call to ping() after the timer is at least half | ||
// expired. That way, we allocate only ~twice per eventCheckInterval, which is | ||
// acceptable. | ||
// | ||
// The canceler detects changes to the configured threshold duration on each call | ||
// to ping(), i.e. in the common case of no stuck rangefeeds, it will ~immediately | ||
// pick up the new value and apply it. | ||
type stuckRangeFeedCanceler struct { | ||
threshold func() time.Duration | ||
cancel context.CancelFunc | ||
t *time.Timer | ||
resetTimerAfter time.Time | ||
activeThreshold time.Duration | ||
|
||
_stuck int32 // atomic | ||
} | ||
|
||
// stuck returns true if the stuck detection got triggered. | ||
// If this returns true, the cancel function will be invoked | ||
// shortly, if it hasn't already. | ||
func (w *stuckRangeFeedCanceler) stuck() bool { | ||
return atomic.LoadInt32(&w._stuck) != 0 | ||
} | ||
|
||
// stop releases the active timer, if any. It should be invoked | ||
// unconditionally before the canceler goes out of scope. | ||
func (w *stuckRangeFeedCanceler) stop() { | ||
if w.t != nil { | ||
w.t.Stop() | ||
w.t = nil | ||
w.activeThreshold = 0 | ||
} | ||
} | ||
|
||
// ping notifies the canceler that the rangefeed has received an | ||
// event, i.e. is making progress. | ||
func (w *stuckRangeFeedCanceler) ping() { | ||
threshold := w.threshold() | ||
if threshold == 0 { | ||
w.stop() | ||
return | ||
} | ||
|
||
mkTimer := func() { | ||
w.activeThreshold = threshold | ||
w.t = time.AfterFunc(threshold, func() { | ||
// NB: important to store _stuck before canceling, since we | ||
// want the caller to be able to detect stuck() after ctx | ||
// cancels. | ||
atomic.StoreInt32(&w._stuck, 1) | ||
w.cancel() | ||
}) | ||
w.resetTimerAfter = timeutil.Now().Add(threshold / 2) | ||
} | ||
|
||
if w.t == nil { | ||
mkTimer() | ||
} else if w.resetTimerAfter.Before(timeutil.Now()) || w.activeThreshold != threshold { | ||
w.stop() | ||
mkTimer() | ||
} | ||
} | ||
|
||
// newStuckRangeFeedCanceler sets up a canceler with the provided | ||
// cancel function (which should cancel the rangefeed if invoked) | ||
// and uses the kv.rangefeed.range_stuck_threshold cluster setting | ||
// to (reactively) configure the timeout. | ||
func newStuckRangeFeedCanceler( | ||
cancel context.CancelFunc, threshold func() time.Duration, | ||
) *stuckRangeFeedCanceler { | ||
w := &stuckRangeFeedCanceler{ | ||
threshold: threshold, | ||
cancel: cancel, | ||
} | ||
w.ping() | ||
return w | ||
} |
62 changes: 62 additions & 0 deletions
62
pkg/kv/kvclient/kvcoord/dist_sender_rangefeed_canceler_test.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
// Copyright 2022 The Cockroach Authors. | ||
// | ||
// Use of this software is governed by the Business Source License | ||
// included in the file licenses/BSL.txt. | ||
// | ||
// As of the Change Date specified in that file, in accordance with | ||
// the Business Source License, use of this software will be governed | ||
// by the Apache License, Version 2.0, included in the file | ||
// licenses/APL.txt. | ||
// | ||
|
||
package kvcoord | ||
|
||
import ( | ||
"sync/atomic" | ||
"testing" | ||
"time" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/util/leaktest" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
type cancelRec int32 | ||
|
||
func (c *cancelRec) cancel() { | ||
atomic.StoreInt32((*int32)(c), 1) | ||
} | ||
|
||
func (c *cancelRec) canceled() bool { | ||
return atomic.LoadInt32((*int32)(c)) != 0 | ||
} | ||
|
||
func TestStuckRangeFeedCanceler(t *testing.T) { | ||
defer leaktest.AfterTest(t)() | ||
|
||
var dur int64 = int64(24 * time.Hour) // atomic | ||
var cr cancelRec | ||
c := newStuckRangeFeedCanceler(cr.cancel, func() time.Duration { | ||
return time.Duration(atomic.LoadInt64(&dur)) | ||
}) | ||
for i := 0; i < 10; i++ { | ||
time.Sleep(time.Millisecond) | ||
require.False(t, c.stuck()) | ||
c.ping() | ||
} | ||
atomic.StoreInt64(&dur, int64(time.Nanosecond)) | ||
// Nothing has reset the timer yet, so we won't be stuck here. | ||
// This isn't great but it is true, so documenting it. | ||
require.False(t, c.stuck()) | ||
// Ping will update the timer, so it will fire very soon. | ||
c.ping() | ||
require.Eventually(t, c.stuck, time.Second /* max */, 5*time.Nanosecond /* tick */) | ||
|
||
atomic.StoreInt64(&dur, int64(24*time.Hour)) | ||
|
||
// Stays marked as stuck even when we ping it again. | ||
for i := 0; i < 10; i++ { | ||
time.Sleep(time.Nanosecond) | ||
require.True(t, c.stuck()) | ||
c.ping() | ||
} | ||
} |