Skip to content

Commit

Permalink
Add jitter and backoff to prevent thundering herd on auth (#9133)
Browse files Browse the repository at this point in the history
Move cache and resourceWatcher watchers from a 10s retry to a jittered backoff retry up to ~1min. Replace the
reconnectToAuthService interval with a retry to add jitter and backoff there as well for when a node restarts due to
changes introduced in #8102.

Fixes #6889.
  • Loading branch information
rosstimothy committed Jan 20, 2022
1 parent eb95301 commit 787bd8c
Show file tree
Hide file tree
Showing 16 changed files with 727 additions and 525 deletions.
1 change: 1 addition & 0 deletions integration/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,7 @@ func (i *TeleInstance) GenerateConfig(t *testing.T, trustedSecrets []*InstanceSe
tconf.Kube.CheckImpersonationPermissions = nullImpersonationCheck

tconf.Keygen = testauthority.New()
tconf.MaxRetryPeriod = defaults.HighResPollingPeriod
i.Config = tconf
return tconf, nil
}
Expand Down
8 changes: 5 additions & 3 deletions lib/auth/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (
authority "github.com/gravitational/teleport/lib/auth/testauthority"
"github.com/gravitational/teleport/lib/backend"
"github.com/gravitational/teleport/lib/backend/memory"
"github.com/gravitational/teleport/lib/defaults"
"github.com/gravitational/teleport/lib/events"
"github.com/gravitational/teleport/lib/limiter"
"github.com/gravitational/teleport/lib/services"
Expand Down Expand Up @@ -326,9 +327,10 @@ func NewTestAuthServer(cfg TestAuthServerConfig) (*TestAuthServer, error) {

srv.LockWatcher, err = services.NewLockWatcher(ctx, services.LockWatcherConfig{
ResourceWatcherConfig: services.ResourceWatcherConfig{
Component: teleport.ComponentAuth,
Client: srv.AuthServer,
Clock: cfg.Clock,
Component: teleport.ComponentAuth,
Client: srv.AuthServer,
Clock: cfg.Clock,
MaxRetryPeriod: defaults.HighResPollingPeriod,
},
})
if err != nil {
Expand Down
32 changes: 24 additions & 8 deletions lib/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -512,8 +512,8 @@ type Config struct {
WindowsDesktops services.WindowsDesktops
// Backend is a backend for local cache
Backend backend.Backend
// RetryPeriod is a period between cache retries on failures
RetryPeriod time.Duration
// MaxRetryPeriod is the maximum period between cache retries on failures
MaxRetryPeriod time.Duration
// WatcherInitTimeout is the maximum acceptable delay for an
// OpInit after a watcher has been started (default=1m).
WatcherInitTimeout time.Duration
Expand Down Expand Up @@ -552,8 +552,8 @@ func (c *Config) CheckAndSetDefaults() error {
if c.Clock == nil {
c.Clock = clockwork.NewRealClock()
}
if c.RetryPeriod == 0 {
c.RetryPeriod = defaults.HighResPollingPeriod
if c.MaxRetryPeriod == 0 {
c.MaxRetryPeriod = defaults.MaxWatcherBackoff
}
if c.WatcherInitTimeout == 0 {
c.WatcherInitTimeout = time.Minute
Expand Down Expand Up @@ -586,6 +586,9 @@ const (
// TombstoneWritten is emitted if cache is closed in a healthy
// state and successfully writes its tombstone.
TombstoneWritten = "tombstone_written"
// Reloading is emitted when an error occurred watching events
// and the cache is waiting to create a new watcher
Reloading = "reloading_cache"
)

// New creates a new instance of Cache
Expand Down Expand Up @@ -660,8 +663,11 @@ func New(config Config) (*Cache, error) {
}

retry, err := utils.NewLinear(utils.LinearConfig{
Step: cs.Config.RetryPeriod / 10,
Max: cs.Config.RetryPeriod,
First: utils.HalfJitter(cs.MaxRetryPeriod / 10),
Step: cs.MaxRetryPeriod / 5,
Max: cs.MaxRetryPeriod,
Jitter: utils.NewHalfJitter(),
Clock: cs.Clock,
})
if err != nil {
cs.Close()
Expand Down Expand Up @@ -724,10 +730,20 @@ func (c *Cache) update(ctx context.Context, retry utils.Retry) {
if err != nil {
c.Warningf("Re-init the cache on error: %v.", err)
}

// events cache should be closed as well
c.Debugf("Reloading %v.", retry)
c.Debugf("Reloading cache.")

c.notify(ctx, Event{Type: Reloading, Event: types.Event{
Resource: &types.ResourceHeader{
Kind: retry.Duration().String(),
},
}})

startedWaiting := c.Clock.Now()
select {
case <-retry.After():
case t := <-retry.After():
c.Debugf("Initiating new watch after waiting %v.", t.Sub(startedWaiting))
retry.Inc()
case <-c.ctx.Done():
return
Expand Down
Loading

0 comments on commit 787bd8c

Please sign in to comment.