From d201f1b8139e1beb346e4cab38e3e58165d808fd Mon Sep 17 00:00:00 2001 From: Nick Cabatoff Date: Thu, 15 Dec 2022 13:09:36 -0500 Subject: [PATCH] Prevent panics in expiration invalidation, and make some changes for testing (#18401) --- changelog/18401.txt | 3 +++ vault/core.go | 4 ++++ vault/expiration.go | 36 +++++++++++++++++++------------ vault/logical_passthrough.go | 23 +++++++++++++++----- vault/logical_passthrough_test.go | 13 ++++++----- vault/testing.go | 7 +----- 6 files changed, 56 insertions(+), 30 deletions(-) create mode 100644 changelog/18401.txt diff --git a/changelog/18401.txt b/changelog/18401.txt new file mode 100644 index 000000000000..441e9a08b0df --- /dev/null +++ b/changelog/18401.txt @@ -0,0 +1,3 @@ +```release-note:bug +expiration: Prevent panics on perf standbys when an irrevocable release gets deleted. +``` diff --git a/vault/core.go b/vault/core.go index b4a0200b7932..3c45eb7f35f5 100644 --- a/vault/core.go +++ b/vault/core.go @@ -654,6 +654,7 @@ type Core struct { rollbackPeriod time.Duration pendingRemovalMountsAllowed bool + expirationRevokeRetryBase time.Duration } func (c *Core) HAState() consts.HAState { @@ -790,6 +791,8 @@ type CoreConfig struct { RollbackPeriod time.Duration PendingRemovalMountsAllowed bool + + ExpirationRevokeRetryBase time.Duration } // GetServiceRegistration returns the config's ServiceRegistration, or nil if it does @@ -944,6 +947,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) { effectiveSDKVersion: effectiveSDKVersion, userFailedLoginInfo: make(map[FailedLoginUser]*FailedLoginInfo), pendingRemovalMountsAllowed: conf.PendingRemovalMountsAllowed, + expirationRevokeRetryBase: conf.ExpirationRevokeRetryBase, } c.standbyStopCh.Store(make(chan struct{})) diff --git a/vault/expiration.go b/vault/expiration.go index 7c4cd77b4af1..a5f1918dcf22 100644 --- a/vault/expiration.go +++ b/vault/expiration.go @@ -151,7 +151,8 @@ type ExpirationManager struct { // request. This value should only be set by tests. testRegisterAuthFailure uberAtomic.Bool - jobManager *fairshare.JobManager + jobManager *fairshare.JobManager + revokeRetryBase time.Duration } type ExpireLeaseStrategy func(context.Context, *ExpirationManager, string, *namespace.Namespace) @@ -234,7 +235,6 @@ func (r *revocationJob) Execute() error { func (r *revocationJob) OnFailure(err error) { r.m.core.metricSink.IncrCounterWithLabels([]string{"expire", "lease_expiration", "error"}, 1, []metrics.Label{metricsutil.NamespaceLabel(r.ns)}) - r.m.logger.Error("failed to revoke lease", "lease_id", r.leaseID, "error", err) r.m.pendingLock.Lock() defer r.m.pendingLock.Unlock() @@ -246,12 +246,15 @@ func (r *revocationJob) OnFailure(err error) { pending := pendingRaw.(pendingInfo) pending.revokesAttempted++ + newTimer := r.revokeExponentialBackoff(pending.revokesAttempted) + if pending.revokesAttempted >= maxRevokeAttempts || errIsUnrecoverable(err) { - r.m.logger.Trace("marking lease as irrevocable", "lease_id", r.leaseID, "error", err) + reason := "unrecoverable error" if pending.revokesAttempted >= maxRevokeAttempts { - r.m.logger.Trace("lease has consumed all retry attempts", "lease_id", r.leaseID) + reason = "lease has consumed all retry attempts" err = fmt.Errorf("%v: %w", outOfRetriesMessage, err) } + r.m.logger.Trace("failed to revoke lease, marking lease as irrevocable", "lease_id", r.leaseID, "error", err, "reason", reason) le, loadErr := r.m.loadEntry(r.nsCtx, r.leaseID) if loadErr != nil { @@ -265,9 +268,12 @@ func (r *revocationJob) OnFailure(err error) { r.m.markLeaseIrrevocable(r.nsCtx, le, err) return + } else { + r.m.logger.Error("failed to revoke lease", "lease_id", r.leaseID, "error", err, + "attempts", pending.revokesAttempted, "next_attempt", newTimer) } - pending.timer.Reset(revokeExponentialBackoff(pending.revokesAttempted)) + pending.timer.Reset(newTimer) r.m.pending.Store(r.leaseID, pending) } @@ -285,8 +291,8 @@ func expireLeaseStrategyFairsharing(ctx context.Context, m *ExpirationManager, l m.jobManager.AddJob(job, mountAccessor) } -func revokeExponentialBackoff(attempt uint8) time.Duration { - exp := (1 << attempt) * revokeRetryBase +func (r *revocationJob) revokeExponentialBackoff(attempt uint8) time.Duration { + exp := (1 << attempt) * r.m.revokeRetryBase randomDelta := 0.5 * float64(exp) // Allow backoff time to be a random value between exp +/- (0.5*exp) @@ -351,7 +357,11 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log logLeaseExpirations: os.Getenv("VAULT_SKIP_LOGGING_LEASE_EXPIRATIONS") == "", expireFunc: e, - jobManager: jobManager, + jobManager: jobManager, + revokeRetryBase: c.expirationRevokeRetryBase, + } + if exp.revokeRetryBase == 0 { + exp.revokeRetryBase = revokeRetryBase } *exp.restoreMode = 1 @@ -495,16 +505,14 @@ func (m *ExpirationManager) invalidate(key string) { m.nonexpiring.Delete(leaseID) if info, ok := m.irrevocable.Load(leaseID); ok { - irrevocable := info.(pendingInfo) + ile := info.(*leaseEntry) m.irrevocable.Delete(leaseID) m.irrevocableLeaseCount-- m.leaseCount-- - // Avoid nil pointer dereference. Without cachedLeaseInfo we do not have enough information to - // accurately update quota lease information. - // Note that cachedLeaseInfo should never be nil under normal operation. - if irrevocable.cachedLeaseInfo != nil { - leaseInfo := "as.QuotaLeaseInformation{LeaseId: leaseID, Role: irrevocable.cachedLeaseInfo.LoginRole} + // Note that the leaseEntry should never be nil under normal operation. + if ile != nil { + leaseInfo := "as.QuotaLeaseInformation{LeaseId: leaseID, Role: ile.LoginRole} if err := m.core.quotasHandleLeases(ctx, quotas.LeaseActionDeleted, []*quotas.QuotaLeaseInformation{leaseInfo}); err != nil { m.logger.Error("failed to update quota on lease invalidation", "error", err) return diff --git a/vault/logical_passthrough.go b/vault/logical_passthrough.go index 0dececa9b68f..d67307f361fc 100644 --- a/vault/logical_passthrough.go +++ b/vault/logical_passthrough.go @@ -16,20 +16,32 @@ import ( // PassthroughBackendFactory returns a PassthroughBackend // with leases switched off func PassthroughBackendFactory(ctx context.Context, conf *logical.BackendConfig) (logical.Backend, error) { - return LeaseSwitchedPassthroughBackend(ctx, conf, false) + return LeaseSwitchedPassthroughBackend(ctx, conf, nil) } // LeasedPassthroughBackendFactory returns a PassthroughBackend // with leases switched on func LeasedPassthroughBackendFactory(ctx context.Context, conf *logical.BackendConfig) (logical.Backend, error) { - return LeaseSwitchedPassthroughBackend(ctx, conf, true) + return LeaseSwitchedPassthroughBackend(ctx, conf, func(ctx context.Context, req *logical.Request, data *framework.FieldData) (*logical.Response, error) { + return nil, nil + }) } +type revokeFunc func(ctx context.Context, req *logical.Request, data *framework.FieldData) (*logical.Response, error) + // LeaseSwitchedPassthroughBackend returns a PassthroughBackend // with leases switched on or off -func LeaseSwitchedPassthroughBackend(ctx context.Context, conf *logical.BackendConfig, leases bool) (logical.Backend, error) { +func LeaseSwitchedPassthroughBackend(ctx context.Context, conf *logical.BackendConfig, revoke revokeFunc) (logical.Backend, error) { var b PassthroughBackend - b.generateLeases = leases + if revoke == nil { + // We probably don't need this, since we should never have to handle revoke requests, but just in case... + b.revoke = func(ctx context.Context, req *logical.Request, data *framework.FieldData) (*logical.Response, error) { + return nil, nil + } + } else { + b.generateLeases = true + b.revoke = revoke + } b.Backend = &framework.Backend{ Help: strings.TrimSpace(passthroughHelp), @@ -65,7 +77,7 @@ func LeaseSwitchedPassthroughBackend(ctx context.Context, conf *logical.BackendC Type: "kv", Renew: b.handleRead, - Revoke: b.handleRevoke, + Revoke: b.revoke, }, } @@ -84,6 +96,7 @@ func LeaseSwitchedPassthroughBackend(ctx context.Context, conf *logical.BackendC type PassthroughBackend struct { *framework.Backend generateLeases bool + revoke func(ctx context.Context, req *logical.Request, data *framework.FieldData) (*logical.Response, error) } func (b *PassthroughBackend) handleRevoke(ctx context.Context, req *logical.Request, data *framework.FieldData) (*logical.Response, error) { diff --git a/vault/logical_passthrough_test.go b/vault/logical_passthrough_test.go index fa06c372bbd8..6baba86b415f 100644 --- a/vault/logical_passthrough_test.go +++ b/vault/logical_passthrough_test.go @@ -197,7 +197,8 @@ func TestPassthroughBackend_List(t *testing.T) { } func TestPassthroughBackend_Revoke(t *testing.T) { - test := func(b logical.Backend) { + test := func(t *testing.T, b logical.Backend) { + t.Helper() req := logical.TestRequest(t, logical.RevokeOperation, "kv") req.Secret = &logical.Secret{ InternalData: map[string]interface{}{ @@ -209,10 +210,12 @@ func TestPassthroughBackend_Revoke(t *testing.T) { t.Fatalf("err: %v", err) } } - b := testPassthroughBackend() - test(b) - b = testPassthroughLeasedBackend() - test(b) + t.Run("passthrough", func(t *testing.T) { + test(t, testPassthroughBackend()) + }) + t.Run("passthrough-leased", func(t *testing.T) { + test(t, testPassthroughLeasedBackend()) + }) } func testPassthroughBackend() logical.Backend { diff --git a/vault/testing.go b/vault/testing.go index 47103e350bb8..ae46a2f0cf8c 100644 --- a/vault/testing.go +++ b/vault/testing.go @@ -1701,20 +1701,15 @@ func NewTestCluster(t testing.T, base *CoreConfig, opts *TestClusterOptions) *Te } coreConfig.ClusterCipherSuites = base.ClusterCipherSuites - coreConfig.DisableCache = base.DisableCache - coreConfig.DevToken = base.DevToken coreConfig.RecoveryMode = base.RecoveryMode - coreConfig.ActivityLogConfig = base.ActivityLogConfig coreConfig.EnableResponseHeaderHostname = base.EnableResponseHeaderHostname coreConfig.EnableResponseHeaderRaftNodeID = base.EnableResponseHeaderRaftNodeID - coreConfig.RollbackPeriod = base.RollbackPeriod - coreConfig.PendingRemovalMountsAllowed = base.PendingRemovalMountsAllowed - + coreConfig.ExpirationRevokeRetryBase = base.ExpirationRevokeRetryBase testApplyEntBaseConfig(coreConfig, base) } if coreConfig.ClusterName == "" {