diff --git a/changelog/25448.txt b/changelog/25448.txt new file mode 100644 index 000000000000..537cc8cdd1f8 --- /dev/null +++ b/changelog/25448.txt @@ -0,0 +1,3 @@ +```release-note:bug +core (enterprise): Fix a deadlock that can occur on performance secondary clusters when there are many mounts and a mount is deleted or filtered +``` diff --git a/vault/rollback.go b/vault/rollback.go index 050d97522e64..e08747cb0aca 100644 --- a/vault/rollback.go +++ b/vault/rollback.go @@ -25,6 +25,8 @@ const ( RollbackWorkersEnvVar = "VAULT_ROLLBACK_WORKERS" ) +var rollbackCanceled = errors.New("rollback attempt canceled") + // RollbackManager is responsible for performing rollbacks of partial // secrets within logical backends. // @@ -73,7 +75,9 @@ type rollbackState struct { cancelLockGrabCtxCancel context.CancelFunc // scheduled is the time that this job was created and submitted to the // rollbackRunner - scheduled time.Time + scheduled time.Time + isRunning chan struct{} + isCanceled chan struct{} } // NewRollbackManager is used to create a new rollback manager @@ -190,29 +194,45 @@ func (m *RollbackManager) triggerRollbacks() { } } -// startOrLookupRollback is used to start an async rollback attempt. -// This must be called with the inflightLock held. -func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath string, grabStatelock bool) *rollbackState { - m.inflightLock.Lock() - defer m.inflightLock.Unlock() +// lookupRollbackLocked checks if there's an inflight rollback with the given +// path. Callers must have the inflightLock. The function also reports metrics, +// since it is regularly called as part of the scheduled rollbacks. +func (m *RollbackManager) lookupRollbackLocked(fullPath string) *rollbackState { defer metrics.SetGauge([]string{"rollback", "queued"}, float32(m.runner.WaitingQueueSize())) defer metrics.SetGauge([]string{"rollback", "inflight"}, float32(len(m.inflight))) - rsInflight, ok := m.inflight[fullPath] - if ok { - return rsInflight - } + rsInflight := m.inflight[fullPath] + return rsInflight +} +// newRollbackLocked creates a new rollback state and adds it to the inflight +// rollback map. Callers must have the inflightLock +func (m *RollbackManager) newRollbackLocked(fullPath string) *rollbackState { cancelCtx, cancelFunc := context.WithCancel(context.Background()) rs := &rollbackState{ cancelLockGrabCtx: cancelCtx, cancelLockGrabCtxCancel: cancelFunc, + isRunning: make(chan struct{}), + isCanceled: make(chan struct{}), + scheduled: time.Now(), } - - // If no inflight rollback is already running, kick one off m.inflight[fullPath] = rs rs.Add(1) m.inflightAll.Add(1) - rs.scheduled = time.Now() + return rs +} + +// startOrLookupRollback is used to start an async rollback attempt. +func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath string, grabStatelock bool) *rollbackState { + m.inflightLock.Lock() + defer m.inflightLock.Unlock() + rs := m.lookupRollbackLocked(fullPath) + if rs != nil { + return rs + } + + // If no inflight rollback is already running, kick one off + rs = m.newRollbackLocked(fullPath) + select { case <-m.doneCh: // if we've already shut down, then don't submit the task to avoid a panic @@ -242,14 +262,23 @@ func (m *RollbackManager) finishRollback(rs *rollbackState, err error, fullPath m.inflightLock.Lock() defer m.inflightLock.Unlock() } - delete(m.inflight, fullPath) + if _, ok := m.inflight[fullPath]; ok { + delete(m.inflight, fullPath) + } } // attemptRollback invokes a RollbackOperation for the given path func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string, rs *rollbackState, grabStatelock bool) (err error) { + close(rs.isRunning) + defer m.finishRollback(rs, err, fullPath, true) + select { + case <-rs.isCanceled: + return rollbackCanceled + default: + } + metrics.MeasureSince([]string{"rollback", "waiting"}, rs.scheduled) defer metrics.MeasureSince([]string{"rollback", "attempt", strings.ReplaceAll(fullPath, "/", "-")}, time.Now()) - defer m.finishRollback(rs, err, fullPath, true) ns, err := namespace.FromContext(ctx) if err != nil { @@ -280,6 +309,7 @@ func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string, case <-m.shutdownCh: case <-rs.cancelLockGrabCtx.Done(): case <-doneCh: + case <-rs.isCanceled: } }() @@ -292,6 +322,8 @@ func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string, select { case <-m.shutdownCh: return errors.New("rollback shutting down") + case <-rs.isCanceled: + return rollbackCanceled default: releaseLock = false } @@ -333,21 +365,37 @@ func (m *RollbackManager) Rollback(ctx context.Context, path string) error { } fullPath := ns.Path + path - // Check for an existing attempt or start one if none - rs := m.startOrLookupRollback(ctx, fullPath, false) + m.inflightLock.Lock() + rs := m.lookupRollbackLocked(fullPath) + if rs != nil { + // Since we have the statelock held, tell any inflight rollback to give up + // trying to acquire it. This will prevent deadlocks in the case where we + // have the write lock. In the case where it was waiting to grab + // a read lock it will then simply continue with the rollback + // operation under the protection of our write lock. + rs.cancelLockGrabCtxCancel() + + select { + case <-rs.isRunning: + // if the rollback has started then we should wait for it to complete + m.inflightLock.Unlock() + rs.Wait() + return rs.lastError + default: + } + + // if the rollback hasn't started and there's no capacity, we could + // end up deadlocking. Cancel the existing rollback and start a new + // one. + close(rs.isCanceled) + } + rs = m.newRollbackLocked(fullPath) + m.inflightLock.Unlock() - // Since we have the statelock held, tell any inflight rollback to give up - // trying to acquire it. This will prevent deadlocks in the case where we - // have the write lock. In the case where it was waiting to grab - // a read lock it will then simply continue with the rollback - // operation under the protection of our write lock. - rs.cancelLockGrabCtxCancel() + // we can ignore the error, since it's going to be set in rs.lastError + m.attemptRollback(ctx, fullPath, rs, false) - // It's safe to do this, since the other thread either already has the lock - // held, or we just canceled it above. rs.Wait() - - // Return the last error return rs.lastError } diff --git a/website/content/docs/release-notes/1.13.0.mdx b/website/content/docs/release-notes/1.13.0.mdx index eea72204d1dd..c7d30a36b1bf 100644 --- a/website/content/docs/release-notes/1.13.0.mdx +++ b/website/content/docs/release-notes/1.13.0.mdx @@ -128,6 +128,8 @@ The fix for this UI issue is coming in the Vault 1.13.1 release. @include 'known-issues/expiration-metrics-fatal-error.mdx' +@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx' + ## Feature deprecations and EOL Please refer to the [Deprecation Plans and Notice](/vault/docs/deprecation) page diff --git a/website/content/docs/release-notes/1.14.0.mdx b/website/content/docs/release-notes/1.14.0.mdx index 8471a9d734c3..395090fd54d6 100644 --- a/website/content/docs/release-notes/1.14.0.mdx +++ b/website/content/docs/release-notes/1.14.0.mdx @@ -24,6 +24,7 @@ All | [API calls to update-primary may lead to data loss](/vault/docs/upgrad 1.14.0+ | [Sublogger levels not adjusted on reload](/vault/docs/upgrading/upgrade-to-1.14.x#sublogger-levels-unchanged-on-reload) 1.14.5 | [Fatal error during expiration metrics gathering causing Vault crash](/vault/docs/upgrading/upgrade-to-1.14.x#fatal-error-during-expiration-metrics-gathering-causing-vault-crash) 1.14.5 | [User lockout potential double logging](/vault/docs/upgrading/upgrade-to-1.14.x#user-lockout-logging) +1.14.5 - 1.14.9 | [Deadlock can occur on performance secondary clusters with many mounts](/vault/docs/upgrading/upgrade-to-1.14.x#deadlock-can-occur-on-performance-secondary-clusters-with-many-mounts) ## Vault companion updates diff --git a/website/content/docs/upgrading/upgrade-to-1.13.x.mdx b/website/content/docs/upgrading/upgrade-to-1.13.x.mdx index 49f97f0c06a5..2865555d3aa4 100644 --- a/website/content/docs/upgrading/upgrade-to-1.13.x.mdx +++ b/website/content/docs/upgrading/upgrade-to-1.13.x.mdx @@ -194,3 +194,5 @@ Affects Vault 1.13.0+ @include 'known-issues/sublogger-levels-unchanged-on-reload.mdx' @include 'known-issues/expiration-metrics-fatal-error.mdx' + +@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx' diff --git a/website/content/docs/upgrading/upgrade-to-1.14.x.mdx b/website/content/docs/upgrading/upgrade-to-1.14.x.mdx index 6d1896503be5..8baf08cb3e43 100644 --- a/website/content/docs/upgrading/upgrade-to-1.14.x.mdx +++ b/website/content/docs/upgrading/upgrade-to-1.14.x.mdx @@ -62,3 +62,5 @@ is measuring cumulative time writing, and not the distribution of individual wri @include 'known-issues/sublogger-levels-unchanged-on-reload.mdx' @include 'known-issues/expiration-metrics-fatal-error.mdx' + +@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx' diff --git a/website/content/partials/known-issues/perf-secondary-many-mounts-deadlock.mdx b/website/content/partials/known-issues/perf-secondary-many-mounts-deadlock.mdx new file mode 100644 index 000000000000..113b3b3a573d --- /dev/null +++ b/website/content/partials/known-issues/perf-secondary-many-mounts-deadlock.mdx @@ -0,0 +1,34 @@ +### Deadlock can occur on performance secondary clusters with many mounts + +#### Affected versions + +- 1.15.0 - 1.15.5 +- 1.14.5 - 1.14.9 +- 1.13.9 - 1.13.13 + +#### Issue + +Vault 1.15.0, 1.14.5, and 1.13.9 introduced a worker pool to schedule periodic +rollback operations on all mounts. This worker pool defaulted to using 256 +workers. The worker pool introduced a risk of deadlocking on the active node of +**performance secondary clusters**, leaving that cluster unable to service any +requests. + +The conditions required to cause the deadlock on the performance secondary: + +- Performance replication is enabled +- The performance primary cluster has more than 256 non-local mounts. The more +mounts the cluster has, the more likely the deadlock becomes +- One of the following occurs: + - A replicated mount is unmounted or remounted OR + - A replicated namespace is deleted OR + - Replication paths filters are used to filter at least one mount or namespace + +#### Workaround + +Set the `VAULT_ROLLBACK_WORKERS` environment variable to a number larger than +the number of mounts in your Vault cluster and restart Vault: + +```shell-session +$ export VAULT_ROLLBACK_WORKERS=1000 +```