Skip to content

Commit

Permalink
backport of commit 5f0638a (#25554)
Browse files Browse the repository at this point in the history
Co-authored-by: miagilepner <[email protected]>
  • Loading branch information
1 parent bef8a3a commit 5cb6a88
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 28 deletions.
3 changes: 3 additions & 0 deletions changelog/25448.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
core (enterprise): Fix a deadlock that can occur on performance secondary clusters when there are many mounts and a mount is deleted or filtered
```
102 changes: 75 additions & 27 deletions vault/rollback.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ const (
RollbackWorkersEnvVar = "VAULT_ROLLBACK_WORKERS"
)

var rollbackCanceled = errors.New("rollback attempt canceled")

// RollbackManager is responsible for performing rollbacks of partial
// secrets within logical backends.
//
Expand Down Expand Up @@ -74,7 +76,9 @@ type rollbackState struct {
cancelLockGrabCtxCancel context.CancelFunc
// scheduled is the time that this job was created and submitted to the
// rollbackRunner
scheduled time.Time
scheduled time.Time
isRunning chan struct{}
isCanceled chan struct{}
}

// NewRollbackManager is used to create a new rollback manager
Expand Down Expand Up @@ -192,29 +196,45 @@ func (m *RollbackManager) triggerRollbacks() {
}
}

// startOrLookupRollback is used to start an async rollback attempt.
// This must be called with the inflightLock held.
func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath string, grabStatelock bool) *rollbackState {
m.inflightLock.Lock()
defer m.inflightLock.Unlock()
// lookupRollbackLocked checks if there's an inflight rollback with the given
// path. Callers must have the inflightLock. The function also reports metrics,
// since it is regularly called as part of the scheduled rollbacks.
func (m *RollbackManager) lookupRollbackLocked(fullPath string) *rollbackState {
defer metrics.SetGauge([]string{"rollback", "queued"}, float32(m.runner.WaitingQueueSize()))
defer metrics.SetGauge([]string{"rollback", "inflight"}, float32(len(m.inflight)))
rsInflight, ok := m.inflight[fullPath]
if ok {
return rsInflight
}
rsInflight := m.inflight[fullPath]
return rsInflight
}

// newRollbackLocked creates a new rollback state and adds it to the inflight
// rollback map. Callers must have the inflightLock
func (m *RollbackManager) newRollbackLocked(fullPath string) *rollbackState {
cancelCtx, cancelFunc := context.WithCancel(context.Background())
rs := &rollbackState{
cancelLockGrabCtx: cancelCtx,
cancelLockGrabCtxCancel: cancelFunc,
isRunning: make(chan struct{}),
isCanceled: make(chan struct{}),
scheduled: time.Now(),
}

// If no inflight rollback is already running, kick one off
m.inflight[fullPath] = rs
rs.Add(1)
m.inflightAll.Add(1)
rs.scheduled = time.Now()
return rs
}

// startOrLookupRollback is used to start an async rollback attempt.
func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath string, grabStatelock bool) *rollbackState {
m.inflightLock.Lock()
defer m.inflightLock.Unlock()
rs := m.lookupRollbackLocked(fullPath)
if rs != nil {
return rs
}

// If no inflight rollback is already running, kick one off
rs = m.newRollbackLocked(fullPath)

select {
case <-m.doneCh:
// if we've already shut down, then don't submit the task to avoid a panic
Expand Down Expand Up @@ -244,18 +264,27 @@ func (m *RollbackManager) finishRollback(rs *rollbackState, err error, fullPath
m.inflightLock.Lock()
defer m.inflightLock.Unlock()
}
delete(m.inflight, fullPath)
if _, ok := m.inflight[fullPath]; ok {
delete(m.inflight, fullPath)
}
}

// attemptRollback invokes a RollbackOperation for the given path
func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string, rs *rollbackState, grabStatelock bool) (err error) {
close(rs.isRunning)
defer m.finishRollback(rs, err, fullPath, true)
select {
case <-rs.isCanceled:
return rollbackCanceled
default:
}

metrics.MeasureSince([]string{"rollback", "waiting"}, rs.scheduled)
metricName := []string{"rollback", "attempt"}
if m.rollbackMetricsMountName {
metricName = append(metricName, strings.ReplaceAll(fullPath, "/", "-"))
}
defer metrics.MeasureSince(metricName, time.Now())
defer m.finishRollback(rs, err, fullPath, true)

ns, err := namespace.FromContext(ctx)
if err != nil {
Expand Down Expand Up @@ -286,6 +315,7 @@ func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string,
case <-m.shutdownCh:
case <-rs.cancelLockGrabCtx.Done():
case <-doneCh:
case <-rs.isCanceled:
}
}()

Expand All @@ -298,6 +328,8 @@ func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string,
select {
case <-m.shutdownCh:
return errors.New("rollback shutting down")
case <-rs.isCanceled:
return rollbackCanceled
default:
releaseLock = false
}
Expand Down Expand Up @@ -339,21 +371,37 @@ func (m *RollbackManager) Rollback(ctx context.Context, path string) error {
}
fullPath := ns.Path + path

// Check for an existing attempt or start one if none
rs := m.startOrLookupRollback(ctx, fullPath, false)
m.inflightLock.Lock()
rs := m.lookupRollbackLocked(fullPath)
if rs != nil {
// Since we have the statelock held, tell any inflight rollback to give up
// trying to acquire it. This will prevent deadlocks in the case where we
// have the write lock. In the case where it was waiting to grab
// a read lock it will then simply continue with the rollback
// operation under the protection of our write lock.
rs.cancelLockGrabCtxCancel()

select {
case <-rs.isRunning:
// if the rollback has started then we should wait for it to complete
m.inflightLock.Unlock()
rs.Wait()
return rs.lastError
default:
}

// if the rollback hasn't started and there's no capacity, we could
// end up deadlocking. Cancel the existing rollback and start a new
// one.
close(rs.isCanceled)
}
rs = m.newRollbackLocked(fullPath)
m.inflightLock.Unlock()

// Since we have the statelock held, tell any inflight rollback to give up
// trying to acquire it. This will prevent deadlocks in the case where we
// have the write lock. In the case where it was waiting to grab
// a read lock it will then simply continue with the rollback
// operation under the protection of our write lock.
rs.cancelLockGrabCtxCancel()
// we can ignore the error, since it's going to be set in rs.lastError
m.attemptRollback(ctx, fullPath, rs, false)

// It's safe to do this, since the other thread either already has the lock
// held, or we just canceled it above.
rs.Wait()

// Return the last error
return rs.lastError
}

Expand Down
2 changes: 2 additions & 0 deletions website/content/docs/release-notes/1.13.0.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ The fix for this UI issue is coming in the Vault 1.13.1 release.

@include 'known-issues/expiration-metrics-fatal-error.mdx'

@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx'

## Feature deprecations and EOL

Please refer to the [Deprecation Plans and Notice](/vault/docs/deprecation) page
Expand Down
1 change: 1 addition & 0 deletions website/content/docs/release-notes/1.14.0.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ All | [API calls to update-primary may lead to data loss](/vault/docs/upgrad
1.14.0+ | [Sublogger levels not adjusted on reload](/vault/docs/upgrading/upgrade-to-1.14.x#sublogger-levels-unchanged-on-reload)
1.14.5 | [Fatal error during expiration metrics gathering causing Vault crash](/vault/docs/upgrading/upgrade-to-1.15.x#fatal-error-during-expiration-metrics-gathering-causing-vault-crash)
1.14.5 | [User lockout potential double logging](/vault/docs/upgrading/upgrade-to-1.14.x#user-lockout-logging)
1.14.5 - 1.14.9 | [Deadlock can occur on performance secondary clusters with many mounts](/vault/docs/upgrading/upgrade-to-1.14.x#deadlock-can-occur-on-performance-secondary-clusters-with-many-mounts)

## Vault companion updates

Expand Down
2 changes: 1 addition & 1 deletion website/content/docs/release-notes/1.15.0.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ description: |-
| 1.15.0+ | [URL change for KV v2 plugin](/vault/docs/upgrading/upgrade-to-1.15.x#kv2-url-change) |
| 1.15.1 | [Fatal error during expiration metrics gathering causing Vault crash](/vault/docs/upgrading/upgrade-to-1.15.x#fatal-error-during-expiration-metrics-gathering-causing-vault-crash) |
| 1.15.0 - 1.15.4 | [Audit devices could log raw data despite configuration](/vault/docs/upgrading/upgrade-to-1.15.x#audit-devices-could-log-raw-data-despite-configuration) |

| 1.15.0 - 1.15.5 | [Deadlock can occur on performance secondary clusters with many mounts](/vault/docs/upgrading/upgrade-to-1.15.x#deadlock-can-occur-on-performance-secondary-clusters-with-many-mounts)

## Vault companion updates

Expand Down
2 changes: 2 additions & 0 deletions website/content/docs/upgrading/upgrade-to-1.13.x.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,5 @@ Affects Vault 1.13.0+
@include 'known-issues/sublogger-levels-unchanged-on-reload.mdx'

@include 'known-issues/expiration-metrics-fatal-error.mdx'

@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx'
2 changes: 2 additions & 0 deletions website/content/docs/upgrading/upgrade-to-1.14.x.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,5 @@ is measuring cumulative time writing, and not the distribution of individual wri
@include 'known-issues/sublogger-levels-unchanged-on-reload.mdx'

@include 'known-issues/expiration-metrics-fatal-error.mdx'

@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx'
2 changes: 2 additions & 0 deletions website/content/docs/upgrading/upgrade-to-1.15.x.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,5 @@ option.
@include 'known-issues/expiration-metrics-fatal-error.mdx'

@include 'known-issues/1_15_audit-use-of-log-raw-applies-to-all-devices.mdx'

@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx'
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
### Deadlock can occur on performance secondary clusters with many mounts

#### Affected versions

- 1.15.0 - 1.15.5
- 1.14.5 - 1.14.9
- 1.13.9 - 1.13.13

#### Issue

Vault 1.15.0, 1.14.5, and 1.13.9 introduced a worker pool to schedule periodic
rollback operations on all mounts. This worker pool defaulted to using 256
workers. The worker pool introduced a risk of deadlocking on the active node of
**performance secondary clusters**, leaving that cluster unable to service any
requests.

The conditions required to cause the deadlock on the performance secondary:

- Performance replication is enabled
- The performance primary cluster has more than 256 non-local mounts. The more
mounts the cluster has, the more likely the deadlock becomes
- One of the following occurs:
- A replicated mount is unmounted or remounted OR
- A replicated namespace is deleted OR
- Replication paths filters are used to filter at least one mount or namespace

#### Workaround

Set the `VAULT_ROLLBACK_WORKERS` environment variable to a number larger than
the number of mounts in your Vault cluster and restart Vault:

```shell-session
$ export VAULT_ROLLBACK_WORKERS=1000
```

0 comments on commit 5cb6a88

Please sign in to comment.