hashicorp · miagilepner · Feb 21, 2024 · Feb 21, 2024
diff --git a/changelog/25448.txt b/changelog/25448.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+core (enterprise): Fix a deadlock that can occur on performance secondary clusters when there are many mounts and a mount is deleted or filtered 
+```
diff --git a/vault/rollback.go b/vault/rollback.go
@@ -25,6 +25,8 @@ const (
 	RollbackWorkersEnvVar     = "VAULT_ROLLBACK_WORKERS"
 )
 
+var rollbackCanceled = errors.New("rollback attempt canceled")
+
 // RollbackManager is responsible for performing rollbacks of partial
 // secrets within logical backends.
 //
@@ -74,7 +76,9 @@ type rollbackState struct {
 	cancelLockGrabCtxCancel context.CancelFunc
 	// scheduled is the time that this job was created and submitted to the
 	// rollbackRunner
-	scheduled time.Time
+	scheduled  time.Time
+	isRunning  chan struct{}
+	isCanceled chan struct{}
 }
 
 // NewRollbackManager is used to create a new rollback manager
@@ -192,29 +196,45 @@ func (m *RollbackManager) triggerRollbacks() {
 	}
 }
 
-// startOrLookupRollback is used to start an async rollback attempt.
-// This must be called with the inflightLock held.
-func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath string, grabStatelock bool) *rollbackState {
-	m.inflightLock.Lock()
-	defer m.inflightLock.Unlock()
+// lookupRollbackLocked checks if there's an inflight rollback with the given
+// path. Callers must have the inflightLock. The function also reports metrics,
+// since it is regularly called as part of the scheduled rollbacks.
+func (m *RollbackManager) lookupRollbackLocked(fullPath string) *rollbackState {
 	defer metrics.SetGauge([]string{"rollback", "queued"}, float32(m.runner.WaitingQueueSize()))
 	defer metrics.SetGauge([]string{"rollback", "inflight"}, float32(len(m.inflight)))
-	rsInflight, ok := m.inflight[fullPath]
-	if ok {
-		return rsInflight
-	}
+	rsInflight := m.inflight[fullPath]
+	return rsInflight
+}
 
+// newRollbackLocked creates a new rollback state and adds it to the inflight
+// rollback map. Callers must have the inflightLock
+func (m *RollbackManager) newRollbackLocked(fullPath string) *rollbackState {
 	cancelCtx, cancelFunc := context.WithCancel(context.Background())
 	rs := &rollbackState{
 		cancelLockGrabCtx:       cancelCtx,
 		cancelLockGrabCtxCancel: cancelFunc,
+		isRunning:               make(chan struct{}),
+		isCanceled:              make(chan struct{}),
+		scheduled:               time.Now(),
 	}
-
-	// If no inflight rollback is already running, kick one off
 	m.inflight[fullPath] = rs
 	rs.Add(1)
 	m.inflightAll.Add(1)
-	rs.scheduled = time.Now()
+	return rs
+}
+
+// startOrLookupRollback is used to start an async rollback attempt.
+func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath string, grabStatelock bool) *rollbackState {
+	m.inflightLock.Lock()
+	defer m.inflightLock.Unlock()
+	rs := m.lookupRollbackLocked(fullPath)
+	if rs != nil {
+		return rs
+	}
+
+	// If no inflight rollback is already running, kick one off
+	rs = m.newRollbackLocked(fullPath)
+
 	select {
 	case <-m.doneCh:
 		// if we've already shut down, then don't submit the task to avoid a panic
@@ -244,18 +264,27 @@ func (m *RollbackManager) finishRollback(rs *rollbackState, err error, fullPath
 		m.inflightLock.Lock()
 		defer m.inflightLock.Unlock()
 	}
-	delete(m.inflight, fullPath)
+	if _, ok := m.inflight[fullPath]; ok {
+		delete(m.inflight, fullPath)
+	}
 }
 
 // attemptRollback invokes a RollbackOperation for the given path
 func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string, rs *rollbackState, grabStatelock bool) (err error) {
+	close(rs.isRunning)
+	defer m.finishRollback(rs, err, fullPath, true)
+	select {
+	case <-rs.isCanceled:
+		return rollbackCanceled
+	default:
+	}
+
 	metrics.MeasureSince([]string{"rollback", "waiting"}, rs.scheduled)
 	metricName := []string{"rollback", "attempt"}
 	if m.rollbackMetricsMountName {
 		metricName = append(metricName, strings.ReplaceAll(fullPath, "/", "-"))
 	}
 	defer metrics.MeasureSince(metricName, time.Now())
-	defer m.finishRollback(rs, err, fullPath, true)
 
 	ns, err := namespace.FromContext(ctx)
 	if err != nil {
@@ -286,6 +315,7 @@ func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string,
 			case <-m.shutdownCh:
 			case <-rs.cancelLockGrabCtx.Done():
 			case <-doneCh:
+			case <-rs.isCanceled:
 			}
 		}()
 
@@ -298,6 +328,8 @@ func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string,
 			select {
 			case <-m.shutdownCh:
 				return errors.New("rollback shutting down")
+			case <-rs.isCanceled:
+				return rollbackCanceled
 			default:
 				releaseLock = false
 			}
@@ -339,21 +371,37 @@ func (m *RollbackManager) Rollback(ctx context.Context, path string) error {
 	}
 	fullPath := ns.Path + path
 
-	// Check for an existing attempt or start one if none
-	rs := m.startOrLookupRollback(ctx, fullPath, false)
+	m.inflightLock.Lock()
+	rs := m.lookupRollbackLocked(fullPath)
+	if rs != nil {
+		// Since we have the statelock held, tell any inflight rollback to give up
+		// trying to acquire it. This will prevent deadlocks in the case where we
+		// have the write lock. In the case where it was waiting to grab
+		// a read lock it will then simply continue with the rollback
+		// operation under the protection of our write lock.
+		rs.cancelLockGrabCtxCancel()
+
+		select {
+		case <-rs.isRunning:
+			// if the rollback has started then we should wait for it to complete
+			m.inflightLock.Unlock()
+			rs.Wait()
+			return rs.lastError
+		default:
+		}
+
+		// if the rollback hasn't started and there's no capacity, we could
+		// end up deadlocking. Cancel the existing rollback and start a new
+		// one.
+		close(rs.isCanceled)
+	}
+	rs = m.newRollbackLocked(fullPath)
+	m.inflightLock.Unlock()
 
-	// Since we have the statelock held, tell any inflight rollback to give up
-	// trying to acquire it. This will prevent deadlocks in the case where we
-	// have the write lock. In the case where it was waiting to grab
-	// a read lock it will then simply continue with the rollback
-	// operation under the protection of our write lock.
-	rs.cancelLockGrabCtxCancel()
+	// we can ignore the error, since it's going to be set in rs.lastError
+	m.attemptRollback(ctx, fullPath, rs, false)
 
-	// It's safe to do this, since the other thread either already has the lock
-	// held, or we just canceled it above.
 	rs.Wait()
-
-	// Return the last error
 	return rs.lastError
 }
 

@@ -128,6 +128,8 @@ The fix for this UI issue is coming in the Vault 1.13.1 release.
 
 @include 'known-issues/expiration-metrics-fatal-error.mdx'
 
+@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx'
+
 ## Feature deprecations and EOL
 
 Please refer to the [Deprecation Plans and Notice](/vault/docs/deprecation) page

@@ -24,6 +24,7 @@ All     | [API calls to update-primary may lead to data loss](/vault/docs/upgrad
 1.14.0+ | [Sublogger levels not adjusted on reload](/vault/docs/upgrading/upgrade-to-1.14.x#sublogger-levels-unchanged-on-reload)
 1.14.5  | [Fatal error during expiration metrics gathering causing Vault crash](/vault/docs/upgrading/upgrade-to-1.15.x#fatal-error-during-expiration-metrics-gathering-causing-vault-crash)
 1.14.5 | [User lockout potential double logging](/vault/docs/upgrading/upgrade-to-1.14.x#user-lockout-logging)
+1.14.5 - 1.14.9 | [Deadlock can occur on performance secondary clusters with many mounts](/vault/docs/upgrading/upgrade-to-1.14.x#deadlock-can-occur-on-performance-secondary-clusters-with-many-mounts)
 
 ## Vault companion updates
 

@@ -25,7 +25,7 @@ description: |-
 | 1.15.0+         | [URL change for KV v2 plugin](/vault/docs/upgrading/upgrade-to-1.15.x#kv2-url-change)                                                                                                                                             |
 | 1.15.1          | [Fatal error during expiration metrics gathering causing Vault crash](/vault/docs/upgrading/upgrade-to-1.15.x#fatal-error-during-expiration-metrics-gathering-causing-vault-crash)                                                |
 | 1.15.0 - 1.15.4 | [Audit devices could log raw data despite configuration](/vault/docs/upgrading/upgrade-to-1.15.x#audit-devices-could-log-raw-data-despite-configuration)                                                                          |
-
+| 1.15.0 - 1.15.5 | [Deadlock can occur on performance secondary clusters with many mounts](/vault/docs/upgrading/upgrade-to-1.15.x#deadlock-can-occur-on-performance-secondary-clusters-with-many-mounts)
 
 ## Vault companion updates
 

@@ -194,3 +194,5 @@ Affects Vault 1.13.0+
 @include 'known-issues/sublogger-levels-unchanged-on-reload.mdx'
 
 @include 'known-issues/expiration-metrics-fatal-error.mdx'
+
+@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx'
@@ -62,3 +62,5 @@ is measuring cumulative time writing, and not the distribution of individual wri
 @include 'known-issues/sublogger-levels-unchanged-on-reload.mdx'
 
 @include 'known-issues/expiration-metrics-fatal-error.mdx'
+
+@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx'
@@ -68,3 +68,5 @@ option.
 @include 'known-issues/expiration-metrics-fatal-error.mdx'
 
 @include 'known-issues/1_15_audit-use-of-log-raw-applies-to-all-devices.mdx'
+
+@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx'
@@ -0,0 +1,34 @@
+### Deadlock can occur on performance secondary clusters with many mounts
+
+#### Affected versions
+
+- 1.15.0 - 1.15.5
+- 1.14.5 - 1.14.9
+- 1.13.9 - 1.13.13
+
+#### Issue
+
+Vault 1.15.0, 1.14.5, and 1.13.9 introduced a worker pool to schedule periodic
+rollback operations on all mounts. This worker pool defaulted to using 256
+workers. The worker pool introduced a risk of deadlocking on the active node of
+**performance secondary clusters**, leaving that cluster unable to service any
+requests.
+
+The conditions required to cause the deadlock on the performance secondary:
+
+- Performance replication is enabled
+- The performance primary cluster has more than 256 non-local mounts. The more
+mounts the cluster has, the more likely the deadlock becomes
+- One of the following occurs:
+  - A replicated mount is unmounted or remounted OR
+  - A replicated namespace is deleted OR
+  - Replication paths filters are used to filter at least one mount or namespace
+
+#### Workaround
+
+Set the `VAULT_ROLLBACK_WORKERS` environment variable to a number larger than
+the number of mounts in your Vault cluster and restart Vault:
+
+```shell-session
+$ export VAULT_ROLLBACK_WORKERS=1000
+```
Original file line number	Diff line number	Diff line change
Expand Up		@@ -194,3 +194,5 @@ Affects Vault 1.13.0+
		@include 'known-issues/sublogger-levels-unchanged-on-reload.mdx'

		@include 'known-issues/expiration-metrics-fatal-error.mdx'

		@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx'
Original file line number	Diff line number	Diff line change
Expand Up		@@ -62,3 +62,5 @@ is measuring cumulative time writing, and not the distribution of individual wri
		@include 'known-issues/sublogger-levels-unchanged-on-reload.mdx'

		@include 'known-issues/expiration-metrics-fatal-error.mdx'

		@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx'
Original file line number	Diff line number	Diff line change
Expand Up		@@ -68,3 +68,5 @@ option.
		@include 'known-issues/expiration-metrics-fatal-error.mdx'

		@include 'known-issues/1_15_audit-use-of-log-raw-applies-to-all-devices.mdx'

		@include 'known-issues/perf-secondary-many-mounts-deadlock.mdx'