Skip to content

Commit

Permalink
core: fix bug where deadlock detection was always on for expiration a…
Browse files Browse the repository at this point in the history
…nd quotas (#23902)

* server: fix bug where deadlock detection was on for expiration and quotas

* trim spaces

* Add tests

* Use trimspace and lower

* Update test

* changelog

* fix config parsing
  • Loading branch information
jasonodonnell authored Oct 30, 2023
1 parent 26bae55 commit 66494c8
Show file tree
Hide file tree
Showing 9 changed files with 147 additions and 26 deletions.
5 changes: 5 additions & 0 deletions changelog/23902.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
```release-note:bug
core: fix bug where deadlock detection was always on for expiration and quotas.
These can now be configured individually with `detect_deadlocks`.
```

39 changes: 31 additions & 8 deletions vault/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"os"
"path/filepath"
"runtime"
"slices"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -690,6 +691,9 @@ type Core struct {

// If any role based quota (LCQ or RLQ) is enabled, don't track lease counts by role
impreciseLeaseRoleTracking bool

// Config value for "detect_deadlocks".
detectDeadlocks []string
}

// c.stateLock needs to be held in read mode before calling this function.
Expand Down Expand Up @@ -947,19 +951,28 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
if conf.NumRollbackWorkers == 0 {
conf.NumRollbackWorkers = RollbackDefaultNumWorkers
}
// Use imported logging deadlock if requested
var stateLock locking.RWMutex
if strings.Contains(conf.DetectDeadlocks, "statelock") {
stateLock = &locking.DeadlockRWMutex{}
} else {
stateLock = &locking.SyncRWMutex{}
}

effectiveSDKVersion := conf.EffectiveSDKVersion
if effectiveSDKVersion == "" {
effectiveSDKVersion = version.GetVersion().Version
}

var detectDeadlocks []string
if conf.DetectDeadlocks != "" {
detectDeadlocks = strings.Split(conf.DetectDeadlocks, ",")
for k, v := range detectDeadlocks {
detectDeadlocks[k] = strings.ToLower(strings.TrimSpace(v))
}
}

// Use imported logging deadlock if requested
var stateLock locking.RWMutex
stateLock = &locking.SyncRWMutex{}

if slices.Contains(detectDeadlocks, "statelock") {
stateLock = &locking.DeadlockRWMutex{}
}

// Setup the core
c := &Core{
entCore: entCore{},
Expand Down Expand Up @@ -1033,6 +1046,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
rollbackMountPathMetrics: conf.MetricSink.TelemetryConsts.RollbackMetricsIncludeMountPoint,
numRollbackWorkers: conf.NumRollbackWorkers,
impreciseLeaseRoleTracking: conf.ImpreciseLeaseRoleTracking,
detectDeadlocks: detectDeadlocks,
}

c.standbyStopCh.Store(make(chan struct{}))
Expand Down Expand Up @@ -1219,7 +1233,9 @@ func NewCore(conf *CoreConfig) (*Core, error) {

// Quotas
quotasLogger := conf.Logger.Named("quotas")
c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink)

detectDeadlocks := slices.Contains(c.detectDeadlocks, "quotas")
c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink, detectDeadlocks)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -4188,3 +4204,10 @@ func (c *Core) GetRaftAutopilotState(ctx context.Context) (*raft.AutopilotState,
func (c *Core) Events() *eventbus.EventBus {
return c.events
}

func (c *Core) DetectStateLockDeadlocks() bool {
if _, ok := c.stateLock.(*locking.DeadlockRWMutex); ok {
return true
}
return false
}
48 changes: 48 additions & 0 deletions vault/core_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3361,3 +3361,51 @@ func InduceDeadlock(t *testing.T, vaultcore *Core, expected uint32) {
t.Fatalf("expected 1 deadlock, detected %d", deadlocks)
}
}

func TestExpiration_DeadlockDetection(t *testing.T) {
testCore := TestCore(t)
testCoreUnsealed(t, testCore)

if testCore.expiration.DetectDeadlocks() {
t.Fatal("expiration has deadlock detection enabled, it shouldn't")
}

testCore = TestCoreWithDeadlockDetection(t, nil, false)
testCoreUnsealed(t, testCore)

if !testCore.expiration.DetectDeadlocks() {
t.Fatal("expiration doesn't have deadlock detection enabled, it should")
}
}

func TestQuotas_DeadlockDetection(t *testing.T) {
testCore := TestCore(t)
testCoreUnsealed(t, testCore)

if testCore.quotaManager.DetectDeadlocks() {
t.Fatal("quotas has deadlock detection enabled, it shouldn't")
}

testCore = TestCoreWithDeadlockDetection(t, nil, false)
testCoreUnsealed(t, testCore)

if !testCore.quotaManager.DetectDeadlocks() {
t.Fatal("quotas doesn't have deadlock detection enabled, it should")
}
}

func TestStatelock_DeadlockDetection(t *testing.T) {
testCore := TestCore(t)
testCoreUnsealed(t, testCore)

if testCore.DetectStateLockDeadlocks() {
t.Fatal("statelock has deadlock detection enabled, it shouldn't")
}

testCore = TestCoreWithDeadlockDetection(t, nil, false)
testCoreUnsealed(t, testCore)

if !testCore.DetectStateLockDeadlocks() {
t.Fatal("statelock doesn't have deadlock detection enabled, it should")
}
}
22 changes: 19 additions & 3 deletions vault/expiration.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"math/rand"
"os"
"path"
"slices"
"sort"
"strconv"
"strings"
Expand Down Expand Up @@ -114,7 +115,7 @@ type ExpirationManager struct {
pending sync.Map
nonexpiring sync.Map
leaseCount int
pendingLock locking.DeadlockRWMutex
pendingLock locking.RWMutex

// A sync.Lock for every active leaseID
lockPerLease sync.Map
Expand Down Expand Up @@ -327,7 +328,7 @@ func getNumExpirationWorkers(c *Core, l log.Logger) int {

// NewExpirationManager creates a new ExpirationManager that is backed
// using a given view, and uses the provided router for revocation.
func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger) *ExpirationManager {
func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger, detectDeadlocks bool) *ExpirationManager {
managerLogger := logger.Named("job-manager")
jobManager := fairshare.NewJobManager("expire", getNumExpirationWorkers(c, logger), managerLogger, c.metricSink)
jobManager.Start()
Expand All @@ -340,6 +341,7 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log
tokenStore: c.tokenStore,
logger: logger,
pending: sync.Map{},
pendingLock: &locking.SyncRWMutex{},
nonexpiring: sync.Map{},
leaseCount: 0,
tidyLock: new(int32),
Expand Down Expand Up @@ -375,6 +377,11 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log
exp.logger = log.New(&opts)
}

if detectDeadlocks {
managerLogger.Debug("enabling deadlock detection")
exp.pendingLock = &locking.DeadlockRWMutex{}
}

go exp.uniquePoliciesGc()

return exp
Expand All @@ -390,7 +397,9 @@ func (c *Core) setupExpiration(e ExpireLeaseStrategy) error {

// Create the manager
expLogger := c.baseLogger.Named("expiration")
mgr := NewExpirationManager(c, view, e, expLogger)

detectDeadlocks := slices.Contains(c.detectDeadlocks, "expiration")
mgr := NewExpirationManager(c, view, e, expLogger, detectDeadlocks)
c.expiration = mgr

// Link the token store to this
Expand Down Expand Up @@ -2821,3 +2830,10 @@ func decodeLeaseEntry(buf []byte) (*leaseEntry, error) {
out := new(leaseEntry)
return out, jsonutil.DecodeJSON(buf, out)
}

func (e *ExpirationManager) DetectDeadlocks() bool {
if _, ok := e.pendingLock.(*locking.DeadlockRWMutex); ok {
return true
}
return false
}
28 changes: 21 additions & 7 deletions vault/quotas/quotas.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,13 +170,13 @@ type Manager struct {
metricSink *metricsutil.ClusterMetricSink

// quotaLock is a lock for manipulating quotas and anything not covered by a more specific lock
quotaLock *locking.DeadlockRWMutex
quotaLock locking.RWMutex

// quotaConfigLock is a lock for accessing config items, such as RateLimitExemptPaths
quotaConfigLock *locking.DeadlockRWMutex
quotaConfigLock locking.RWMutex

// dbAndCacheLock is a lock for db and path caches that need to be reset during Reset()
dbAndCacheLock *locking.DeadlockRWMutex
dbAndCacheLock locking.RWMutex
}

// QuotaLeaseInformation contains all of the information lease-count quotas require
Expand Down Expand Up @@ -275,7 +275,7 @@ type Request struct {

// NewManager creates and initializes a new quota manager to hold all the quota
// rules and to process incoming requests.
func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink) (*Manager, error) {
func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink, detectDeadlocks bool) (*Manager, error) {
db, err := memdb.NewMemDB(dbSchema())
if err != nil {
return nil, err
Expand All @@ -287,9 +287,16 @@ func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.Clust
metricSink: ms,
rateLimitPathManager: pathmanager.New(),
config: new(Config),
quotaLock: new(locking.DeadlockRWMutex),
quotaConfigLock: new(locking.DeadlockRWMutex),
dbAndCacheLock: new(locking.DeadlockRWMutex),
quotaLock: &locking.SyncRWMutex{},
quotaConfigLock: &locking.SyncRWMutex{},
dbAndCacheLock: &locking.SyncRWMutex{},
}

if detectDeadlocks {
logger.Debug("enabling deadlock detection")
manager.quotaLock = &locking.DeadlockRWMutex{}
manager.quotaConfigLock = &locking.DeadlockRWMutex{}
manager.dbAndCacheLock = &locking.DeadlockRWMutex{}
}

manager.init(walkFunc)
Expand Down Expand Up @@ -1319,3 +1326,10 @@ func (m *Manager) HandleBackendDisabling(ctx context.Context, nsPath, mountPath

return nil
}

func (m *Manager) DetectDeadlocks() bool {
if _, ok := m.quotaLock.(*locking.DeadlockRWMutex); ok {
return true
}
return false
}
2 changes: 1 addition & 1 deletion vault/quotas/quotas_rate_limit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ func TestRateLimitQuota_Allow_WithBlock(t *testing.T) {

func TestRateLimitQuota_Update(t *testing.T) {
defer goleak.VerifyNone(t)
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
require.NoError(t, err)

quota := NewRateLimitQuota("quota1", "", "", "", "", false, time.Second, 0, 10)
Expand Down
6 changes: 3 additions & 3 deletions vault/quotas/quotas_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (
)

func TestQuotas_MountPathOverwrite(t *testing.T) {
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
require.NoError(t, err)

quota := NewRateLimitQuota("tq", "", "kv1/", "", "", false, time.Second, 0, 10)
Expand All @@ -43,7 +43,7 @@ func TestQuotas_MountPathOverwrite(t *testing.T) {
}

func TestQuotas_Precedence(t *testing.T) {
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
require.NoError(t, err)

setQuotaFunc := func(t *testing.T, name, nsPath, mountPath, pathSuffix, role string, inheritable bool) Quota {
Expand Down Expand Up @@ -142,7 +142,7 @@ func TestQuotas_QueryResolveRole_RateLimitQuotas(t *testing.T) {
leaseWalkFunc := func(context.Context, func(request *Request) bool) error {
return nil
}
qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink())
qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink(), true)
require.NoError(t, err)

rlqReq := &Request{
Expand Down
14 changes: 14 additions & 0 deletions vault/testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,20 @@ func TestCoreWithSeal(t testing.T, testSeal Seal, enableRaw bool) *Core {
return TestCoreWithSealAndUI(t, conf)
}

func TestCoreWithDeadlockDetection(t testing.T, testSeal Seal, enableRaw bool) *Core {
conf := &CoreConfig{
Seal: testSeal,
EnableUI: false,
EnableRaw: enableRaw,
BuiltinRegistry: corehelpers.NewMockBuiltinRegistry(),
AuditBackends: map[string]audit.Factory{
"file": auditFile.Factory,
},
DetectDeadlocks: "expiration,quotas,statelock",
}
return TestCoreWithSealAndUI(t, conf)
}

func TestCoreWithCustomResponseHeaderAndUI(t testing.T, CustomResponseHeaders map[string]map[string]string, enableUI bool) (*Core, [][]byte, string) {
confRaw := &server.Config{
SharedConfig: &configutil.SharedConfig{
Expand Down
9 changes: 5 additions & 4 deletions website/content/docs/configuration/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,11 @@ to specify where the configuration is.
maximum request duration allowed before Vault cancels the request. This can
be overridden per listener via the `max_request_duration` value.

- `detect_deadlocks` `(string: "")` - Specifies the internal mutex locks that should be monitored for
potential deadlocks. Currently supported value is `statelock`, which will cause "POTENTIAL DEADLOCK:"
to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this can have
a negative effect on performance due to the tracking of each lock attempt.
- `detect_deadlocks` `(string: "")` - A comma separated string that specifies the internal
mutex locks that should be monitored for potential deadlocks. Currently supported values
include `statelock`, `quotas` and `expiration` which will cause "POTENTIAL DEADLOCK:"
to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this
can have a negative effect on performance due to the tracking of each lock attempt.

- `raw_storage_endpoint` `(bool: false)` – Enables the `sys/raw` endpoint which
allows the decryption/encryption of raw data into and out of the security
Expand Down

0 comments on commit 66494c8

Please sign in to comment.