From a5cd06dc0473c26e3eead2398bda5f97c1ad687d Mon Sep 17 00:00:00 2001 From: hc-github-team-secure-vault-core <82990506+hc-github-team-secure-vault-core@users.noreply.github.com> Date: Mon, 30 Oct 2023 13:21:47 -0400 Subject: [PATCH] Backport of core: fix bug where deadlock detection was always on for expiration and quotas into release/1.14.x (#23904) * backport of commit 66494c8129cddf33eb0cf435b6cb2f76bc47416f * Remove slices package * remove slices --------- Co-authored-by: Jason O'Donnell <2160810+jasonodonnell@users.noreply.github.com> --- changelog/23902.txt | 5 ++ vault/core.go | 46 +++++++++++++++---- vault/core_test.go | 48 ++++++++++++++++++++ vault/expiration.go | 27 +++++++++-- vault/quotas/quotas.go | 28 +++++++++--- vault/quotas/quotas_rate_limit_test.go | 2 +- vault/quotas/quotas_test.go | 6 +-- vault/testing.go | 14 ++++++ website/content/docs/configuration/index.mdx | 9 ++-- 9 files changed, 159 insertions(+), 26 deletions(-) create mode 100644 changelog/23902.txt diff --git a/changelog/23902.txt b/changelog/23902.txt new file mode 100644 index 000000000..cbfec6509 --- /dev/null +++ b/changelog/23902.txt @@ -0,0 +1,5 @@ +```release-note:bug +core: fix bug where deadlock detection was always on for expiration and quotas. +These can now be configured individually with `detect_deadlocks`. +``` + diff --git a/vault/core.go b/vault/core.go index 18cb720a8..56322af80 100644 --- a/vault/core.go +++ b/vault/core.go @@ -706,6 +706,9 @@ type Core struct { // If any role based quota (LCQ or RLQ) is enabled, don't track lease counts by role impreciseLeaseRoleTracking bool + + // Config value for "detect_deadlocks". + detectDeadlocks []string } // c.stateLock needs to be held in read mode before calling this function. @@ -963,19 +966,30 @@ func CreateCore(conf *CoreConfig) (*Core, error) { if conf.NumRollbackWorkers == 0 { conf.NumRollbackWorkers = RollbackDefaultNumWorkers } - // Use imported logging deadlock if requested - var stateLock locking.RWMutex - if strings.Contains(conf.DetectDeadlocks, "statelock") { - stateLock = &locking.DeadlockRWMutex{} - } else { - stateLock = &locking.SyncRWMutex{} - } effectiveSDKVersion := conf.EffectiveSDKVersion if effectiveSDKVersion == "" { effectiveSDKVersion = version.GetVersion().Version } + var detectDeadlocks []string + if conf.DetectDeadlocks != "" { + detectDeadlocks = strings.Split(conf.DetectDeadlocks, ",") + for k, v := range detectDeadlocks { + detectDeadlocks[k] = strings.ToLower(strings.TrimSpace(v)) + } + } + + // Use imported logging deadlock if requested + var stateLock locking.RWMutex + stateLock = &locking.SyncRWMutex{} + + for _, v := range detectDeadlocks { + if v == "statelock" { + stateLock = &locking.DeadlockRWMutex{} + } + } + // Setup the core c := &Core{ entCore: entCore{}, @@ -1048,6 +1062,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) { expirationRevokeRetryBase: conf.ExpirationRevokeRetryBase, numRollbackWorkers: conf.NumRollbackWorkers, impreciseLeaseRoleTracking: conf.ImpreciseLeaseRoleTracking, + detectDeadlocks: detectDeadlocks, } c.standbyStopCh.Store(make(chan struct{})) @@ -1229,7 +1244,15 @@ func NewCore(conf *CoreConfig) (*Core, error) { // Quotas quotasLogger := conf.Logger.Named("quotas") - c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink) + + detectDeadlocks := false + for _, v := range c.detectDeadlocks { + if v == "quotas" { + detectDeadlocks = true + } + } + + c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink, detectDeadlocks) if err != nil { return nil, err } @@ -4133,3 +4156,10 @@ func (c *Core) GetRaftAutopilotState(ctx context.Context) (*raft.AutopilotState, func (c *Core) Events() *eventbus.EventBus { return c.events } + +func (c *Core) DetectStateLockDeadlocks() bool { + if _, ok := c.stateLock.(*locking.DeadlockRWMutex); ok { + return true + } + return false +} diff --git a/vault/core_test.go b/vault/core_test.go index bdac8dff1..2e442417a 100644 --- a/vault/core_test.go +++ b/vault/core_test.go @@ -3322,3 +3322,51 @@ func InduceDeadlock(t *testing.T, vaultcore *Core, expected uint32) { t.Fatalf("expected 1 deadlock, detected %d", deadlocks) } } + +func TestExpiration_DeadlockDetection(t *testing.T) { + testCore := TestCore(t) + testCoreUnsealed(t, testCore) + + if testCore.expiration.DetectDeadlocks() { + t.Fatal("expiration has deadlock detection enabled, it shouldn't") + } + + testCore = TestCoreWithDeadlockDetection(t, nil, false) + testCoreUnsealed(t, testCore) + + if !testCore.expiration.DetectDeadlocks() { + t.Fatal("expiration doesn't have deadlock detection enabled, it should") + } +} + +func TestQuotas_DeadlockDetection(t *testing.T) { + testCore := TestCore(t) + testCoreUnsealed(t, testCore) + + if testCore.quotaManager.DetectDeadlocks() { + t.Fatal("quotas has deadlock detection enabled, it shouldn't") + } + + testCore = TestCoreWithDeadlockDetection(t, nil, false) + testCoreUnsealed(t, testCore) + + if !testCore.quotaManager.DetectDeadlocks() { + t.Fatal("quotas doesn't have deadlock detection enabled, it should") + } +} + +func TestStatelock_DeadlockDetection(t *testing.T) { + testCore := TestCore(t) + testCoreUnsealed(t, testCore) + + if testCore.DetectStateLockDeadlocks() { + t.Fatal("statelock has deadlock detection enabled, it shouldn't") + } + + testCore = TestCoreWithDeadlockDetection(t, nil, false) + testCoreUnsealed(t, testCore) + + if !testCore.DetectStateLockDeadlocks() { + t.Fatal("statelock doesn't have deadlock detection enabled, it should") + } +} diff --git a/vault/expiration.go b/vault/expiration.go index 1eb3a5a1e..f1872b551 100644 --- a/vault/expiration.go +++ b/vault/expiration.go @@ -114,7 +114,7 @@ type ExpirationManager struct { pending sync.Map nonexpiring sync.Map leaseCount int - pendingLock locking.DeadlockRWMutex + pendingLock locking.RWMutex // A sync.Lock for every active leaseID lockPerLease sync.Map @@ -327,7 +327,7 @@ func getNumExpirationWorkers(c *Core, l log.Logger) int { // NewExpirationManager creates a new ExpirationManager that is backed // using a given view, and uses the provided router for revocation. -func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger) *ExpirationManager { +func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger, detectDeadlocks bool) *ExpirationManager { managerLogger := logger.Named("job-manager") jobManager := fairshare.NewJobManager("expire", getNumExpirationWorkers(c, logger), managerLogger, c.metricSink) jobManager.Start() @@ -340,6 +340,7 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log tokenStore: c.tokenStore, logger: logger, pending: sync.Map{}, + pendingLock: &locking.SyncRWMutex{}, nonexpiring: sync.Map{}, leaseCount: 0, tidyLock: new(int32), @@ -375,6 +376,11 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log exp.logger = log.New(&opts) } + if detectDeadlocks { + managerLogger.Debug("enabling deadlock detection") + exp.pendingLock = &locking.DeadlockRWMutex{} + } + go exp.uniquePoliciesGc() return exp @@ -390,7 +396,15 @@ func (c *Core) setupExpiration(e ExpireLeaseStrategy) error { // Create the manager expLogger := c.baseLogger.Named("expiration") - mgr := NewExpirationManager(c, view, e, expLogger) + + detectDeadlocks := false + for _, v := range c.detectDeadlocks { + if v == "expiration" { + detectDeadlocks = true + } + } + + mgr := NewExpirationManager(c, view, e, expLogger, detectDeadlocks) c.expiration = mgr // Link the token store to this @@ -2821,3 +2835,10 @@ func decodeLeaseEntry(buf []byte) (*leaseEntry, error) { out := new(leaseEntry) return out, jsonutil.DecodeJSON(buf, out) } + +func (e *ExpirationManager) DetectDeadlocks() bool { + if _, ok := e.pendingLock.(*locking.DeadlockRWMutex); ok { + return true + } + return false +} diff --git a/vault/quotas/quotas.go b/vault/quotas/quotas.go index c48ca6414..609b01493 100644 --- a/vault/quotas/quotas.go +++ b/vault/quotas/quotas.go @@ -170,13 +170,13 @@ type Manager struct { metricSink *metricsutil.ClusterMetricSink // quotaLock is a lock for manipulating quotas and anything not covered by a more specific lock - quotaLock *locking.DeadlockRWMutex + quotaLock locking.RWMutex // quotaConfigLock is a lock for accessing config items, such as RateLimitExemptPaths - quotaConfigLock *locking.DeadlockRWMutex + quotaConfigLock locking.RWMutex // dbAndCacheLock is a lock for db and path caches that need to be reset during Reset() - dbAndCacheLock *locking.DeadlockRWMutex + dbAndCacheLock locking.RWMutex } // QuotaLeaseInformation contains all of the information lease-count quotas require @@ -272,7 +272,7 @@ type Request struct { // NewManager creates and initializes a new quota manager to hold all the quota // rules and to process incoming requests. -func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink) (*Manager, error) { +func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink, detectDeadlocks bool) (*Manager, error) { db, err := memdb.NewMemDB(dbSchema()) if err != nil { return nil, err @@ -284,9 +284,16 @@ func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.Clust metricSink: ms, rateLimitPathManager: pathmanager.New(), config: new(Config), - quotaLock: new(locking.DeadlockRWMutex), - quotaConfigLock: new(locking.DeadlockRWMutex), - dbAndCacheLock: new(locking.DeadlockRWMutex), + quotaLock: &locking.SyncRWMutex{}, + quotaConfigLock: &locking.SyncRWMutex{}, + dbAndCacheLock: &locking.SyncRWMutex{}, + } + + if detectDeadlocks { + logger.Debug("enabling deadlock detection") + manager.quotaLock = &locking.DeadlockRWMutex{} + manager.quotaConfigLock = &locking.DeadlockRWMutex{} + manager.dbAndCacheLock = &locking.DeadlockRWMutex{} } manager.init(walkFunc) @@ -1302,3 +1309,10 @@ func (m *Manager) HandleBackendDisabling(ctx context.Context, nsPath, mountPath return nil } + +func (m *Manager) DetectDeadlocks() bool { + if _, ok := m.quotaLock.(*locking.DeadlockRWMutex); ok { + return true + } + return false +} diff --git a/vault/quotas/quotas_rate_limit_test.go b/vault/quotas/quotas_rate_limit_test.go index ff5711acf..2d034efcb 100644 --- a/vault/quotas/quotas_rate_limit_test.go +++ b/vault/quotas/quotas_rate_limit_test.go @@ -218,7 +218,7 @@ func TestRateLimitQuota_Allow_WithBlock(t *testing.T) { func TestRateLimitQuota_Update(t *testing.T) { defer goleak.VerifyNone(t) - qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink()) + qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true) require.NoError(t, err) quota := NewRateLimitQuota("quota1", "", "", "", "", 10, time.Second, 0) diff --git a/vault/quotas/quotas_test.go b/vault/quotas/quotas_test.go index 6ab12a20f..0ef483616 100644 --- a/vault/quotas/quotas_test.go +++ b/vault/quotas/quotas_test.go @@ -16,7 +16,7 @@ import ( ) func TestQuotas_MountPathOverwrite(t *testing.T) { - qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink()) + qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true) require.NoError(t, err) quota := NewRateLimitQuota("tq", "", "kv1/", "", "", 10, time.Second, 0) @@ -43,7 +43,7 @@ func TestQuotas_MountPathOverwrite(t *testing.T) { } func TestQuotas_Precedence(t *testing.T) { - qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink()) + qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true) require.NoError(t, err) setQuotaFunc := func(t *testing.T, name, nsPath, mountPath, pathSuffix, role string) Quota { @@ -124,7 +124,7 @@ func TestQuotas_QueryResolveRole_RateLimitQuotas(t *testing.T) { leaseWalkFunc := func(context.Context, func(request *Request) bool) error { return nil } - qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink()) + qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink(), true) require.NoError(t, err) rlqReq := &Request{ diff --git a/vault/testing.go b/vault/testing.go index 649268a1b..162e7dfa6 100644 --- a/vault/testing.go +++ b/vault/testing.go @@ -141,6 +141,20 @@ func TestCoreWithSeal(t testing.T, testSeal Seal, enableRaw bool) *Core { return TestCoreWithSealAndUI(t, conf) } +func TestCoreWithDeadlockDetection(t testing.T, testSeal Seal, enableRaw bool) *Core { + conf := &CoreConfig{ + Seal: testSeal, + EnableUI: false, + EnableRaw: enableRaw, + BuiltinRegistry: corehelpers.NewMockBuiltinRegistry(), + AuditBackends: map[string]audit.Factory{ + "file": auditFile.Factory, + }, + DetectDeadlocks: "expiration,quotas,statelock", + } + return TestCoreWithSealAndUI(t, conf) +} + func TestCoreWithCustomResponseHeaderAndUI(t testing.T, CustomResponseHeaders map[string]map[string]string, enableUI bool) (*Core, [][]byte, string) { confRaw := &server.Config{ SharedConfig: &configutil.SharedConfig{ diff --git a/website/content/docs/configuration/index.mdx b/website/content/docs/configuration/index.mdx index 1ae7f8d89..cd414e606 100644 --- a/website/content/docs/configuration/index.mdx +++ b/website/content/docs/configuration/index.mdx @@ -159,10 +159,11 @@ to specify where the configuration is. maximum request duration allowed before Vault cancels the request. This can be overridden per listener via the `max_request_duration` value. -- `detect_deadlocks` `(string: "")` - Specifies the internal mutex locks that should be monitored for -potential deadlocks. Currently supported value is `statelock`, which will cause "POTENTIAL DEADLOCK:" -to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this can have -a negative effect on performance due to the tracking of each lock attempt. +- `detect_deadlocks` `(string: "")` - A comma separated string that specifies the internal +mutex locks that should be monitored for potential deadlocks. Currently supported values +include `statelock`, `quotas` and `expiration` which will cause "POTENTIAL DEADLOCK:" +to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this +can have a negative effect on performance due to the tracking of each lock attempt. - `raw_storage_endpoint` `(bool: false)` – Enables the `sys/raw` endpoint which allows the decryption/encryption of raw data into and out of the security