diff --git a/changelog/18604.txt b/changelog/18604.txt new file mode 100644 index 000000000..7645cbb40 --- /dev/null +++ b/changelog/18604.txt @@ -0,0 +1,3 @@ +```release-note:improvement +core: add `detect_deadlocks` config to optionally detect core state deadlocks +``` \ No newline at end of file diff --git a/command/server.go b/command/server.go index f3f7db537..ed194f07c 100644 --- a/command/server.go +++ b/command/server.go @@ -2619,6 +2619,7 @@ func createCoreConfig(c *ServerCommand, config *server.Config, backend physical. CredentialBackends: c.CredentialBackends, LogicalBackends: c.LogicalBackends, Logger: c.logger, + DetectDeadlocks: config.DetectDeadlocks, DisableSentinelTrace: config.DisableSentinelTrace, DisableCache: config.DisableCache, DisableMlock: config.DisableMlock, diff --git a/command/server/config.go b/command/server/config.go index b83a9fe2f..63b43def4 100644 --- a/command/server/config.go +++ b/command/server/config.go @@ -97,6 +97,8 @@ type Config struct { LogRequestsLevel string `hcl:"-"` LogRequestsLevelRaw interface{} `hcl:"log_requests_level"` + DetectDeadlocks string `hcl:"detect_deadlocks"` + EnableResponseHeaderRaftNodeID bool `hcl:"-"` EnableResponseHeaderRaftNodeIDRaw interface{} `hcl:"enable_response_header_raft_node_id"` @@ -389,6 +391,11 @@ func (c *Config) Merge(c2 *Config) *Config { result.LogRequestsLevel = c2.LogRequestsLevel } + result.DetectDeadlocks = c.DetectDeadlocks + if c2.DetectDeadlocks != "" { + result.DetectDeadlocks = c2.DetectDeadlocks + } + result.EnableResponseHeaderRaftNodeID = c.EnableResponseHeaderRaftNodeID if c2.EnableResponseHeaderRaftNodeID { result.EnableResponseHeaderRaftNodeID = c2.EnableResponseHeaderRaftNodeID @@ -1025,6 +1032,8 @@ func (c *Config) Sanitized() map[string]interface{} { "enable_response_header_raft_node_id": c.EnableResponseHeaderRaftNodeID, "log_requests_level": c.LogRequestsLevel, + + "detect_deadlocks": c.DetectDeadlocks, } for k, v := range sharedResult { result[k] = v diff --git a/command/server/config_test_helpers.go b/command/server/config_test_helpers.go index aac19b5df..bb06dda93 100644 --- a/command/server/config_test_helpers.go +++ b/command/server/config_test_helpers.go @@ -745,6 +745,7 @@ func testConfig_Sanitized(t *testing.T) { "raw_storage_endpoint": true, "introspection_endpoint": false, "disable_sentinel_trace": true, + "detect_deadlocks": "", "enable_ui": true, "enable_response_header_hostname": false, "enable_response_header_raft_node_id": false, diff --git a/helper/locking/deadlock.go b/helper/locking/deadlock.go deleted file mode 100644 index e250abd1a..000000000 --- a/helper/locking/deadlock.go +++ /dev/null @@ -1,21 +0,0 @@ -//go:build deadlock - -package locking - -import ( - "github.com/sasha-s/go-deadlock" -) - -// DeadlockMutex, when the build tag `deadlock` is present, behaves like a -// sync.Mutex but does periodic checking to see if outstanding locks and requests -// look like a deadlock. If it finds a deadlock candidate it will output it -// prefixed with "POTENTIAL DEADLOCK", as described at -// https://github.com/sasha-s/go-deadlock -type DeadlockMutex struct { - deadlock.Mutex -} - -// DeadlockRWMutex is the RW version of DeadlockMutex. -type DeadlockRWMutex struct { - deadlock.RWMutex -} diff --git a/helper/locking/lock.go b/helper/locking/lock.go index 1b1fae3af..8043f01ad 100644 --- a/helper/locking/lock.go +++ b/helper/locking/lock.go @@ -1,19 +1,46 @@ -//go:build !deadlock - package locking import ( "sync" + + "github.com/sasha-s/go-deadlock" ) -// DeadlockMutex is just a sync.Mutex when the build tag `deadlock` is absent. -// See its other definition in the corresponding deadlock-build-tag-constrained -// file for more details. +// Common mutex interface to allow either built-in or imported deadlock use +type Mutex interface { + Lock() + Unlock() +} + +// Common r/w mutex interface to allow either built-in or imported deadlock use +type RWMutex interface { + Lock() + RLock() + RLocker() sync.Locker + RUnlock() + Unlock() +} + +// DeadlockMutex (used when requested via config option `detact_deadlocks`), +// behaves like a sync.Mutex but does periodic checking to see if outstanding +// locks and requests look like a deadlock. If it finds a deadlock candidate it +// will output it prefixed with "POTENTIAL DEADLOCK", as described at +// https://github.com/sasha-s/go-deadlock type DeadlockMutex struct { - sync.Mutex + deadlock.Mutex } // DeadlockRWMutex is the RW version of DeadlockMutex. type DeadlockRWMutex struct { + deadlock.RWMutex +} + +// Regular sync/mutex. +type SyncMutex struct { + sync.Mutex +} + +// DeadlockRWMutex is the RW version of SyncMutex. +type SyncRWMutex struct { sync.RWMutex } diff --git a/http/sys_config_state_test.go b/http/sys_config_state_test.go index 4cd2aae8b..d55897854 100644 --- a/http/sys_config_state_test.go +++ b/http/sys_config_state_test.go @@ -39,6 +39,7 @@ func TestSysConfigState_Sanitized(t *testing.T) { "disable_printable_check": false, "disable_sealwrap": false, "raw_storage_endpoint": false, + "detect_deadlocks": "", "introspection_endpoint": false, "disable_sentinel_trace": false, "enable_ui": false, diff --git a/vault/core.go b/vault/core.go index 8252e1bee..6624d7b5f 100644 --- a/vault/core.go +++ b/vault/core.go @@ -316,7 +316,7 @@ type Core struct { auditBackends map[string]audit.Factory // stateLock protects mutable state - stateLock locking.DeadlockRWMutex + stateLock locking.RWMutex sealed *uint32 standby bool @@ -732,6 +732,9 @@ type CoreConfig struct { Logger log.Logger + // Use the deadlocks library to detect deadlocks + DetectDeadlocks string + // Disables the trace display for Sentinel checks DisableSentinelTrace bool @@ -904,6 +907,14 @@ func CreateCore(conf *CoreConfig) (*Core, error) { conf.NumExpirationWorkers = numExpirationWorkersDefault } + // Use imported logging deadlock if requested + var stateLock locking.RWMutex + if strings.Contains(conf.DetectDeadlocks, "statelock") { + stateLock = &locking.DeadlockRWMutex{} + } else { + stateLock = &locking.SyncRWMutex{} + } + effectiveSDKVersion := conf.EffectiveSDKVersion if effectiveSDKVersion == "" { effectiveSDKVersion = version.GetVersion().Version @@ -922,6 +933,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) { clusterListener: new(atomic.Value), customListenerHeader: new(atomic.Value), seal: conf.Seal, + stateLock: stateLock, router: NewRouter(), sealed: new(uint32), sealMigrationDone: new(uint32), diff --git a/vault/core_test.go b/vault/core_test.go index 3789c6853..e72e705e8 100644 --- a/vault/core_test.go +++ b/vault/core_test.go @@ -6,6 +6,7 @@ import ( "reflect" "strings" "sync" + "sync/atomic" "testing" "time" @@ -21,6 +22,7 @@ import ( "github.com/hashicorp/vault/sdk/physical" "github.com/hashicorp/vault/sdk/physical/inmem" "github.com/hashicorp/vault/version" + "github.com/sasha-s/go-deadlock" ) // invalidKey is used to test Unseal @@ -2836,3 +2838,51 @@ func TestCore_ServiceRegistration(t *testing.T) { t.Fatal(diff) } } + +func TestDetectedDeadlock(t *testing.T) { + testCore, _, _ := TestCoreUnsealedWithConfig(t, &CoreConfig{DetectDeadlocks: "statelock"}) + InduceDeadlock(t, testCore, 1) +} + +func TestDefaultDeadlock(t *testing.T) { + testCore, _, _ := TestCoreUnsealed(t) + InduceDeadlock(t, testCore, 0) +} + +func RestoreDeadlockOpts() func() { + opts := deadlock.Opts + return func() { + deadlock.Opts = opts + } +} + +func InduceDeadlock(t *testing.T, vaultcore *Core, expected uint32) { + defer RestoreDeadlockOpts()() + var deadlocks uint32 + deadlock.Opts.OnPotentialDeadlock = func() { + atomic.AddUint32(&deadlocks, 1) + } + var mtx deadlock.Mutex + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + vaultcore.expiration.coreStateLock.Lock() + mtx.Lock() + mtx.Unlock() + vaultcore.expiration.coreStateLock.Unlock() + }() + wg.Wait() + wg.Add(1) + go func() { + defer wg.Done() + mtx.Lock() + vaultcore.expiration.coreStateLock.RLock() + vaultcore.expiration.coreStateLock.RUnlock() + mtx.Unlock() + }() + wg.Wait() + if atomic.LoadUint32(&deadlocks) != expected { + t.Fatalf("expected 1 deadlock, detected %d", deadlocks) + } +} diff --git a/vault/expiration.go b/vault/expiration.go index a5f1918dc..a49fed466 100644 --- a/vault/expiration.go +++ b/vault/expiration.go @@ -139,7 +139,7 @@ type ExpirationManager struct { quitCh chan struct{} // do not hold coreStateLock in any API handler code - it is already held - coreStateLock *locking.DeadlockRWMutex + coreStateLock locking.RWMutex quitContext context.Context leaseCheckCounter *uint32 @@ -350,7 +350,7 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log restoreLocks: locksutil.CreateLocks(), quitCh: make(chan struct{}), - coreStateLock: &c.stateLock, + coreStateLock: c.stateLock, quitContext: c.activeContext, leaseCheckCounter: new(uint32), diff --git a/vault/test_cluster_detect_deadlock.go b/vault/test_cluster_detect_deadlock.go new file mode 100644 index 000000000..154a948f4 --- /dev/null +++ b/vault/test_cluster_detect_deadlock.go @@ -0,0 +1,5 @@ +//go:build deadlock + +package vault + +const TestDeadlockDetection = "statelock" diff --git a/vault/test_cluster_do_not_detect_deadlock.go b/vault/test_cluster_do_not_detect_deadlock.go new file mode 100644 index 000000000..06cf1a94a --- /dev/null +++ b/vault/test_cluster_do_not_detect_deadlock.go @@ -0,0 +1,5 @@ +//go:build !deadlock + +package vault + +const TestDeadlockDetection = "" diff --git a/vault/testing.go b/vault/testing.go index 05853c09e..0b3d63dd7 100644 --- a/vault/testing.go +++ b/vault/testing.go @@ -212,6 +212,7 @@ func TestCoreWithSealAndUINoCleanup(t testing.T, opts *CoreConfig) *Core { conf.EnableResponseHeaderHostname = opts.EnableResponseHeaderHostname conf.DisableSSCTokens = opts.DisableSSCTokens conf.PluginDirectory = opts.PluginDirectory + conf.DetectDeadlocks = opts.DetectDeadlocks if opts.Logger != nil { conf.Logger = opts.Logger @@ -1382,6 +1383,7 @@ func NewTestCluster(t testing.T, base *CoreConfig, opts *TestClusterOptions) *Te var baseAddr *net.TCPAddr if opts != nil && opts.BaseListenAddress != "" { baseAddr, err = net.ResolveTCPAddr("tcp", opts.BaseListenAddress) + if err != nil { t.Fatal("could not parse given base IP") } @@ -1633,6 +1635,7 @@ func NewTestCluster(t testing.T, base *CoreConfig, opts *TestClusterOptions) *Te } if base != nil { + coreConfig.DetectDeadlocks = TestDeadlockDetection coreConfig.RawConfig = base.RawConfig coreConfig.DisableCache = base.DisableCache coreConfig.EnableUI = base.EnableUI diff --git a/website/content/docs/configuration/index.mdx b/website/content/docs/configuration/index.mdx index b7166aa2e..287d64d1f 100644 --- a/website/content/docs/configuration/index.mdx +++ b/website/content/docs/configuration/index.mdx @@ -149,6 +149,11 @@ to specify where the configuration is. maximum request duration allowed before Vault cancels the request. This can be overridden per listener via the `max_request_duration` value. +- `detect_deadlocks` `(string: "")` - Specifies the internal mutex locks that should be monitored for +potential deadlocks. Currently supported value is `statelock`, which will cause "POTENTIAL DEADLOCK:" +to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this can have +a negative effect on performance due to the tracking of each lock attempt. + - `raw_storage_endpoint` `(bool: false)` – Enables the `sys/raw` endpoint which allows the decryption/encryption of raw data into and out of the security barrier. This is a highly privileged endpoint.