Backport of core: fix bug where deadlock detection was always on for expiration and quotas into release/1.14.x (#23904)

* backport of commit 66494c8129cddf33eb0cf435b6cb2f76bc47416f

* Remove slices package

* remove slices

---------

Co-authored-by: Jason O'Donnell <2160810+jasonodonnell@users.noreply.github.com>
This commit is contained in:
hc-github-team-secure-vault-core 2023-10-30 13:21:47 -04:00 committed by GitHub
parent f752283c08
commit a5cd06dc04
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 159 additions and 26 deletions

5
changelog/23902.txt Normal file
View File

@ -0,0 +1,5 @@
```release-note:bug
core: fix bug where deadlock detection was always on for expiration and quotas.
These can now be configured individually with `detect_deadlocks`.
```

View File

@ -706,6 +706,9 @@ type Core struct {
// If any role based quota (LCQ or RLQ) is enabled, don't track lease counts by role
impreciseLeaseRoleTracking bool
// Config value for "detect_deadlocks".
detectDeadlocks []string
}
// c.stateLock needs to be held in read mode before calling this function.
@ -963,19 +966,30 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
if conf.NumRollbackWorkers == 0 {
conf.NumRollbackWorkers = RollbackDefaultNumWorkers
}
// Use imported logging deadlock if requested
var stateLock locking.RWMutex
if strings.Contains(conf.DetectDeadlocks, "statelock") {
stateLock = &locking.DeadlockRWMutex{}
} else {
stateLock = &locking.SyncRWMutex{}
}
effectiveSDKVersion := conf.EffectiveSDKVersion
if effectiveSDKVersion == "" {
effectiveSDKVersion = version.GetVersion().Version
}
var detectDeadlocks []string
if conf.DetectDeadlocks != "" {
detectDeadlocks = strings.Split(conf.DetectDeadlocks, ",")
for k, v := range detectDeadlocks {
detectDeadlocks[k] = strings.ToLower(strings.TrimSpace(v))
}
}
// Use imported logging deadlock if requested
var stateLock locking.RWMutex
stateLock = &locking.SyncRWMutex{}
for _, v := range detectDeadlocks {
if v == "statelock" {
stateLock = &locking.DeadlockRWMutex{}
}
}
// Setup the core
c := &Core{
entCore: entCore{},
@ -1048,6 +1062,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
expirationRevokeRetryBase: conf.ExpirationRevokeRetryBase,
numRollbackWorkers: conf.NumRollbackWorkers,
impreciseLeaseRoleTracking: conf.ImpreciseLeaseRoleTracking,
detectDeadlocks: detectDeadlocks,
}
c.standbyStopCh.Store(make(chan struct{}))
@ -1229,7 +1244,15 @@ func NewCore(conf *CoreConfig) (*Core, error) {
// Quotas
quotasLogger := conf.Logger.Named("quotas")
c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink)
detectDeadlocks := false
for _, v := range c.detectDeadlocks {
if v == "quotas" {
detectDeadlocks = true
}
}
c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink, detectDeadlocks)
if err != nil {
return nil, err
}
@ -4133,3 +4156,10 @@ func (c *Core) GetRaftAutopilotState(ctx context.Context) (*raft.AutopilotState,
func (c *Core) Events() *eventbus.EventBus {
return c.events
}
func (c *Core) DetectStateLockDeadlocks() bool {
if _, ok := c.stateLock.(*locking.DeadlockRWMutex); ok {
return true
}
return false
}

View File

@ -3322,3 +3322,51 @@ func InduceDeadlock(t *testing.T, vaultcore *Core, expected uint32) {
t.Fatalf("expected 1 deadlock, detected %d", deadlocks)
}
}
func TestExpiration_DeadlockDetection(t *testing.T) {
testCore := TestCore(t)
testCoreUnsealed(t, testCore)
if testCore.expiration.DetectDeadlocks() {
t.Fatal("expiration has deadlock detection enabled, it shouldn't")
}
testCore = TestCoreWithDeadlockDetection(t, nil, false)
testCoreUnsealed(t, testCore)
if !testCore.expiration.DetectDeadlocks() {
t.Fatal("expiration doesn't have deadlock detection enabled, it should")
}
}
func TestQuotas_DeadlockDetection(t *testing.T) {
testCore := TestCore(t)
testCoreUnsealed(t, testCore)
if testCore.quotaManager.DetectDeadlocks() {
t.Fatal("quotas has deadlock detection enabled, it shouldn't")
}
testCore = TestCoreWithDeadlockDetection(t, nil, false)
testCoreUnsealed(t, testCore)
if !testCore.quotaManager.DetectDeadlocks() {
t.Fatal("quotas doesn't have deadlock detection enabled, it should")
}
}
func TestStatelock_DeadlockDetection(t *testing.T) {
testCore := TestCore(t)
testCoreUnsealed(t, testCore)
if testCore.DetectStateLockDeadlocks() {
t.Fatal("statelock has deadlock detection enabled, it shouldn't")
}
testCore = TestCoreWithDeadlockDetection(t, nil, false)
testCoreUnsealed(t, testCore)
if !testCore.DetectStateLockDeadlocks() {
t.Fatal("statelock doesn't have deadlock detection enabled, it should")
}
}

View File

@ -114,7 +114,7 @@ type ExpirationManager struct {
pending sync.Map
nonexpiring sync.Map
leaseCount int
pendingLock locking.DeadlockRWMutex
pendingLock locking.RWMutex
// A sync.Lock for every active leaseID
lockPerLease sync.Map
@ -327,7 +327,7 @@ func getNumExpirationWorkers(c *Core, l log.Logger) int {
// NewExpirationManager creates a new ExpirationManager that is backed
// using a given view, and uses the provided router for revocation.
func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger) *ExpirationManager {
func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger, detectDeadlocks bool) *ExpirationManager {
managerLogger := logger.Named("job-manager")
jobManager := fairshare.NewJobManager("expire", getNumExpirationWorkers(c, logger), managerLogger, c.metricSink)
jobManager.Start()
@ -340,6 +340,7 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log
tokenStore: c.tokenStore,
logger: logger,
pending: sync.Map{},
pendingLock: &locking.SyncRWMutex{},
nonexpiring: sync.Map{},
leaseCount: 0,
tidyLock: new(int32),
@ -375,6 +376,11 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log
exp.logger = log.New(&opts)
}
if detectDeadlocks {
managerLogger.Debug("enabling deadlock detection")
exp.pendingLock = &locking.DeadlockRWMutex{}
}
go exp.uniquePoliciesGc()
return exp
@ -390,7 +396,15 @@ func (c *Core) setupExpiration(e ExpireLeaseStrategy) error {
// Create the manager
expLogger := c.baseLogger.Named("expiration")
mgr := NewExpirationManager(c, view, e, expLogger)
detectDeadlocks := false
for _, v := range c.detectDeadlocks {
if v == "expiration" {
detectDeadlocks = true
}
}
mgr := NewExpirationManager(c, view, e, expLogger, detectDeadlocks)
c.expiration = mgr
// Link the token store to this
@ -2821,3 +2835,10 @@ func decodeLeaseEntry(buf []byte) (*leaseEntry, error) {
out := new(leaseEntry)
return out, jsonutil.DecodeJSON(buf, out)
}
func (e *ExpirationManager) DetectDeadlocks() bool {
if _, ok := e.pendingLock.(*locking.DeadlockRWMutex); ok {
return true
}
return false
}

View File

@ -170,13 +170,13 @@ type Manager struct {
metricSink *metricsutil.ClusterMetricSink
// quotaLock is a lock for manipulating quotas and anything not covered by a more specific lock
quotaLock *locking.DeadlockRWMutex
quotaLock locking.RWMutex
// quotaConfigLock is a lock for accessing config items, such as RateLimitExemptPaths
quotaConfigLock *locking.DeadlockRWMutex
quotaConfigLock locking.RWMutex
// dbAndCacheLock is a lock for db and path caches that need to be reset during Reset()
dbAndCacheLock *locking.DeadlockRWMutex
dbAndCacheLock locking.RWMutex
}
// QuotaLeaseInformation contains all of the information lease-count quotas require
@ -272,7 +272,7 @@ type Request struct {
// NewManager creates and initializes a new quota manager to hold all the quota
// rules and to process incoming requests.
func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink) (*Manager, error) {
func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink, detectDeadlocks bool) (*Manager, error) {
db, err := memdb.NewMemDB(dbSchema())
if err != nil {
return nil, err
@ -284,9 +284,16 @@ func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.Clust
metricSink: ms,
rateLimitPathManager: pathmanager.New(),
config: new(Config),
quotaLock: new(locking.DeadlockRWMutex),
quotaConfigLock: new(locking.DeadlockRWMutex),
dbAndCacheLock: new(locking.DeadlockRWMutex),
quotaLock: &locking.SyncRWMutex{},
quotaConfigLock: &locking.SyncRWMutex{},
dbAndCacheLock: &locking.SyncRWMutex{},
}
if detectDeadlocks {
logger.Debug("enabling deadlock detection")
manager.quotaLock = &locking.DeadlockRWMutex{}
manager.quotaConfigLock = &locking.DeadlockRWMutex{}
manager.dbAndCacheLock = &locking.DeadlockRWMutex{}
}
manager.init(walkFunc)
@ -1302,3 +1309,10 @@ func (m *Manager) HandleBackendDisabling(ctx context.Context, nsPath, mountPath
return nil
}
func (m *Manager) DetectDeadlocks() bool {
if _, ok := m.quotaLock.(*locking.DeadlockRWMutex); ok {
return true
}
return false
}

View File

@ -218,7 +218,7 @@ func TestRateLimitQuota_Allow_WithBlock(t *testing.T) {
func TestRateLimitQuota_Update(t *testing.T) {
defer goleak.VerifyNone(t)
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
require.NoError(t, err)
quota := NewRateLimitQuota("quota1", "", "", "", "", 10, time.Second, 0)

View File

@ -16,7 +16,7 @@ import (
)
func TestQuotas_MountPathOverwrite(t *testing.T) {
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
require.NoError(t, err)
quota := NewRateLimitQuota("tq", "", "kv1/", "", "", 10, time.Second, 0)
@ -43,7 +43,7 @@ func TestQuotas_MountPathOverwrite(t *testing.T) {
}
func TestQuotas_Precedence(t *testing.T) {
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
require.NoError(t, err)
setQuotaFunc := func(t *testing.T, name, nsPath, mountPath, pathSuffix, role string) Quota {
@ -124,7 +124,7 @@ func TestQuotas_QueryResolveRole_RateLimitQuotas(t *testing.T) {
leaseWalkFunc := func(context.Context, func(request *Request) bool) error {
return nil
}
qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink())
qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink(), true)
require.NoError(t, err)
rlqReq := &Request{

View File

@ -141,6 +141,20 @@ func TestCoreWithSeal(t testing.T, testSeal Seal, enableRaw bool) *Core {
return TestCoreWithSealAndUI(t, conf)
}
func TestCoreWithDeadlockDetection(t testing.T, testSeal Seal, enableRaw bool) *Core {
conf := &CoreConfig{
Seal: testSeal,
EnableUI: false,
EnableRaw: enableRaw,
BuiltinRegistry: corehelpers.NewMockBuiltinRegistry(),
AuditBackends: map[string]audit.Factory{
"file": auditFile.Factory,
},
DetectDeadlocks: "expiration,quotas,statelock",
}
return TestCoreWithSealAndUI(t, conf)
}
func TestCoreWithCustomResponseHeaderAndUI(t testing.T, CustomResponseHeaders map[string]map[string]string, enableUI bool) (*Core, [][]byte, string) {
confRaw := &server.Config{
SharedConfig: &configutil.SharedConfig{

View File

@ -159,10 +159,11 @@ to specify where the configuration is.
maximum request duration allowed before Vault cancels the request. This can
be overridden per listener via the `max_request_duration` value.
- `detect_deadlocks` `(string: "")` - Specifies the internal mutex locks that should be monitored for
potential deadlocks. Currently supported value is `statelock`, which will cause "POTENTIAL DEADLOCK:"
to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this can have
a negative effect on performance due to the tracking of each lock attempt.
- `detect_deadlocks` `(string: "")` - A comma separated string that specifies the internal
mutex locks that should be monitored for potential deadlocks. Currently supported values
include `statelock`, `quotas` and `expiration` which will cause "POTENTIAL DEADLOCK:"
to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this
can have a negative effect on performance due to the tracking of each lock attempt.
- `raw_storage_endpoint` `(bool: false)` Enables the `sys/raw` endpoint which
allows the decryption/encryption of raw data into and out of the security