Backport of core: fix bug where deadlock detection was always on for expiration and quotas into release/1.14.x (#23904)
* backport of commit 66494c8129cddf33eb0cf435b6cb2f76bc47416f * Remove slices package * remove slices --------- Co-authored-by: Jason O'Donnell <2160810+jasonodonnell@users.noreply.github.com>
This commit is contained in:
parent
f752283c08
commit
a5cd06dc04
|
@ -0,0 +1,5 @@
|
|||
```release-note:bug
|
||||
core: fix bug where deadlock detection was always on for expiration and quotas.
|
||||
These can now be configured individually with `detect_deadlocks`.
|
||||
```
|
||||
|
|
@ -706,6 +706,9 @@ type Core struct {
|
|||
|
||||
// If any role based quota (LCQ or RLQ) is enabled, don't track lease counts by role
|
||||
impreciseLeaseRoleTracking bool
|
||||
|
||||
// Config value for "detect_deadlocks".
|
||||
detectDeadlocks []string
|
||||
}
|
||||
|
||||
// c.stateLock needs to be held in read mode before calling this function.
|
||||
|
@ -963,19 +966,30 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
|
|||
if conf.NumRollbackWorkers == 0 {
|
||||
conf.NumRollbackWorkers = RollbackDefaultNumWorkers
|
||||
}
|
||||
// Use imported logging deadlock if requested
|
||||
var stateLock locking.RWMutex
|
||||
if strings.Contains(conf.DetectDeadlocks, "statelock") {
|
||||
stateLock = &locking.DeadlockRWMutex{}
|
||||
} else {
|
||||
stateLock = &locking.SyncRWMutex{}
|
||||
}
|
||||
|
||||
effectiveSDKVersion := conf.EffectiveSDKVersion
|
||||
if effectiveSDKVersion == "" {
|
||||
effectiveSDKVersion = version.GetVersion().Version
|
||||
}
|
||||
|
||||
var detectDeadlocks []string
|
||||
if conf.DetectDeadlocks != "" {
|
||||
detectDeadlocks = strings.Split(conf.DetectDeadlocks, ",")
|
||||
for k, v := range detectDeadlocks {
|
||||
detectDeadlocks[k] = strings.ToLower(strings.TrimSpace(v))
|
||||
}
|
||||
}
|
||||
|
||||
// Use imported logging deadlock if requested
|
||||
var stateLock locking.RWMutex
|
||||
stateLock = &locking.SyncRWMutex{}
|
||||
|
||||
for _, v := range detectDeadlocks {
|
||||
if v == "statelock" {
|
||||
stateLock = &locking.DeadlockRWMutex{}
|
||||
}
|
||||
}
|
||||
|
||||
// Setup the core
|
||||
c := &Core{
|
||||
entCore: entCore{},
|
||||
|
@ -1048,6 +1062,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
|
|||
expirationRevokeRetryBase: conf.ExpirationRevokeRetryBase,
|
||||
numRollbackWorkers: conf.NumRollbackWorkers,
|
||||
impreciseLeaseRoleTracking: conf.ImpreciseLeaseRoleTracking,
|
||||
detectDeadlocks: detectDeadlocks,
|
||||
}
|
||||
|
||||
c.standbyStopCh.Store(make(chan struct{}))
|
||||
|
@ -1229,7 +1244,15 @@ func NewCore(conf *CoreConfig) (*Core, error) {
|
|||
|
||||
// Quotas
|
||||
quotasLogger := conf.Logger.Named("quotas")
|
||||
c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink)
|
||||
|
||||
detectDeadlocks := false
|
||||
for _, v := range c.detectDeadlocks {
|
||||
if v == "quotas" {
|
||||
detectDeadlocks = true
|
||||
}
|
||||
}
|
||||
|
||||
c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink, detectDeadlocks)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -4133,3 +4156,10 @@ func (c *Core) GetRaftAutopilotState(ctx context.Context) (*raft.AutopilotState,
|
|||
func (c *Core) Events() *eventbus.EventBus {
|
||||
return c.events
|
||||
}
|
||||
|
||||
func (c *Core) DetectStateLockDeadlocks() bool {
|
||||
if _, ok := c.stateLock.(*locking.DeadlockRWMutex); ok {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -3322,3 +3322,51 @@ func InduceDeadlock(t *testing.T, vaultcore *Core, expected uint32) {
|
|||
t.Fatalf("expected 1 deadlock, detected %d", deadlocks)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExpiration_DeadlockDetection(t *testing.T) {
|
||||
testCore := TestCore(t)
|
||||
testCoreUnsealed(t, testCore)
|
||||
|
||||
if testCore.expiration.DetectDeadlocks() {
|
||||
t.Fatal("expiration has deadlock detection enabled, it shouldn't")
|
||||
}
|
||||
|
||||
testCore = TestCoreWithDeadlockDetection(t, nil, false)
|
||||
testCoreUnsealed(t, testCore)
|
||||
|
||||
if !testCore.expiration.DetectDeadlocks() {
|
||||
t.Fatal("expiration doesn't have deadlock detection enabled, it should")
|
||||
}
|
||||
}
|
||||
|
||||
func TestQuotas_DeadlockDetection(t *testing.T) {
|
||||
testCore := TestCore(t)
|
||||
testCoreUnsealed(t, testCore)
|
||||
|
||||
if testCore.quotaManager.DetectDeadlocks() {
|
||||
t.Fatal("quotas has deadlock detection enabled, it shouldn't")
|
||||
}
|
||||
|
||||
testCore = TestCoreWithDeadlockDetection(t, nil, false)
|
||||
testCoreUnsealed(t, testCore)
|
||||
|
||||
if !testCore.quotaManager.DetectDeadlocks() {
|
||||
t.Fatal("quotas doesn't have deadlock detection enabled, it should")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStatelock_DeadlockDetection(t *testing.T) {
|
||||
testCore := TestCore(t)
|
||||
testCoreUnsealed(t, testCore)
|
||||
|
||||
if testCore.DetectStateLockDeadlocks() {
|
||||
t.Fatal("statelock has deadlock detection enabled, it shouldn't")
|
||||
}
|
||||
|
||||
testCore = TestCoreWithDeadlockDetection(t, nil, false)
|
||||
testCoreUnsealed(t, testCore)
|
||||
|
||||
if !testCore.DetectStateLockDeadlocks() {
|
||||
t.Fatal("statelock doesn't have deadlock detection enabled, it should")
|
||||
}
|
||||
}
|
||||
|
|
|
@ -114,7 +114,7 @@ type ExpirationManager struct {
|
|||
pending sync.Map
|
||||
nonexpiring sync.Map
|
||||
leaseCount int
|
||||
pendingLock locking.DeadlockRWMutex
|
||||
pendingLock locking.RWMutex
|
||||
|
||||
// A sync.Lock for every active leaseID
|
||||
lockPerLease sync.Map
|
||||
|
@ -327,7 +327,7 @@ func getNumExpirationWorkers(c *Core, l log.Logger) int {
|
|||
|
||||
// NewExpirationManager creates a new ExpirationManager that is backed
|
||||
// using a given view, and uses the provided router for revocation.
|
||||
func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger) *ExpirationManager {
|
||||
func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger, detectDeadlocks bool) *ExpirationManager {
|
||||
managerLogger := logger.Named("job-manager")
|
||||
jobManager := fairshare.NewJobManager("expire", getNumExpirationWorkers(c, logger), managerLogger, c.metricSink)
|
||||
jobManager.Start()
|
||||
|
@ -340,6 +340,7 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log
|
|||
tokenStore: c.tokenStore,
|
||||
logger: logger,
|
||||
pending: sync.Map{},
|
||||
pendingLock: &locking.SyncRWMutex{},
|
||||
nonexpiring: sync.Map{},
|
||||
leaseCount: 0,
|
||||
tidyLock: new(int32),
|
||||
|
@ -375,6 +376,11 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log
|
|||
exp.logger = log.New(&opts)
|
||||
}
|
||||
|
||||
if detectDeadlocks {
|
||||
managerLogger.Debug("enabling deadlock detection")
|
||||
exp.pendingLock = &locking.DeadlockRWMutex{}
|
||||
}
|
||||
|
||||
go exp.uniquePoliciesGc()
|
||||
|
||||
return exp
|
||||
|
@ -390,7 +396,15 @@ func (c *Core) setupExpiration(e ExpireLeaseStrategy) error {
|
|||
|
||||
// Create the manager
|
||||
expLogger := c.baseLogger.Named("expiration")
|
||||
mgr := NewExpirationManager(c, view, e, expLogger)
|
||||
|
||||
detectDeadlocks := false
|
||||
for _, v := range c.detectDeadlocks {
|
||||
if v == "expiration" {
|
||||
detectDeadlocks = true
|
||||
}
|
||||
}
|
||||
|
||||
mgr := NewExpirationManager(c, view, e, expLogger, detectDeadlocks)
|
||||
c.expiration = mgr
|
||||
|
||||
// Link the token store to this
|
||||
|
@ -2821,3 +2835,10 @@ func decodeLeaseEntry(buf []byte) (*leaseEntry, error) {
|
|||
out := new(leaseEntry)
|
||||
return out, jsonutil.DecodeJSON(buf, out)
|
||||
}
|
||||
|
||||
func (e *ExpirationManager) DetectDeadlocks() bool {
|
||||
if _, ok := e.pendingLock.(*locking.DeadlockRWMutex); ok {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -170,13 +170,13 @@ type Manager struct {
|
|||
metricSink *metricsutil.ClusterMetricSink
|
||||
|
||||
// quotaLock is a lock for manipulating quotas and anything not covered by a more specific lock
|
||||
quotaLock *locking.DeadlockRWMutex
|
||||
quotaLock locking.RWMutex
|
||||
|
||||
// quotaConfigLock is a lock for accessing config items, such as RateLimitExemptPaths
|
||||
quotaConfigLock *locking.DeadlockRWMutex
|
||||
quotaConfigLock locking.RWMutex
|
||||
|
||||
// dbAndCacheLock is a lock for db and path caches that need to be reset during Reset()
|
||||
dbAndCacheLock *locking.DeadlockRWMutex
|
||||
dbAndCacheLock locking.RWMutex
|
||||
}
|
||||
|
||||
// QuotaLeaseInformation contains all of the information lease-count quotas require
|
||||
|
@ -272,7 +272,7 @@ type Request struct {
|
|||
|
||||
// NewManager creates and initializes a new quota manager to hold all the quota
|
||||
// rules and to process incoming requests.
|
||||
func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink) (*Manager, error) {
|
||||
func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink, detectDeadlocks bool) (*Manager, error) {
|
||||
db, err := memdb.NewMemDB(dbSchema())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -284,9 +284,16 @@ func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.Clust
|
|||
metricSink: ms,
|
||||
rateLimitPathManager: pathmanager.New(),
|
||||
config: new(Config),
|
||||
quotaLock: new(locking.DeadlockRWMutex),
|
||||
quotaConfigLock: new(locking.DeadlockRWMutex),
|
||||
dbAndCacheLock: new(locking.DeadlockRWMutex),
|
||||
quotaLock: &locking.SyncRWMutex{},
|
||||
quotaConfigLock: &locking.SyncRWMutex{},
|
||||
dbAndCacheLock: &locking.SyncRWMutex{},
|
||||
}
|
||||
|
||||
if detectDeadlocks {
|
||||
logger.Debug("enabling deadlock detection")
|
||||
manager.quotaLock = &locking.DeadlockRWMutex{}
|
||||
manager.quotaConfigLock = &locking.DeadlockRWMutex{}
|
||||
manager.dbAndCacheLock = &locking.DeadlockRWMutex{}
|
||||
}
|
||||
|
||||
manager.init(walkFunc)
|
||||
|
@ -1302,3 +1309,10 @@ func (m *Manager) HandleBackendDisabling(ctx context.Context, nsPath, mountPath
|
|||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Manager) DetectDeadlocks() bool {
|
||||
if _, ok := m.quotaLock.(*locking.DeadlockRWMutex); ok {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -218,7 +218,7 @@ func TestRateLimitQuota_Allow_WithBlock(t *testing.T) {
|
|||
|
||||
func TestRateLimitQuota_Update(t *testing.T) {
|
||||
defer goleak.VerifyNone(t)
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
|
||||
require.NoError(t, err)
|
||||
|
||||
quota := NewRateLimitQuota("quota1", "", "", "", "", 10, time.Second, 0)
|
||||
|
|
|
@ -16,7 +16,7 @@ import (
|
|||
)
|
||||
|
||||
func TestQuotas_MountPathOverwrite(t *testing.T) {
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
|
||||
require.NoError(t, err)
|
||||
|
||||
quota := NewRateLimitQuota("tq", "", "kv1/", "", "", 10, time.Second, 0)
|
||||
|
@ -43,7 +43,7 @@ func TestQuotas_MountPathOverwrite(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestQuotas_Precedence(t *testing.T) {
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
|
||||
require.NoError(t, err)
|
||||
|
||||
setQuotaFunc := func(t *testing.T, name, nsPath, mountPath, pathSuffix, role string) Quota {
|
||||
|
@ -124,7 +124,7 @@ func TestQuotas_QueryResolveRole_RateLimitQuotas(t *testing.T) {
|
|||
leaseWalkFunc := func(context.Context, func(request *Request) bool) error {
|
||||
return nil
|
||||
}
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink())
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink(), true)
|
||||
require.NoError(t, err)
|
||||
|
||||
rlqReq := &Request{
|
||||
|
|
|
@ -141,6 +141,20 @@ func TestCoreWithSeal(t testing.T, testSeal Seal, enableRaw bool) *Core {
|
|||
return TestCoreWithSealAndUI(t, conf)
|
||||
}
|
||||
|
||||
func TestCoreWithDeadlockDetection(t testing.T, testSeal Seal, enableRaw bool) *Core {
|
||||
conf := &CoreConfig{
|
||||
Seal: testSeal,
|
||||
EnableUI: false,
|
||||
EnableRaw: enableRaw,
|
||||
BuiltinRegistry: corehelpers.NewMockBuiltinRegistry(),
|
||||
AuditBackends: map[string]audit.Factory{
|
||||
"file": auditFile.Factory,
|
||||
},
|
||||
DetectDeadlocks: "expiration,quotas,statelock",
|
||||
}
|
||||
return TestCoreWithSealAndUI(t, conf)
|
||||
}
|
||||
|
||||
func TestCoreWithCustomResponseHeaderAndUI(t testing.T, CustomResponseHeaders map[string]map[string]string, enableUI bool) (*Core, [][]byte, string) {
|
||||
confRaw := &server.Config{
|
||||
SharedConfig: &configutil.SharedConfig{
|
||||
|
|
|
@ -159,10 +159,11 @@ to specify where the configuration is.
|
|||
maximum request duration allowed before Vault cancels the request. This can
|
||||
be overridden per listener via the `max_request_duration` value.
|
||||
|
||||
- `detect_deadlocks` `(string: "")` - Specifies the internal mutex locks that should be monitored for
|
||||
potential deadlocks. Currently supported value is `statelock`, which will cause "POTENTIAL DEADLOCK:"
|
||||
to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this can have
|
||||
a negative effect on performance due to the tracking of each lock attempt.
|
||||
- `detect_deadlocks` `(string: "")` - A comma separated string that specifies the internal
|
||||
mutex locks that should be monitored for potential deadlocks. Currently supported values
|
||||
include `statelock`, `quotas` and `expiration` which will cause "POTENTIAL DEADLOCK:"
|
||||
to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this
|
||||
can have a negative effect on performance due to the tracking of each lock attempt.
|
||||
|
||||
- `raw_storage_endpoint` `(bool: false)` – Enables the `sys/raw` endpoint which
|
||||
allows the decryption/encryption of raw data into and out of the security
|
||||
|
|
Loading…
Reference in New Issue