diff --git a/changelog/17187.txt b/changelog/17187.txt new file mode 100644 index 000000000..71476ef31 --- /dev/null +++ b/changelog/17187.txt @@ -0,0 +1,3 @@ +```release-note:bug +core: Refactor lock grabbing code to simplify stateLock deadlock investigations +``` \ No newline at end of file diff --git a/vault/core_metrics.go b/vault/core_metrics.go index 446c07ca4..b63eabd2d 100644 --- a/vault/core_metrics.go +++ b/vault/core_metrics.go @@ -19,8 +19,9 @@ func (c *Core) metricsLoop(stopCh chan struct{}) { emitTimer := time.Tick(time.Second) stopOrHAState := func() (bool, consts.HAState) { - stopped := grabLockOrStop(c.stateLock.RLock, c.stateLock.RUnlock, stopCh) - if stopped { + l := newLockGrabber(c.stateLock.RLock, c.stateLock.RUnlock, stopCh) + go l.grab() + if stopped := l.lockOrStop(); stopped { return true, 0 } defer c.stateLock.RUnlock() @@ -117,7 +118,9 @@ func (c *Core) metricsLoop(stopCh chan struct{}) { rb.CollectMetrics(c.MetricSink()) } case <-writeTimer: - if stopped := grabLockOrStop(c.stateLock.RLock, c.stateLock.RUnlock, stopCh); stopped { + l := newLockGrabber(c.stateLock.RLock, c.stateLock.RUnlock, stopCh) + go l.grab() + if stopped := l.lockOrStop(); stopped { return } // Ship barrier encryption counts if a perf standby or the active node diff --git a/vault/ha.go b/vault/ha.go index c2d55bbf1..1cc1a98e4 100644 --- a/vault/ha.go +++ b/vault/ha.go @@ -468,7 +468,9 @@ func (c *Core) waitForLeadership(newLeaderCh chan func(), manualStepDownCh, stop continueCh := interruptPerfStandby(newLeaderCh, stopCh) // Grab the statelock or stop - if stopped := grabLockOrStop(c.stateLock.Lock, c.stateLock.Unlock, stopCh); stopped { + l := newLockGrabber(c.stateLock.Lock, c.stateLock.Unlock, stopCh) + go l.grab() + if stopped := l.lockOrStop(); stopped { lock.Unlock() close(continueCh) metrics.MeasureSince([]string{"core", "leadership_setup_failed"}, activeTime) @@ -614,7 +616,9 @@ func (c *Core) waitForLeadership(newLeaderCh chan func(), manualStepDownCh, stop }() // Grab lock if we are not stopped - stopped := grabLockOrStop(c.stateLock.Lock, c.stateLock.Unlock, stopCh) + l := newLockGrabber(c.stateLock.Lock, c.stateLock.Unlock, stopCh) + go l.grab() + stopped := l.lockOrStop() // Cancel the context incase the above go routine hasn't done it // yet @@ -657,48 +661,76 @@ func (c *Core) waitForLeadership(newLeaderCh chan func(), manualStepDownCh, stop // lock was acquired (stopped=false) then it's up to the caller to unlock. If // the lock was not acquired (stopped=true), the caller does not hold the lock and // should not call unlock. +// It's probably better to inline the body of grabLockOrStop into your function +// instead of calling it. If multiple functions call grabLockOrStop, when a deadlock +// occurs, we have no way of knowing who launched the grab goroutine, complicating +// investigation. func grabLockOrStop(lockFunc, unlockFunc func(), stopCh chan struct{}) (stopped bool) { - // lock protects these variables which are shared by parent and child. - var lock sync.Mutex - parentWaiting := true - locked := false + l := newLockGrabber(lockFunc, unlockFunc, stopCh) + go l.grab() + return l.lockOrStop() +} +type lockGrabber struct { + // stopCh provides a way to interrupt the grab-or-stop + stopCh chan struct{} // doneCh is closed when the child goroutine is done. - doneCh := make(chan struct{}) - go func() { - defer close(doneCh) - lockFunc() + doneCh chan struct{} + lockFunc func() + unlockFunc func() + // lock protects these variables which are shared by parent and child. + lock sync.Mutex + parentWaiting bool + locked bool +} - // The parent goroutine may or may not be waiting. - lock.Lock() - defer lock.Unlock() - if !parentWaiting { - unlockFunc() - } else { - locked = true - } - }() +func newLockGrabber(lockFunc, unlockFunc func(), stopCh chan struct{}) *lockGrabber { + return &lockGrabber{ + doneCh: make(chan struct{}), + lockFunc: lockFunc, + unlockFunc: unlockFunc, + parentWaiting: true, + stopCh: stopCh, + } +} +// lockOrStop waits for grab to get a lock or give up, see grabLockOrStop for how to use it. +func (l *lockGrabber) lockOrStop() (stopped bool) { stop := false select { - case <-stopCh: + case <-l.stopCh: stop = true - case <-doneCh: + case <-l.doneCh: } // The child goroutine may not have acquired the lock yet. - lock.Lock() - defer lock.Unlock() - parentWaiting = false + l.lock.Lock() + defer l.lock.Unlock() + l.parentWaiting = false if stop { - if locked { - unlockFunc() + if l.locked { + l.unlockFunc() } return true } return false } +// grab tries to get a lock, see grabLockOrStop for how to use it. +func (l *lockGrabber) grab() { + defer close(l.doneCh) + l.lockFunc() + + // The parent goroutine may or may not be waiting. + l.lock.Lock() + defer l.lock.Unlock() + if !l.parentWaiting { + l.unlockFunc() + } else { + l.locked = true + } +} + // This checks the leader periodically to ensure that we switch RPC to a new // leader pretty quickly. There is logic in Leader() already to not make this // onerous and avoid more traffic than needed, so we just call that and ignore diff --git a/vault/logical_system_raft.go b/vault/logical_system_raft.go index 62145663f..e44a1c472 100644 --- a/vault/logical_system_raft.go +++ b/vault/logical_system_raft.go @@ -589,7 +589,9 @@ func (b *SystemBackend) handleStorageRaftSnapshotWrite(force bool) framework.Ope defer cleanup() // Grab statelock - if stopped := grabLockOrStop(b.Core.stateLock.Lock, b.Core.stateLock.Unlock, b.Core.standbyStopCh.Load().(chan struct{})); stopped { + l := newLockGrabber(b.Core.stateLock.Lock, b.Core.stateLock.Unlock, b.Core.standbyStopCh.Load().(chan struct{})) + go l.grab() + if stopped := l.lockOrStop(); stopped { b.Core.logger.Error("not applying snapshot; shutting down") return } diff --git a/vault/raft.go b/vault/raft.go index a081325ba..b87a82df9 100644 --- a/vault/raft.go +++ b/vault/raft.go @@ -635,7 +635,9 @@ func (c *Core) raftSnapshotRestoreCallback(grabLock bool, sealNode bool) func(co if grabLock { // Grab statelock - if stopped := grabLockOrStop(c.stateLock.Lock, c.stateLock.Unlock, c.standbyStopCh.Load().(chan struct{})); stopped { + l := newLockGrabber(c.stateLock.Lock, c.stateLock.Unlock, c.standbyStopCh.Load().(chan struct{})) + go l.grab() + if stopped := l.lockOrStop(); stopped { c.logger.Error("did not apply snapshot; vault is shutting down") return errors.New("did not apply snapshot; vault is shutting down") } diff --git a/vault/rollback.go b/vault/rollback.go index 8ba0e27b2..4316c84d4 100644 --- a/vault/rollback.go +++ b/vault/rollback.go @@ -209,7 +209,9 @@ func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string, }() // Grab the statelock or stop - if stopped := grabLockOrStop(m.core.stateLock.RLock, m.core.stateLock.RUnlock, stopCh); stopped { + l := newLockGrabber(m.core.stateLock.RLock, m.core.stateLock.RUnlock, stopCh) + go l.grab() + if stopped := l.lockOrStop(); stopped { // If we stopped due to shutdown, return. Otherwise another thread // is holding the lock for us, continue on. select {