Break grabLockOrStop into two pieces to facilitate investigating deadlocks (#17187)

Break grabLockOrStop into two pieces to facilitate investigating deadlocks.  Without this change, the "grab" goroutine looks the same regardless of who was calling grabLockOrStop, so there's no way to identify one of the deadlock parties.
This commit is contained in:
Nick Cabatoff 2022-09-20 11:03:16 -04:00 committed by GitHub
parent ba096f9dfa
commit 559754d580
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 76 additions and 32 deletions

3
changelog/17187.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
core: Refactor lock grabbing code to simplify stateLock deadlock investigations
```

View File

@ -19,8 +19,9 @@ func (c *Core) metricsLoop(stopCh chan struct{}) {
emitTimer := time.Tick(time.Second) emitTimer := time.Tick(time.Second)
stopOrHAState := func() (bool, consts.HAState) { stopOrHAState := func() (bool, consts.HAState) {
stopped := grabLockOrStop(c.stateLock.RLock, c.stateLock.RUnlock, stopCh) l := newLockGrabber(c.stateLock.RLock, c.stateLock.RUnlock, stopCh)
if stopped { go l.grab()
if stopped := l.lockOrStop(); stopped {
return true, 0 return true, 0
} }
defer c.stateLock.RUnlock() defer c.stateLock.RUnlock()
@ -117,7 +118,9 @@ func (c *Core) metricsLoop(stopCh chan struct{}) {
rb.CollectMetrics(c.MetricSink()) rb.CollectMetrics(c.MetricSink())
} }
case <-writeTimer: case <-writeTimer:
if stopped := grabLockOrStop(c.stateLock.RLock, c.stateLock.RUnlock, stopCh); stopped { l := newLockGrabber(c.stateLock.RLock, c.stateLock.RUnlock, stopCh)
go l.grab()
if stopped := l.lockOrStop(); stopped {
return return
} }
// Ship barrier encryption counts if a perf standby or the active node // Ship barrier encryption counts if a perf standby or the active node

View File

@ -468,7 +468,9 @@ func (c *Core) waitForLeadership(newLeaderCh chan func(), manualStepDownCh, stop
continueCh := interruptPerfStandby(newLeaderCh, stopCh) continueCh := interruptPerfStandby(newLeaderCh, stopCh)
// Grab the statelock or stop // Grab the statelock or stop
if stopped := grabLockOrStop(c.stateLock.Lock, c.stateLock.Unlock, stopCh); stopped { l := newLockGrabber(c.stateLock.Lock, c.stateLock.Unlock, stopCh)
go l.grab()
if stopped := l.lockOrStop(); stopped {
lock.Unlock() lock.Unlock()
close(continueCh) close(continueCh)
metrics.MeasureSince([]string{"core", "leadership_setup_failed"}, activeTime) metrics.MeasureSince([]string{"core", "leadership_setup_failed"}, activeTime)
@ -614,7 +616,9 @@ func (c *Core) waitForLeadership(newLeaderCh chan func(), manualStepDownCh, stop
}() }()
// Grab lock if we are not stopped // Grab lock if we are not stopped
stopped := grabLockOrStop(c.stateLock.Lock, c.stateLock.Unlock, stopCh) l := newLockGrabber(c.stateLock.Lock, c.stateLock.Unlock, stopCh)
go l.grab()
stopped := l.lockOrStop()
// Cancel the context incase the above go routine hasn't done it // Cancel the context incase the above go routine hasn't done it
// yet // yet
@ -657,48 +661,76 @@ func (c *Core) waitForLeadership(newLeaderCh chan func(), manualStepDownCh, stop
// lock was acquired (stopped=false) then it's up to the caller to unlock. If // lock was acquired (stopped=false) then it's up to the caller to unlock. If
// the lock was not acquired (stopped=true), the caller does not hold the lock and // the lock was not acquired (stopped=true), the caller does not hold the lock and
// should not call unlock. // should not call unlock.
// It's probably better to inline the body of grabLockOrStop into your function
// instead of calling it. If multiple functions call grabLockOrStop, when a deadlock
// occurs, we have no way of knowing who launched the grab goroutine, complicating
// investigation.
func grabLockOrStop(lockFunc, unlockFunc func(), stopCh chan struct{}) (stopped bool) { func grabLockOrStop(lockFunc, unlockFunc func(), stopCh chan struct{}) (stopped bool) {
// lock protects these variables which are shared by parent and child. l := newLockGrabber(lockFunc, unlockFunc, stopCh)
var lock sync.Mutex go l.grab()
parentWaiting := true return l.lockOrStop()
locked := false }
type lockGrabber struct {
// stopCh provides a way to interrupt the grab-or-stop
stopCh chan struct{}
// doneCh is closed when the child goroutine is done. // doneCh is closed when the child goroutine is done.
doneCh := make(chan struct{}) doneCh chan struct{}
go func() { lockFunc func()
defer close(doneCh) unlockFunc func()
lockFunc() // lock protects these variables which are shared by parent and child.
lock sync.Mutex
parentWaiting bool
locked bool
}
// The parent goroutine may or may not be waiting. func newLockGrabber(lockFunc, unlockFunc func(), stopCh chan struct{}) *lockGrabber {
lock.Lock() return &lockGrabber{
defer lock.Unlock() doneCh: make(chan struct{}),
if !parentWaiting { lockFunc: lockFunc,
unlockFunc() unlockFunc: unlockFunc,
} else { parentWaiting: true,
locked = true stopCh: stopCh,
} }
}() }
// lockOrStop waits for grab to get a lock or give up, see grabLockOrStop for how to use it.
func (l *lockGrabber) lockOrStop() (stopped bool) {
stop := false stop := false
select { select {
case <-stopCh: case <-l.stopCh:
stop = true stop = true
case <-doneCh: case <-l.doneCh:
} }
// The child goroutine may not have acquired the lock yet. // The child goroutine may not have acquired the lock yet.
lock.Lock() l.lock.Lock()
defer lock.Unlock() defer l.lock.Unlock()
parentWaiting = false l.parentWaiting = false
if stop { if stop {
if locked { if l.locked {
unlockFunc() l.unlockFunc()
} }
return true return true
} }
return false return false
} }
// grab tries to get a lock, see grabLockOrStop for how to use it.
func (l *lockGrabber) grab() {
defer close(l.doneCh)
l.lockFunc()
// The parent goroutine may or may not be waiting.
l.lock.Lock()
defer l.lock.Unlock()
if !l.parentWaiting {
l.unlockFunc()
} else {
l.locked = true
}
}
// This checks the leader periodically to ensure that we switch RPC to a new // This checks the leader periodically to ensure that we switch RPC to a new
// leader pretty quickly. There is logic in Leader() already to not make this // leader pretty quickly. There is logic in Leader() already to not make this
// onerous and avoid more traffic than needed, so we just call that and ignore // onerous and avoid more traffic than needed, so we just call that and ignore

View File

@ -589,7 +589,9 @@ func (b *SystemBackend) handleStorageRaftSnapshotWrite(force bool) framework.Ope
defer cleanup() defer cleanup()
// Grab statelock // Grab statelock
if stopped := grabLockOrStop(b.Core.stateLock.Lock, b.Core.stateLock.Unlock, b.Core.standbyStopCh.Load().(chan struct{})); stopped { l := newLockGrabber(b.Core.stateLock.Lock, b.Core.stateLock.Unlock, b.Core.standbyStopCh.Load().(chan struct{}))
go l.grab()
if stopped := l.lockOrStop(); stopped {
b.Core.logger.Error("not applying snapshot; shutting down") b.Core.logger.Error("not applying snapshot; shutting down")
return return
} }

View File

@ -635,7 +635,9 @@ func (c *Core) raftSnapshotRestoreCallback(grabLock bool, sealNode bool) func(co
if grabLock { if grabLock {
// Grab statelock // Grab statelock
if stopped := grabLockOrStop(c.stateLock.Lock, c.stateLock.Unlock, c.standbyStopCh.Load().(chan struct{})); stopped { l := newLockGrabber(c.stateLock.Lock, c.stateLock.Unlock, c.standbyStopCh.Load().(chan struct{}))
go l.grab()
if stopped := l.lockOrStop(); stopped {
c.logger.Error("did not apply snapshot; vault is shutting down") c.logger.Error("did not apply snapshot; vault is shutting down")
return errors.New("did not apply snapshot; vault is shutting down") return errors.New("did not apply snapshot; vault is shutting down")
} }

View File

@ -209,7 +209,9 @@ func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string,
}() }()
// Grab the statelock or stop // Grab the statelock or stop
if stopped := grabLockOrStop(m.core.stateLock.RLock, m.core.stateLock.RUnlock, stopCh); stopped { l := newLockGrabber(m.core.stateLock.RLock, m.core.stateLock.RUnlock, stopCh)
go l.grab()
if stopped := l.lockOrStop(); stopped {
// If we stopped due to shutdown, return. Otherwise another thread // If we stopped due to shutdown, return. Otherwise another thread
// is holding the lock for us, continue on. // is holding the lock for us, continue on.
select { select {