Break grabLockOrStop into two pieces to facilitate investigating deadlocks (#17187)
Break grabLockOrStop into two pieces to facilitate investigating deadlocks. Without this change, the "grab" goroutine looks the same regardless of who was calling grabLockOrStop, so there's no way to identify one of the deadlock parties.
This commit is contained in:
parent
ba096f9dfa
commit
559754d580
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:bug
|
||||||
|
core: Refactor lock grabbing code to simplify stateLock deadlock investigations
|
||||||
|
```
|
|
@ -19,8 +19,9 @@ func (c *Core) metricsLoop(stopCh chan struct{}) {
|
||||||
emitTimer := time.Tick(time.Second)
|
emitTimer := time.Tick(time.Second)
|
||||||
|
|
||||||
stopOrHAState := func() (bool, consts.HAState) {
|
stopOrHAState := func() (bool, consts.HAState) {
|
||||||
stopped := grabLockOrStop(c.stateLock.RLock, c.stateLock.RUnlock, stopCh)
|
l := newLockGrabber(c.stateLock.RLock, c.stateLock.RUnlock, stopCh)
|
||||||
if stopped {
|
go l.grab()
|
||||||
|
if stopped := l.lockOrStop(); stopped {
|
||||||
return true, 0
|
return true, 0
|
||||||
}
|
}
|
||||||
defer c.stateLock.RUnlock()
|
defer c.stateLock.RUnlock()
|
||||||
|
@ -117,7 +118,9 @@ func (c *Core) metricsLoop(stopCh chan struct{}) {
|
||||||
rb.CollectMetrics(c.MetricSink())
|
rb.CollectMetrics(c.MetricSink())
|
||||||
}
|
}
|
||||||
case <-writeTimer:
|
case <-writeTimer:
|
||||||
if stopped := grabLockOrStop(c.stateLock.RLock, c.stateLock.RUnlock, stopCh); stopped {
|
l := newLockGrabber(c.stateLock.RLock, c.stateLock.RUnlock, stopCh)
|
||||||
|
go l.grab()
|
||||||
|
if stopped := l.lockOrStop(); stopped {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Ship barrier encryption counts if a perf standby or the active node
|
// Ship barrier encryption counts if a perf standby or the active node
|
||||||
|
|
88
vault/ha.go
88
vault/ha.go
|
@ -468,7 +468,9 @@ func (c *Core) waitForLeadership(newLeaderCh chan func(), manualStepDownCh, stop
|
||||||
continueCh := interruptPerfStandby(newLeaderCh, stopCh)
|
continueCh := interruptPerfStandby(newLeaderCh, stopCh)
|
||||||
|
|
||||||
// Grab the statelock or stop
|
// Grab the statelock or stop
|
||||||
if stopped := grabLockOrStop(c.stateLock.Lock, c.stateLock.Unlock, stopCh); stopped {
|
l := newLockGrabber(c.stateLock.Lock, c.stateLock.Unlock, stopCh)
|
||||||
|
go l.grab()
|
||||||
|
if stopped := l.lockOrStop(); stopped {
|
||||||
lock.Unlock()
|
lock.Unlock()
|
||||||
close(continueCh)
|
close(continueCh)
|
||||||
metrics.MeasureSince([]string{"core", "leadership_setup_failed"}, activeTime)
|
metrics.MeasureSince([]string{"core", "leadership_setup_failed"}, activeTime)
|
||||||
|
@ -614,7 +616,9 @@ func (c *Core) waitForLeadership(newLeaderCh chan func(), manualStepDownCh, stop
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// Grab lock if we are not stopped
|
// Grab lock if we are not stopped
|
||||||
stopped := grabLockOrStop(c.stateLock.Lock, c.stateLock.Unlock, stopCh)
|
l := newLockGrabber(c.stateLock.Lock, c.stateLock.Unlock, stopCh)
|
||||||
|
go l.grab()
|
||||||
|
stopped := l.lockOrStop()
|
||||||
|
|
||||||
// Cancel the context incase the above go routine hasn't done it
|
// Cancel the context incase the above go routine hasn't done it
|
||||||
// yet
|
// yet
|
||||||
|
@ -657,48 +661,76 @@ func (c *Core) waitForLeadership(newLeaderCh chan func(), manualStepDownCh, stop
|
||||||
// lock was acquired (stopped=false) then it's up to the caller to unlock. If
|
// lock was acquired (stopped=false) then it's up to the caller to unlock. If
|
||||||
// the lock was not acquired (stopped=true), the caller does not hold the lock and
|
// the lock was not acquired (stopped=true), the caller does not hold the lock and
|
||||||
// should not call unlock.
|
// should not call unlock.
|
||||||
|
// It's probably better to inline the body of grabLockOrStop into your function
|
||||||
|
// instead of calling it. If multiple functions call grabLockOrStop, when a deadlock
|
||||||
|
// occurs, we have no way of knowing who launched the grab goroutine, complicating
|
||||||
|
// investigation.
|
||||||
func grabLockOrStop(lockFunc, unlockFunc func(), stopCh chan struct{}) (stopped bool) {
|
func grabLockOrStop(lockFunc, unlockFunc func(), stopCh chan struct{}) (stopped bool) {
|
||||||
// lock protects these variables which are shared by parent and child.
|
l := newLockGrabber(lockFunc, unlockFunc, stopCh)
|
||||||
var lock sync.Mutex
|
go l.grab()
|
||||||
parentWaiting := true
|
return l.lockOrStop()
|
||||||
locked := false
|
|
||||||
|
|
||||||
// doneCh is closed when the child goroutine is done.
|
|
||||||
doneCh := make(chan struct{})
|
|
||||||
go func() {
|
|
||||||
defer close(doneCh)
|
|
||||||
lockFunc()
|
|
||||||
|
|
||||||
// The parent goroutine may or may not be waiting.
|
|
||||||
lock.Lock()
|
|
||||||
defer lock.Unlock()
|
|
||||||
if !parentWaiting {
|
|
||||||
unlockFunc()
|
|
||||||
} else {
|
|
||||||
locked = true
|
|
||||||
}
|
}
|
||||||
}()
|
|
||||||
|
|
||||||
|
type lockGrabber struct {
|
||||||
|
// stopCh provides a way to interrupt the grab-or-stop
|
||||||
|
stopCh chan struct{}
|
||||||
|
// doneCh is closed when the child goroutine is done.
|
||||||
|
doneCh chan struct{}
|
||||||
|
lockFunc func()
|
||||||
|
unlockFunc func()
|
||||||
|
// lock protects these variables which are shared by parent and child.
|
||||||
|
lock sync.Mutex
|
||||||
|
parentWaiting bool
|
||||||
|
locked bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func newLockGrabber(lockFunc, unlockFunc func(), stopCh chan struct{}) *lockGrabber {
|
||||||
|
return &lockGrabber{
|
||||||
|
doneCh: make(chan struct{}),
|
||||||
|
lockFunc: lockFunc,
|
||||||
|
unlockFunc: unlockFunc,
|
||||||
|
parentWaiting: true,
|
||||||
|
stopCh: stopCh,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// lockOrStop waits for grab to get a lock or give up, see grabLockOrStop for how to use it.
|
||||||
|
func (l *lockGrabber) lockOrStop() (stopped bool) {
|
||||||
stop := false
|
stop := false
|
||||||
select {
|
select {
|
||||||
case <-stopCh:
|
case <-l.stopCh:
|
||||||
stop = true
|
stop = true
|
||||||
case <-doneCh:
|
case <-l.doneCh:
|
||||||
}
|
}
|
||||||
|
|
||||||
// The child goroutine may not have acquired the lock yet.
|
// The child goroutine may not have acquired the lock yet.
|
||||||
lock.Lock()
|
l.lock.Lock()
|
||||||
defer lock.Unlock()
|
defer l.lock.Unlock()
|
||||||
parentWaiting = false
|
l.parentWaiting = false
|
||||||
if stop {
|
if stop {
|
||||||
if locked {
|
if l.locked {
|
||||||
unlockFunc()
|
l.unlockFunc()
|
||||||
}
|
}
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// grab tries to get a lock, see grabLockOrStop for how to use it.
|
||||||
|
func (l *lockGrabber) grab() {
|
||||||
|
defer close(l.doneCh)
|
||||||
|
l.lockFunc()
|
||||||
|
|
||||||
|
// The parent goroutine may or may not be waiting.
|
||||||
|
l.lock.Lock()
|
||||||
|
defer l.lock.Unlock()
|
||||||
|
if !l.parentWaiting {
|
||||||
|
l.unlockFunc()
|
||||||
|
} else {
|
||||||
|
l.locked = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// This checks the leader periodically to ensure that we switch RPC to a new
|
// This checks the leader periodically to ensure that we switch RPC to a new
|
||||||
// leader pretty quickly. There is logic in Leader() already to not make this
|
// leader pretty quickly. There is logic in Leader() already to not make this
|
||||||
// onerous and avoid more traffic than needed, so we just call that and ignore
|
// onerous and avoid more traffic than needed, so we just call that and ignore
|
||||||
|
|
|
@ -589,7 +589,9 @@ func (b *SystemBackend) handleStorageRaftSnapshotWrite(force bool) framework.Ope
|
||||||
defer cleanup()
|
defer cleanup()
|
||||||
|
|
||||||
// Grab statelock
|
// Grab statelock
|
||||||
if stopped := grabLockOrStop(b.Core.stateLock.Lock, b.Core.stateLock.Unlock, b.Core.standbyStopCh.Load().(chan struct{})); stopped {
|
l := newLockGrabber(b.Core.stateLock.Lock, b.Core.stateLock.Unlock, b.Core.standbyStopCh.Load().(chan struct{}))
|
||||||
|
go l.grab()
|
||||||
|
if stopped := l.lockOrStop(); stopped {
|
||||||
b.Core.logger.Error("not applying snapshot; shutting down")
|
b.Core.logger.Error("not applying snapshot; shutting down")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
|
@ -635,7 +635,9 @@ func (c *Core) raftSnapshotRestoreCallback(grabLock bool, sealNode bool) func(co
|
||||||
|
|
||||||
if grabLock {
|
if grabLock {
|
||||||
// Grab statelock
|
// Grab statelock
|
||||||
if stopped := grabLockOrStop(c.stateLock.Lock, c.stateLock.Unlock, c.standbyStopCh.Load().(chan struct{})); stopped {
|
l := newLockGrabber(c.stateLock.Lock, c.stateLock.Unlock, c.standbyStopCh.Load().(chan struct{}))
|
||||||
|
go l.grab()
|
||||||
|
if stopped := l.lockOrStop(); stopped {
|
||||||
c.logger.Error("did not apply snapshot; vault is shutting down")
|
c.logger.Error("did not apply snapshot; vault is shutting down")
|
||||||
return errors.New("did not apply snapshot; vault is shutting down")
|
return errors.New("did not apply snapshot; vault is shutting down")
|
||||||
}
|
}
|
||||||
|
|
|
@ -209,7 +209,9 @@ func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string,
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// Grab the statelock or stop
|
// Grab the statelock or stop
|
||||||
if stopped := grabLockOrStop(m.core.stateLock.RLock, m.core.stateLock.RUnlock, stopCh); stopped {
|
l := newLockGrabber(m.core.stateLock.RLock, m.core.stateLock.RUnlock, stopCh)
|
||||||
|
go l.grab()
|
||||||
|
if stopped := l.lockOrStop(); stopped {
|
||||||
// If we stopped due to shutdown, return. Otherwise another thread
|
// If we stopped due to shutdown, return. Otherwise another thread
|
||||||
// is holding the lock for us, continue on.
|
// is holding the lock for us, continue on.
|
||||||
select {
|
select {
|
||||||
|
|
Loading…
Reference in New Issue