diff --git a/changelog/23282.txt b/changelog/23282.txt new file mode 100644 index 000000000..1026ccf41 --- /dev/null +++ b/changelog/23282.txt @@ -0,0 +1,3 @@ +```release-note:bug +expiration: Prevent large lease loads from delaying state changes, e.g. becoming active or standby. +``` \ No newline at end of file diff --git a/vault/expiration.go b/vault/expiration.go index 188299e8b..1eb3a5a1e 100644 --- a/vault/expiration.go +++ b/vault/expiration.go @@ -147,7 +147,7 @@ type ExpirationManager struct { leaseCheckCounter *uint32 logLeaseExpirations bool - expireFunc ExpireLeaseStrategy + expireFunc atomic.Pointer[ExpireLeaseStrategy] // testRegisterAuthFailure, if set to true, triggers an explicit failure on // RegisterAuth to simulate a partial failure during a token creation @@ -360,11 +360,11 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log leaseCheckCounter: new(uint32), logLeaseExpirations: os.Getenv("VAULT_SKIP_LOGGING_LEASE_EXPIRATIONS") == "", - expireFunc: e, jobManager: jobManager, revokeRetryBase: c.expirationRevokeRetryBase, } + exp.expireFunc.Store(&e) if exp.revokeRetryBase == 0 { exp.revokeRetryBase = revokeRetryBase } @@ -846,6 +846,8 @@ func (m *ExpirationManager) processRestore(ctx context.Context, leaseID string) return nil } +func expireNoop(ctx context.Context, manager *ExpirationManager, s string, n *namespace.Namespace) {} + // Stop is used to prevent further automatic revocations. // This must be called before sealing the view. func (m *ExpirationManager) Stop() error { @@ -860,27 +862,24 @@ func (m *ExpirationManager) Stop() error { close(m.quitCh) m.pendingLock.Lock() - // Replacing the entire map would cause a race with - // a simultaneous WalkTokens, which doesn't hold pendingLock. - m.pending.Range(func(key, value interface{}) bool { - info := value.(pendingInfo) - info.timer.Stop() - m.pending.Delete(key) - return true - }) + // We don't want any lease timers that fire to do anything; they can wait + // for the next ExpirationManager to handle them. + newStrategy := ExpireLeaseStrategy(expireNoop) + m.expireFunc.Store(&newStrategy) + oldPending := m.pending + m.pending, m.nonexpiring, m.irrevocable = sync.Map{}, sync.Map{}, sync.Map{} m.leaseCount = 0 - m.nonexpiring.Range(func(key, value interface{}) bool { - m.nonexpiring.Delete(key) - return true - }) m.uniquePolicies = make(map[string][]string) - m.irrevocable.Range(func(key, _ interface{}) bool { - m.irrevocable.Delete(key) - return true - }) m.irrevocableLeaseCount = 0 m.pendingLock.Unlock() + go oldPending.Range(func(key, value interface{}) bool { + info := value.(pendingInfo) + // We need to stop the timers to prevent memory leaks. + info.timer.Stop() + return true + }) + if m.inRestoreMode() { for { if !m.inRestoreMode() { @@ -1881,7 +1880,8 @@ func (m *ExpirationManager) updatePendingInternal(le *leaseEntry) { leaseID, namespace := le.LeaseID, le.namespace // Extend the timer by the lease total timer := time.AfterFunc(leaseTotal, func() { - m.expireFunc(m.quitContext, m, leaseID, namespace) + expFn := *m.expireFunc.Load() // Load and deref the pointer + expFn(m.quitContext, m, leaseID, namespace) }) pending = pendingInfo{ timer: timer, @@ -2471,8 +2471,13 @@ func (m *ExpirationManager) WalkTokens(walkFn ExpirationWalkFunction) error { return true } - m.pending.Range(callback) - m.nonexpiring.Range(callback) + m.pendingLock.RLock() + toWalk := []sync.Map{m.pending, m.nonexpiring} + m.pendingLock.RUnlock() + + for _, m := range toWalk { + m.Range(callback) + } return nil } @@ -2495,8 +2500,13 @@ func (m *ExpirationManager) walkLeases(walkFn leaseWalkFunction) error { return walkFn(key.(string), expireTime) } - m.pending.Range(callback) - m.nonexpiring.Range(callback) + m.pendingLock.RLock() + toWalk := []sync.Map{m.pending, m.nonexpiring} + m.pendingLock.RUnlock() + + for _, m := range toWalk { + m.Range(callback) + } return nil }