cache: prevent goroutine leak in agent cache (#14908)
There is a bug in the error handling code for the Agent cache subsystem discovered: 1. NotifyCallback calls notifyBlockingQuery which calls getWithIndex in a loop (which backs off on-error up to 1 minute) 2. getWithIndex calls fetch if there’s no valid entry in the cache 3. fetch starts a goroutine which calls Fetch on the cache-type, waits for a while (again with backoff up to 1 minute for errors) and then calls fetch to trigger a refresh The end result being that every 1 minute notifyBlockingQuery spawns an ancestry of goroutines that essentially lives forever. This PR ensures that the goroutine started by `fetch` cancels any prior goroutine spawned by the same line for the same key. In isolated testing where a cache type was tweaked to indefinitely error, this patch prevented goroutine counts from skyrocketing.
This commit is contained in:
parent
ca916eec32
commit
9f41cc4a25
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:bug
|
||||||
|
cache: prevent goroutine leak in agent cache
|
||||||
|
```
|
|
@ -139,6 +139,10 @@ type Cache struct {
|
||||||
entries map[string]cacheEntry
|
entries map[string]cacheEntry
|
||||||
entriesExpiryHeap *ttlcache.ExpiryHeap
|
entriesExpiryHeap *ttlcache.ExpiryHeap
|
||||||
|
|
||||||
|
fetchLock sync.Mutex
|
||||||
|
lastFetchID uint64
|
||||||
|
fetchHandles map[string]fetchHandle
|
||||||
|
|
||||||
// stopped is used as an atomic flag to signal that the Cache has been
|
// stopped is used as an atomic flag to signal that the Cache has been
|
||||||
// discarded so background fetches and expiry processing should stop.
|
// discarded so background fetches and expiry processing should stop.
|
||||||
stopped uint32
|
stopped uint32
|
||||||
|
@ -150,6 +154,11 @@ type Cache struct {
|
||||||
rateLimitCancel context.CancelFunc
|
rateLimitCancel context.CancelFunc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type fetchHandle struct {
|
||||||
|
id uint64
|
||||||
|
stopCh chan struct{}
|
||||||
|
}
|
||||||
|
|
||||||
// typeEntry is a single type that is registered with a Cache.
|
// typeEntry is a single type that is registered with a Cache.
|
||||||
type typeEntry struct {
|
type typeEntry struct {
|
||||||
// Name that was used to register the Type
|
// Name that was used to register the Type
|
||||||
|
@ -225,6 +234,7 @@ func New(options Options) *Cache {
|
||||||
types: make(map[string]typeEntry),
|
types: make(map[string]typeEntry),
|
||||||
entries: make(map[string]cacheEntry),
|
entries: make(map[string]cacheEntry),
|
||||||
entriesExpiryHeap: ttlcache.NewExpiryHeap(),
|
entriesExpiryHeap: ttlcache.NewExpiryHeap(),
|
||||||
|
fetchHandles: make(map[string]fetchHandle),
|
||||||
stopCh: make(chan struct{}),
|
stopCh: make(chan struct{}),
|
||||||
options: options,
|
options: options,
|
||||||
rateLimitContext: ctx,
|
rateLimitContext: ctx,
|
||||||
|
@ -614,8 +624,18 @@ func (c *Cache) fetch(key string, r getOptions, allowNew bool, attempt uint, ign
|
||||||
metrics.SetGauge([]string{"cache", "entries_count"}, float32(len(c.entries)))
|
metrics.SetGauge([]string{"cache", "entries_count"}, float32(len(c.entries)))
|
||||||
|
|
||||||
tEntry := r.TypeEntry
|
tEntry := r.TypeEntry
|
||||||
// The actual Fetch must be performed in a goroutine.
|
|
||||||
go func() {
|
// The actual Fetch must be performed in a goroutine. Ensure that we only
|
||||||
|
// have one in-flight at a time, but don't use a deferred
|
||||||
|
// context.WithCancel style termination so that these things outlive their
|
||||||
|
// requester.
|
||||||
|
//
|
||||||
|
// By the time we get here the system WANTS to make a replacement fetcher, so
|
||||||
|
// we terminate the prior one and replace it.
|
||||||
|
handle := c.getOrReplaceFetchHandle(key)
|
||||||
|
go func(handle fetchHandle) {
|
||||||
|
defer c.deleteFetchHandle(key, handle.id)
|
||||||
|
|
||||||
// If we have background refresh and currently are in "disconnected" state,
|
// If we have background refresh and currently are in "disconnected" state,
|
||||||
// waiting for a response might mean we mark our results as stale for up to
|
// waiting for a response might mean we mark our results as stale for up to
|
||||||
// 10 minutes (max blocking timeout) after connection is restored. To reduce
|
// 10 minutes (max blocking timeout) after connection is restored. To reduce
|
||||||
|
@ -666,6 +686,14 @@ func (c *Cache) fetch(key string, r getOptions, allowNew bool, attempt uint, ign
|
||||||
connectedTimer.Stop()
|
connectedTimer.Stop()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we were stopped while waiting on a blocking query now would be a
|
||||||
|
// good time to detect that.
|
||||||
|
select {
|
||||||
|
case <-handle.stopCh:
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
// Copy the existing entry to start.
|
// Copy the existing entry to start.
|
||||||
newEntry := entry
|
newEntry := entry
|
||||||
newEntry.Fetching = false
|
newEntry.Fetching = false
|
||||||
|
@ -820,13 +848,15 @@ func (c *Cache) fetch(key string, r getOptions, allowNew bool, attempt uint, ign
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we're over the attempt minimum, start an exponential backoff.
|
// If we're over the attempt minimum, start an exponential backoff.
|
||||||
if wait := backOffWait(attempt); wait > 0 {
|
wait := backOffWait(attempt)
|
||||||
time.Sleep(wait)
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we have a timer, wait for it
|
// If we have a timer, wait for it
|
||||||
if tEntry.Opts.RefreshTimer > 0 {
|
wait += tEntry.Opts.RefreshTimer
|
||||||
time.Sleep(tEntry.Opts.RefreshTimer)
|
|
||||||
|
select {
|
||||||
|
case <-time.After(wait):
|
||||||
|
case <-handle.stopCh:
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Trigger. The "allowNew" field is false because in the time we were
|
// Trigger. The "allowNew" field is false because in the time we were
|
||||||
|
@ -836,11 +866,46 @@ func (c *Cache) fetch(key string, r getOptions, allowNew bool, attempt uint, ign
|
||||||
r.Info.MinIndex = 0
|
r.Info.MinIndex = 0
|
||||||
c.fetch(key, r, false, attempt, true)
|
c.fetch(key, r, false, attempt, true)
|
||||||
}
|
}
|
||||||
}()
|
}(handle)
|
||||||
|
|
||||||
return entry.Waiter
|
return entry.Waiter
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *Cache) getOrReplaceFetchHandle(key string) fetchHandle {
|
||||||
|
c.fetchLock.Lock()
|
||||||
|
defer c.fetchLock.Unlock()
|
||||||
|
|
||||||
|
if prevHandle, ok := c.fetchHandles[key]; ok {
|
||||||
|
close(prevHandle.stopCh)
|
||||||
|
}
|
||||||
|
|
||||||
|
c.lastFetchID++
|
||||||
|
|
||||||
|
handle := fetchHandle{
|
||||||
|
id: c.lastFetchID,
|
||||||
|
stopCh: make(chan struct{}),
|
||||||
|
}
|
||||||
|
|
||||||
|
c.fetchHandles[key] = handle
|
||||||
|
|
||||||
|
return handle
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Cache) deleteFetchHandle(key string, fetchID uint64) {
|
||||||
|
c.fetchLock.Lock()
|
||||||
|
defer c.fetchLock.Unlock()
|
||||||
|
|
||||||
|
// Only remove a fetchHandle if it's YOUR fetchHandle.
|
||||||
|
handle, ok := c.fetchHandles[key]
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if handle.id == fetchID {
|
||||||
|
delete(c.fetchHandles, key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func backOffWait(failures uint) time.Duration {
|
func backOffWait(failures uint) time.Duration {
|
||||||
if failures > CacheRefreshBackoffMin {
|
if failures > CacheRefreshBackoffMin {
|
||||||
shift := failures - CacheRefreshBackoffMin
|
shift := failures - CacheRefreshBackoffMin
|
||||||
|
|
|
@ -18,6 +18,7 @@ import (
|
||||||
"github.com/hashicorp/consul/acl"
|
"github.com/hashicorp/consul/acl"
|
||||||
"github.com/hashicorp/consul/lib/ttlcache"
|
"github.com/hashicorp/consul/lib/ttlcache"
|
||||||
"github.com/hashicorp/consul/sdk/testutil"
|
"github.com/hashicorp/consul/sdk/testutil"
|
||||||
|
"github.com/hashicorp/consul/sdk/testutil/retry"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Test a basic Get with no indexes (and therefore no blocking queries).
|
// Test a basic Get with no indexes (and therefore no blocking queries).
|
||||||
|
@ -1750,12 +1751,22 @@ func TestCache_RefreshLifeCycle(t *testing.T) {
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.Equal(t, true, result)
|
require.Equal(t, true, result)
|
||||||
|
|
||||||
// ensure that the entry is fetching again
|
waitUntilFetching := func(expectValue bool) {
|
||||||
|
retry.Run(t, func(t *retry.R) {
|
||||||
c.entriesLock.Lock()
|
c.entriesLock.Lock()
|
||||||
|
defer c.entriesLock.Unlock()
|
||||||
entry, ok := c.entries[key]
|
entry, ok := c.entries[key]
|
||||||
require.True(t, ok)
|
require.True(t, ok)
|
||||||
|
if expectValue {
|
||||||
require.True(t, entry.Fetching)
|
require.True(t, entry.Fetching)
|
||||||
c.entriesLock.Unlock()
|
} else {
|
||||||
|
require.False(t, entry.Fetching)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ensure that the entry is fetching again
|
||||||
|
waitUntilFetching(true)
|
||||||
|
|
||||||
requestChan := make(chan error)
|
requestChan := make(chan error)
|
||||||
|
|
||||||
|
@ -1789,11 +1800,7 @@ func TestCache_RefreshLifeCycle(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// ensure that the entry is fetching again
|
// ensure that the entry is fetching again
|
||||||
c.entriesLock.Lock()
|
waitUntilFetching(true)
|
||||||
entry, ok = c.entries[key]
|
|
||||||
require.True(t, ok)
|
|
||||||
require.True(t, entry.Fetching)
|
|
||||||
c.entriesLock.Unlock()
|
|
||||||
|
|
||||||
// background a call that will wait for a newer version - will result in an acl not found error
|
// background a call that will wait for a newer version - will result in an acl not found error
|
||||||
go getError(5)
|
go getError(5)
|
||||||
|
@ -1814,11 +1821,7 @@ func TestCache_RefreshLifeCycle(t *testing.T) {
|
||||||
|
|
||||||
// ensure that the ACL not found error killed off the background refresh
|
// ensure that the ACL not found error killed off the background refresh
|
||||||
// but didn't remove it from the cache
|
// but didn't remove it from the cache
|
||||||
c.entriesLock.Lock()
|
waitUntilFetching(false)
|
||||||
entry, ok = c.entries[key]
|
|
||||||
require.True(t, ok)
|
|
||||||
require.False(t, entry.Fetching)
|
|
||||||
c.entriesLock.Unlock()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type fakeType struct {
|
type fakeType struct {
|
||||||
|
|
Loading…
Reference in New Issue