Fix hot loop in cache for RPC returning zero index.

2018-06-15 21:03:19 +01:00 · 2018-06-15 21:03:19 +01:00 · 81bd1b43a3
parent 3d51c2aeac
commit 81bd1b43a3
5 changed files with 133 additions and 23 deletions
--- a/agent/cache/cache.go
+++ b/agent/cache/cache.go
@ -325,14 +325,6 @@ func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<-
 		entry = cacheEntry{Valid: false, Waiter: make(chan struct{})}
 	}

-	// We always specify an index greater than zero since index of zero
-	// means to always return immediately and we want to block if possible.
-	// Index 1 is always safe since Consul's own initialization always results
-	// in a higher index (around 10 or above).
-	if entry.Index == 0 {
-		entry.Index = 1
-	}
-
 	// Set that we're fetching to true, which makes it so that future
 	// identical calls to fetch will return the same waiter rather than
 	// perform multiple fetches.
@ -355,6 +347,17 @@ func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<-
 			// A new value was given, so we create a brand new entry.
 			newEntry.Value = result.Value
 			newEntry.Index = result.Index
+			if newEntry.Index < 1 {
+				// Less than one is invalid unless there was an error and in this case
+				// there wasn't since a value was returned. If a badly behaved RPC
+				// returns 0 when it has no data, we might get into a busy loop here. We
+				// set this to minimum of 1 which is safe because no valid user data can
+				// ever be written at raft index 1 due to the bootstrap process for
+				// raft. This insure that any subsequent background refresh request will
+				// always block, but allows the initial request to return immediately
+				// even if there is no data.
+				newEntry.Index = 1
+			}

 			// This is a valid entry with a result
 			newEntry.Valid = true
@ -365,8 +368,25 @@ func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<-
 			metrics.IncrCounter([]string{"consul", "cache", "fetch_success"}, 1)
 			metrics.IncrCounter([]string{"consul", "cache", t, "fetch_success"}, 1)

+			if result.Index > 0 {
 				// Reset the attempts counter so we don't have any backoff
 				attempt = 0
+			} else {
+				// Result having a zero index is an implicit error case. There was no
+				// actual error but it implies the RPC found in index (nothing written
+				// yet for that type) but didn't take care to return safe "1" index. We
+				// don't want to actually treat it like an error by setting
+				// newEntry.Error to something non-nil, but we should guard against 100%
+				// CPU burn hot loops caused by that case which will never block but
+				// also won't backoff either. So we treat it as a failed attempt so that
+				// at least the failure backoff will save our CPU while still
+				// periodically refreshing so normal service can resume when the servers
+				// actually have something to return from the RPC. If we get in this
+				// state it can be considered a bug in the RPC implementation (to ever
+				// return a zero index) however since it can happen this is a safety net
+				// for the future.
+				attempt++
+			}
 		} else {
 			metrics.IncrCounter([]string{"consul", "cache", "fetch_error"}, 1)
 			metrics.IncrCounter([]string{"consul", "cache", t, "fetch_error"}, 1)
--- a/agent/cache/cache_test.go
+++ b/agent/cache/cache_test.go
@ -471,10 +471,8 @@ func TestCacheGet_periodicRefreshErrorBackoff(t *testing.T) {
 	require.True(t, actual < 10, fmt.Sprintf("actual: %d", actual))
 }

-// Test that fetching with no index actually sets to index to one, including
-// for background refreshes. This ensures we don't end up with any index 0
-// loops.
-func TestCacheGet_noIndexSetsOne(t *testing.T) {
+// Test that a badly behaved RPC that returns 0 index will perform a backoff.
+func TestCacheGet_periodicRefreshBadRPCZeroIndexErrorBackoff(t *testing.T) {
 	t.Parallel()

 	typ := TestType(t)
@ -487,20 +485,93 @@ func TestCacheGet_noIndexSetsOne(t *testing.T) {
 	})

 	// Configure the type
-	fetchErr := fmt.Errorf("test fetch error")
-	typ.Static(FetchResult{Value: 42, Index: 0}, fetchErr).Run(func(args mock.Arguments) {
-		opts := args.Get(0).(FetchOptions)
-		assert.True(t, opts.MinIndex > 0, "minIndex > 0")
+	var retries uint32
+	typ.Static(FetchResult{Value: 0, Index: 0}, nil).Run(func(args mock.Arguments) {
+		atomic.AddUint32(&retries, 1)
 	})

 	// Fetch
 	resultCh := TestCacheGetCh(t, c, "t", TestRequest(t, RequestInfo{Key: "hello"}))
-	TestCacheGetChResult(t, resultCh, 42)
+	TestCacheGetChResult(t, resultCh, 0)
+
+	// Sleep a bit. The refresh will quietly fail in the background. What we
+	// want to verify is that it doesn't retry too much. "Too much" is hard
+	// to measure since its CPU dependent if this test is failing. But due
+	// to the short sleep below, we can calculate about what we'd expect if
+	// backoff IS working.
+	time.Sleep(500 * time.Millisecond)
+
+	// Fetch should work, we should get a 0 still. Errors are ignored.
+	resultCh = TestCacheGetCh(t, c, "t", TestRequest(t, RequestInfo{Key: "hello"}))
+	TestCacheGetChResult(t, resultCh, 0)
+
+	// Check the number
+	actual := atomic.LoadUint32(&retries)
+	require.True(t, actual < 10, fmt.Sprintf("%d retries, should be < 10", actual))
+}
+
+// Test that fetching with no index makes an initial request with no index, but
+// then ensures all background refreshes have > 0. This ensures we don't end up
+// with any index 0 loops from background refreshed while also returning
+// immediately on the initial request if there is no data written to that table
+// yet.
+func TestCacheGet_noIndexSetsOne(t *testing.T) {
+	t.Parallel()
+
+	typ := TestType(t)
+	defer typ.AssertExpectations(t)
+	c := TestCache(t)
+	c.RegisterType("t", typ, &RegisterOptions{
+		Refresh:        true,
+		RefreshTimer:   0,
+		RefreshTimeout: 5 * time.Minute,
+	})
+
+	// Simulate "well behaved" RPC with no data yet but returning 1
+	{
+		first := int32(1)
+
+		typ.Static(FetchResult{Value: 0, Index: 1}, nil).Run(func(args mock.Arguments) {
+			opts := args.Get(0).(FetchOptions)
+			isFirst := atomic.SwapInt32(&first, 0)
+			if isFirst == 1 {
+				assert.Equal(t, uint64(0), opts.MinIndex)
+			} else {
+				assert.True(t, opts.MinIndex > 0, "minIndex > 0")
+			}
+		})
+
+		// Fetch
+		resultCh := TestCacheGetCh(t, c, "t", TestRequest(t, RequestInfo{Key: "hello"}))
+		TestCacheGetChResult(t, resultCh, 0)

 		// Sleep a bit so background refresh happens
 		time.Sleep(100 * time.Millisecond)
 	}

+	// Same for "badly behaved" RPC that returns 0 index and no data
+	{
+		first := int32(1)
+
+		typ.Static(FetchResult{Value: 0, Index: 0}, nil).Run(func(args mock.Arguments) {
+			opts := args.Get(0).(FetchOptions)
+			isFirst := atomic.SwapInt32(&first, 0)
+			if isFirst == 1 {
+				assert.Equal(t, uint64(0), opts.MinIndex)
+			} else {
+				assert.True(t, opts.MinIndex > 0, "minIndex > 0")
+			}
+		})
+
+		// Fetch
+		resultCh := TestCacheGetCh(t, c, "t", TestRequest(t, RequestInfo{Key: "hello"}))
+		TestCacheGetChResult(t, resultCh, 0)
+
+		// Sleep a bit so background refresh happens
+		time.Sleep(100 * time.Millisecond)
+	}
+}
+
 // Test that the backend fetch sets the proper timeout.
 func TestCacheGet_fetchTimeout(t *testing.T) {
 	t.Parallel()
--- a/agent/consul/state/intention.go
+++ b/agent/consul/state/intention.go
@ -132,6 +132,9 @@ func (s *Store) Intentions(ws memdb.WatchSet) (uint64, structs.Intentions, error

 	// Get the index
 	idx := maxIndexTxn(tx, intentionsTableName)
+	if idx < 1 {
+		idx = 1
+	}

 	// Get all intentions
 	iter, err := tx.Get(intentionsTableName, "id")
@ -228,6 +231,9 @@ func (s *Store) IntentionGet(ws memdb.WatchSet, id string) (uint64, *structs.Int

 	// Get the table index.
 	idx := maxIndexTxn(tx, intentionsTableName)
+	if idx < 1 {
+		idx = 1
+	}

 	// Look up by its ID.
 	watchCh, intention, err := tx.FirstWatch(intentionsTableName, "id", id)
@ -295,6 +301,9 @@ func (s *Store) IntentionMatch(ws memdb.WatchSet, args *structs.IntentionQueryMa

 	// Get the table index.
 	idx := maxIndexTxn(tx, intentionsTableName)
+	if idx < 1 {
+		idx = 1
+	}

 	// Make all the calls and accumulate the results
 	results := make([]structs.Intentions, len(args.Entries))
--- a/agent/consul/state/intention_test.go
+++ b/agent/consul/state/intention_test.go
@ -17,7 +17,7 @@ func TestStore_IntentionGet_none(t *testing.T) {
 	// Querying with no results returns nil.
 	ws := memdb.NewWatchSet()
 	idx, res, err := s.IntentionGet(ws, testUUID())
-	assert.Equal(idx, uint64(0))
+	assert.Equal(uint64(1), idx)
 	assert.Nil(res)
 	assert.Nil(err)
 }
@ -231,7 +231,7 @@ func TestStore_IntentionsList(t *testing.T) {
 	idx, res, err := s.Intentions(ws)
 	assert.NoError(err)
 	assert.Nil(res)
-	assert.Equal(idx, uint64(0))
+	assert.Equal(uint64(1), idx)

 	// Create some intentions
 	ixns := structs.Intentions{
--- a/agent/testagent.go
+++ b/agent/testagent.go
@ -352,6 +352,16 @@ func TestConfig(sources ...config.Source) *config.RuntimeConfig {
 				ca_config {
 					cluster_id = "` + connect.TestClusterID + `"
 				}
+				proxy_defaults {
+					// Generally we don't actually need to test real proxy startup except
+					// in the Daemon package which explicitly manages how it starts things
+					// so making this a no-op long running command like /bin/sleep would
+					// be fine, but there is no such thing on windows etc. We hackily rely
+					// on the fact that if the executable doesn't exist, the Daemon
+					// manager will get an exec error and retry it with a backoff beningly
+					// until tests pass.
+				  daemon_command = ["/bin/sleep", "3600"]
+				}
 			}
 			performance {
 				raft_multiplier = 1