From cf0c5110be0f071da33ee49af2c54fafc326b154 Mon Sep 17 00:00:00 2001 From: "R.B. Boyer" <4903+rboyer@users.noreply.github.com> Date: Wed, 20 Apr 2022 12:21:47 -0500 Subject: [PATCH] ca: fix a bug that caused a non blocking leaf cert query after a blocking leaf cert query to block (#12820) Fixes #12048 Fixes #12319 Regression introduced in #11693 Local reproduction steps: 1. `consul agent -dev` 2. `curl -sLiv 'localhost:8500/v1/agent/connect/ca/leaf/web'` 3. make note of the `X-Consul-Index` header returned 4. `curl -sLi 'localhost:8500/v1/agent/connect/ca/leaf/web?index='` 5. Kill the above curl when it hangs with Ctrl-C 6. Repeat (2) and it should not hang. --- .changelog/12820.txt | 3 ++ agent/agent_endpoint_test.go | 95 ++++++++++++++++++++++++++++++++++++ agent/cache/cache.go | 7 +++ 3 files changed, 105 insertions(+) create mode 100644 .changelog/12820.txt diff --git a/.changelog/12820.txt b/.changelog/12820.txt new file mode 100644 index 000000000..af5533b77 --- /dev/null +++ b/.changelog/12820.txt @@ -0,0 +1,3 @@ +```release-note:bug +ca: fix a bug that caused a non blocking leaf cert query after a blocking leaf cert query to block +``` diff --git a/agent/agent_endpoint_test.go b/agent/agent_endpoint_test.go index 103243497..fc29333e9 100644 --- a/agent/agent_endpoint_test.go +++ b/agent/agent_endpoint_test.go @@ -6202,6 +6202,101 @@ func TestAgentConnectCALeafCert_goodNotLocal(t *testing.T) { } } +func TestAgentConnectCALeafCert_nonBlockingQuery_after_blockingQuery_shouldNotBlock(t *testing.T) { + // see: https://github.com/hashicorp/consul/issues/12048 + + runStep := func(t *testing.T, name string, fn func(t *testing.T)) { + t.Helper() + if !t.Run(name, fn) { + t.FailNow() + } + } + + if testing.Short() { + t.Skip("too slow for testing.Short") + } + + t.Parallel() + + a := NewTestAgent(t, "") + testrpc.WaitForTestAgent(t, a.RPC, "dc1") + testrpc.WaitForActiveCARoot(t, a.RPC, "dc1", nil) + + { + // Register a local service + args := &structs.ServiceDefinition{ + ID: "foo", + Name: "test", + Address: "127.0.0.1", + Port: 8000, + Check: structs.CheckType{ + TTL: 15 * time.Second, + }, + } + req := httptest.NewRequest("PUT", "/v1/agent/service/register", jsonReader(args)) + resp := httptest.NewRecorder() + a.srv.h.ServeHTTP(resp, req) + if !assert.Equal(t, 200, resp.Code) { + t.Log("Body: ", resp.Body.String()) + } + } + + var ( + serialNumber string + index string + issued structs.IssuedCert + ) + runStep(t, "do initial non-blocking query", func(t *testing.T) { + req := httptest.NewRequest("GET", "/v1/agent/connect/ca/leaf/test", nil) + resp := httptest.NewRecorder() + a.srv.h.ServeHTTP(resp, req) + + dec := json.NewDecoder(resp.Body) + require.NoError(t, dec.Decode(&issued)) + serialNumber = issued.SerialNumber + + require.Equal(t, "MISS", resp.Header().Get("X-Cache"), + "for the leaf cert cache type these are always MISS") + index = resp.Header().Get("X-Consul-Index") + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go func() { + // launch goroutine for blocking query + req := httptest.NewRequest("GET", "/v1/agent/connect/ca/leaf/test?index="+index, nil).Clone(ctx) + resp := httptest.NewRecorder() + a.srv.h.ServeHTTP(resp, req) + }() + + // We just need to ensure that the above blocking query is in-flight before + // the next step, so do a little sleep. + time.Sleep(50 * time.Millisecond) + + // The initial non-blocking query populated the leaf cert cache entry + // implicitly. The agent cache doesn't prune entries very often at all, so + // in between both of these steps the data should still be there, causing + // this to be a HIT that completes in less than 10m (the default inner leaf + // cert blocking query timeout). + runStep(t, "do a non-blocking query that should not block", func(t *testing.T) { + req := httptest.NewRequest("GET", "/v1/agent/connect/ca/leaf/test", nil) + resp := httptest.NewRecorder() + a.srv.h.ServeHTTP(resp, req) + + var issued2 structs.IssuedCert + dec := json.NewDecoder(resp.Body) + require.NoError(t, dec.Decode(&issued2)) + + require.Equal(t, "HIT", resp.Header().Get("X-Cache")) + + // If this is actually returning a cached result, the serial number + // should be unchanged. + require.Equal(t, serialNumber, issued2.SerialNumber) + + require.Equal(t, issued, issued2) + }) +} + func TestAgentConnectCALeafCert_Vault_doesNotChurnLeafCertsAtIdle(t *testing.T) { ca.SkipIfVaultNotPresent(t) diff --git a/agent/cache/cache.go b/agent/cache/cache.go index d104cdd3b..e012b6868 100644 --- a/agent/cache/cache.go +++ b/agent/cache/cache.go @@ -376,6 +376,13 @@ func (c *Cache) getEntryLocked( // Check if re-validate is requested. If so the first time round the // loop is not a hit but subsequent ones should be treated normally. if !tEntry.Opts.Refresh && info.MustRevalidate { + if entry.Fetching { + // There is an active blocking query for this data, which has not + // returned. We can logically deduce that the contents of the cache + // are actually current, and we can simply return this while + // leaving the blocking query alone. + return true, true, entry + } return true, false, entry }