diff --git a/.changelog/11693.txt b/.changelog/11693.txt new file mode 100644 index 000000000..1088a24c5 --- /dev/null +++ b/.changelog/11693.txt @@ -0,0 +1,3 @@ +```release-note:bug +ca: fixes a bug that caused non blocking leaf cert queries to return the same cached response regardless of ca rotation or leaf cert expiry +``` \ No newline at end of file diff --git a/agent/agent_endpoint.go b/agent/agent_endpoint.go index 0222950f4..f736e4b57 100644 --- a/agent/agent_endpoint.go +++ b/agent/agent_endpoint.go @@ -1494,7 +1494,9 @@ func (s *HTTPHandlers) AgentConnectCARoots(resp http.ResponseWriter, req *http.R } // AgentConnectCALeafCert returns the certificate bundle for a service -// instance. This supports blocking queries to update the returned bundle. +// instance. This endpoint ignores all "Cache-Control" attributes. +// This supports blocking queries to update the returned bundle. +// Non-blocking queries will always verify that the cache entry is still valid. func (s *HTTPHandlers) AgentConnectCALeafCert(resp http.ResponseWriter, req *http.Request) (interface{}, error) { // Get the service name. Note that this is the name of the service, // not the ID of the service instance. @@ -1518,6 +1520,14 @@ func (s *HTTPHandlers) AgentConnectCALeafCert(resp http.ResponseWriter, req *htt args.MaxQueryTime = qOpts.MaxQueryTime args.Token = qOpts.Token + // TODO(ffmmmm): maybe set MustRevalidate in ConnectCALeafRequest (as part of CacheInfo()) + // We don't want non-blocking queries to return expired leaf certs + // or leaf certs not valid under the current CA. So always revalidate + // the leaf cert on non-blocking queries (ie when MinQueryIndex == 0) + if args.MinQueryIndex == 0 { + args.MustRevalidate = true + } + if !s.validateRequestPartition(resp, &args.EnterpriseMeta) { return nil, nil } diff --git a/agent/agent_endpoint_test.go b/agent/agent_endpoint_test.go index a1984c946..e9f0d12d8 100644 --- a/agent/agent_endpoint_test.go +++ b/agent/agent_endpoint_test.go @@ -5797,16 +5797,13 @@ func TestAgentConnectCALeafCert_good(t *testing.T) { obj2, err := a.srv.AgentConnectCALeafCert(resp, req) require.NoError(err) require.Equal(obj, obj2) - - // Should cache hit this time and not make request - require.Equal("HIT", resp.Header().Get("X-Cache")) } + // Set a new CA + ca2 := connect.TestCAConfigSet(t, a, nil) + // Issue a blocking query to ensure that the cert gets updated appropriately { - // Set a new CA - ca := connect.TestCAConfigSet(t, a, nil) - resp := httptest.NewRecorder() req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test?index="+index, nil) obj, err := a.srv.AgentConnectCALeafCert(resp, req) @@ -5816,12 +5813,59 @@ func TestAgentConnectCALeafCert_good(t *testing.T) { require.NotEqual(issued.PrivateKeyPEM, issued2.PrivateKeyPEM) // Verify that the cert is signed by the new CA - requireLeafValidUnderCA(t, issued2, ca) + requireLeafValidUnderCA(t, issued2, ca2) // Should not be a cache hit! The data was updated in response to the blocking // query being made. require.Equal("MISS", resp.Header().Get("X-Cache")) } + + t.Run("test non-blocking queries update leaf cert", func(t *testing.T) { + resp := httptest.NewRecorder() + obj, err := a.srv.AgentConnectCALeafCert(resp, req) + require.NoError(err) + + // Get the issued cert + issued, ok := obj.(*structs.IssuedCert) + assert.True(ok) + + // Verify that the cert is signed by the CA + requireLeafValidUnderCA(t, issued, ca2) + + // Issue a non blocking query to ensure that the cert gets updated appropriately + { + // Set a new CA + ca3 := connect.TestCAConfigSet(t, a, nil) + + resp := httptest.NewRecorder() + req, err := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test", nil) + require.NoError(err) + obj, err = a.srv.AgentConnectCALeafCert(resp, req) + require.NoError(err) + issued2 := obj.(*structs.IssuedCert) + require.NotEqual(issued.CertPEM, issued2.CertPEM) + require.NotEqual(issued.PrivateKeyPEM, issued2.PrivateKeyPEM) + + // Verify that the cert is signed by the new CA + requireLeafValidUnderCA(t, issued2, ca3) + + // Should not be a cache hit! + require.Equal("MISS", resp.Header().Get("X-Cache")) + } + + // Test caching for the leaf cert + { + + for fetched := 0; fetched < 4; fetched++ { + + // Fetch it again + resp := httptest.NewRecorder() + obj2, err := a.srv.AgentConnectCALeafCert(resp, req) + require.NoError(err) + require.Equal(obj, obj2) + } + } + }) } // Test we can request a leaf cert for a service we have permission for @@ -5894,9 +5938,6 @@ func TestAgentConnectCALeafCert_goodNotLocal(t *testing.T) { obj2, err := a.srv.AgentConnectCALeafCert(resp, req) require.NoError(err) require.Equal(obj, obj2) - - // Should cache hit this time and not make request - require.Equal("HIT", resp.Header().Get("X-Cache")) } // Test Blocking - see https://github.com/hashicorp/consul/issues/4462 @@ -5944,12 +5985,7 @@ func TestAgentConnectCALeafCert_goodNotLocal(t *testing.T) { // Verify that the cert is signed by the new CA requireLeafValidUnderCA(t, issued2, ca) - // Should be a cache hit! The data should've updated in the cache - // in the background so this should've been fetched directly from - // the cache. - if resp.Header().Get("X-Cache") != "HIT" { - r.Fatalf("should be a cache hit") - } + require.NotEqual(issued, issued2) }) } } @@ -6044,9 +6080,6 @@ func TestAgentConnectCALeafCert_Vault_doesNotChurnLeafCertsAtIdle(t *testing.T) obj2, err := a.srv.AgentConnectCALeafCert(resp, req) require.NoError(err) require.Equal(obj, obj2) - - // Should cache hit this time and not make request - require.Equal("HIT", resp.Header().Get("X-Cache")) } // Test that we aren't churning leaves for no reason at idle. @@ -6153,7 +6186,8 @@ func TestAgentConnectCALeafCert_secondaryDC_good(t *testing.T) { } // List - req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test", nil) + req, err := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test", nil) + require.NoError(err) resp := httptest.NewRecorder() obj, err := a2.srv.AgentConnectCALeafCert(resp, req) require.NoError(err) @@ -6178,9 +6212,6 @@ func TestAgentConnectCALeafCert_secondaryDC_good(t *testing.T) { obj2, err := a2.srv.AgentConnectCALeafCert(resp, req) require.NoError(err) require.Equal(obj, obj2) - - // Should cache hit this time and not make request - require.Equal("HIT", resp.Header().Get("X-Cache")) } // Test that we aren't churning leaves for no reason at idle. @@ -6245,12 +6276,7 @@ func TestAgentConnectCALeafCert_secondaryDC_good(t *testing.T) { // Verify that the cert is signed by the new CA requireLeafValidUnderCA(t, issued2, dc1_ca2) - // Should be a cache hit! The data should've updated in the cache - // in the background so this should've been fetched directly from - // the cache. - if resp.Header().Get("X-Cache") != "HIT" { - r.Fatalf("should be a cache hit") - } + require.NotEqual(issued, issued2) }) } diff --git a/agent/cache-types/connect_ca_leaf.go b/agent/cache-types/connect_ca_leaf.go index 9a7fcd216..1950ef756 100644 --- a/agent/cache-types/connect_ca_leaf.go +++ b/agent/cache-types/connect_ca_leaf.go @@ -380,6 +380,25 @@ func (c *ConnectCALeaf) Fetch(opts cache.FetchOptions, req cache.Request) (cache return c.generateNewLeaf(reqReal, lastResultWithNewState()) } + // If we called Fetch() with MustRevalidate then this call came from a non-blocking query. + // Any prior CA rotations should've already expired the cert. + // All we need to do is check whether the current CA is the one that signed the leaf. If not, generate a new leaf. + // This is not a perfect solution (as a CA rotation update can be missed) but it should take care of instances like + // see https://github.com/hashicorp/consul/issues/10871, https://github.com/hashicorp/consul/issues/9862 + // This seems to me like a hack, so maybe we can revisit the caching/ fetching logic in this case + if req.CacheInfo().MustRevalidate { + roots, err := c.rootsFromCache() + if err != nil { + return lastResultWithNewState(), err + } + if activeRootHasKey(roots, state.authorityKeyID) { + return lastResultWithNewState(), nil + } + + // if we reach here then the current leaf was not signed by the same CAs, just regen + return c.generateNewLeaf(reqReal, lastResultWithNewState()) + } + // We are about to block and wait for a change or timeout. // Make a chan we can be notified of changes to CA roots on. It must be @@ -401,7 +420,7 @@ func (c *ConnectCALeaf) Fetch(opts cache.FetchOptions, req cache.Request) (cache c.fetchStart(rootUpdateCh) defer c.fetchDone(rootUpdateCh) - // Setup the timeout chan outside the loop so we don't keep bumping the timout + // Setup the timeout chan outside the loop so we don't keep bumping the timeout // later if we loop around. timeoutCh := time.After(opts.Timeout) @@ -492,7 +511,7 @@ func (c *ConnectCALeaf) rootsFromCache() (*structs.IndexedCARoots, error) { // generateNewLeaf does the actual work of creating a new private key, // generating a CSR and getting it signed by the servers. result argument -// represents the last result currently in cache if any along with it's state. +// represents the last result currently in cache if any along with its state. func (c *ConnectCALeaf) generateNewLeaf(req *ConnectCALeafRequest, result cache.FetchResult) (cache.FetchResult, error) { @@ -643,14 +662,15 @@ func (c *ConnectCALeaf) generateNewLeaf(req *ConnectCALeafRequest, // since this is only used for cache-related requests and not forwarded // directly to any Consul servers. type ConnectCALeafRequest struct { - Token string - Datacenter string - Service string // Service name, not ID - Agent string // Agent name, not ID - DNSSAN []string - IPSAN []net.IP - MinQueryIndex uint64 - MaxQueryTime time.Duration + Token string + Datacenter string + Service string // Service name, not ID + Agent string // Agent name, not ID + DNSSAN []string + IPSAN []net.IP + MinQueryIndex uint64 + MaxQueryTime time.Duration + MustRevalidate bool structs.EnterpriseMeta } @@ -684,10 +704,11 @@ func (req *ConnectCALeafRequest) TargetPartition() string { func (r *ConnectCALeafRequest) CacheInfo() cache.RequestInfo { return cache.RequestInfo{ - Token: r.Token, - Key: r.Key(), - Datacenter: r.Datacenter, - MinIndex: r.MinQueryIndex, - Timeout: r.MaxQueryTime, + Token: r.Token, + Key: r.Key(), + Datacenter: r.Datacenter, + MinIndex: r.MinQueryIndex, + Timeout: r.MaxQueryTime, + MustRevalidate: r.MustRevalidate, } }