Fix multiple issues related to proxycfg health queries. (#17241)

Fix multiple issues related to proxycfg health queries. 1. The datacenter was not being provided to a proxycfg query, which resulted in bypassing agentless query optimizations and using the normal API instead. 2. The health rpc endpoint would return a zero index when insufficient ACLs were detected. This would result in the agent cache performing an infinite loop of queries in rapid succession without backoff.
2023-05-09 12:37:58 -05:00 · 2023-05-09 12:37:58 -05:00 · 3ce5277217
parent 91f76b6fb2
commit 3ce5277217
5 changed files with 37 additions and 30 deletions
--- a/.changelog/17241.txt
+++ b/.changelog/17241.txt
@ -0,0 +1,3 @@
+```release-note:bug
+connect: Fix multiple inefficient behaviors when querying service health.
+```
--- a/agent/consul/health_endpoint.go
+++ b/agent/consul/health_endpoint.go
@ -214,28 +214,10 @@ func (h *Health) ServiceNodes(args *structs.ServiceSpecificRequest, reply *struc
 		f = h.serviceNodesDefault
 	}

-	authzContext := acl.AuthorizerContext{
-		Peer: args.PeerName,
-	}
-	authz, err := h.srv.ResolveTokenAndDefaultMeta(args.Token, &args.EnterpriseMeta, &authzContext)
-	if err != nil {
-		return err
-	}
-
 	if err := h.srv.validateEnterpriseRequest(&args.EnterpriseMeta, false); err != nil {
 		return err
 	}

-	// If we're doing a connect or ingress query, we need read access to the service
-	// we're trying to find proxies for, so check that.
-	if args.Connect || args.Ingress {
-		// TODO(acl-error-enhancements) Look for ways to percolate this information up to give any feedback to the user.
-		if authz.ServiceRead(args.ServiceName, &authzContext) != acl.Allow {
-			// Just return nil, which will return an empty response (tested)
-			return nil
-		}
-	}
-
 	filter, err := bexpr.CreateFilter(args.Filter, nil, reply.Nodes)
 	if err != nil {
 		return err
@ -257,6 +239,25 @@ func (h *Health) ServiceNodes(args *structs.ServiceSpecificRequest, reply *struc
 				return err
 			}

+			authzContext := acl.AuthorizerContext{
+				Peer: args.PeerName,
+			}
+			authz, err := h.srv.ResolveTokenAndDefaultMeta(args.Token, &args.EnterpriseMeta, &authzContext)
+			if err != nil {
+				return err
+			}
+
+			// If we're doing a connect or ingress query, we need read access to the service
+			// we're trying to find proxies for, so check that.
+			if args.Connect || args.Ingress {
+				// TODO(acl-error-enhancements) Look for ways to percolate this information up to give any feedback to the user.
+				if authz.ServiceRead(args.ServiceName, &authzContext) != acl.Allow {
+					// Return the index here so that the agent cache does not infinitely loop.
+					reply.Index = index
+					return nil
+				}
+			}
+
 			resolvedNodes := nodes
 			if args.MergeCentralConfig {
 				for _, node := range resolvedNodes {
--- a/agent/consul/health_endpoint_test.go
+++ b/agent/consul/health_endpoint_test.go
@ -1127,6 +1127,7 @@ node "foo" {
 	var resp structs.IndexedCheckServiceNodes
 	assert.Nil(t, msgpackrpc.CallWithCodec(codec, "Health.ServiceNodes", &req, &resp))
 	assert.Len(t, resp.Nodes, 0)
+	assert.Greater(t, resp.Index, uint64(0))

 	// List w/ token. This should work since we're requesting "foo", but should
 	// also only contain the proxies with names that adhere to our ACL.
--- a/agent/proxycfg/state_test.go
+++ b/agent/proxycfg/state_test.go
@ -795,7 +795,7 @@ func TestState_WatchesAndUpdates(t *testing.T) {
 				fmt.Sprintf("upstream-target:api-failover-remote.default.default.dc2:%s-failover-remote?dc=dc2", apiUID.String()): genVerifyServiceSpecificRequest("api-failover-remote", "", "dc2", true),
 				fmt.Sprintf("upstream-target:api-failover-local.default.default.dc2:%s-failover-local?dc=dc2", apiUID.String()):   genVerifyServiceSpecificRequest("api-failover-local", "", "dc2", true),
 				fmt.Sprintf("upstream-target:api-failover-direct.default.default.dc2:%s-failover-direct?dc=dc2", apiUID.String()): genVerifyServiceSpecificRequest("api-failover-direct", "", "dc2", true),
-				upstreamPeerWatchIDPrefix + fmt.Sprintf("%s-failover-to-peer?peer=cluster-01", apiUID.String()):                   genVerifyServiceSpecificPeeredRequest("api-failover-to-peer", "", "", "cluster-01", true),
+				upstreamPeerWatchIDPrefix + fmt.Sprintf("%s-failover-to-peer?peer=cluster-01", apiUID.String()):                   genVerifyServiceSpecificPeeredRequest("api-failover-to-peer", "", "dc1", "cluster-01", true),
 				fmt.Sprintf("mesh-gateway:dc2:%s-failover-remote?dc=dc2", apiUID.String()):                                        genVerifyGatewayWatch("dc2"),
 				fmt.Sprintf("mesh-gateway:dc1:%s-failover-local?dc=dc2", apiUID.String()):                                         genVerifyGatewayWatch("dc1"),
 			},
--- a/agent/proxycfg/upstreams.go
+++ b/agent/proxycfg/upstreams.go
@ -316,8 +316,19 @@ func (s *handlerUpstreams) resetWatchesFromChain(
 			watchedChainEndpoints = true
 		}

-		opts := targetWatchOpts{upstreamID: uid}
-		opts.fromChainTarget(target)
+		opts := targetWatchOpts{
+			upstreamID: uid,
+			chainID:    target.ID,
+			service:    target.Service,
+			filter:     target.Subset.Filter,
+			datacenter: target.Datacenter,
+			peer:       target.Peer,
+			entMeta:    target.GetEnterpriseMetadata(),
+		}
+		// Peering targets do not set the datacenter field, so we should default it here.
+		if opts.datacenter == "" {
+			opts.datacenter = s.source.Datacenter
+		}

 		err := s.watchUpstreamTarget(ctx, snap, opts)
 		if err != nil {
@ -435,15 +446,6 @@ type targetWatchOpts struct {
 	entMeta    *acl.EnterpriseMeta
 }

-func (o *targetWatchOpts) fromChainTarget(t *structs.DiscoveryTarget) {
-	o.chainID = t.ID
-	o.service = t.Service
-	o.filter = t.Subset.Filter
-	o.datacenter = t.Datacenter
-	o.peer = t.Peer
-	o.entMeta = t.GetEnterpriseMetadata()
-}
-
 func (s *handlerUpstreams) watchUpstreamTarget(ctx context.Context, snap *ConfigSnapshotUpstreams, opts targetWatchOpts) error {
 	s.logger.Trace("initializing watch of target",
 		"upstream", opts.upstreamID,