From 1ad327ddf537c941ed5c4c98446657bc2f0100a9 Mon Sep 17 00:00:00 2001
From: Paul Glass <pglass@hashicorp.com>
Date: Fri, 3 Feb 2023 08:45:11 -0600
Subject: [PATCH] Use agent token for service/check deregistration during
 anti-entropy (#16097)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use only the agent token for deregistration during anti-entropy

The previous behavior had the agent attempt to use the "service" token
(i.e. from the `token` field in a service definition file), and if that
was not set then it would use the agent token.

The previous behavior was problematic because, if the service token had
been deleted, the deregistration request would fail. The agent would
retry the deregistration during each anti-entropy sync, and the
situation would never resolve.

The new behavior is to only/always use the agent token for service and
check deregistration during anti-entropy. This approach is:

* Simpler: No fallback logic to try different tokens
* Faster (slightly): No time spent attempting the service token
* Correct: The agent token is able to deregister services on that
  agent's node, because:
  * node:write permissions allow deregistration of services/checks on
    that node.
  * The agent token must have node:write permission, or else the agent
    is not be able to (de)register itself into the catalog

Co-authored-by: Vesa Hagström <weeezes@gmail.com>
---
 .changelog/16097.txt      |  3 +++
 agent/local/state.go      | 11 +++++++++--
 agent/local/state_test.go | 41 ++++++++++++++++++++++++++++++++-------
 3 files changed, 46 insertions(+), 9 deletions(-)
 create mode 100644 .changelog/16097.txt

diff --git a/.changelog/16097.txt b/.changelog/16097.txt
new file mode 100644
index 000000000..e25bf8961
--- /dev/null
+++ b/.changelog/16097.txt
@@ -0,0 +1,3 @@
+```release-note:bug-fix
+agent: Only use the `agent` token for internal deregistration of checks and services during anti-entropy syncing. The service token specified in the `token` field of the check or service definition is no longer used for deregistration. This fixes an issue where the agent failed to ever deregister a service or check if the service token was deleted.
+```
diff --git a/agent/local/state.go b/agent/local/state.go
index d6ff091ad..d3ffa1e02 100644
--- a/agent/local/state.go
+++ b/agent/local/state.go
@@ -1293,7 +1293,12 @@ func (l *State) deleteService(key structs.ServiceID) error {
 		return fmt.Errorf("ServiceID missing")
 	}
 
-	st := l.aclTokenForServiceSync(key, l.tokens.AgentToken)
+	// Always use the agent token to delete without trying the service token.
+	// This works because the agent token really must have node:write
+	// permission and node:write allows deregistration of services/checks on
+	// that node. Because the service token may have been deleted, using the
+	// agent token without fallback logic is a bit faster, simpler, and safer.
+	st := l.tokens.AgentToken()
 	req := structs.DeregisterRequest{
 		Datacenter:     l.config.Datacenter,
 		Node:           l.config.NodeName,
@@ -1344,7 +1349,9 @@ func (l *State) deleteCheck(key structs.CheckID) error {
 		return fmt.Errorf("CheckID missing")
 	}
 
-	ct := l.aclTokenForCheckSync(key, l.tokens.AgentToken)
+	// Always use the agent token for deletion. Refer to deleteService() for
+	// an explanation.
+	ct := l.tokens.AgentToken()
 	req := structs.DeregisterRequest{
 		Datacenter:     l.config.Datacenter,
 		Node:           l.config.NodeName,
diff --git a/agent/local/state_test.go b/agent/local/state_test.go
index 63256f848..6052fa478 100644
--- a/agent/local/state_test.go
+++ b/agent/local/state_test.go
@@ -829,10 +829,6 @@ func TestAgentAntiEntropy_Services_WithChecks(t *testing.T) {
 }
 
 var testRegisterRules = `
- node "" {
- 	policy = "write"
- }
-
  service "api" {
  	policy = "write"
  }
@@ -863,6 +859,9 @@ func TestAgentAntiEntropy_Services_ACLDeny(t *testing.T) {
 	defer a.Shutdown()
 	testrpc.WaitForLeader(t, a.RPC, "dc1")
 
+	// The agent token is the only token used for deleteService.
+	setAgentToken(t, a)
+
 	token := createToken(t, a, testRegisterRules)
 
 	// Create service (disallowed)
@@ -1153,15 +1152,19 @@ type RPC interface {
 func createToken(t *testing.T, rpc RPC, policyRules string) string {
 	t.Helper()
 
+	uniqueId, err := uuid.GenerateUUID()
+	require.NoError(t, err)
+	policyName := "the-policy-" + uniqueId
+
 	reqPolicy := structs.ACLPolicySetRequest{
 		Datacenter: "dc1",
 		Policy: structs.ACLPolicy{
-			Name:  "the-policy",
+			Name:  policyName,
 			Rules: policyRules,
 		},
 		WriteRequest: structs.WriteRequest{Token: "root"},
 	}
-	err := rpc.RPC(context.Background(), "ACL.PolicySet", &reqPolicy, &structs.ACLPolicy{})
+	err = rpc.RPC(context.Background(), "ACL.PolicySet", &reqPolicy, &structs.ACLPolicy{})
 	require.NoError(t, err)
 
 	token, err := uuid.GenerateUUID()
@@ -1171,7 +1174,7 @@ func createToken(t *testing.T, rpc RPC, policyRules string) string {
 		Datacenter: "dc1",
 		ACLToken: structs.ACLToken{
 			SecretID: token,
-			Policies: []structs.ACLTokenPolicyLink{{Name: "the-policy"}},
+			Policies: []structs.ACLTokenPolicyLink{{Name: policyName}},
 		},
 		WriteRequest: structs.WriteRequest{Token: "root"},
 	}
@@ -1180,6 +1183,27 @@ func createToken(t *testing.T, rpc RPC, policyRules string) string {
 	return token
 }
 
+// setAgentToken sets the 'agent' token for this agent. It creates a new token
+// with node:write for the agent's node name, and service:write for any
+// service.
+func setAgentToken(t *testing.T, a *agent.TestAgent) {
+	var policy = fmt.Sprintf(`
+	  node "%s" {
+		policy = "write"
+	  }
+	  service_prefix "" {
+		policy = "read"
+	  }
+	`, a.Config.NodeName)
+
+	token := createToken(t, a, policy)
+
+	_, err := a.Client().Agent().UpdateAgentACLToken(token, &api.WriteOptions{Token: "root"})
+	if err != nil {
+		t.Fatalf("setting agent token: %v", err)
+	}
+}
+
 func TestAgentAntiEntropy_Checks(t *testing.T) {
 	if testing.Short() {
 		t.Skip("too slow for testing.Short")
@@ -1481,6 +1505,9 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) {
 
 	testrpc.WaitForLeader(t, a.RPC, dc)
 
+	// The agent token is the only token used for deleteCheck.
+	setAgentToken(t, a)
+
 	token := createToken(t, a, testRegisterRules)
 
 	// Create services using the root token