From db62541676468080d673bc710a026f3df4456d20 Mon Sep 17 00:00:00 2001 From: "R.B. Boyer" Date: Tue, 5 Jan 2021 17:04:27 -0600 Subject: [PATCH] acl: use the presence of a management policy in the state store as a sign that we already migrated to v2 acls (#9505) This way we only have to wait for the serf barrier to pass once before we can upgrade to v2 acls. Without this patch every restart needs to re-compute the change, and potentially if a stray older node joins after a migration it might regress back to v1 mode which would be problematic. --- .changelog/9505.txt | 3 ++ agent/consul/acl_server.go | 10 ++++ agent/consul/leader_test.go | 94 +++++++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100644 .changelog/9505.txt diff --git a/.changelog/9505.txt b/.changelog/9505.txt new file mode 100644 index 000000000..51320af7b --- /dev/null +++ b/.changelog/9505.txt @@ -0,0 +1,3 @@ +```release-note:improvement +acl: use the presence of a management policy in the state store as a sign that we already migrated to v2 acls +``` diff --git a/agent/consul/acl_server.go b/agent/consul/acl_server.go index a495a312b..56627a487 100644 --- a/agent/consul/acl_server.go +++ b/agent/consul/acl_server.go @@ -108,6 +108,16 @@ func (s *Server) canUpgradeToNewACLs(isLeader bool) bool { return false } + // Check to see if we already upgraded the last time we ran by seeing if we + // have a copy of any global management policy stored locally. This should + // always be true because policies always replicate. + _, mgmtPolicy, err := s.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta()) + if err != nil { + s.logger.Warn("Failed to get the builtin global-management policy to check for a completed ACL upgrade; skipping this optimization", "error", err) + } else if mgmtPolicy != nil { + return true + } + if !s.InACLDatacenter() { foundServers, mode, _ := ServersGetACLMode(s, "", s.config.ACLDatacenter) if mode != structs.ACLModeEnabled || !foundServers { diff --git a/agent/consul/leader_test.go b/agent/consul/leader_test.go index dfe9ffe39..08466c840 100644 --- a/agent/consul/leader_test.go +++ b/agent/consul/leader_test.go @@ -10,6 +10,7 @@ import ( "time" "github.com/hashicorp/consul/agent/structs" + tokenStore "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/sdk/testutil" "github.com/hashicorp/consul/sdk/testutil/retry" @@ -1272,6 +1273,99 @@ func TestLeader_ACLUpgrade(t *testing.T) { }) } +func TestLeader_ACLUpgrade_IsStickyEvenIfSerfTagsRegress(t *testing.T) { + if testing.Short() { + t.Skip("too slow for testing.Short") + } + + t.Parallel() + + // We test this by having two datacenters with one server each. They + // initially come up and complete the migration, then we power them both + // off. We leave the primary off permanently, and then we stand up the + // secondary. Hopefully it should transition to ENABLED instead of being + // stuck in LEGACY. + + dir1, s1 := testServerWithConfig(t, func(c *Config) { + c.Datacenter = "dc1" + c.ACLDatacenter = "dc1" + c.ACLsEnabled = true + c.ACLMasterToken = "root" + }) + defer os.RemoveAll(dir1) + defer s1.Shutdown() + codec := rpcClient(t, s1) + defer codec.Close() + + waitForLeaderEstablishment(t, s1) + + dir2, s2 := testServerWithConfig(t, func(c *Config) { + c.Datacenter = "dc2" + c.ACLDatacenter = "dc1" + c.ACLsEnabled = true + c.ACLTokenReplication = false + c.ACLReplicationRate = 100 + c.ACLReplicationBurst = 100 + c.ACLReplicationApplyLimit = 1000000 + }) + defer os.RemoveAll(dir2) + defer s2.Shutdown() + codec2 := rpcClient(t, s2) + defer codec2.Close() + + s2.tokens.UpdateReplicationToken("root", tokenStore.TokenSourceConfig) + + testrpc.WaitForLeader(t, s2.RPC, "dc2") + waitForLeaderEstablishment(t, s2) + + // Create the WAN link + joinWAN(t, s2, s1) + waitForLeaderEstablishment(t, s1) + waitForLeaderEstablishment(t, s2) + + waitForNewACLs(t, s1) + waitForNewACLs(t, s2) + waitForNewACLReplication(t, s2, structs.ACLReplicatePolicies, 1, 0, 0) + + // Everybody has the management policy. + retry.Run(t, func(r *retry.R) { + _, policy1, err := s1.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta()) + require.NoError(r, err) + require.NotNil(r, policy1) + + _, policy2, err := s2.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta()) + require.NoError(r, err) + require.NotNil(r, policy2) + }) + + // Shutdown s1 and s2. + s1.Shutdown() + s2.Shutdown() + + // Restart just s2 + + dir2new, s2new := testServerWithConfig(t, func(c *Config) { + c.Datacenter = "dc2" + c.ACLDatacenter = "dc1" + c.ACLsEnabled = true + c.ACLTokenReplication = false + c.ACLReplicationRate = 100 + c.ACLReplicationBurst = 100 + c.ACLReplicationApplyLimit = 1000000 + + c.DataDir = s2.config.DataDir + c.NodeName = s2.config.NodeName + c.NodeID = s2.config.NodeID + }) + defer os.RemoveAll(dir2new) + defer s2new.Shutdown() + + waitForLeaderEstablishment(t, s2new) + + // It should be able to transition without connectivity to the primary. + waitForNewACLs(t, s2new) +} + func TestLeader_ConfigEntryBootstrap(t *testing.T) { if testing.Short() { t.Skip("too slow for testing.Short")