diff --git a/agent/consul/connect_ca_endpoint.go b/agent/consul/connect_ca_endpoint.go index 47672ee55..4cdb72ff7 100644 --- a/agent/consul/connect_ca_endpoint.go +++ b/agent/consul/connect_ca_endpoint.go @@ -5,6 +5,7 @@ import ( "fmt" "reflect" "strings" + "time" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/connect" @@ -177,6 +178,7 @@ func (s *ConnectCA) ConfigurationSet( newRoot := *r if newRoot.Active { newRoot.Active = false + newRoot.RotatedOutAt = time.Now() } newRoots = append(newRoots, &newRoot) } diff --git a/agent/consul/leader.go b/agent/consul/leader.go index 5fe4eebec..8b32226dc 100644 --- a/agent/consul/leader.go +++ b/agent/consul/leader.go @@ -28,7 +28,18 @@ const ( barrierWriteTimeout = 2 * time.Minute ) -var minAutopilotVersion = version.Must(version.NewVersion("0.8.0")) +var ( + // caRootPruneInterval is how often we check for stale CARoots to remove. + caRootPruneInterval = time.Hour + + // caRootExpireDuration is the duration after which an inactive root is considered + // "expired". Currently this is based on the default leaf cert TTL of 3 days. + caRootExpireDuration = 7 * 24 * time.Hour + + // minAutopilotVersion is the minimum Consul version in which Autopilot features + // are supported. + minAutopilotVersion = version.Must(version.NewVersion("0.8.0")) +) // monitorLeadership is used to monitor if we acquire or lose our role // as the leader in the Raft cluster. There is some work the leader is @@ -220,6 +231,8 @@ func (s *Server) establishLeadership() error { return err } + s.startCARootPruning() + s.setConsistentReadReady() return nil } @@ -236,6 +249,8 @@ func (s *Server) revokeLeadership() error { return err } + s.stopCARootPruning() + s.setCAProvider(nil, nil) s.resetConsistentReadReady() @@ -550,6 +565,92 @@ func (s *Server) setCAProvider(newProvider ca.Provider, root *structs.CARoot) { s.caProviderRoot = root } +// startCARootPruning starts a goroutine that looks for stale CARoots +// and removes them from the state store. +func (s *Server) startCARootPruning() { + s.caPruningLock.Lock() + defer s.caPruningLock.Unlock() + + if s.caPruningEnabled { + return + } + + s.caPruningCh = make(chan struct{}) + + go func() { + ticker := time.NewTicker(caRootPruneInterval) + defer ticker.Stop() + + for { + select { + case <-s.caPruningCh: + return + case <-ticker.C: + if err := s.pruneCARoots(); err != nil { + s.logger.Printf("[ERR] connect: error pruning CA roots: %v", err) + } + } + } + }() + + s.caPruningEnabled = true +} + +// pruneCARoots looks for any CARoots that have been rotated out and expired. +func (s *Server) pruneCARoots() error { + if !s.config.ConnectEnabled { + return nil + } + + idx, roots, err := s.fsm.State().CARoots(nil) + if err != nil { + return err + } + + var newRoots structs.CARoots + for _, r := range roots { + if !r.Active && !r.RotatedOutAt.IsZero() && time.Now().Sub(r.RotatedOutAt) > caRootExpireDuration { + s.logger.Printf("[INFO] connect: pruning old unused root CA (ID: %s)", r.ID) + continue + } + newRoot := *r + newRoots = append(newRoots, &newRoot) + } + + // Return early if there's nothing to remove. + if len(newRoots) == len(roots) { + return nil + } + + // Commit the new root state. + var args structs.CARequest + args.Op = structs.CAOpSetRoots + args.Index = idx + args.Roots = newRoots + resp, err := s.raftApply(structs.ConnectCARequestType, args) + if err != nil { + return err + } + if respErr, ok := resp.(error); ok { + return respErr + } + + return nil +} + +// stopCARootPruning stops the CARoot pruning process. +func (s *Server) stopCARootPruning() { + s.caPruningLock.Lock() + defer s.caPruningLock.Unlock() + + if !s.caPruningEnabled { + return + } + + close(s.caPruningCh) + s.caPruningEnabled = false +} + // reconcileReaped is used to reconcile nodes that have failed and been reaped // from Serf but remain in the catalog. This is done by looking for unknown nodes with serfHealth checks registered. // We generate a "reap" event to cause the node to be cleaned up. diff --git a/agent/consul/leader_test.go b/agent/consul/leader_test.go index da2092fda..a685223cb 100644 --- a/agent/consul/leader_test.go +++ b/agent/consul/leader_test.go @@ -5,12 +5,14 @@ import ( "testing" "time" + "github.com/hashicorp/consul/agent/connect" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/testrpc" "github.com/hashicorp/consul/testutil/retry" "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/serf/serf" + "github.com/stretchr/testify/require" ) func TestLeader_RegisterMember(t *testing.T) { @@ -1001,3 +1003,64 @@ func TestLeader_ACL_Initialization(t *testing.T) { }) } } + +func TestLeader_CARootPruning(t *testing.T) { + t.Parallel() + + caRootExpireDuration = 500 * time.Millisecond + caRootPruneInterval = 200 * time.Millisecond + + require := require.New(t) + dir1, s1 := testServer(t) + defer os.RemoveAll(dir1) + defer s1.Shutdown() + codec := rpcClient(t, s1) + defer codec.Close() + + testrpc.WaitForLeader(t, s1.RPC, "dc1") + + // Get the current root + rootReq := &structs.DCSpecificRequest{ + Datacenter: "dc1", + } + var rootList structs.IndexedCARoots + require.Nil(msgpackrpc.CallWithCodec(codec, "ConnectCA.Roots", rootReq, &rootList)) + require.Len(rootList.Roots, 1) + oldRoot := rootList.Roots[0] + + // Update the provider config to use a new private key, which should + // cause a rotation. + _, newKey, err := connect.GeneratePrivateKey() + require.NoError(err) + newConfig := &structs.CAConfiguration{ + Provider: "consul", + Config: map[string]interface{}{ + "PrivateKey": newKey, + "RootCert": "", + "RotationPeriod": 90 * 24 * time.Hour, + }, + } + { + args := &structs.CARequest{ + Datacenter: "dc1", + Config: newConfig, + } + var reply interface{} + + require.NoError(msgpackrpc.CallWithCodec(codec, "ConnectCA.ConfigurationSet", args, &reply)) + } + + // Should have 2 roots now. + _, roots, err := s1.fsm.State().CARoots(nil) + require.NoError(err) + require.Len(roots, 2) + + time.Sleep(caRootExpireDuration * 2) + + // Now the old root should be pruned. + _, roots, err = s1.fsm.State().CARoots(nil) + require.NoError(err) + require.Len(roots, 1) + require.True(roots[0].Active) + require.NotEqual(roots[0].ID, oldRoot.ID) +} diff --git a/agent/consul/server.go b/agent/consul/server.go index ca363e7a1..4520b1111 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -107,6 +107,12 @@ type Server struct { caProviderRoot *structs.CARoot caProviderLock sync.RWMutex + // caPruningCh is used to shut down the CA root pruning goroutine when we + // lose leadership. + caPruningCh chan struct{} + caPruningLock sync.RWMutex + caPruningEnabled bool + // Consul configuration config *Config diff --git a/agent/structs/connect_ca.go b/agent/structs/connect_ca.go index 2577a99a6..375a7df32 100644 --- a/agent/structs/connect_ca.go +++ b/agent/structs/connect_ca.go @@ -73,6 +73,11 @@ type CARoot struct { // cannot be active. Active bool + // RotatedOutAt is the time at which this CA was removed from the state. + // This will only be set on roots that have been rotated out from being the + // active root. + RotatedOutAt time.Time `json:"-"` + RaftIndex }