From 3c520019e9482ec9b241c457ab34f195eed27459 Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Thu, 21 Jun 2018 15:42:28 -0700 Subject: [PATCH 1/2] connect/ca: add logic for pruning old stale RootCA entries --- agent/consul/connect_ca_endpoint.go | 2 + agent/consul/leader.go | 107 +++++++++++++++++++++++++++- agent/consul/leader_test.go | 63 ++++++++++++++++ agent/consul/server.go | 6 ++ agent/structs/connect_ca.go | 5 ++ 5 files changed, 182 insertions(+), 1 deletion(-) diff --git a/agent/consul/connect_ca_endpoint.go b/agent/consul/connect_ca_endpoint.go index 47672ee55..1e04f6733 100644 --- a/agent/consul/connect_ca_endpoint.go +++ b/agent/consul/connect_ca_endpoint.go @@ -5,6 +5,7 @@ import ( "fmt" "reflect" "strings" + "time" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/connect" @@ -177,6 +178,7 @@ func (s *ConnectCA) ConfigurationSet( newRoot := *r if newRoot.Active { newRoot.Active = false + newRoot.RotateOutAt = time.Now().Add(caRootExpireDuration) } newRoots = append(newRoots, &newRoot) } diff --git a/agent/consul/leader.go b/agent/consul/leader.go index 5fe4eebec..2b0497dc3 100644 --- a/agent/consul/leader.go +++ b/agent/consul/leader.go @@ -28,7 +28,18 @@ const ( barrierWriteTimeout = 2 * time.Minute ) -var minAutopilotVersion = version.Must(version.NewVersion("0.8.0")) +var ( + // caRootPruneInterval is how often we check for stale CARoots to remove. + caRootPruneInterval = time.Hour + + // caRootExpireDuration is the duration after which an inactive root is considered + // "expired". + caRootExpireDuration = 7 * 24 * time.Hour + + // minAutopilotVersion is the minimum Consul version in which Autopilot features + // are supported. + minAutopilotVersion = version.Must(version.NewVersion("0.8.0")) +) // monitorLeadership is used to monitor if we acquire or lose our role // as the leader in the Raft cluster. There is some work the leader is @@ -220,6 +231,8 @@ func (s *Server) establishLeadership() error { return err } + s.startCARootPruning() + s.setConsistentReadReady() return nil } @@ -236,6 +249,8 @@ func (s *Server) revokeLeadership() error { return err } + s.stopCARootPruning() + s.setCAProvider(nil, nil) s.resetConsistentReadReady() @@ -550,6 +565,96 @@ func (s *Server) setCAProvider(newProvider ca.Provider, root *structs.CARoot) { s.caProviderRoot = root } +// startCARootPruning starts a goroutine that looks for stale CARoots +// and removes them from the state store. +func (s *Server) startCARootPruning() { + if !s.config.ConnectEnabled { + return + } + + s.caPruningLock.Lock() + defer s.caPruningLock.Unlock() + + if s.caPruningEnabled { + return + } + + s.caPruningCh = make(chan struct{}) + + go func() { + ticker := time.NewTicker(caRootPruneInterval) + defer ticker.Stop() + + for { + select { + case <-s.caPruningCh: + return + case <-ticker.C: + if err := s.pruneCARoots(); err != nil { + s.logger.Printf("[ERR] connect: error pruning CA roots: %v", err) + } + } + } + }() + + s.caPruningEnabled = true +} + +// pruneCARoots looks for any CARoots that have been rotated out and expired. +func (s *Server) pruneCARoots() error { + idx, roots, err := s.fsm.State().CARoots(nil) + if err != nil { + return err + } + + var newRoots structs.CARoots + for _, r := range roots { + if !r.Active && !r.RotateOutAt.IsZero() && r.RotateOutAt.Before(time.Now()) { + s.logger.Printf("[INFO] connect: pruning old unused root CA (ID: %s)", r.ID) + continue + } + newRoot := *r + newRoots = append(newRoots, &newRoot) + } + + // Return early if there's nothing to remove. + if len(newRoots) == len(roots) { + return nil + } + + // Commit the new root state. + var args structs.CARequest + args.Op = structs.CAOpSetRoots + args.Index = idx + args.Roots = newRoots + resp, err := s.raftApply(structs.ConnectCARequestType, args) + if err != nil { + return err + } + if respErr, ok := resp.(error); ok { + return respErr + } + + return nil +} + +// stopCARootPruning stops the CARoot pruning process. +func (s *Server) stopCARootPruning() { + if !s.config.ConnectEnabled { + return + } + + s.caPruningLock.Lock() + defer s.caPruningLock.Unlock() + + if !s.caPruningEnabled { + return + } + + close(s.caPruningCh) + s.caPruningEnabled = false +} + // reconcileReaped is used to reconcile nodes that have failed and been reaped // from Serf but remain in the catalog. This is done by looking for unknown nodes with serfHealth checks registered. // We generate a "reap" event to cause the node to be cleaned up. diff --git a/agent/consul/leader_test.go b/agent/consul/leader_test.go index da2092fda..a685223cb 100644 --- a/agent/consul/leader_test.go +++ b/agent/consul/leader_test.go @@ -5,12 +5,14 @@ import ( "testing" "time" + "github.com/hashicorp/consul/agent/connect" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/testrpc" "github.com/hashicorp/consul/testutil/retry" "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/serf/serf" + "github.com/stretchr/testify/require" ) func TestLeader_RegisterMember(t *testing.T) { @@ -1001,3 +1003,64 @@ func TestLeader_ACL_Initialization(t *testing.T) { }) } } + +func TestLeader_CARootPruning(t *testing.T) { + t.Parallel() + + caRootExpireDuration = 500 * time.Millisecond + caRootPruneInterval = 200 * time.Millisecond + + require := require.New(t) + dir1, s1 := testServer(t) + defer os.RemoveAll(dir1) + defer s1.Shutdown() + codec := rpcClient(t, s1) + defer codec.Close() + + testrpc.WaitForLeader(t, s1.RPC, "dc1") + + // Get the current root + rootReq := &structs.DCSpecificRequest{ + Datacenter: "dc1", + } + var rootList structs.IndexedCARoots + require.Nil(msgpackrpc.CallWithCodec(codec, "ConnectCA.Roots", rootReq, &rootList)) + require.Len(rootList.Roots, 1) + oldRoot := rootList.Roots[0] + + // Update the provider config to use a new private key, which should + // cause a rotation. + _, newKey, err := connect.GeneratePrivateKey() + require.NoError(err) + newConfig := &structs.CAConfiguration{ + Provider: "consul", + Config: map[string]interface{}{ + "PrivateKey": newKey, + "RootCert": "", + "RotationPeriod": 90 * 24 * time.Hour, + }, + } + { + args := &structs.CARequest{ + Datacenter: "dc1", + Config: newConfig, + } + var reply interface{} + + require.NoError(msgpackrpc.CallWithCodec(codec, "ConnectCA.ConfigurationSet", args, &reply)) + } + + // Should have 2 roots now. + _, roots, err := s1.fsm.State().CARoots(nil) + require.NoError(err) + require.Len(roots, 2) + + time.Sleep(caRootExpireDuration * 2) + + // Now the old root should be pruned. + _, roots, err = s1.fsm.State().CARoots(nil) + require.NoError(err) + require.Len(roots, 1) + require.True(roots[0].Active) + require.NotEqual(roots[0].ID, oldRoot.ID) +} diff --git a/agent/consul/server.go b/agent/consul/server.go index ca363e7a1..4520b1111 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -107,6 +107,12 @@ type Server struct { caProviderRoot *structs.CARoot caProviderLock sync.RWMutex + // caPruningCh is used to shut down the CA root pruning goroutine when we + // lose leadership. + caPruningCh chan struct{} + caPruningLock sync.RWMutex + caPruningEnabled bool + // Consul configuration config *Config diff --git a/agent/structs/connect_ca.go b/agent/structs/connect_ca.go index 2577a99a6..52b9c16f1 100644 --- a/agent/structs/connect_ca.go +++ b/agent/structs/connect_ca.go @@ -73,6 +73,11 @@ type CARoot struct { // cannot be active. Active bool + // RotateOutAt is the time at which this CA can be removed from the state. + // This will only be set on roots that have been rotated out from being the + // active one. + RotateOutAt time.Time `json:"-"` + RaftIndex } From 883b2a518a60121434b3895604e97e0c67fd8095 Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Fri, 6 Jul 2018 16:05:25 -0700 Subject: [PATCH 2/2] Store the time CARoot is rotated out instead of when to prune --- agent/consul/connect_ca_endpoint.go | 2 +- agent/consul/leader.go | 16 ++++++---------- agent/structs/connect_ca.go | 6 +++--- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/agent/consul/connect_ca_endpoint.go b/agent/consul/connect_ca_endpoint.go index 1e04f6733..4cdb72ff7 100644 --- a/agent/consul/connect_ca_endpoint.go +++ b/agent/consul/connect_ca_endpoint.go @@ -178,7 +178,7 @@ func (s *ConnectCA) ConfigurationSet( newRoot := *r if newRoot.Active { newRoot.Active = false - newRoot.RotateOutAt = time.Now().Add(caRootExpireDuration) + newRoot.RotatedOutAt = time.Now() } newRoots = append(newRoots, &newRoot) } diff --git a/agent/consul/leader.go b/agent/consul/leader.go index 2b0497dc3..8b32226dc 100644 --- a/agent/consul/leader.go +++ b/agent/consul/leader.go @@ -33,7 +33,7 @@ var ( caRootPruneInterval = time.Hour // caRootExpireDuration is the duration after which an inactive root is considered - // "expired". + // "expired". Currently this is based on the default leaf cert TTL of 3 days. caRootExpireDuration = 7 * 24 * time.Hour // minAutopilotVersion is the minimum Consul version in which Autopilot features @@ -568,10 +568,6 @@ func (s *Server) setCAProvider(newProvider ca.Provider, root *structs.CARoot) { // startCARootPruning starts a goroutine that looks for stale CARoots // and removes them from the state store. func (s *Server) startCARootPruning() { - if !s.config.ConnectEnabled { - return - } - s.caPruningLock.Lock() defer s.caPruningLock.Unlock() @@ -602,6 +598,10 @@ func (s *Server) startCARootPruning() { // pruneCARoots looks for any CARoots that have been rotated out and expired. func (s *Server) pruneCARoots() error { + if !s.config.ConnectEnabled { + return nil + } + idx, roots, err := s.fsm.State().CARoots(nil) if err != nil { return err @@ -609,7 +609,7 @@ func (s *Server) pruneCARoots() error { var newRoots structs.CARoots for _, r := range roots { - if !r.Active && !r.RotateOutAt.IsZero() && r.RotateOutAt.Before(time.Now()) { + if !r.Active && !r.RotatedOutAt.IsZero() && time.Now().Sub(r.RotatedOutAt) > caRootExpireDuration { s.logger.Printf("[INFO] connect: pruning old unused root CA (ID: %s)", r.ID) continue } @@ -640,10 +640,6 @@ func (s *Server) pruneCARoots() error { // stopCARootPruning stops the CARoot pruning process. func (s *Server) stopCARootPruning() { - if !s.config.ConnectEnabled { - return - } - s.caPruningLock.Lock() defer s.caPruningLock.Unlock() diff --git a/agent/structs/connect_ca.go b/agent/structs/connect_ca.go index 52b9c16f1..375a7df32 100644 --- a/agent/structs/connect_ca.go +++ b/agent/structs/connect_ca.go @@ -73,10 +73,10 @@ type CARoot struct { // cannot be active. Active bool - // RotateOutAt is the time at which this CA can be removed from the state. + // RotatedOutAt is the time at which this CA was removed from the state. // This will only be set on roots that have been rotated out from being the - // active one. - RotateOutAt time.Time `json:"-"` + // active root. + RotatedOutAt time.Time `json:"-"` RaftIndex }