From 0b4876f906c8e49fe989636c84b7f4cf03506512 Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Wed, 21 Oct 2020 12:49:21 -0700 Subject: [PATCH 1/7] connect: Add logic for updating secondary DC intermediate on config set --- agent/connect/testing_spiffe.go | 8 +- agent/consul/connect_ca_endpoint.go | 27 ++++- agent/consul/connect_ca_endpoint_test.go | 134 +++++++++++++++++++++++ agent/consul/leader_connect.go | 68 +++++++----- 4 files changed, 207 insertions(+), 30 deletions(-) diff --git a/agent/connect/testing_spiffe.go b/agent/connect/testing_spiffe.go index c7fa6f753..4a0577880 100644 --- a/agent/connect/testing_spiffe.go +++ b/agent/connect/testing_spiffe.go @@ -12,10 +12,16 @@ func TestSpiffeIDService(t testing.T, service string) *SpiffeIDService { // TestSpiffeIDServiceWithHost returns a SPIFFE ID representing a service with // the specified trust domain. func TestSpiffeIDServiceWithHost(t testing.T, service, host string) *SpiffeIDService { + return TestSpiffeIDServiceWithHostDC(t, service, host, "dc1") +} + +// TestSpiffeIDServiceWithHostDC returns a SPIFFE ID representing a service with +// the specified trust domain for the given datacenter. +func TestSpiffeIDServiceWithHostDC(t testing.T, service, host, datacenter string) *SpiffeIDService { return &SpiffeIDService{ Host: host, Namespace: "default", - Datacenter: "dc1", + Datacenter: datacenter, Service: service, } } diff --git a/agent/consul/connect_ca_endpoint.go b/agent/consul/connect_ca_endpoint.go index 2198c0ec6..9733f12b4 100644 --- a/agent/consul/connect_ca_endpoint.go +++ b/agent/consul/connect_ca_endpoint.go @@ -163,6 +163,27 @@ func (s *ConnectCA) ConfigurationSet( } }() + // If this is a secondary, check if the intermediate needs to be regenerated. + if s.srv.config.Datacenter != s.srv.config.PrimaryDatacenter { + // Get the current root certs from the primary DC. + var roots structs.IndexedCARoots + rootArgs := structs.DCSpecificRequest{ + Datacenter: s.srv.config.PrimaryDatacenter, + } + if err := s.srv.forwardDC("ConnectCA.Roots", s.srv.config.PrimaryDatacenter, &rootArgs, &roots); err != nil { + return fmt.Errorf("Error retrieving the primary datacenter's roots: %v", err) + } + + s.srv.caProviderReconfigurationLock.Lock() + defer s.srv.caProviderReconfigurationLock.Unlock() + if err := s.srv.initializeSecondaryCA(newProvider, roots, args.Config); err != nil { + return fmt.Errorf("Error updating secondary datacenter CA config: %v", err) + } + cleanupNewProvider = false + s.logger.Info("Secondary CA provider config updated") + return nil + } + if err := newProvider.GenerateRoot(); err != nil { return fmt.Errorf("error generating CA root certificate: %v", err) } @@ -192,10 +213,8 @@ func (s *ConnectCA) ConfigurationSet( return err } - // If the root didn't change or if this is a secondary DC, just update the - // config and return. - if (s.srv.config.Datacenter != s.srv.config.PrimaryDatacenter) || - root != nil && root.ID == newActiveRoot.ID { + // If the root didn't change, just update the config and return. + if root != nil && root.ID == newActiveRoot.ID { args.Op = structs.CAOpSetConfig resp, err := s.srv.raftApply(structs.ConnectCARequestType, args) if err != nil { diff --git a/agent/consul/connect_ca_endpoint_test.go b/agent/consul/connect_ca_endpoint_test.go index d55c7e4b6..315358911 100644 --- a/agent/consul/connect_ca_endpoint_test.go +++ b/agent/consul/connect_ca_endpoint_test.go @@ -405,6 +405,140 @@ func TestConnectCAConfig_TriggerRotation(t *testing.T) { } } +func TestConnectCAConfig_UpdateSecondary(t *testing.T) { + t.Parallel() + + assert := assert.New(t) + require := require.New(t) + + // Initialize primary as the primary DC + dir1, s1 := testServerWithConfig(t, func(c *Config) { + c.Datacenter = "primary" + }) + defer os.RemoveAll(dir1) + defer s1.Shutdown() + + testrpc.WaitForLeader(t, s1.RPC, "primary") + + // secondary as a secondary DC + dir2, s2 := testServerWithConfig(t, func(c *Config) { + c.Datacenter = "secondary" + c.PrimaryDatacenter = "primary" + }) + defer os.RemoveAll(dir2) + defer s2.Shutdown() + codec := rpcClient(t, s2) + defer codec.Close() + + // Create the WAN link + joinWAN(t, s2, s1) + testrpc.WaitForLeader(t, s2.RPC, "secondary") + + // Capture the current root + rootList, activeRoot, err := getTestRoots(s1, "primary") + require.NoError(err) + require.Len(rootList.Roots, 1) + rootCert := activeRoot + + waitForActiveCARoot(t, s1, rootCert) + waitForActiveCARoot(t, s2, rootCert) + + // Capture the current intermediate + rootList, activeRoot, err = getTestRoots(s2, "secondary") + require.NoError(err) + require.Len(rootList.Roots, 1) + require.Len(activeRoot.IntermediateCerts, 1) + oldIntermediatePEM := activeRoot.IntermediateCerts[0] + + // Update the secondary CA config to use a new private key, which should + // cause a re-signing with a new intermediate. + _, newKey, err := connect.GeneratePrivateKey() + assert.NoError(err) + newConfig := &structs.CAConfiguration{ + Provider: "consul", + Config: map[string]interface{}{ + "PrivateKey": newKey, + "RootCert": "", + "RotationPeriod": 90 * 24 * time.Hour, + }, + } + { + args := &structs.CARequest{ + Datacenter: "secondary", + Config: newConfig, + } + var reply interface{} + + require.NoError(msgpackrpc.CallWithCodec(codec, "ConnectCA.ConfigurationSet", args, &reply)) + } + + // Make sure the new intermediate has replaced the old one in the active root, + // and that the root itself hasn't changed. + var newIntermediatePEM string + { + args := &structs.DCSpecificRequest{ + Datacenter: "secondary", + } + var reply structs.IndexedCARoots + require.Nil(msgpackrpc.CallWithCodec(codec, "ConnectCA.Roots", args, &reply)) + require.Len(reply.Roots, 1) + require.Len(reply.Roots[0].IntermediateCerts, 1) + newIntermediatePEM = reply.Roots[0].IntermediateCerts[0] + require.NotEqual(oldIntermediatePEM, newIntermediatePEM) + require.Equal(reply.Roots[0].RootCert, rootCert.RootCert) + } + + // Verify the new config was set. + { + args := &structs.DCSpecificRequest{ + Datacenter: "secondary", + } + var reply structs.CAConfiguration + require.NoError(msgpackrpc.CallWithCodec(codec, "ConnectCA.ConfigurationGet", args, &reply)) + + actual, err := ca.ParseConsulCAConfig(reply.Config) + require.NoError(err) + expected, err := ca.ParseConsulCAConfig(newConfig.Config) + require.NoError(err) + assert.Equal(reply.Provider, newConfig.Provider) + assert.Equal(actual, expected) + } + + // Verify that new leaf certs get the new intermediate bundled + { + // Generate a CSR and request signing + spiffeId := connect.TestSpiffeIDServiceWithHostDC(t, "web", connect.TestClusterID+".consul", "secondary") + csr, _ := connect.TestCSR(t, spiffeId) + args := &structs.CASignRequest{ + Datacenter: "secondary", + CSR: csr, + } + var reply structs.IssuedCert + require.NoError(msgpackrpc.CallWithCodec(codec, "ConnectCA.Sign", args, &reply)) + + // Verify the leaf cert has the new intermediate. + { + roots := x509.NewCertPool() + assert.True(roots.AppendCertsFromPEM([]byte(rootCert.RootCert))) + leaf, err := connect.ParseCert(reply.CertPEM) + require.NoError(err) + + intermediates := x509.NewCertPool() + require.True(intermediates.AppendCertsFromPEM([]byte(newIntermediatePEM))) + + _, err = leaf.Verify(x509.VerifyOptions{ + Roots: roots, + Intermediates: intermediates, + }) + require.NoError(err) + } + + // Verify other fields + assert.Equal("web", reply.Service) + assert.Equal(spiffeId.URI().String(), reply.ServiceURI) + } +} + // Test CA signing func TestConnectCASign(t *testing.T) { t.Parallel() diff --git a/agent/consul/leader_connect.go b/agent/consul/leader_connect.go index f87641a1c..67c56520a 100644 --- a/agent/consul/leader_connect.go +++ b/agent/consul/leader_connect.go @@ -214,7 +214,7 @@ func (s *Server) initializeCA() error { if err := s.initializeSecondaryProvider(provider, roots); err != nil { return fmt.Errorf("error configuring provider: %v", err) } - if err := s.initializeSecondaryCA(provider, roots); err != nil { + if err := s.initializeSecondaryCA(provider, roots, nil); err != nil { return err } @@ -337,7 +337,7 @@ func (s *Server) initializeRootCA(provider ca.Provider, conf *structs.CAConfigur // intermediate. // It is being called while holding caProviderReconfigurationLock // which means it must never take that lock itself or call anything that does. -func (s *Server) initializeSecondaryCA(provider ca.Provider, primaryRoots structs.IndexedCARoots) error { +func (s *Server) initializeSecondaryCA(provider ca.Provider, primaryRoots structs.IndexedCARoots, config *structs.CAConfiguration) error { activeIntermediate, err := provider.ActiveIntermediate() if err != nil { return err @@ -432,19 +432,25 @@ func (s *Server) initializeSecondaryCA(provider ca.Provider, primaryRoots struct if err != nil { return err } + + // Determine whether a root update is needed, and persist the roots/config accordingly. + var newRoot *structs.CARoot if activeRoot == nil || activeRoot.ID != newActiveRoot.ID || newIntermediate { - if err := s.persistNewRoot(provider, newActiveRoot); err != nil { - return err - } + newRoot = newActiveRoot + } + if err := s.persistNewRootAndConfig(provider, newRoot, config); err != nil { + return err } s.setCAProvider(provider, newActiveRoot) return nil } -// persistNewRoot is being called while holding caProviderReconfigurationLock +// persistNewRootAndConfig is being called while holding caProviderReconfigurationLock // which means it must never take that lock itself or call anything that does. -func (s *Server) persistNewRoot(provider ca.Provider, newActiveRoot *structs.CARoot) error { +// If newActiveRoot is non-nil, it will be appended to the current roots list. +// If config is non-nil, it will be used to overwrite the existing config. +func (s *Server) persistNewRootAndConfig(provider ca.Provider, newActiveRoot *structs.CARoot, config *structs.CAConfiguration) error { connectLogger := s.loggers.Named(logging.Connect) state := s.fsm.State() idx, oldRoots, err := state.CARoots(nil) @@ -452,15 +458,23 @@ func (s *Server) persistNewRoot(provider ca.Provider, newActiveRoot *structs.CAR return err } - _, config, err := state.CAConfig(nil) + var newConf structs.CAConfiguration + _, storedConfig, err := state.CAConfig(nil) if err != nil { return err } - if config == nil { + if storedConfig == nil { return fmt.Errorf("local CA not initialized yet") } - newConf := *config - newConf.ClusterID = newActiveRoot.ExternalTrustDomain + if config != nil { + newConf = *config + } else { + newConf = *storedConfig + } + newConf.ModifyIndex = storedConfig.ModifyIndex + if newActiveRoot != nil { + newConf.ClusterID = newActiveRoot.ExternalTrustDomain + } // Persist any state the provider needs us to newConf.State, err = provider.State() @@ -468,21 +482,25 @@ func (s *Server) persistNewRoot(provider ca.Provider, newActiveRoot *structs.CAR return fmt.Errorf("error getting provider state: %v", err) } - // Copy the root list and append the new active root, updating the old root - // with the time it was rotated out. + // If there's a new active root, copy the root list and append it, updating + // the old root with the time it was rotated out. var newRoots structs.CARoots - for _, r := range oldRoots { - newRoot := *r - if newRoot.Active { - newRoot.Active = false - newRoot.RotatedOutAt = time.Now() + if newActiveRoot != nil { + for _, r := range oldRoots { + newRoot := *r + if newRoot.Active { + newRoot.Active = false + newRoot.RotatedOutAt = time.Now() + } + if newRoot.ExternalTrustDomain == "" { + newRoot.ExternalTrustDomain = config.ClusterID + } + newRoots = append(newRoots, &newRoot) } - if newRoot.ExternalTrustDomain == "" { - newRoot.ExternalTrustDomain = config.ClusterID - } - newRoots = append(newRoots, &newRoot) + newRoots = append(newRoots, newActiveRoot) + } else { + newRoots = oldRoots } - newRoots = append(newRoots, newActiveRoot) args := &structs.CARequest{ Op: structs.CAOpSetRootsAndConfig, @@ -748,7 +766,7 @@ func (s *Server) intermediateCertRenewalWatch(ctx context.Context) error { return err } - if err := s.persistNewRoot(provider, activeRoot); err != nil { + if err := s.persistNewRootAndConfig(provider, activeRoot, nil); err != nil { return err } @@ -808,7 +826,7 @@ func (s *Server) secondaryCARootWatch(ctx context.Context) error { // Run the secondary CA init routine to see if we need to request a new // intermediate. if s.configuredSecondaryCA() { - if err := s.initializeSecondaryCA(provider, roots); err != nil { + if err := s.initializeSecondaryCA(provider, roots, nil); err != nil { return fmt.Errorf("Failed to initialize the secondary CA: %v", err) } } From 5de81c13758065e364913eb909c18708b0448ad0 Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Wed, 11 Nov 2020 17:05:04 -0800 Subject: [PATCH 2/7] connect: Add CAManager for synchronizing CA operations --- agent/connect/ca/provider_vault.go | 2 +- agent/consul/connect_ca_endpoint.go | 23 +- agent/consul/leader.go | 4 +- agent/consul/leader_connect.go | 786 ++------------------------ agent/consul/leader_connect_ca.go | 821 ++++++++++++++++++++++++++++ agent/consul/leader_connect_test.go | 8 +- agent/consul/server.go | 22 +- agent/consul/server_connect.go | 2 +- agent/consul/server_test.go | 2 +- 9 files changed, 875 insertions(+), 795 deletions(-) create mode 100644 agent/consul/leader_connect_ca.go diff --git a/agent/connect/ca/provider_vault.go b/agent/connect/ca/provider_vault.go index ea39801a2..0510fe866 100644 --- a/agent/connect/ca/provider_vault.go +++ b/agent/connect/ca/provider_vault.go @@ -135,7 +135,7 @@ func (v *VaultProvider) renewToken(ctx context.Context, watcher *vaultapi.Lifeti go watcher.Start() case <-watcher.RenewCh(): - v.logger.Error("Successfully renewed token for Vault provider") + v.logger.Info("Successfully renewed token for Vault provider") } } } diff --git a/agent/consul/connect_ca_endpoint.go b/agent/consul/connect_ca_endpoint.go index 9733f12b4..f75ac3326 100644 --- a/agent/consul/connect_ca_endpoint.go +++ b/agent/consul/connect_ca_endpoint.go @@ -165,18 +165,13 @@ func (s *ConnectCA) ConfigurationSet( // If this is a secondary, check if the intermediate needs to be regenerated. if s.srv.config.Datacenter != s.srv.config.PrimaryDatacenter { - // Get the current root certs from the primary DC. - var roots structs.IndexedCARoots - rootArgs := structs.DCSpecificRequest{ - Datacenter: s.srv.config.PrimaryDatacenter, - } - if err := s.srv.forwardDC("ConnectCA.Roots", s.srv.config.PrimaryDatacenter, &rootArgs, &roots); err != nil { - return fmt.Errorf("Error retrieving the primary datacenter's roots: %v", err) + // Attempt to take the CA lock in order to update the config. + if err := s.srv.caManager.setState(CAStateReconfig); err != nil { + return err } + defer s.srv.caManager.setReady() - s.srv.caProviderReconfigurationLock.Lock() - defer s.srv.caProviderReconfigurationLock.Unlock() - if err := s.srv.initializeSecondaryCA(newProvider, roots, args.Config); err != nil { + if err := s.srv.caManager.initializeSecondaryCA(newProvider, args.Config); err != nil { return fmt.Errorf("Error updating secondary datacenter CA config: %v", err) } cleanupNewProvider = false @@ -226,7 +221,7 @@ func (s *ConnectCA) ConfigurationSet( // If the config has been committed, update the local provider instance cleanupNewProvider = false - s.srv.setCAProvider(newProvider, newActiveRoot) + s.srv.caManager.setCAProvider(newProvider, newActiveRoot) s.logger.Info("CA provider config updated") @@ -239,7 +234,7 @@ func (s *ConnectCA) ConfigurationSet( // First up, sanity check that the current provider actually supports // cross-signing. - oldProvider, _ := s.srv.getCAProvider() + oldProvider, _ := s.srv.caManager.getCAProvider() if oldProvider == nil { return fmt.Errorf("internal error: CA provider is nil") } @@ -323,7 +318,7 @@ func (s *ConnectCA) ConfigurationSet( // If the config has been committed, update the local provider instance // and call teardown on the old provider cleanupNewProvider = false - s.srv.setCAProvider(newProvider, newActiveRoot) + s.srv.caManager.setCAProvider(newProvider, newActiveRoot) if err := oldProvider.Cleanup(); err != nil { s.logger.Warn("failed to clean up old provider", "provider", config.Provider) @@ -457,7 +452,7 @@ func (s *ConnectCA) SignIntermediate( return acl.ErrPermissionDenied } - provider, _ := s.srv.getCAProvider() + provider, _ := s.srv.caManager.getCAProvider() if provider == nil { return fmt.Errorf("internal error: CA provider is nil") } diff --git a/agent/consul/leader.go b/agent/consul/leader.go index a1d90131a..a251d63b3 100644 --- a/agent/consul/leader.go +++ b/agent/consul/leader.go @@ -328,7 +328,7 @@ func (s *Server) establishLeadership(ctx context.Context) error { s.autopilot.Start(ctx) // todo(kyhavlov): start a goroutine here for handling periodic CA rotation - if err := s.initializeCA(); err != nil { + if err := s.caManager.initializeCA(); err != nil { return err } @@ -375,7 +375,7 @@ func (s *Server) revokeLeadership() { s.stopConnectLeader() - s.setCAProvider(nil, nil) + s.caManager.setCAProvider(nil, nil) s.stopACLTokenReaping() diff --git a/agent/consul/leader_connect.go b/agent/consul/leader_connect.go index 67c56520a..64a157198 100644 --- a/agent/consul/leader_connect.go +++ b/agent/consul/leader_connect.go @@ -3,17 +3,13 @@ package consul import ( "context" "fmt" - "reflect" - "strings" "time" "golang.org/x/time/rate" - "github.com/hashicorp/consul/agent/connect" "github.com/hashicorp/consul/agent/connect/ca" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/logging" - uuid "github.com/hashicorp/go-uuid" ) const ( @@ -32,76 +28,36 @@ var ( maxRetryBackoff = 256 ) -// initializeCAConfig is used to initialize the CA config if necessary -// when setting up the CA during establishLeadership -func (s *Server) initializeCAConfig() (*structs.CAConfiguration, error) { - state := s.fsm.State() - _, config, err := state.CAConfig(nil) - if err != nil { - return nil, err - } - if config == nil { - config = s.config.CAConfig - if config.ClusterID == "" { - id, err := uuid.GenerateUUID() - if err != nil { - return nil, err - } - config.ClusterID = id - } - } else if _, ok := config.Config["IntermediateCertTTL"]; !ok { - dup := *config - copied := make(map[string]interface{}) - for k, v := range dup.Config { - copied[k] = v - } - copied["IntermediateCertTTL"] = connect.DefaultIntermediateCertTTL.String() - dup.Config = copied - config = &dup - } else { - return config, nil +// startConnectLeader starts multi-dc connect leader routines. +func (s *Server) startConnectLeader() error { + if !s.config.ConnectEnabled { + return nil } - req := structs.CARequest{ - Op: structs.CAOpSetConfig, - Config: config, - } - if resp, err := s.raftApply(structs.ConnectCARequestType, req); err != nil { - return nil, err - } else if respErr, ok := resp.(error); ok { - return nil, respErr + // Start the Connect secondary DC actions if enabled. + if s.config.Datacenter != s.config.PrimaryDatacenter { + s.leaderRoutineManager.Start(secondaryCARootWatchRoutineName, s.caManager.secondaryCARootWatch) } - return config, nil + s.leaderRoutineManager.Start(intermediateCertRenewWatchRoutineName, s.caManager.intermediateCertRenewalWatch) + s.leaderRoutineManager.Start(caRootPruningRoutineName, s.runCARootPruning) + return s.startIntentionConfigEntryMigration() } -// parseCARoot returns a filled-in structs.CARoot from a raw PEM value. -func parseCARoot(pemValue, provider, clusterID string) (*structs.CARoot, error) { - id, err := connect.CalculateCertFingerprint(pemValue) - if err != nil { - return nil, fmt.Errorf("error parsing root fingerprint: %v", err) +// stopConnectLeader stops connect specific leader functions. +func (s *Server) stopConnectLeader() { + s.leaderRoutineManager.Stop(intentionMigrationRoutineName) + s.leaderRoutineManager.Stop(secondaryCARootWatchRoutineName) + s.leaderRoutineManager.Stop(intermediateCertRenewWatchRoutineName) + s.leaderRoutineManager.Stop(caRootPruningRoutineName) + + // If the provider implements NeedsStop, we call Stop to perform any shutdown actions. + provider, _ := s.caManager.getCAProvider() + if provider != nil { + if needsStop, ok := provider.(ca.NeedsStop); ok { + needsStop.Stop() + } } - rootCert, err := connect.ParseCert(pemValue) - if err != nil { - return nil, fmt.Errorf("error parsing root cert: %v", err) - } - keyType, keyBits, err := connect.KeyInfoFromCert(rootCert) - if err != nil { - return nil, fmt.Errorf("error extracting root key info: %v", err) - } - return &structs.CARoot{ - ID: id, - Name: fmt.Sprintf("%s CA Root Cert", strings.Title(provider)), - SerialNumber: rootCert.SerialNumber.Uint64(), - SigningKeyID: connect.EncodeSigningKeyID(rootCert.SubjectKeyId), - ExternalTrustDomain: clusterID, - NotBefore: rootCert.NotBefore, - NotAfter: rootCert.NotAfter, - RootCert: pemValue, - PrivateKeyType: keyType, - PrivateKeyBits: keyBits, - Active: true, - }, nil } // createProvider returns a connect CA provider from the given config. @@ -126,505 +82,6 @@ func (s *Server) createCAProvider(conf *structs.CAConfiguration) (ca.Provider, e return p, nil } -// getCAProvider is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. -func (s *Server) getCAProvider() (ca.Provider, *structs.CARoot) { - retries := 0 - var result ca.Provider - var resultRoot *structs.CARoot - for result == nil { - s.caProviderLock.RLock() - result = s.caProvider - resultRoot = s.caProviderRoot - s.caProviderLock.RUnlock() - - // In cases where an agent is started with managed proxies, we may ask - // for the provider before establishLeadership completes. If we're the - // leader, then wait and get the provider again - if result == nil && s.IsLeader() && retries < 10 { - retries++ - time.Sleep(50 * time.Millisecond) - continue - } - - break - } - - return result, resultRoot -} - -// setCAProvider is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. -func (s *Server) setCAProvider(newProvider ca.Provider, root *structs.CARoot) { - s.caProviderLock.Lock() - defer s.caProviderLock.Unlock() - s.caProvider = newProvider - s.caProviderRoot = root -} - -// initializeCA sets up the CA provider when gaining leadership, either bootstrapping -// the CA if this is the primary DC or making a remote RPC for intermediate signing -// if this is a secondary DC. -func (s *Server) initializeCA() error { - connectLogger := s.loggers.Named(logging.Connect) - // Bail if connect isn't enabled. - if !s.config.ConnectEnabled { - return nil - } - - // Initialize the provider based on the current config. - conf, err := s.initializeCAConfig() - if err != nil { - return err - } - provider, err := s.createCAProvider(conf) - if err != nil { - return err - } - - s.caProviderReconfigurationLock.Lock() - defer s.caProviderReconfigurationLock.Unlock() - s.setCAProvider(provider, nil) - - // If this isn't the primary DC, run the secondary DC routine if the primary has already been upgraded to at least 1.6.0 - if s.config.PrimaryDatacenter != s.config.Datacenter { - versionOk, foundPrimary := ServersInDCMeetMinimumVersion(s, s.config.PrimaryDatacenter, minMultiDCConnectVersion) - if !foundPrimary { - connectLogger.Warn("primary datacenter is configured but unreachable - deferring initialization of the secondary datacenter CA") - // return nil because we will initialize the secondary CA later - return nil - } else if !versionOk { - // return nil because we will initialize the secondary CA later - connectLogger.Warn("servers in the primary datacenter are not at least at the minimum version - deferring initialization of the secondary datacenter CA", - "min_version", minMultiDCConnectVersion.String(), - ) - return nil - } - - // Get the root CA to see if we need to refresh our intermediate. - args := structs.DCSpecificRequest{ - Datacenter: s.config.PrimaryDatacenter, - } - var roots structs.IndexedCARoots - if err := s.forwardDC("ConnectCA.Roots", s.config.PrimaryDatacenter, &args, &roots); err != nil { - return err - } - - // Configure the CA provider and initialize the intermediate certificate if necessary. - if err := s.initializeSecondaryProvider(provider, roots); err != nil { - return fmt.Errorf("error configuring provider: %v", err) - } - if err := s.initializeSecondaryCA(provider, roots, nil); err != nil { - return err - } - - connectLogger.Info("initialized secondary datacenter CA with provider", "provider", conf.Provider) - return nil - } - - return s.initializeRootCA(provider, conf) -} - -// initializeRootCA runs the initialization logic for a root CA. -// It is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. -func (s *Server) initializeRootCA(provider ca.Provider, conf *structs.CAConfiguration) error { - connectLogger := s.loggers.Named(logging.Connect) - pCfg := ca.ProviderConfig{ - ClusterID: conf.ClusterID, - Datacenter: s.config.Datacenter, - IsPrimary: true, - RawConfig: conf.Config, - State: conf.State, - } - if err := provider.Configure(pCfg); err != nil { - return fmt.Errorf("error configuring provider: %v", err) - } - if err := provider.GenerateRoot(); err != nil { - return fmt.Errorf("error generating CA root certificate: %v", err) - } - - // Get the active root cert from the CA - rootPEM, err := provider.ActiveRoot() - if err != nil { - return fmt.Errorf("error getting root cert: %v", err) - } - rootCA, err := parseCARoot(rootPEM, conf.Provider, conf.ClusterID) - if err != nil { - return err - } - - // Also create the intermediate CA, which is the one that actually signs leaf certs - interPEM, err := provider.GenerateIntermediate() - if err != nil { - return fmt.Errorf("error generating intermediate cert: %v", err) - } - _, err = connect.ParseCert(interPEM) - if err != nil { - return fmt.Errorf("error getting intermediate cert: %v", err) - } - - // If the provider has state to persist and it's changed or new then update - // CAConfig. - pState, err := provider.State() - if err != nil { - return fmt.Errorf("error getting provider state: %v", err) - } - if !reflect.DeepEqual(conf.State, pState) { - // Update the CAConfig in raft to persist the provider state - conf.State = pState - req := structs.CARequest{ - Op: structs.CAOpSetConfig, - Config: conf, - } - if _, err = s.raftApply(structs.ConnectCARequestType, req); err != nil { - return fmt.Errorf("error persisting provider state: %v", err) - } - } - - // Check if the CA root is already initialized and exit if it is, - // adding on any existing intermediate certs since they aren't directly - // tied to the provider. - // Every change to the CA after this initial bootstrapping should - // be done through the rotation process. - state := s.fsm.State() - _, activeRoot, err := state.CARootActive(nil) - if err != nil { - return err - } - if activeRoot != nil { - // This state shouldn't be possible to get into because we update the root and - // CA config in the same FSM operation. - if activeRoot.ID != rootCA.ID { - return fmt.Errorf("stored CA root %q is not the active root (%s)", rootCA.ID, activeRoot.ID) - } - - rootCA.IntermediateCerts = activeRoot.IntermediateCerts - s.setCAProvider(provider, rootCA) - - return nil - } - - // Get the highest index - idx, _, err := state.CARoots(nil) - if err != nil { - return err - } - - // Store the root cert in raft - resp, err := s.raftApply(structs.ConnectCARequestType, &structs.CARequest{ - Op: structs.CAOpSetRoots, - Index: idx, - Roots: []*structs.CARoot{rootCA}, - }) - if err != nil { - connectLogger.Error("Raft apply failed", "error", err) - return err - } - if respErr, ok := resp.(error); ok { - return respErr - } - - s.setCAProvider(provider, rootCA) - - connectLogger.Info("initialized primary datacenter CA with provider", "provider", conf.Provider) - - return nil -} - -// initializeSecondaryCA runs the routine for generating an intermediate CA CSR and getting -// it signed by the primary DC if the root CA of the primary DC has changed since the last -// intermediate. -// It is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. -func (s *Server) initializeSecondaryCA(provider ca.Provider, primaryRoots structs.IndexedCARoots, config *structs.CAConfiguration) error { - activeIntermediate, err := provider.ActiveIntermediate() - if err != nil { - return err - } - - var ( - storedRootID string - expectedSigningKeyID string - currentSigningKeyID string - activeSecondaryRoot *structs.CARoot - ) - if activeIntermediate != "" { - // In the event that we already have an intermediate, we must have - // already replicated some primary root information locally, so check - // to see if we're up to date by fetching the rootID and the - // signingKeyID used in the secondary. - // - // Note that for the same rootID the primary representation of the root - // will have a different SigningKeyID field than the secondary - // representation of the same root. This is because it's derived from - // the intermediate which is different in all datacenters. - storedRoot, err := provider.ActiveRoot() - if err != nil { - return err - } - - storedRootID, err = connect.CalculateCertFingerprint(storedRoot) - if err != nil { - return fmt.Errorf("error parsing root fingerprint: %v, %#v", err, storedRoot) - } - - intermediateCert, err := connect.ParseCert(activeIntermediate) - if err != nil { - return fmt.Errorf("error parsing active intermediate cert: %v", err) - } - expectedSigningKeyID = connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId) - - // This will fetch the secondary's exact current representation of the - // active root. Note that this data should only be used if the IDs - // match, otherwise it's out of date and should be regenerated. - _, activeSecondaryRoot, err = s.fsm.State().CARootActive(nil) - if err != nil { - return err - } - if activeSecondaryRoot != nil { - currentSigningKeyID = activeSecondaryRoot.SigningKeyID - } - } - - // Determine which of the provided PRIMARY representations of roots is the - // active one. We'll use this as a template to generate any new root - // representations meant for this secondary. - var newActiveRoot *structs.CARoot - for _, root := range primaryRoots.Roots { - if root.ID == primaryRoots.ActiveRootID && root.Active { - newActiveRoot = root - break - } - } - if newActiveRoot == nil { - return fmt.Errorf("primary datacenter does not have an active root CA for Connect") - } - - // Get a signed intermediate from the primary DC if the provider - // hasn't been initialized yet or if the primary's root has changed. - needsNewIntermediate := false - if activeIntermediate == "" || storedRootID != primaryRoots.ActiveRootID { - needsNewIntermediate = true - } - - // Also we take this opportunity to correct an incorrectly persisted SigningKeyID - // in secondary datacenters (see PR-6513). - if expectedSigningKeyID != "" && currentSigningKeyID != expectedSigningKeyID { - needsNewIntermediate = true - } - - newIntermediate := false - if needsNewIntermediate { - if err := s.getIntermediateCASigned(provider, newActiveRoot); err != nil { - return err - } - newIntermediate = true - } else { - // Discard the primary's representation since our local one is - // sufficiently up to date. - newActiveRoot = activeSecondaryRoot - } - - // Update the roots list in the state store if there's a new active root. - state := s.fsm.State() - _, activeRoot, err := state.CARootActive(nil) - if err != nil { - return err - } - - // Determine whether a root update is needed, and persist the roots/config accordingly. - var newRoot *structs.CARoot - if activeRoot == nil || activeRoot.ID != newActiveRoot.ID || newIntermediate { - newRoot = newActiveRoot - } - if err := s.persistNewRootAndConfig(provider, newRoot, config); err != nil { - return err - } - - s.setCAProvider(provider, newActiveRoot) - return nil -} - -// persistNewRootAndConfig is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. -// If newActiveRoot is non-nil, it will be appended to the current roots list. -// If config is non-nil, it will be used to overwrite the existing config. -func (s *Server) persistNewRootAndConfig(provider ca.Provider, newActiveRoot *structs.CARoot, config *structs.CAConfiguration) error { - connectLogger := s.loggers.Named(logging.Connect) - state := s.fsm.State() - idx, oldRoots, err := state.CARoots(nil) - if err != nil { - return err - } - - var newConf structs.CAConfiguration - _, storedConfig, err := state.CAConfig(nil) - if err != nil { - return err - } - if storedConfig == nil { - return fmt.Errorf("local CA not initialized yet") - } - if config != nil { - newConf = *config - } else { - newConf = *storedConfig - } - newConf.ModifyIndex = storedConfig.ModifyIndex - if newActiveRoot != nil { - newConf.ClusterID = newActiveRoot.ExternalTrustDomain - } - - // Persist any state the provider needs us to - newConf.State, err = provider.State() - if err != nil { - return fmt.Errorf("error getting provider state: %v", err) - } - - // If there's a new active root, copy the root list and append it, updating - // the old root with the time it was rotated out. - var newRoots structs.CARoots - if newActiveRoot != nil { - for _, r := range oldRoots { - newRoot := *r - if newRoot.Active { - newRoot.Active = false - newRoot.RotatedOutAt = time.Now() - } - if newRoot.ExternalTrustDomain == "" { - newRoot.ExternalTrustDomain = config.ClusterID - } - newRoots = append(newRoots, &newRoot) - } - newRoots = append(newRoots, newActiveRoot) - } else { - newRoots = oldRoots - } - - args := &structs.CARequest{ - Op: structs.CAOpSetRootsAndConfig, - Index: idx, - Roots: newRoots, - Config: &newConf, - } - resp, err := s.raftApply(structs.ConnectCARequestType, &args) - if err != nil { - return err - } - if respErr, ok := resp.(error); ok { - return respErr - } - if respOk, ok := resp.(bool); ok && !respOk { - return fmt.Errorf("could not atomically update roots and config") - } - - connectLogger.Info("updated root certificates from primary datacenter") - return nil -} - -// getIntermediateCAPrimary regenerates the intermediate cert in the primary datacenter. -// This is only run for CAs that require an intermediary in the primary DC, such as Vault. -// This function is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. -func (s *Server) getIntermediateCAPrimary(provider ca.Provider, newActiveRoot *structs.CARoot) error { - connectLogger := s.loggers.Named(logging.Connect) - // Generate and sign an intermediate cert using the root CA. - intermediatePEM, err := provider.GenerateIntermediate() - if err != nil { - return fmt.Errorf("error generating new intermediate cert: %v", err) - } - - intermediateCert, err := connect.ParseCert(intermediatePEM) - if err != nil { - return fmt.Errorf("error parsing intermediate cert: %v", err) - } - - // Append the new intermediate to our local active root entry. This is - // where the root representations start to diverge. - newActiveRoot.IntermediateCerts = append(newActiveRoot.IntermediateCerts, intermediatePEM) - newActiveRoot.SigningKeyID = connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId) - - connectLogger.Info("generated new intermediate certificate for primary datacenter") - return nil -} - -// getIntermediateCASigned is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. -func (s *Server) getIntermediateCASigned(provider ca.Provider, newActiveRoot *structs.CARoot) error { - connectLogger := s.loggers.Named(logging.Connect) - csr, err := provider.GenerateIntermediateCSR() - if err != nil { - return err - } - - var intermediatePEM string - if err := s.forwardDC("ConnectCA.SignIntermediate", s.config.PrimaryDatacenter, s.generateCASignRequest(csr), &intermediatePEM); err != nil { - // this is a failure in the primary and shouldn't be capable of erroring out our establishing leadership - connectLogger.Warn("Primary datacenter refused to sign our intermediate CA certificate", "error", err) - return nil - } - - if err := provider.SetIntermediate(intermediatePEM, newActiveRoot.RootCert); err != nil { - return fmt.Errorf("Failed to set the intermediate certificate with the CA provider: %v", err) - } - - intermediateCert, err := connect.ParseCert(intermediatePEM) - if err != nil { - return fmt.Errorf("error parsing intermediate cert: %v", err) - } - - // Append the new intermediate to our local active root entry. This is - // where the root representations start to diverge. - newActiveRoot.IntermediateCerts = append(newActiveRoot.IntermediateCerts, intermediatePEM) - newActiveRoot.SigningKeyID = connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId) - - connectLogger.Info("received new intermediate certificate from primary datacenter") - return nil -} - -func (s *Server) generateCASignRequest(csr string) *structs.CASignRequest { - return &structs.CASignRequest{ - Datacenter: s.config.PrimaryDatacenter, - CSR: csr, - WriteRequest: structs.WriteRequest{Token: s.tokens.ReplicationToken()}, - } -} - -// startConnectLeader starts multi-dc connect leader routines. -func (s *Server) startConnectLeader() error { - if !s.config.ConnectEnabled { - return nil - } - - // Start the Connect secondary DC actions if enabled. - if s.config.Datacenter != s.config.PrimaryDatacenter { - s.leaderRoutineManager.Start(secondaryCARootWatchRoutineName, s.secondaryCARootWatch) - } - - s.leaderRoutineManager.Start(intermediateCertRenewWatchRoutineName, s.intermediateCertRenewalWatch) - s.leaderRoutineManager.Start(caRootPruningRoutineName, s.runCARootPruning) - return s.startIntentionConfigEntryMigration() -} - -// stopConnectLeader stops connect specific leader functions. -func (s *Server) stopConnectLeader() { - s.leaderRoutineManager.Stop(intentionMigrationRoutineName) - s.leaderRoutineManager.Stop(secondaryCARootWatchRoutineName) - s.leaderRoutineManager.Stop(intermediateCertRenewWatchRoutineName) - s.leaderRoutineManager.Stop(caRootPruningRoutineName) - - // If the provider implements NeedsStop, we call Stop to perform any shutdown actions. - s.caProviderReconfigurationLock.Lock() - defer s.caProviderReconfigurationLock.Unlock() - provider, _ := s.getCAProvider() - if provider != nil { - if needsStop, ok := provider.(ca.NeedsStop); ok { - needsStop.Stop() - } - } -} - func (s *Server) runCARootPruning(ctx context.Context) error { ticker := time.NewTicker(caRootPruneInterval) defer ticker.Stop() @@ -694,155 +151,6 @@ func (s *Server) pruneCARoots() error { return nil } -// intermediateCertRenewalWatch checks the intermediate cert for -// expiration. As soon as more than half the time a cert is valid has passed, -// it will try to renew it. -func (s *Server) intermediateCertRenewalWatch(ctx context.Context) error { - connectLogger := s.loggers.Named(logging.Connect) - isPrimary := s.config.Datacenter == s.config.PrimaryDatacenter - - for { - select { - case <-ctx.Done(): - return nil - case <-time.After(structs.IntermediateCertRenewInterval): - retryLoopBackoffAbortOnSuccess(ctx, func() error { - s.caProviderReconfigurationLock.Lock() - defer s.caProviderReconfigurationLock.Unlock() - - provider, _ := s.getCAProvider() - if provider == nil { - // this happens when leadership is being revoked and this go routine will be stopped - return nil - } - // If this isn't the primary, make sure the CA has been initialized. - if !isPrimary && !s.configuredSecondaryCA() { - return fmt.Errorf("secondary CA is not yet configured.") - } - - state := s.fsm.State() - _, activeRoot, err := state.CARootActive(nil) - if err != nil { - return err - } - - // If this is the primary, check if this is a provider that uses an intermediate cert. If - // it isn't, we don't need to check for a renewal. - if isPrimary { - _, config, err := state.CAConfig(nil) - if err != nil { - return err - } - - if _, ok := ca.PrimaryIntermediateProviders[config.Provider]; !ok { - return nil - } - } - - activeIntermediate, err := provider.ActiveIntermediate() - if err != nil { - return err - } - - if activeIntermediate == "" { - return fmt.Errorf("datacenter doesn't have an active intermediate.") - } - - intermediateCert, err := connect.ParseCert(activeIntermediate) - if err != nil { - return fmt.Errorf("error parsing active intermediate cert: %v", err) - } - - if lessThanHalfTimePassed(time.Now(), intermediateCert.NotBefore.Add(ca.CertificateTimeDriftBuffer), - intermediateCert.NotAfter) { - return nil - } - - renewalFunc := s.getIntermediateCAPrimary - if !isPrimary { - renewalFunc = s.getIntermediateCASigned - } - if err := renewalFunc(provider, activeRoot); err != nil { - return err - } - - if err := s.persistNewRootAndConfig(provider, activeRoot, nil); err != nil { - return err - } - - s.setCAProvider(provider, activeRoot) - return nil - }, func(err error) { - connectLogger.Error("error renewing intermediate certs", - "routine", intermediateCertRenewWatchRoutineName, - "error", err, - ) - }) - } - } -} - -// secondaryCARootWatch maintains a blocking query to the primary datacenter's -// ConnectCA.Roots endpoint to monitor when it needs to request a new signed -// intermediate certificate. -func (s *Server) secondaryCARootWatch(ctx context.Context) error { - connectLogger := s.loggers.Named(logging.Connect) - args := structs.DCSpecificRequest{ - Datacenter: s.config.PrimaryDatacenter, - QueryOptions: structs.QueryOptions{ - // the maximum time the primary roots watch query can block before returning - MaxQueryTime: s.config.MaxQueryTime, - }, - } - - connectLogger.Debug("starting Connect CA root replication from primary datacenter", "primary", s.config.PrimaryDatacenter) - - retryLoopBackoff(ctx, func() error { - var roots structs.IndexedCARoots - if err := s.forwardDC("ConnectCA.Roots", s.config.PrimaryDatacenter, &args, &roots); err != nil { - return fmt.Errorf("Error retrieving the primary datacenter's roots: %v", err) - } - - // Check to see if the primary has been upgraded in case we're waiting to switch to - // secondary mode. - provider, _ := s.getCAProvider() - if provider == nil { - // this happens when leadership is being revoked and this go routine will be stopped - return nil - } - if !s.configuredSecondaryCA() { - versionOk, primaryFound := ServersInDCMeetMinimumVersion(s, s.config.PrimaryDatacenter, minMultiDCConnectVersion) - if !primaryFound { - return fmt.Errorf("Primary datacenter is unreachable - deferring secondary CA initialization") - } - - if versionOk { - if err := s.initializeSecondaryProvider(provider, roots); err != nil { - return fmt.Errorf("Failed to initialize secondary CA provider: %v", err) - } - } - } - - // Run the secondary CA init routine to see if we need to request a new - // intermediate. - if s.configuredSecondaryCA() { - if err := s.initializeSecondaryCA(provider, roots, nil); err != nil { - return fmt.Errorf("Failed to initialize the secondary CA: %v", err) - } - } - - args.QueryOptions.MinQueryIndex = nextIndexVal(args.QueryOptions.MinQueryIndex, roots.QueryMeta.Index) - return nil - }, func(err error) { - connectLogger.Error("CA root replication failed, will retry", - "routine", secondaryCARootWatchRoutineName, - "error", err, - ) - }) - - return nil -} - // retryLoopBackoff loops a given function indefinitely, backing off exponentially // upon errors up to a maximum of maxRetryBackoff seconds. func retryLoopBackoff(ctx context.Context, loopFn func() error, errFn func(error)) { @@ -898,46 +206,6 @@ func nextIndexVal(prevIdx, idx uint64) uint64 { return idx } -// initializeSecondaryProvider configures the given provider for a secondary, non-root datacenter. -// It is being called while holding caProviderReconfigurationLock which means -// it must never take that lock itself or call anything that does. -func (s *Server) initializeSecondaryProvider(provider ca.Provider, roots structs.IndexedCARoots) error { - if roots.TrustDomain == "" { - return fmt.Errorf("trust domain from primary datacenter is not initialized") - } - - clusterID := strings.Split(roots.TrustDomain, ".")[0] - _, conf, err := s.fsm.State().CAConfig(nil) - if err != nil { - return err - } - - pCfg := ca.ProviderConfig{ - ClusterID: clusterID, - Datacenter: s.config.Datacenter, - IsPrimary: false, - RawConfig: conf.Config, - State: conf.State, - } - if err := provider.Configure(pCfg); err != nil { - return fmt.Errorf("error configuring provider: %v", err) - } - - s.actingSecondaryLock.Lock() - s.actingSecondaryCA = true - s.actingSecondaryLock.Unlock() - - return nil -} - -// configuredSecondaryCA is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. -func (s *Server) configuredSecondaryCA() bool { - s.actingSecondaryLock.RLock() - defer s.actingSecondaryLock.RUnlock() - return s.actingSecondaryCA -} - // halfTime returns a duration that is half the time between notBefore and // notAfter. func halfTime(notBefore, notAfter time.Time) time.Duration { @@ -953,3 +221,11 @@ func lessThanHalfTimePassed(now, notBefore, notAfter time.Time) bool { t := notBefore.Add(halfTime(notBefore, notAfter)) return t.Sub(now) > 0 } + +func (s *Server) generateCASignRequest(csr string) *structs.CASignRequest { + return &structs.CASignRequest{ + Datacenter: s.config.PrimaryDatacenter, + CSR: csr, + WriteRequest: structs.WriteRequest{Token: s.tokens.ReplicationToken()}, + } +} diff --git a/agent/consul/leader_connect_ca.go b/agent/consul/leader_connect_ca.go new file mode 100644 index 000000000..e1d7ffec1 --- /dev/null +++ b/agent/consul/leader_connect_ca.go @@ -0,0 +1,821 @@ +package consul + +import ( + "context" + "fmt" + "reflect" + "strings" + "sync" + "time" + + "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/connect/ca" + "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/logging" + "github.com/hashicorp/go-hclog" + uuid "github.com/hashicorp/go-uuid" +) + +type CAState string + +const ( + CAStateUninitialized CAState = "UNINITIALIZED" + CAStateInitializing = "INITIALIZING" + CAStateReady = "READY" + CAStateRenewIntermediate = "RENEWING" + CAStateReconfig = "RECONFIGURING" +) + +// CAManager is a wrapper around CA operations such as updating roots, an intermediate +// or the configuration. All operations should go through the CAManager in order to +// avoid data races. +type CAManager struct { + srv *Server + logger hclog.Logger + + // provider is the current CA provider in use for Connect. This is + // only non-nil when we are the leader. + provider ca.Provider + + // providerRoot is the CARoot that was stored along with the ca.Provider + // active. It's only updated in lock-step with the provider. This prevents + // races between state updates to active roots and the fetch of the provider + // instance. + providerRoot *structs.CARoot + providerLock sync.RWMutex + + // primaryRoots is the most recently seen state of the root CAs from the primary datacenter. + // This is protected by the stateLock and updated by initializeCA and the root CA watch routine. + primaryRoots structs.IndexedCARoots + + // actingSecondaryCA is whether this datacenter has been initialized as a secondary CA. + actingSecondaryCA bool + state CAState + stateLock sync.RWMutex +} + +func NewCAManager(srv *Server) *CAManager { + return &CAManager{ + srv: srv, + logger: srv.loggers.Named(logging.Connect), + state: CAStateUninitialized, + } +} + +// setState attempts to update the CA state to the given state. +// If the current state is not READY, this will fail. The only exception is when +// the current state is UNINITIALIZED, and the function is called with CAStateInitializing. +func (c *CAManager) setState(newState CAState) error { + c.stateLock.RLock() + state := c.state + c.stateLock.RUnlock() + + if state == CAStateReady || (state == CAStateUninitialized && newState == CAStateInitializing) { + c.stateLock.Lock() + c.state = newState + c.stateLock.Unlock() + } else { + return fmt.Errorf("CA is already in %s state", state) + } + return nil +} + +// setReady sets the CA state back to READY. This should only be called by a function +// that has successfully called setState beforehand. +func (c *CAManager) setReady() { + c.stateLock.Lock() + c.state = CAStateReady + c.stateLock.Unlock() +} + +// initializeCAConfig is used to initialize the CA config if necessary +// when setting up the CA during establishLeadership +func (c *CAManager) initializeCAConfig() (*structs.CAConfiguration, error) { + state := c.srv.fsm.State() + _, config, err := state.CAConfig(nil) + if err != nil { + return nil, err + } + if config == nil { + config = c.srv.config.CAConfig + if config.ClusterID == "" { + id, err := uuid.GenerateUUID() + if err != nil { + return nil, err + } + config.ClusterID = id + } + } else if _, ok := config.Config["IntermediateCertTTL"]; !ok { + dup := *config + copied := make(map[string]interface{}) + for k, v := range dup.Config { + copied[k] = v + } + copied["IntermediateCertTTL"] = connect.DefaultIntermediateCertTTL.String() + dup.Config = copied + config = &dup + } else { + return config, nil + } + + req := structs.CARequest{ + Op: structs.CAOpSetConfig, + Config: config, + } + if resp, err := c.srv.raftApply(structs.ConnectCARequestType, req); err != nil { + return nil, err + } else if respErr, ok := resp.(error); ok { + return nil, respErr + } + + return config, nil +} + +// parseCARoot returns a filled-in structs.CARoot from a raw PEM value. +func parseCARoot(pemValue, provider, clusterID string) (*structs.CARoot, error) { + id, err := connect.CalculateCertFingerprint(pemValue) + if err != nil { + return nil, fmt.Errorf("error parsing root fingerprint: %v", err) + } + rootCert, err := connect.ParseCert(pemValue) + if err != nil { + return nil, fmt.Errorf("error parsing root cert: %v", err) + } + keyType, keyBits, err := connect.KeyInfoFromCert(rootCert) + if err != nil { + return nil, fmt.Errorf("error extracting root key info: %v", err) + } + return &structs.CARoot{ + ID: id, + Name: fmt.Sprintf("%s CA Root Cert", strings.Title(provider)), + SerialNumber: rootCert.SerialNumber.Uint64(), + SigningKeyID: connect.EncodeSigningKeyID(rootCert.SubjectKeyId), + ExternalTrustDomain: clusterID, + NotBefore: rootCert.NotBefore, + NotAfter: rootCert.NotAfter, + RootCert: pemValue, + PrivateKeyType: keyType, + PrivateKeyBits: keyBits, + Active: true, + }, nil +} + +// getCAProvider is being called while holding caProviderReconfigurationLock +// which means it must never take that lock itself or call anything that does. +func (c *CAManager) getCAProvider() (ca.Provider, *structs.CARoot) { + retries := 0 + var result ca.Provider + var resultRoot *structs.CARoot + for result == nil { + c.providerLock.RLock() + result = c.provider + resultRoot = c.providerRoot + c.providerLock.RUnlock() + + // In cases where an agent is started with managed proxies, we may ask + // for the provider before establishLeadership completes. If we're the + // leader, then wait and get the provider again + if result == nil && c.srv.IsLeader() && retries < 10 { + retries++ + time.Sleep(50 * time.Millisecond) + continue + } + + break + } + + return result, resultRoot +} + +// setCAProvider is being called while holding the stateLock +// which means it must never take that lock itself or call anything that does. +func (c *CAManager) setCAProvider(newProvider ca.Provider, root *structs.CARoot) { + c.providerLock.Lock() + c.provider = newProvider + c.providerRoot = root + c.providerLock.Unlock() +} + +// initializeCA sets up the CA provider when gaining leadership, either bootstrapping +// the CA if this is the primary DC or making a remote RPC for intermediate signing +// if this is a secondary DC. +func (c *CAManager) initializeCA() error { + // Bail if connect isn't enabled. + if !c.srv.config.ConnectEnabled { + return nil + } + + err := c.setState(CAStateInitializing) + if err != nil { + return err + } + defer c.setReady() + + // Initialize the provider based on the current config. + conf, err := c.initializeCAConfig() + if err != nil { + return err + } + provider, err := c.srv.createCAProvider(conf) + if err != nil { + return err + } + + c.setCAProvider(provider, nil) + + // If this isn't the primary DC, run the secondary DC routine if the primary has already been upgraded to at least 1.6.0 + if c.srv.config.PrimaryDatacenter != c.srv.config.Datacenter { + versionOk, foundPrimary := ServersInDCMeetMinimumVersion(c.srv, c.srv.config.PrimaryDatacenter, minMultiDCConnectVersion) + if !foundPrimary { + c.logger.Warn("primary datacenter is configured but unreachable - deferring initialization of the secondary datacenter CA") + // return nil because we will initialize the secondary CA later + return nil + } else if !versionOk { + // return nil because we will initialize the secondary CA later + c.logger.Warn("servers in the primary datacenter are not at least at the minimum version - deferring initialization of the secondary datacenter CA", + "min_version", minMultiDCConnectVersion.String(), + ) + return nil + } + + // Get the root CA to see if we need to refresh our intermediate. + args := structs.DCSpecificRequest{ + Datacenter: c.srv.config.PrimaryDatacenter, + } + var roots structs.IndexedCARoots + if err := c.srv.forwardDC("ConnectCA.Roots", c.srv.config.PrimaryDatacenter, &args, &roots); err != nil { + return err + } + c.primaryRoots = roots + + // Configure the CA provider and initialize the intermediate certificate if necessary. + if err := c.initializeSecondaryProvider(provider, roots); err != nil { + return fmt.Errorf("error configuring provider: %v", err) + } + if err := c.initializeSecondaryCA(provider, nil); err != nil { + return err + } + + c.logger.Info("initialized secondary datacenter CA with provider", "provider", conf.Provider) + return nil + } + + return c.initializeRootCA(provider, conf) +} + +// initializeRootCA runs the initialization logic for a root CA. +// It is being called while holding caProviderReconfigurationLock +// which means it must never take that lock itself or call anything that does. +func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfiguration) error { + pCfg := ca.ProviderConfig{ + ClusterID: conf.ClusterID, + Datacenter: c.srv.config.Datacenter, + IsPrimary: true, + RawConfig: conf.Config, + State: conf.State, + } + if err := provider.Configure(pCfg); err != nil { + return fmt.Errorf("error configuring provider: %v", err) + } + if err := provider.GenerateRoot(); err != nil { + return fmt.Errorf("error generating CA root certificate: %v", err) + } + + // Get the active root cert from the CA + rootPEM, err := provider.ActiveRoot() + if err != nil { + return fmt.Errorf("error getting root cert: %v", err) + } + rootCA, err := parseCARoot(rootPEM, conf.Provider, conf.ClusterID) + if err != nil { + return err + } + + // Also create the intermediate CA, which is the one that actually signs leaf certs + interPEM, err := provider.GenerateIntermediate() + if err != nil { + return fmt.Errorf("error generating intermediate cert: %v", err) + } + _, err = connect.ParseCert(interPEM) + if err != nil { + return fmt.Errorf("error getting intermediate cert: %v", err) + } + + // If the provider has state to persist and it's changed or new then update + // CAConfig. + pState, err := provider.State() + if err != nil { + return fmt.Errorf("error getting provider state: %v", err) + } + if !reflect.DeepEqual(conf.State, pState) { + // Update the CAConfig in raft to persist the provider state + conf.State = pState + req := structs.CARequest{ + Op: structs.CAOpSetConfig, + Config: conf, + } + if _, err = c.srv.raftApply(structs.ConnectCARequestType, req); err != nil { + return fmt.Errorf("error persisting provider state: %v", err) + } + } + + // Check if the CA root is already initialized and exit if it is, + // adding on any existing intermediate certs since they aren't directly + // tied to the provider. + // Every change to the CA after this initial bootstrapping should + // be done through the rotation process. + state := c.srv.fsm.State() + _, activeRoot, err := state.CARootActive(nil) + if err != nil { + return err + } + if activeRoot != nil { + // This state shouldn't be possible to get into because we update the root and + // CA config in the same FSM operation. + if activeRoot.ID != rootCA.ID { + return fmt.Errorf("stored CA root %q is not the active root (%s)", rootCA.ID, activeRoot.ID) + } + + rootCA.IntermediateCerts = activeRoot.IntermediateCerts + c.setCAProvider(provider, rootCA) + + return nil + } + + // Get the highest index + idx, _, err := state.CARoots(nil) + if err != nil { + return err + } + + // Store the root cert in raft + resp, err := c.srv.raftApply(structs.ConnectCARequestType, &structs.CARequest{ + Op: structs.CAOpSetRoots, + Index: idx, + Roots: []*structs.CARoot{rootCA}, + }) + if err != nil { + c.logger.Error("Raft apply failed", "error", err) + return err + } + if respErr, ok := resp.(error); ok { + return respErr + } + + c.setCAProvider(provider, rootCA) + + c.logger.Info("initialized primary datacenter CA with provider", "provider", conf.Provider) + + return nil +} + +// initializeSecondaryCA runs the routine for generating an intermediate CA CSR and getting +// it signed by the primary DC if the root CA of the primary DC has changed since the last +// intermediate. +// It is being called while holding caProviderReconfigurationLock +// which means it must never take that lock itself or call anything that does. +func (c *CAManager) initializeSecondaryCA(provider ca.Provider, config *structs.CAConfiguration) error { + activeIntermediate, err := provider.ActiveIntermediate() + if err != nil { + return err + } + + var ( + storedRootID string + expectedSigningKeyID string + currentSigningKeyID string + activeSecondaryRoot *structs.CARoot + ) + if activeIntermediate != "" { + // In the event that we already have an intermediate, we must have + // already replicated some primary root information locally, so check + // to see if we're up to date by fetching the rootID and the + // signingKeyID used in the secondary. + // + // Note that for the same rootID the primary representation of the root + // will have a different SigningKeyID field than the secondary + // representation of the same root. This is because it's derived from + // the intermediate which is different in all datacenters. + storedRoot, err := provider.ActiveRoot() + if err != nil { + return err + } + + storedRootID, err = connect.CalculateCertFingerprint(storedRoot) + if err != nil { + return fmt.Errorf("error parsing root fingerprint: %v, %#v", err, storedRoot) + } + + intermediateCert, err := connect.ParseCert(activeIntermediate) + if err != nil { + return fmt.Errorf("error parsing active intermediate cert: %v", err) + } + expectedSigningKeyID = connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId) + + // This will fetch the secondary's exact current representation of the + // active root. Note that this data should only be used if the IDs + // match, otherwise it's out of date and should be regenerated. + _, activeSecondaryRoot, err = c.srv.fsm.State().CARootActive(nil) + if err != nil { + return err + } + if activeSecondaryRoot != nil { + currentSigningKeyID = activeSecondaryRoot.SigningKeyID + } + } + + // Determine which of the provided PRIMARY representations of roots is the + // active one. We'll use this as a template to generate any new root + // representations meant for this secondary. + var newActiveRoot *structs.CARoot + for _, root := range c.primaryRoots.Roots { + if root.ID == c.primaryRoots.ActiveRootID && root.Active { + newActiveRoot = root + break + } + } + if newActiveRoot == nil { + return fmt.Errorf("primary datacenter does not have an active root CA for Connect") + } + + // Get a signed intermediate from the primary DC if the provider + // hasn't been initialized yet or if the primary's root has changed. + needsNewIntermediate := false + if activeIntermediate == "" || storedRootID != c.primaryRoots.ActiveRootID { + needsNewIntermediate = true + } + + // Also we take this opportunity to correct an incorrectly persisted SigningKeyID + // in secondary datacenters (see PR-6513). + if expectedSigningKeyID != "" && currentSigningKeyID != expectedSigningKeyID { + needsNewIntermediate = true + } + + newIntermediate := false + if needsNewIntermediate { + if err := c.getIntermediateCASigned(provider, newActiveRoot); err != nil { + return err + } + newIntermediate = true + } else { + // Discard the primary's representation since our local one is + // sufficiently up to date. + newActiveRoot = activeSecondaryRoot + } + + // Update the roots list in the state store if there's a new active root. + state := c.srv.fsm.State() + _, activeRoot, err := state.CARootActive(nil) + if err != nil { + return err + } + + // Determine whether a root update is needed, and persist the roots/config accordingly. + var newRoot *structs.CARoot + if activeRoot == nil || activeRoot.ID != newActiveRoot.ID || newIntermediate { + newRoot = newActiveRoot + } + if err := c.persistNewRootAndConfig(provider, newRoot, config); err != nil { + return err + } + + c.setCAProvider(provider, newActiveRoot) + return nil +} + +// persistNewRootAndConfig is being called while holding caProviderReconfigurationLock +// which means it must never take that lock itself or call anything that does. +// If newActiveRoot is non-nil, it will be appended to the current roots list. +// If config is non-nil, it will be used to overwrite the existing config. +func (c *CAManager) persistNewRootAndConfig(provider ca.Provider, newActiveRoot *structs.CARoot, config *structs.CAConfiguration) error { + state := c.srv.fsm.State() + idx, oldRoots, err := state.CARoots(nil) + if err != nil { + return err + } + + var newConf structs.CAConfiguration + _, storedConfig, err := state.CAConfig(nil) + if err != nil { + return err + } + if storedConfig == nil { + return fmt.Errorf("local CA not initialized yet") + } + if config != nil { + newConf = *config + } else { + newConf = *storedConfig + } + newConf.ModifyIndex = storedConfig.ModifyIndex + if newActiveRoot != nil { + newConf.ClusterID = newActiveRoot.ExternalTrustDomain + } + + // Persist any state the provider needs us to + newConf.State, err = provider.State() + if err != nil { + return fmt.Errorf("error getting provider state: %v", err) + } + + // If there's a new active root, copy the root list and append it, updating + // the old root with the time it was rotated out. + var newRoots structs.CARoots + if newActiveRoot != nil { + for _, r := range oldRoots { + newRoot := *r + if newRoot.Active { + newRoot.Active = false + newRoot.RotatedOutAt = time.Now() + } + if newRoot.ExternalTrustDomain == "" { + newRoot.ExternalTrustDomain = config.ClusterID + } + newRoots = append(newRoots, &newRoot) + } + newRoots = append(newRoots, newActiveRoot) + } else { + newRoots = oldRoots + } + + args := &structs.CARequest{ + Op: structs.CAOpSetRootsAndConfig, + Index: idx, + Roots: newRoots, + Config: &newConf, + } + resp, err := c.srv.raftApply(structs.ConnectCARequestType, &args) + if err != nil { + return err + } + if respErr, ok := resp.(error); ok { + return respErr + } + if respOk, ok := resp.(bool); ok && !respOk { + return fmt.Errorf("could not atomically update roots and config") + } + + c.logger.Info("updated root certificates from primary datacenter") + return nil +} + +// getIntermediateCAPrimary regenerates the intermediate cert in the primary datacenter. +// This is only run for CAs that require an intermediary in the primary DC, such as Vault. +// This function is being called while holding caProviderReconfigurationLock +// which means it must never take that lock itself or call anything that does. +func (c *CAManager) getIntermediateCAPrimary(provider ca.Provider, newActiveRoot *structs.CARoot) error { + // Generate and sign an intermediate cert using the root CA. + intermediatePEM, err := provider.GenerateIntermediate() + if err != nil { + return fmt.Errorf("error generating new intermediate cert: %v", err) + } + + intermediateCert, err := connect.ParseCert(intermediatePEM) + if err != nil { + return fmt.Errorf("error parsing intermediate cert: %v", err) + } + + // Append the new intermediate to our local active root entry. This is + // where the root representations start to diverge. + newActiveRoot.IntermediateCerts = append(newActiveRoot.IntermediateCerts, intermediatePEM) + newActiveRoot.SigningKeyID = connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId) + + c.logger.Info("generated new intermediate certificate for primary datacenter") + return nil +} + +// getIntermediateCASigned is being called while holding caProviderReconfigurationLock +// which means it must never take that lock itself or call anything that does. +func (c *CAManager) getIntermediateCASigned(provider ca.Provider, newActiveRoot *structs.CARoot) error { + csr, err := provider.GenerateIntermediateCSR() + if err != nil { + return err + } + + var intermediatePEM string + if err := c.srv.forwardDC("ConnectCA.SignIntermediate", c.srv.config.PrimaryDatacenter, c.srv.generateCASignRequest(csr), &intermediatePEM); err != nil { + // this is a failure in the primary and shouldn't be capable of erroring out our establishing leadership + c.logger.Warn("Primary datacenter refused to sign our intermediate CA certificate", "error", err) + return nil + } + + if err := provider.SetIntermediate(intermediatePEM, newActiveRoot.RootCert); err != nil { + return fmt.Errorf("Failed to set the intermediate certificate with the CA provider: %v", err) + } + + intermediateCert, err := connect.ParseCert(intermediatePEM) + if err != nil { + return fmt.Errorf("error parsing intermediate cert: %v", err) + } + + // Append the new intermediate to our local active root entry. This is + // where the root representations start to diverge. + newActiveRoot.IntermediateCerts = append(newActiveRoot.IntermediateCerts, intermediatePEM) + newActiveRoot.SigningKeyID = connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId) + + c.logger.Info("received new intermediate certificate from primary datacenter") + return nil +} + +// intermediateCertRenewalWatch checks the intermediate cert for +// expiration. As soon as more than half the time a cert is valid has passed, +// it will try to renew it. +func (c *CAManager) intermediateCertRenewalWatch(ctx context.Context) error { + isPrimary := c.srv.config.Datacenter == c.srv.config.PrimaryDatacenter + + for { + select { + case <-ctx.Done(): + return nil + case <-time.After(structs.IntermediateCertRenewInterval): + retryLoopBackoffAbortOnSuccess(ctx, func() error { + if !isPrimary { + c.logger.Info("starting check for intermediate renewal") + } + // Grab the 'lock' right away so the provider/config can't be changed out while we check + // the intermediate. + if err := c.setState(CAStateRenewIntermediate); err != nil { + return err + } + defer c.setReady() + + provider, _ := c.getCAProvider() + if provider == nil { + // this happens when leadership is being revoked and this go routine will be stopped + return nil + } + // If this isn't the primary, make sure the CA has been initialized. + if !isPrimary && !c.configuredSecondaryCA() { + return fmt.Errorf("secondary CA is not yet configured.") + } + + state := c.srv.fsm.State() + _, activeRoot, err := state.CARootActive(nil) + if err != nil { + return err + } + + // If this is the primary, check if this is a provider that uses an intermediate cert. If + // it isn't, we don't need to check for a renewal. + if isPrimary { + _, config, err := state.CAConfig(nil) + if err != nil { + return err + } + + if _, ok := ca.PrimaryIntermediateProviders[config.Provider]; !ok { + return nil + } + } else { + c.logger.Info("Checking for intermediate renewal") + } + + activeIntermediate, err := provider.ActiveIntermediate() + if err != nil { + return err + } + + if activeIntermediate == "" { + return fmt.Errorf("datacenter doesn't have an active intermediate.") + } + + intermediateCert, err := connect.ParseCert(activeIntermediate) + if err != nil { + return fmt.Errorf("error parsing active intermediate cert: %v", err) + } + + if lessThanHalfTimePassed(time.Now(), intermediateCert.NotBefore.Add(ca.CertificateTimeDriftBuffer), + intermediateCert.NotAfter) { + return nil + } + + // Enough time has passed, go ahead with getting a new intermediate. + renewalFunc := c.getIntermediateCAPrimary + if !isPrimary { + renewalFunc = c.getIntermediateCASigned + } + if err := renewalFunc(provider, activeRoot); err != nil { + return err + } + + if err := c.persistNewRootAndConfig(provider, activeRoot, nil); err != nil { + return err + } + + c.setCAProvider(provider, activeRoot) + return nil + }, func(err error) { + c.logger.Error("error renewing intermediate certs", + "routine", intermediateCertRenewWatchRoutineName, + "error", err, + ) + }) + } + } +} + +// secondaryCARootWatch maintains a blocking query to the primary datacenter's +// ConnectCA.Roots endpoint to monitor when it needs to request a new signed +// intermediate certificate. +func (c *CAManager) secondaryCARootWatch(ctx context.Context) error { + args := structs.DCSpecificRequest{ + Datacenter: c.srv.config.PrimaryDatacenter, + QueryOptions: structs.QueryOptions{ + // the maximum time the primary roots watch query can block before returning + MaxQueryTime: c.srv.config.MaxQueryTime, + }, + } + + c.logger.Debug("starting Connect CA root replication from primary datacenter", "primary", c.srv.config.PrimaryDatacenter) + + retryLoopBackoff(ctx, func() error { + var roots structs.IndexedCARoots + if err := c.srv.forwardDC("ConnectCA.Roots", c.srv.config.PrimaryDatacenter, &args, &roots); err != nil { + return fmt.Errorf("Error retrieving the primary datacenter's roots: %v", err) + } + + // Update the state first to claim the 'lock'. + if err := c.setState(CAStateReconfig); err != nil { + return err + } + defer c.setReady() + + // Update the cached primary roots now that the lock is held. + c.primaryRoots = roots + + // Check to see if the primary has been upgraded in case we're waiting to switch to + // secondary mode. + provider, _ := c.getCAProvider() + if provider == nil { + // this happens when leadership is being revoked and this go routine will be stopped + return nil + } + if !c.configuredSecondaryCA() { + versionOk, primaryFound := ServersInDCMeetMinimumVersion(c.srv, c.srv.config.PrimaryDatacenter, minMultiDCConnectVersion) + if !primaryFound { + return fmt.Errorf("Primary datacenter is unreachable - deferring secondary CA initialization") + } + + if versionOk { + if err := c.initializeSecondaryProvider(provider, roots); err != nil { + return fmt.Errorf("Failed to initialize secondary CA provider: %v", err) + } + } + } + + // Run the secondary CA init routine to see if we need to request a new + // intermediate. + if c.configuredSecondaryCA() { + if err := c.initializeSecondaryCA(provider, nil); err != nil { + return fmt.Errorf("Failed to initialize the secondary CA: %v", err) + } + } + + args.QueryOptions.MinQueryIndex = nextIndexVal(args.QueryOptions.MinQueryIndex, roots.QueryMeta.Index) + return nil + }, func(err error) { + c.logger.Error("CA root replication failed, will retry", + "routine", secondaryCARootWatchRoutineName, + "error", err, + ) + }) + + return nil +} + +// initializeSecondaryProvider configures the given provider for a secondary, non-root datacenter. +// It is being called while holding the stateLock in order to update actingSecondaryCA, which means +// it must never take that lock itself or call anything that does. +func (c *CAManager) initializeSecondaryProvider(provider ca.Provider, roots structs.IndexedCARoots) error { + if roots.TrustDomain == "" { + return fmt.Errorf("trust domain from primary datacenter is not initialized") + } + + clusterID := strings.Split(roots.TrustDomain, ".")[0] + _, conf, err := c.srv.fsm.State().CAConfig(nil) + if err != nil { + return err + } + + pCfg := ca.ProviderConfig{ + ClusterID: clusterID, + Datacenter: c.srv.config.Datacenter, + IsPrimary: false, + RawConfig: conf.Config, + State: conf.State, + } + if err := provider.Configure(pCfg); err != nil { + return fmt.Errorf("error configuring provider: %v", err) + } + + c.actingSecondaryCA = true + + return nil +} + +// configuredSecondaryCA returns true if we have been initialized as a secondary datacenter's CA. +func (c *CAManager) configuredSecondaryCA() bool { + c.stateLock.RLock() + defer c.stateLock.RUnlock() + return c.actingSecondaryCA +} diff --git a/agent/consul/leader_connect_test.go b/agent/consul/leader_connect_test.go index 6f861c17e..5beca904b 100644 --- a/agent/consul/leader_connect_test.go +++ b/agent/consul/leader_connect_test.go @@ -80,11 +80,11 @@ func TestLeader_SecondaryCA_Initialize(t *testing.T) { s2.tokens.UpdateAgentToken(masterToken, token.TokenSourceConfig) s2.tokens.UpdateReplicationToken(masterToken, token.TokenSourceConfig) - testrpc.WaitForLeader(t, s2.RPC, "secondary") - // Create the WAN link joinWAN(t, s2, s1) + testrpc.WaitForLeader(t, s2.RPC, "secondary") + waitForNewACLs(t, s1) waitForNewACLs(t, s2) @@ -175,9 +175,7 @@ func waitForActiveCARoot(t *testing.T, srv *Server, expect *structs.CARoot) { } func getCAProviderWithLock(s *Server) (ca.Provider, *structs.CARoot) { - s.caProviderReconfigurationLock.Lock() - defer s.caProviderReconfigurationLock.Unlock() - return s.getCAProvider() + return s.caManager.getCAProvider() } func TestLeader_Vault_PrimaryCA_IntermediateRenew(t *testing.T) { diff --git a/agent/consul/server.go b/agent/consul/server.go index 13fece406..914eeb198 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -30,7 +30,6 @@ import ( "google.golang.org/grpc" "github.com/hashicorp/consul/acl" - ca "github.com/hashicorp/consul/agent/connect/ca" "github.com/hashicorp/consul/agent/consul/authmethod" "github.com/hashicorp/consul/agent/consul/authmethod/ssoauth" "github.com/hashicorp/consul/agent/consul/fsm" @@ -137,17 +136,11 @@ type Server struct { // autopilot is the Autopilot instance for this server. autopilot *autopilot.Autopilot - // caProviderReconfigurationLock guards the provider reconfiguration. - caProviderReconfigurationLock sync.Mutex - // caProvider is the current CA provider in use for Connect. This is - // only non-nil when we are the leader. - caProvider ca.Provider - // caProviderRoot is the CARoot that was stored along with the ca.Provider - // active. It's only updated in lock-step with the caProvider. This prevents - // races between state updates to active roots and the fetch of the provider - // instance. - caProviderRoot *structs.CARoot - caProviderLock sync.RWMutex + // autopilotWaitGroup is used to block until Autopilot shuts down. + autopilotWaitGroup sync.WaitGroup + + // caManager is used to synchronize CA operations across the leader and RPC functions. + caManager *CAManager // rate limiter to use when signing leaf certificates caLeafLimiter connectSignRateLimiter @@ -298,10 +291,6 @@ type Server struct { shutdownCh chan struct{} shutdownLock sync.Mutex - // State for whether this datacenter is acting as a secondary CA. - actingSecondaryCA bool - actingSecondaryLock sync.RWMutex - // dcSupportsIntentionsAsConfigEntries is used to determine whether we can // migrate old intentions into service-intentions config entries. All // servers in the local DC must be on a version of Consul supporting @@ -480,6 +469,7 @@ func NewServer(config *Config, flat Deps) (*Server, error) { return nil, fmt.Errorf("Failed to start Raft: %v", err) } + s.caManager = NewCAManager(s) if s.config.ConnectEnabled && (s.config.AutoEncryptAllowTLS || s.config.AutoConfigAuthzEnabled) { go s.connectCARootsMonitor(&lib.StopChannelContext{StopCh: s.shutdownCh}) } diff --git a/agent/consul/server_connect.go b/agent/consul/server_connect.go index ecc648158..e854d813a 100644 --- a/agent/consul/server_connect.go +++ b/agent/consul/server_connect.go @@ -132,7 +132,7 @@ func (s *Server) getCARoots(ws memdb.WatchSet, state *state.Store) (*structs.Ind } func (s *Server) SignCertificate(csr *x509.CertificateRequest, spiffeID connect.CertURI) (*structs.IssuedCert, error) { - provider, caRoot := s.getCAProvider() + provider, caRoot := s.caManager.getCAProvider() if provider == nil { return nil, fmt.Errorf("internal error: CA provider is nil") } else if caRoot == nil { diff --git a/agent/consul/server_test.go b/agent/consul/server_test.go index 928e59257..5bfaba555 100644 --- a/agent/consul/server_test.go +++ b/agent/consul/server_test.go @@ -1483,7 +1483,7 @@ func TestServer_CALogging(t *testing.T) { defer s1.Shutdown() testrpc.WaitForLeader(t, s1.RPC, "dc1") - if _, ok := s1.caProvider.(ca.NeedsLogger); !ok { + if _, ok := s1.caManager.provider.(ca.NeedsLogger); !ok { t.Fatalf("provider does not implement NeedsLogger") } From 0a86533e20968ff6fd858e44f8d3dc51b47d6bfb Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Fri, 13 Nov 2020 14:33:19 -0800 Subject: [PATCH 3/7] Reorganize some CA manager code for correctness/readability --- agent/consul/connect_ca_endpoint.go | 227 +----------- agent/consul/leader.go | 3 +- agent/consul/leader_connect_ca.go | 551 ++++++++++++++++++++-------- agent/consul/server.go | 3 - 4 files changed, 408 insertions(+), 376 deletions(-) diff --git a/agent/consul/connect_ca_endpoint.go b/agent/consul/connect_ca_endpoint.go index f75ac3326..46607e3b6 100644 --- a/agent/consul/connect_ca_endpoint.go +++ b/agent/consul/connect_ca_endpoint.go @@ -3,14 +3,12 @@ package consul import ( "errors" "fmt" - "reflect" "time" "github.com/hashicorp/go-hclog" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/connect" - "github.com/hashicorp/consul/agent/connect/ca" "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/go-memdb" @@ -103,230 +101,7 @@ func (s *ConnectCA) ConfigurationSet( return acl.ErrPermissionDenied } - // Exit early if it's a no-op change - state := s.srv.fsm.State() - confIdx, config, err := state.CAConfig(nil) - if err != nil { - return err - } - - // Don't allow state changes. Either it needs to be empty or the same to allow - // read-modify-write loops that don't touch the State field. - if len(args.Config.State) > 0 && - !reflect.DeepEqual(args.Config.State, config.State) { - return ErrStateReadOnly - } - - // Don't allow users to change the ClusterID. - args.Config.ClusterID = config.ClusterID - if args.Config.Provider == config.Provider && reflect.DeepEqual(args.Config.Config, config.Config) { - return nil - } - - // If the provider hasn't changed, we need to load the current Provider state - // so it can decide if it needs to change resources or not based on the config - // change. - if args.Config.Provider == config.Provider { - // Note this is a shallow copy since the State method doc requires the - // provider return a map that will not be further modified and should not - // modify the one we pass to Configure. - args.Config.State = config.State - } - - // Create a new instance of the provider described by the config - // and get the current active root CA. This acts as a good validation - // of the config and makes sure the provider is functioning correctly - // before we commit any changes to Raft. - newProvider, err := s.srv.createCAProvider(args.Config) - if err != nil { - return fmt.Errorf("could not initialize provider: %v", err) - } - pCfg := ca.ProviderConfig{ - ClusterID: args.Config.ClusterID, - Datacenter: s.srv.config.Datacenter, - // This endpoint can be called in a secondary DC too so set this correctly. - IsPrimary: s.srv.config.Datacenter == s.srv.config.PrimaryDatacenter, - RawConfig: args.Config.Config, - State: args.Config.State, - } - if err := newProvider.Configure(pCfg); err != nil { - return fmt.Errorf("error configuring provider: %v", err) - } - - // Set up a defer to clean up the new provider if we exit early due to an error. - cleanupNewProvider := true - defer func() { - if cleanupNewProvider { - if err := newProvider.Cleanup(); err != nil { - s.logger.Warn("failed to clean up CA provider while handling startup failure", "provider", newProvider, "error", err) - } - } - }() - - // If this is a secondary, check if the intermediate needs to be regenerated. - if s.srv.config.Datacenter != s.srv.config.PrimaryDatacenter { - // Attempt to take the CA lock in order to update the config. - if err := s.srv.caManager.setState(CAStateReconfig); err != nil { - return err - } - defer s.srv.caManager.setReady() - - if err := s.srv.caManager.initializeSecondaryCA(newProvider, args.Config); err != nil { - return fmt.Errorf("Error updating secondary datacenter CA config: %v", err) - } - cleanupNewProvider = false - s.logger.Info("Secondary CA provider config updated") - return nil - } - - if err := newProvider.GenerateRoot(); err != nil { - return fmt.Errorf("error generating CA root certificate: %v", err) - } - - newRootPEM, err := newProvider.ActiveRoot() - if err != nil { - return err - } - - newActiveRoot, err := parseCARoot(newRootPEM, args.Config.Provider, args.Config.ClusterID) - if err != nil { - return err - } - - // See if the provider needs to persist any state along with the config - pState, err := newProvider.State() - if err != nil { - return fmt.Errorf("error getting provider state: %v", err) - } - args.Config.State = pState - - // Compare the new provider's root CA ID to the current one. If they - // match, just update the existing provider with the new config. - // If they don't match, begin the root rotation process. - _, root, err := state.CARootActive(nil) - if err != nil { - return err - } - - // If the root didn't change, just update the config and return. - if root != nil && root.ID == newActiveRoot.ID { - args.Op = structs.CAOpSetConfig - resp, err := s.srv.raftApply(structs.ConnectCARequestType, args) - if err != nil { - return err - } - if respErr, ok := resp.(error); ok { - return respErr - } - - // If the config has been committed, update the local provider instance - cleanupNewProvider = false - s.srv.caManager.setCAProvider(newProvider, newActiveRoot) - - s.logger.Info("CA provider config updated") - - return nil - } - - // At this point, we know the config change has trigged a root rotation, - // either by swapping the provider type or changing the provider's config - // to use a different root certificate. - - // First up, sanity check that the current provider actually supports - // cross-signing. - oldProvider, _ := s.srv.caManager.getCAProvider() - if oldProvider == nil { - return fmt.Errorf("internal error: CA provider is nil") - } - canXSign, err := oldProvider.SupportsCrossSigning() - if err != nil { - return fmt.Errorf("CA provider error: %s", err) - } - if !canXSign && !args.Config.ForceWithoutCrossSigning { - return errors.New("The current CA Provider does not support cross-signing. " + - "You can try again with ForceWithoutCrossSigningSet but this may cause " + - "disruption - see documentation for more.") - } - if !canXSign && args.Config.ForceWithoutCrossSigning { - s.logger.Warn("current CA doesn't support cross signing but " + - "CA reconfiguration forced anyway with ForceWithoutCrossSigning") - } - - // If it's a config change that would trigger a rotation (different provider/root): - // 1. Get the root from the new provider. - // 2. Call CrossSignCA on the old provider to sign the new root with the old one to - // get a cross-signed certificate. - // 3. Take the active root for the new provider and append the intermediate from step 2 - // to its list of intermediates. - newRoot, err := connect.ParseCert(newRootPEM) - if err != nil { - return err - } - - if canXSign { - // Have the old provider cross-sign the new root - xcCert, err := oldProvider.CrossSignCA(newRoot) - if err != nil { - return err - } - - // Add the cross signed cert to the new CA's intermediates (to be attached - // to leaf certs). - newActiveRoot.IntermediateCerts = []string{xcCert} - } - - intermediate, err := newProvider.GenerateIntermediate() - if err != nil { - return err - } - if intermediate != newRootPEM { - newActiveRoot.IntermediateCerts = append(newActiveRoot.IntermediateCerts, intermediate) - } - - // Update the roots and CA config in the state store at the same time - idx, roots, err := state.CARoots(nil) - if err != nil { - return err - } - - var newRoots structs.CARoots - for _, r := range roots { - newRoot := *r - if newRoot.Active { - newRoot.Active = false - newRoot.RotatedOutAt = time.Now() - } - newRoots = append(newRoots, &newRoot) - } - newRoots = append(newRoots, newActiveRoot) - - args.Op = structs.CAOpSetRootsAndConfig - args.Index = idx - args.Config.ModifyIndex = confIdx - args.Roots = newRoots - resp, err := s.srv.raftApply(structs.ConnectCARequestType, args) - if err != nil { - return err - } - if respErr, ok := resp.(error); ok { - return respErr - } - if respOk, ok := resp.(bool); ok && !respOk { - return fmt.Errorf("could not atomically update roots and config") - } - - // If the config has been committed, update the local provider instance - // and call teardown on the old provider - cleanupNewProvider = false - s.srv.caManager.setCAProvider(newProvider, newActiveRoot) - - if err := oldProvider.Cleanup(); err != nil { - s.logger.Warn("failed to clean up old provider", "provider", config.Provider) - } - - s.logger.Info("CA rotated to new root under provider", "provider", args.Config.Provider) - - return nil + return s.srv.caManager.UpdateConfiguration(args) } // Roots returns the currently trusted root certificates. diff --git a/agent/consul/leader.go b/agent/consul/leader.go index a251d63b3..b4b200c80 100644 --- a/agent/consul/leader.go +++ b/agent/consul/leader.go @@ -327,8 +327,7 @@ func (s *Server) establishLeadership(ctx context.Context) error { s.getOrCreateAutopilotConfig() s.autopilot.Start(ctx) - // todo(kyhavlov): start a goroutine here for handling periodic CA rotation - if err := s.caManager.initializeCA(); err != nil { + if err := s.caManager.InitializeCA(); err != nil { return err } diff --git a/agent/consul/leader_connect_ca.go b/agent/consul/leader_connect_ca.go index e1d7ffec1..39103d8e1 100644 --- a/agent/consul/leader_connect_ca.go +++ b/agent/consul/leader_connect_ca.go @@ -2,6 +2,7 @@ package consul import ( "context" + "errors" "fmt" "reflect" "strings" @@ -33,25 +34,21 @@ type CAManager struct { srv *Server logger hclog.Logger + providerLock sync.RWMutex // provider is the current CA provider in use for Connect. This is // only non-nil when we are the leader. provider ca.Provider - // providerRoot is the CARoot that was stored along with the ca.Provider // active. It's only updated in lock-step with the provider. This prevents // races between state updates to active roots and the fetch of the provider // instance. providerRoot *structs.CARoot - providerLock sync.RWMutex - // primaryRoots is the most recently seen state of the root CAs from the primary datacenter. - // This is protected by the stateLock and updated by initializeCA and the root CA watch routine. - primaryRoots structs.IndexedCARoots - - // actingSecondaryCA is whether this datacenter has been initialized as a secondary CA. - actingSecondaryCA bool + // stateLock protects the internal state used for administrative CA tasks. + stateLock sync.Mutex state CAState - stateLock sync.RWMutex + primaryRoots structs.IndexedCARoots // The most recently seen state of the root CAs from the primary datacenter. + actingSecondaryCA bool // True if this datacenter has been initialized as a secondary CA. } func NewCAManager(srv *Server) *CAManager { @@ -65,27 +62,36 @@ func NewCAManager(srv *Server) *CAManager { // setState attempts to update the CA state to the given state. // If the current state is not READY, this will fail. The only exception is when // the current state is UNINITIALIZED, and the function is called with CAStateInitializing. -func (c *CAManager) setState(newState CAState) error { - c.stateLock.RLock() +func (c *CAManager) setState(newState CAState, validateState bool) error { + c.stateLock.Lock() + defer c.stateLock.Unlock() state := c.state - c.stateLock.RUnlock() - if state == CAStateReady || (state == CAStateUninitialized && newState == CAStateInitializing) { - c.stateLock.Lock() + if !validateState || state == CAStateReady || (state == CAStateUninitialized && newState == CAStateInitializing) { c.state = newState - c.stateLock.Unlock() } else { - return fmt.Errorf("CA is already in %s state", state) + return fmt.Errorf("CA is already in state %q", state) } return nil } -// setReady sets the CA state back to READY. This should only be called by a function -// that has successfully called setState beforehand. -func (c *CAManager) setReady() { +// setPrimaryRoots updates the most recently seen roots from the primary. +func (c *CAManager) setPrimaryRoots(newRoots structs.IndexedCARoots) error { c.stateLock.Lock() - c.state = CAStateReady - c.stateLock.Unlock() + defer c.stateLock.Unlock() + + if c.state == CAStateInitializing || c.state == CAStateReconfig { + c.primaryRoots = newRoots + } else { + return fmt.Errorf("Cannot update primary roots in state %q", c.state) + } + return nil +} + +func (c *CAManager) getPrimaryRoots() structs.IndexedCARoots { + c.stateLock.Lock() + defer c.stateLock.Unlock() + return c.primaryRoots } // initializeCAConfig is used to initialize the CA config if necessary @@ -196,20 +202,21 @@ func (c *CAManager) setCAProvider(newProvider ca.Provider, root *structs.CARoot) c.providerLock.Unlock() } -// initializeCA sets up the CA provider when gaining leadership, either bootstrapping +// InitializeCA sets up the CA provider when gaining leadership, either bootstrapping // the CA if this is the primary DC or making a remote RPC for intermediate signing // if this is a secondary DC. -func (c *CAManager) initializeCA() error { +func (c *CAManager) InitializeCA() error { // Bail if connect isn't enabled. if !c.srv.config.ConnectEnabled { return nil } - err := c.setState(CAStateInitializing) + // Update the state before doing anything else. + err := c.setState(CAStateInitializing, true) if err != nil { return err } - defer c.setReady() + defer c.setState(CAStateReady, false) // Initialize the provider based on the current config. conf, err := c.initializeCAConfig() @@ -246,7 +253,9 @@ func (c *CAManager) initializeCA() error { if err := c.srv.forwardDC("ConnectCA.Roots", c.srv.config.PrimaryDatacenter, &args, &roots); err != nil { return err } - c.primaryRoots = roots + if err := c.setPrimaryRoots(roots); err != nil { + return err + } // Configure the CA provider and initialize the intermediate certificate if necessary. if err := c.initializeSecondaryProvider(provider, roots); err != nil { @@ -428,8 +437,9 @@ func (c *CAManager) initializeSecondaryCA(provider ca.Provider, config *structs. // active one. We'll use this as a template to generate any new root // representations meant for this secondary. var newActiveRoot *structs.CARoot - for _, root := range c.primaryRoots.Roots { - if root.ID == c.primaryRoots.ActiveRootID && root.Active { + primaryRoots := c.getPrimaryRoots() + for _, root := range primaryRoots.Roots { + if root.ID == primaryRoots.ActiveRootID && root.Active { newActiveRoot = root break } @@ -441,7 +451,7 @@ func (c *CAManager) initializeSecondaryCA(provider ca.Provider, config *structs. // Get a signed intermediate from the primary DC if the provider // hasn't been initialized yet or if the primary's root has changed. needsNewIntermediate := false - if activeIntermediate == "" || storedRootID != c.primaryRoots.ActiveRootID { + if activeIntermediate == "" || storedRootID != primaryRoots.ActiveRootID { needsNewIntermediate = true } @@ -559,6 +569,233 @@ func (c *CAManager) persistNewRootAndConfig(provider ca.Provider, newActiveRoot return nil } +func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error { + // Attempt to update the state first. + if err := c.setState(CAStateReconfig, true); err != nil { + return err + } + defer c.setState(CAStateReady, false) + + // Exit early if it's a no-op change + state := c.srv.fsm.State() + confIdx, config, err := state.CAConfig(nil) + if err != nil { + return err + } + + // Don't allow state changes. Either it needs to be empty or the same to allow + // read-modify-write loops that don't touch the State field. + if len(args.Config.State) > 0 && + !reflect.DeepEqual(args.Config.State, config.State) { + return ErrStateReadOnly + } + + // Don't allow users to change the ClusterID. + args.Config.ClusterID = config.ClusterID + if args.Config.Provider == config.Provider && reflect.DeepEqual(args.Config.Config, config.Config) { + return nil + } + + // If the provider hasn't changed, we need to load the current Provider state + // so it can decide if it needs to change resources or not based on the config + // change. + if args.Config.Provider == config.Provider { + // Note this is a shallow copy since the State method doc requires the + // provider return a map that will not be further modified and should not + // modify the one we pass to Configure. + args.Config.State = config.State + } + + // Create a new instance of the provider described by the config + // and get the current active root CA. This acts as a good validation + // of the config and makes sure the provider is functioning correctly + // before we commit any changes to Raft. + newProvider, err := c.srv.createCAProvider(args.Config) + if err != nil { + return fmt.Errorf("could not initialize provider: %v", err) + } + pCfg := ca.ProviderConfig{ + ClusterID: args.Config.ClusterID, + Datacenter: c.srv.config.Datacenter, + // This endpoint can be called in a secondary DC too so set this correctly. + IsPrimary: c.srv.config.Datacenter == c.srv.config.PrimaryDatacenter, + RawConfig: args.Config.Config, + State: args.Config.State, + } + if err := newProvider.Configure(pCfg); err != nil { + return fmt.Errorf("error configuring provider: %v", err) + } + + // Set up a defer to clean up the new provider if we exit early due to an error. + cleanupNewProvider := true + defer func() { + if cleanupNewProvider { + if err := newProvider.Cleanup(); err != nil { + c.logger.Warn("failed to clean up CA provider while handling startup failure", "provider", newProvider, "error", err) + } + } + }() + + // If this is a secondary, just check if the intermediate needs to be regenerated. + if c.srv.config.Datacenter != c.srv.config.PrimaryDatacenter { + if err := c.srv.caManager.initializeSecondaryCA(newProvider, args.Config); err != nil { + return fmt.Errorf("Error updating secondary datacenter CA config: %v", err) + } + cleanupNewProvider = false + c.logger.Info("Secondary CA provider config updated") + return nil + } + + if err := newProvider.GenerateRoot(); err != nil { + return fmt.Errorf("error generating CA root certificate: %v", err) + } + + newRootPEM, err := newProvider.ActiveRoot() + if err != nil { + return err + } + + newActiveRoot, err := parseCARoot(newRootPEM, args.Config.Provider, args.Config.ClusterID) + if err != nil { + return err + } + + // See if the provider needs to persist any state along with the config + pState, err := newProvider.State() + if err != nil { + return fmt.Errorf("error getting provider state: %v", err) + } + args.Config.State = pState + + // Compare the new provider's root CA ID to the current one. If they + // match, just update the existing provider with the new config. + // If they don't match, begin the root rotation process. + _, root, err := state.CARootActive(nil) + if err != nil { + return err + } + + // If the root didn't change, just update the config and return. + if root != nil && root.ID == newActiveRoot.ID { + args.Op = structs.CAOpSetConfig + resp, err := c.srv.raftApply(structs.ConnectCARequestType, args) + if err != nil { + return err + } + if respErr, ok := resp.(error); ok { + return respErr + } + + // If the config has been committed, update the local provider instance + cleanupNewProvider = false + c.setCAProvider(newProvider, newActiveRoot) + + c.logger.Info("CA provider config updated") + + return nil + } + + // At this point, we know the config change has trigged a root rotation, + // either by swapping the provider type or changing the provider's config + // to use a different root certificate. + + // First up, sanity check that the current provider actually supports + // cross-signing. + oldProvider, _ := c.getCAProvider() + if oldProvider == nil { + return fmt.Errorf("internal error: CA provider is nil") + } + canXSign, err := oldProvider.SupportsCrossSigning() + if err != nil { + return fmt.Errorf("CA provider error: %s", err) + } + if !canXSign && !args.Config.ForceWithoutCrossSigning { + return errors.New("The current CA Provider does not support cross-signing. " + + "You can try again with ForceWithoutCrossSigningSet but this may cause " + + "disruption - see documentation for more.") + } + if !canXSign && args.Config.ForceWithoutCrossSigning { + c.logger.Warn("current CA doesn't support cross signing but " + + "CA reconfiguration forced anyway with ForceWithoutCrossSigning") + } + + // If it's a config change that would trigger a rotation (different provider/root): + // 1. Get the root from the new provider. + // 2. Call CrossSignCA on the old provider to sign the new root with the old one to + // get a cross-signed certificate. + // 3. Take the active root for the new provider and append the intermediate from step 2 + // to its list of intermediates. + newRoot, err := connect.ParseCert(newRootPEM) + if err != nil { + return err + } + + if canXSign { + // Have the old provider cross-sign the new root + xcCert, err := oldProvider.CrossSignCA(newRoot) + if err != nil { + return err + } + + // Add the cross signed cert to the new CA's intermediates (to be attached + // to leaf certs). + newActiveRoot.IntermediateCerts = []string{xcCert} + } + + intermediate, err := newProvider.GenerateIntermediate() + if err != nil { + return err + } + if intermediate != newRootPEM { + newActiveRoot.IntermediateCerts = append(newActiveRoot.IntermediateCerts, intermediate) + } + + // Update the roots and CA config in the state store at the same time + idx, roots, err := state.CARoots(nil) + if err != nil { + return err + } + + var newRoots structs.CARoots + for _, r := range roots { + newRoot := *r + if newRoot.Active { + newRoot.Active = false + newRoot.RotatedOutAt = time.Now() + } + newRoots = append(newRoots, &newRoot) + } + newRoots = append(newRoots, newActiveRoot) + + args.Op = structs.CAOpSetRootsAndConfig + args.Index = idx + args.Config.ModifyIndex = confIdx + args.Roots = newRoots + resp, err := c.srv.raftApply(structs.ConnectCARequestType, args) + if err != nil { + return err + } + if respErr, ok := resp.(error); ok { + return respErr + } + if respOk, ok := resp.(bool); ok && !respOk { + return fmt.Errorf("could not atomically update roots and config") + } + + // If the config has been committed, update the local provider instance + // and call teardown on the old provider + cleanupNewProvider = false + c.setCAProvider(newProvider, newActiveRoot) + + if err := oldProvider.Cleanup(); err != nil { + c.logger.Warn("failed to clean up old provider", "provider", config.Provider) + } + + c.logger.Info("CA rotated to new root under provider", "provider", args.Config.Provider) + + return nil +} + // getIntermediateCAPrimary regenerates the intermediate cert in the primary datacenter. // This is only run for CAs that require an intermediary in the primary DC, such as Vault. // This function is being called while holding caProviderReconfigurationLock @@ -617,9 +854,7 @@ func (c *CAManager) getIntermediateCASigned(provider ca.Provider, newActiveRoot return nil } -// intermediateCertRenewalWatch checks the intermediate cert for -// expiration. As soon as more than half the time a cert is valid has passed, -// it will try to renew it. +// intermediateCertRenewalWatch periodically attempts to renew the intermediate cert. func (c *CAManager) intermediateCertRenewalWatch(ctx context.Context) error { isPrimary := c.srv.config.Datacenter == c.srv.config.PrimaryDatacenter @@ -629,81 +864,7 @@ func (c *CAManager) intermediateCertRenewalWatch(ctx context.Context) error { return nil case <-time.After(structs.IntermediateCertRenewInterval): retryLoopBackoffAbortOnSuccess(ctx, func() error { - if !isPrimary { - c.logger.Info("starting check for intermediate renewal") - } - // Grab the 'lock' right away so the provider/config can't be changed out while we check - // the intermediate. - if err := c.setState(CAStateRenewIntermediate); err != nil { - return err - } - defer c.setReady() - - provider, _ := c.getCAProvider() - if provider == nil { - // this happens when leadership is being revoked and this go routine will be stopped - return nil - } - // If this isn't the primary, make sure the CA has been initialized. - if !isPrimary && !c.configuredSecondaryCA() { - return fmt.Errorf("secondary CA is not yet configured.") - } - - state := c.srv.fsm.State() - _, activeRoot, err := state.CARootActive(nil) - if err != nil { - return err - } - - // If this is the primary, check if this is a provider that uses an intermediate cert. If - // it isn't, we don't need to check for a renewal. - if isPrimary { - _, config, err := state.CAConfig(nil) - if err != nil { - return err - } - - if _, ok := ca.PrimaryIntermediateProviders[config.Provider]; !ok { - return nil - } - } else { - c.logger.Info("Checking for intermediate renewal") - } - - activeIntermediate, err := provider.ActiveIntermediate() - if err != nil { - return err - } - - if activeIntermediate == "" { - return fmt.Errorf("datacenter doesn't have an active intermediate.") - } - - intermediateCert, err := connect.ParseCert(activeIntermediate) - if err != nil { - return fmt.Errorf("error parsing active intermediate cert: %v", err) - } - - if lessThanHalfTimePassed(time.Now(), intermediateCert.NotBefore.Add(ca.CertificateTimeDriftBuffer), - intermediateCert.NotAfter) { - return nil - } - - // Enough time has passed, go ahead with getting a new intermediate. - renewalFunc := c.getIntermediateCAPrimary - if !isPrimary { - renewalFunc = c.getIntermediateCASigned - } - if err := renewalFunc(provider, activeRoot); err != nil { - return err - } - - if err := c.persistNewRootAndConfig(provider, activeRoot, nil); err != nil { - return err - } - - c.setCAProvider(provider, activeRoot) - return nil + return c.RenewIntermediate(isPrimary) }, func(err error) { c.logger.Error("error renewing intermediate certs", "routine", intermediateCertRenewWatchRoutineName, @@ -714,6 +875,82 @@ func (c *CAManager) intermediateCertRenewalWatch(ctx context.Context) error { } } +// RenewIntermediate checks the intermediate cert for +// expiration. If more than half the time a cert is valid has passed, +// it will try to renew it. +func (c *CAManager) RenewIntermediate(isPrimary bool) error { + // Grab the 'lock' right away so the provider/config can't be changed out while we check + // the intermediate. + if err := c.setState(CAStateRenewIntermediate, true); err != nil { + return err + } + defer c.setState(CAStateReady, false) + + provider, _ := c.getCAProvider() + if provider == nil { + // this happens when leadership is being revoked and this go routine will be stopped + return nil + } + // If this isn't the primary, make sure the CA has been initialized. + if !isPrimary && !c.configuredSecondaryCA() { + return fmt.Errorf("secondary CA is not yet configured.") + } + + state := c.srv.fsm.State() + _, activeRoot, err := state.CARootActive(nil) + if err != nil { + return err + } + + // If this is the primary, check if this is a provider that uses an intermediate cert. If + // it isn't, we don't need to check for a renewal. + if isPrimary { + _, config, err := state.CAConfig(nil) + if err != nil { + return err + } + + if _, ok := ca.PrimaryIntermediateProviders[config.Provider]; !ok { + return nil + } + } + + activeIntermediate, err := provider.ActiveIntermediate() + if err != nil { + return err + } + + if activeIntermediate == "" { + return fmt.Errorf("datacenter doesn't have an active intermediate.") + } + + intermediateCert, err := connect.ParseCert(activeIntermediate) + if err != nil { + return fmt.Errorf("error parsing active intermediate cert: %v", err) + } + + if lessThanHalfTimePassed(time.Now(), intermediateCert.NotBefore.Add(ca.CertificateTimeDriftBuffer), + intermediateCert.NotAfter) { + return nil + } + + // Enough time has passed, go ahead with getting a new intermediate. + renewalFunc := c.getIntermediateCAPrimary + if !isPrimary { + renewalFunc = c.getIntermediateCASigned + } + if err := renewalFunc(provider, activeRoot); err != nil { + return err + } + + if err := c.persistNewRootAndConfig(provider, activeRoot, nil); err != nil { + return err + } + + c.setCAProvider(provider, activeRoot) + return nil +} + // secondaryCARootWatch maintains a blocking query to the primary datacenter's // ConnectCA.Roots endpoint to monitor when it needs to request a new signed // intermediate certificate. @@ -734,43 +971,10 @@ func (c *CAManager) secondaryCARootWatch(ctx context.Context) error { return fmt.Errorf("Error retrieving the primary datacenter's roots: %v", err) } - // Update the state first to claim the 'lock'. - if err := c.setState(CAStateReconfig); err != nil { + // Attempt to update the roots using the returned data. + if err := c.UpdateRoots(roots); err != nil { return err } - defer c.setReady() - - // Update the cached primary roots now that the lock is held. - c.primaryRoots = roots - - // Check to see if the primary has been upgraded in case we're waiting to switch to - // secondary mode. - provider, _ := c.getCAProvider() - if provider == nil { - // this happens when leadership is being revoked and this go routine will be stopped - return nil - } - if !c.configuredSecondaryCA() { - versionOk, primaryFound := ServersInDCMeetMinimumVersion(c.srv, c.srv.config.PrimaryDatacenter, minMultiDCConnectVersion) - if !primaryFound { - return fmt.Errorf("Primary datacenter is unreachable - deferring secondary CA initialization") - } - - if versionOk { - if err := c.initializeSecondaryProvider(provider, roots); err != nil { - return fmt.Errorf("Failed to initialize secondary CA provider: %v", err) - } - } - } - - // Run the secondary CA init routine to see if we need to request a new - // intermediate. - if c.configuredSecondaryCA() { - if err := c.initializeSecondaryCA(provider, nil); err != nil { - return fmt.Errorf("Failed to initialize the secondary CA: %v", err) - } - } - args.QueryOptions.MinQueryIndex = nextIndexVal(args.QueryOptions.MinQueryIndex, roots.QueryMeta.Index) return nil }, func(err error) { @@ -783,6 +987,51 @@ func (c *CAManager) secondaryCARootWatch(ctx context.Context) error { return nil } +// UpdateRoots updates the cached roots from the primary and regenerates the intermediate +// certificate if necessary. +func (c *CAManager) UpdateRoots(roots structs.IndexedCARoots) error { + // Update the state first to claim the 'lock'. + if err := c.setState(CAStateReconfig, true); err != nil { + return err + } + defer c.setState(CAStateReady, false) + + // Update the cached primary roots now that the lock is held. + if err := c.setPrimaryRoots(roots); err != nil { + return err + } + + // Check to see if the primary has been upgraded in case we're waiting to switch to + // secondary mode. + provider, _ := c.getCAProvider() + if provider == nil { + // this happens when leadership is being revoked and this go routine will be stopped + return nil + } + if !c.configuredSecondaryCA() { + versionOk, primaryFound := ServersInDCMeetMinimumVersion(c.srv, c.srv.config.PrimaryDatacenter, minMultiDCConnectVersion) + if !primaryFound { + return fmt.Errorf("Primary datacenter is unreachable - deferring secondary CA initialization") + } + + if versionOk { + if err := c.initializeSecondaryProvider(provider, roots); err != nil { + return fmt.Errorf("Failed to initialize secondary CA provider: %v", err) + } + } + } + + // Run the secondary CA init routine to see if we need to request a new + // intermediate. + if c.configuredSecondaryCA() { + if err := c.initializeSecondaryCA(provider, nil); err != nil { + return fmt.Errorf("Failed to initialize the secondary CA: %v", err) + } + } + + return nil +} + // initializeSecondaryProvider configures the given provider for a secondary, non-root datacenter. // It is being called while holding the stateLock in order to update actingSecondaryCA, which means // it must never take that lock itself or call anything that does. @@ -808,14 +1057,26 @@ func (c *CAManager) initializeSecondaryProvider(provider ca.Provider, roots stru return fmt.Errorf("error configuring provider: %v", err) } - c.actingSecondaryCA = true + return c.setSecondaryCA() +} + +// setSecondaryCA sets the flag for acting as a secondary CA to true. +func (c *CAManager) setSecondaryCA() error { + c.stateLock.Lock() + defer c.stateLock.Unlock() + + if c.state == CAStateInitializing || c.state == CAStateReconfig { + c.actingSecondaryCA = true + } else { + return fmt.Errorf("Cannot update secondary CA flag in state %q", c.state) + } return nil } // configuredSecondaryCA returns true if we have been initialized as a secondary datacenter's CA. func (c *CAManager) configuredSecondaryCA() bool { - c.stateLock.RLock() - defer c.stateLock.RUnlock() + c.stateLock.Lock() + defer c.stateLock.Unlock() return c.actingSecondaryCA } diff --git a/agent/consul/server.go b/agent/consul/server.go index 914eeb198..0f34f25ba 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -136,9 +136,6 @@ type Server struct { // autopilot is the Autopilot instance for this server. autopilot *autopilot.Autopilot - // autopilotWaitGroup is used to block until Autopilot shuts down. - autopilotWaitGroup sync.WaitGroup - // caManager is used to synchronize CA operations across the leader and RPC functions. caManager *CAManager From c8d4a40a87c1a6d38ba52b2462683e8fa82913f4 Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Tue, 17 Nov 2020 15:35:33 -0800 Subject: [PATCH 4/7] connect: update some function comments in CA manager --- agent/consul/leader_connect_ca.go | 115 ++++++++++++++++-------------- agent/structs/connect_ca.go | 10 +++ 2 files changed, 70 insertions(+), 55 deletions(-) diff --git a/agent/consul/leader_connect_ca.go b/agent/consul/leader_connect_ca.go index 39103d8e1..d6b94978d 100644 --- a/agent/consul/leader_connect_ca.go +++ b/agent/consul/leader_connect_ca.go @@ -95,7 +95,8 @@ func (c *CAManager) getPrimaryRoots() structs.IndexedCARoots { } // initializeCAConfig is used to initialize the CA config if necessary -// when setting up the CA during establishLeadership +// when setting up the CA during establishLeadership. The state should be set to +// non-ready before calling this. func (c *CAManager) initializeCAConfig() (*structs.CAConfiguration, error) { state := c.srv.fsm.State() _, config, err := state.CAConfig(nil) @@ -166,8 +167,8 @@ func parseCARoot(pemValue, provider, clusterID string) (*structs.CARoot, error) }, nil } -// getCAProvider is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. +// getCAProvider returns the currently active instance of the CA Provider, +// as well as the active root. func (c *CAManager) getCAProvider() (ca.Provider, *structs.CARoot) { retries := 0 var result ca.Provider @@ -230,51 +231,51 @@ func (c *CAManager) InitializeCA() error { c.setCAProvider(provider, nil) + // Run the root CA initialization if this is the primary DC. + if c.srv.config.PrimaryDatacenter == c.srv.config.Datacenter { + return c.initializeRootCA(provider, conf) + } + // If this isn't the primary DC, run the secondary DC routine if the primary has already been upgraded to at least 1.6.0 - if c.srv.config.PrimaryDatacenter != c.srv.config.Datacenter { - versionOk, foundPrimary := ServersInDCMeetMinimumVersion(c.srv, c.srv.config.PrimaryDatacenter, minMultiDCConnectVersion) - if !foundPrimary { - c.logger.Warn("primary datacenter is configured but unreachable - deferring initialization of the secondary datacenter CA") - // return nil because we will initialize the secondary CA later - return nil - } else if !versionOk { - // return nil because we will initialize the secondary CA later - c.logger.Warn("servers in the primary datacenter are not at least at the minimum version - deferring initialization of the secondary datacenter CA", - "min_version", minMultiDCConnectVersion.String(), - ) - return nil - } - - // Get the root CA to see if we need to refresh our intermediate. - args := structs.DCSpecificRequest{ - Datacenter: c.srv.config.PrimaryDatacenter, - } - var roots structs.IndexedCARoots - if err := c.srv.forwardDC("ConnectCA.Roots", c.srv.config.PrimaryDatacenter, &args, &roots); err != nil { - return err - } - if err := c.setPrimaryRoots(roots); err != nil { - return err - } - - // Configure the CA provider and initialize the intermediate certificate if necessary. - if err := c.initializeSecondaryProvider(provider, roots); err != nil { - return fmt.Errorf("error configuring provider: %v", err) - } - if err := c.initializeSecondaryCA(provider, nil); err != nil { - return err - } - - c.logger.Info("initialized secondary datacenter CA with provider", "provider", conf.Provider) + versionOk, foundPrimary := ServersInDCMeetMinimumVersion(c.srv, c.srv.config.PrimaryDatacenter, minMultiDCConnectVersion) + if !foundPrimary { + c.logger.Warn("primary datacenter is configured but unreachable - deferring initialization of the secondary datacenter CA") + // return nil because we will initialize the secondary CA later + return nil + } else if !versionOk { + // return nil because we will initialize the secondary CA later + c.logger.Warn("servers in the primary datacenter are not at least at the minimum version - deferring initialization of the secondary datacenter CA", + "min_version", minMultiDCConnectVersion.String(), + ) return nil } - return c.initializeRootCA(provider, conf) + // Get the root CA to see if we need to refresh our intermediate. + args := structs.DCSpecificRequest{ + Datacenter: c.srv.config.PrimaryDatacenter, + } + var roots structs.IndexedCARoots + if err := c.srv.forwardDC("ConnectCA.Roots", c.srv.config.PrimaryDatacenter, &args, &roots); err != nil { + return err + } + if err := c.setPrimaryRoots(roots); err != nil { + return err + } + + // Configure the CA provider and initialize the intermediate certificate if necessary. + if err := c.initializeSecondaryProvider(provider, roots); err != nil { + return fmt.Errorf("error configuring provider: %v", err) + } + if err := c.initializeSecondaryCA(provider, nil); err != nil { + return err + } + + c.logger.Info("initialized secondary datacenter CA with provider", "provider", conf.Provider) + return nil } -// initializeRootCA runs the initialization logic for a root CA. -// It is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. +// initializeRootCA runs the initialization logic for a root CA. It should only +// be called while the state lock is held by setting the state to non-ready. func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfiguration) error { pCfg := ca.ProviderConfig{ ClusterID: conf.ClusterID, @@ -380,9 +381,8 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi // initializeSecondaryCA runs the routine for generating an intermediate CA CSR and getting // it signed by the primary DC if the root CA of the primary DC has changed since the last -// intermediate. -// It is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. +// intermediate. It should only be called while the state lock is held by setting the state +// to non-ready. func (c *CAManager) initializeSecondaryCA(provider ca.Provider, config *structs.CAConfiguration) error { activeIntermediate, err := provider.ActiveIntermediate() if err != nil { @@ -493,8 +493,8 @@ func (c *CAManager) initializeSecondaryCA(provider ca.Provider, config *structs. return nil } -// persistNewRootAndConfig is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. +// persistNewRootAndConfig should only be called while the state lock is held +// by setting the state to non-ready. // If newActiveRoot is non-nil, it will be appended to the current roots list. // If config is non-nil, it will be used to overwrite the existing config. func (c *CAManager) persistNewRootAndConfig(provider ca.Provider, newActiveRoot *structs.CARoot, config *structs.CAConfiguration) error { @@ -539,7 +539,7 @@ func (c *CAManager) persistNewRootAndConfig(provider ca.Provider, newActiveRoot newRoot.RotatedOutAt = time.Now() } if newRoot.ExternalTrustDomain == "" { - newRoot.ExternalTrustDomain = config.ClusterID + newRoot.ExternalTrustDomain = newConf.ClusterID } newRoots = append(newRoots, &newRoot) } @@ -798,8 +798,7 @@ func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error { // getIntermediateCAPrimary regenerates the intermediate cert in the primary datacenter. // This is only run for CAs that require an intermediary in the primary DC, such as Vault. -// This function is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. +// It should only be called while the state lock is held by setting the state to non-ready. func (c *CAManager) getIntermediateCAPrimary(provider ca.Provider, newActiveRoot *structs.CARoot) error { // Generate and sign an intermediate cert using the root CA. intermediatePEM, err := provider.GenerateIntermediate() @@ -821,8 +820,8 @@ func (c *CAManager) getIntermediateCAPrimary(provider ca.Provider, newActiveRoot return nil } -// getIntermediateCASigned is being called while holding caProviderReconfigurationLock -// which means it must never take that lock itself or call anything that does. +// getIntermediateCASigned should only be called while the state lock is held by +// setting the state to non-ready. func (c *CAManager) getIntermediateCASigned(provider ca.Provider, newActiveRoot *structs.CARoot) error { csr, err := provider.GenerateIntermediateCSR() if err != nil { @@ -897,10 +896,11 @@ func (c *CAManager) RenewIntermediate(isPrimary bool) error { } state := c.srv.fsm.State() - _, activeRoot, err := state.CARootActive(nil) + _, root, err := state.CARootActive(nil) if err != nil { return err } + activeRoot := root.Clone() // If this is the primary, check if this is a provider that uses an intermediate cert. If // it isn't, we don't need to check for a renewal. @@ -971,6 +971,13 @@ func (c *CAManager) secondaryCARootWatch(ctx context.Context) error { return fmt.Errorf("Error retrieving the primary datacenter's roots: %v", err) } + // Return if the context has been canceled while waiting on the RPC. + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + // Attempt to update the roots using the returned data. if err := c.UpdateRoots(roots); err != nil { return err @@ -1033,8 +1040,6 @@ func (c *CAManager) UpdateRoots(roots structs.IndexedCARoots) error { } // initializeSecondaryProvider configures the given provider for a secondary, non-root datacenter. -// It is being called while holding the stateLock in order to update actingSecondaryCA, which means -// it must never take that lock itself or call anything that does. func (c *CAManager) initializeSecondaryProvider(provider ca.Provider, roots structs.IndexedCARoots) error { if roots.TrustDomain == "" { return fmt.Errorf("trust domain from primary datacenter is not initialized") diff --git a/agent/structs/connect_ca.go b/agent/structs/connect_ca.go index 6b57e9e39..dc96334f8 100644 --- a/agent/structs/connect_ca.go +++ b/agent/structs/connect_ca.go @@ -122,6 +122,16 @@ type CARoot struct { RaftIndex } +func (c *CARoot) Clone() *CARoot { + if c == nil { + return nil + } + + newCopy := *c + copy(c.IntermediateCerts, newCopy.IntermediateCerts) + return &newCopy +} + // CARoots is a list of CARoot structures. type CARoots []*CARoot From 26a9c985c5362ee8f073d0c87b97ce269d6c5013 Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Thu, 19 Nov 2020 20:08:06 -0800 Subject: [PATCH 5/7] Add CA server delegate interface for testing --- agent/connect/ca/provider_consul_test.go | 25 +-- agent/connect/ca/testing.go | 29 +++ agent/connect/testing_ca.go | 24 ++- agent/consul/leader.go | 1 + agent/consul/leader_connect_ca.go | 136 +++++++----- agent/consul/leader_connect_ca_test.go | 255 +++++++++++++++++++++++ agent/consul/server.go | 2 +- agent/structs/acl.go | 6 +- agent/structs/config_entry_intentions.go | 2 +- agent/structs/connect_ca.go | 2 +- 10 files changed, 400 insertions(+), 82 deletions(-) create mode 100644 agent/consul/leader_connect_ca_test.go diff --git a/agent/connect/ca/provider_consul_test.go b/agent/connect/ca/provider_consul_test.go index 759769049..8733293d8 100644 --- a/agent/connect/ca/provider_consul_test.go +++ b/agent/connect/ca/provider_consul_test.go @@ -22,30 +22,7 @@ func (c *consulCAMockDelegate) State() *state.Store { } func (c *consulCAMockDelegate) ApplyCARequest(req *structs.CARequest) (interface{}, error) { - idx, _, err := c.state.CAConfig(nil) - if err != nil { - return nil, err - } - - switch req.Op { - case structs.CAOpSetProviderState: - _, err := c.state.CASetProviderState(idx+1, req.ProviderState) - if err != nil { - return nil, err - } - - return true, nil - case structs.CAOpDeleteProviderState: - if err := c.state.CADeleteProviderState(idx+1, req.ProviderState.ID); err != nil { - return nil, err - } - - return true, nil - case structs.CAOpIncrementProviderSerialNumber: - return uint64(2), nil - default: - return nil, fmt.Errorf("Invalid CA operation '%s'", req.Op) - } + return ApplyCARequestToStore(c.state, req) } func newMockDelegate(t *testing.T, conf *structs.CAConfiguration) *consulCAMockDelegate { diff --git a/agent/connect/ca/testing.go b/agent/connect/ca/testing.go index 25533f8dd..0bd837f7f 100644 --- a/agent/connect/ca/testing.go +++ b/agent/connect/ca/testing.go @@ -8,6 +8,8 @@ import ( "sync" "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/consul/state" + "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/sdk/freeport" "github.com/hashicorp/consul/sdk/testutil/retry" "github.com/hashicorp/go-hclog" @@ -234,3 +236,30 @@ func (v *TestVaultServer) Stop() error { return nil } + +func ApplyCARequestToStore(store *state.Store, req *structs.CARequest) (interface{}, error) { + idx, _, err := store.CAConfig(nil) + if err != nil { + return nil, err + } + + switch req.Op { + case structs.CAOpSetProviderState: + _, err := store.CASetProviderState(idx+1, req.ProviderState) + if err != nil { + return nil, err + } + + return true, nil + case structs.CAOpDeleteProviderState: + if err := store.CADeleteProviderState(idx+1, req.ProviderState.ID); err != nil { + return nil, err + } + + return true, nil + case structs.CAOpIncrementProviderSerialNumber: + return uint64(2), nil + default: + return nil, fmt.Errorf("Invalid CA operation '%s'", req.Op) + } +} diff --git a/agent/connect/testing_ca.go b/agent/connect/testing_ca.go index e623c1872..4851b9db5 100644 --- a/agent/connect/testing_ca.go +++ b/agent/connect/testing_ca.go @@ -56,7 +56,7 @@ func ValidateLeaf(caPEM string, leafPEM string, intermediatePEMs []string) error return err } -func testCA(t testing.T, xc *structs.CARoot, keyType string, keyBits int) *structs.CARoot { +func testCA(t testing.T, xc *structs.CARoot, keyType string, keyBits int, ttl time.Duration) *structs.CARoot { var result structs.CARoot result.Active = true result.Name = fmt.Sprintf("Test CA %d", atomic.AddUint64(&testCACounter, 1)) @@ -76,6 +76,14 @@ func testCA(t testing.T, xc *structs.CARoot, keyType string, keyBits int) *struc id := &SpiffeIDSigning{ClusterID: TestClusterID, Domain: "consul"} // Create the CA cert + now := time.Now() + before := now + after := now + if ttl != 0 { + after = after.Add(ttl) + } else { + after = after.AddDate(10, 0, 0) + } template := x509.Certificate{ SerialNumber: sn, Subject: pkix.Name{CommonName: result.Name}, @@ -85,8 +93,8 @@ func testCA(t testing.T, xc *structs.CARoot, keyType string, keyBits int) *struc x509.KeyUsageCRLSign | x509.KeyUsageDigitalSignature, IsCA: true, - NotAfter: time.Now().AddDate(10, 0, 0), - NotBefore: time.Now(), + NotAfter: after, + NotBefore: before, AuthorityKeyId: testKeyID(t, signer.Public()), SubjectKeyId: testKeyID(t, signer.Public()), } @@ -159,13 +167,19 @@ func testCA(t testing.T, xc *structs.CARoot, keyType string, keyBits int) *struc // that is cross-signed with the previous cert, and this will be set as // SigningCert. func TestCA(t testing.T, xc *structs.CARoot) *structs.CARoot { - return testCA(t, xc, DefaultPrivateKeyType, DefaultPrivateKeyBits) + return testCA(t, xc, DefaultPrivateKeyType, DefaultPrivateKeyBits, 0) +} + +// TestCAWithTTL is similar to TestCA, except that it +// takes a custom duration for the lifetime of the certificate. +func TestCAWithTTL(t testing.T, xc *structs.CARoot, ttl time.Duration) *structs.CARoot { + return testCA(t, xc, DefaultPrivateKeyType, DefaultPrivateKeyBits, ttl) } // TestCAWithKeyType is similar to TestCA, except that it // takes two additional arguments to override the default private key type and size. func TestCAWithKeyType(t testing.T, xc *structs.CARoot, keyType string, keyBits int) *structs.CARoot { - return testCA(t, xc, keyType, keyBits) + return testCA(t, xc, keyType, keyBits, 0) } // testCertID is an interface to be implemented the various spiffe ID / CertURI types diff --git a/agent/consul/leader.go b/agent/consul/leader.go index b4b200c80..797ffeae8 100644 --- a/agent/consul/leader.go +++ b/agent/consul/leader.go @@ -375,6 +375,7 @@ func (s *Server) revokeLeadership() { s.stopConnectLeader() s.caManager.setCAProvider(nil, nil) + s.caManager.setState(CAStateUninitialized, false) s.stopACLTokenReaping() diff --git a/agent/consul/leader_connect_ca.go b/agent/consul/leader_connect_ca.go index d6b94978d..5e42b5f9d 100644 --- a/agent/consul/leader_connect_ca.go +++ b/agent/consul/leader_connect_ca.go @@ -11,8 +11,8 @@ import ( "github.com/hashicorp/consul/agent/connect" "github.com/hashicorp/consul/agent/connect/ca" + "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/logging" "github.com/hashicorp/go-hclog" uuid "github.com/hashicorp/go-uuid" ) @@ -27,12 +27,27 @@ const ( CAStateReconfig = "RECONFIGURING" ) +// caServerDelegate is an interface for server operations for facilitating +// easier testing. +type caServerDelegate interface { + State() *state.Store + IsLeader() bool + + createCAProvider(conf *structs.CAConfiguration) (ca.Provider, error) + forwardDC(method, dc string, args interface{}, reply interface{}) error + generateCASignRequest(csr string) *structs.CASignRequest + raftApply(t structs.MessageType, msg interface{}) (interface{}, error) + + checkServersProvider +} + // CAManager is a wrapper around CA operations such as updating roots, an intermediate // or the configuration. All operations should go through the CAManager in order to // avoid data races. type CAManager struct { - srv *Server - logger hclog.Logger + delegate caServerDelegate + serverConf *Config + logger hclog.Logger providerLock sync.RWMutex // provider is the current CA provider in use for Connect. This is @@ -51,14 +66,30 @@ type CAManager struct { actingSecondaryCA bool // True if this datacenter has been initialized as a secondary CA. } -func NewCAManager(srv *Server) *CAManager { +type caDelegateWithState struct { + *Server +} + +func (c *caDelegateWithState) State() *state.Store { + return c.fsm.State() +} + +func NewCAManager(delegate caServerDelegate, logger hclog.Logger, config *Config) *CAManager { return &CAManager{ - srv: srv, - logger: srv.loggers.Named(logging.Connect), - state: CAStateUninitialized, + delegate: delegate, + logger: logger, + serverConf: config, + state: CAStateUninitialized, } } +func (c *CAManager) reset() { + c.state = CAStateUninitialized + c.primaryRoots = structs.IndexedCARoots{} + c.actingSecondaryCA = false + c.setCAProvider(nil, nil) +} + // setState attempts to update the CA state to the given state. // If the current state is not READY, this will fail. The only exception is when // the current state is UNINITIALIZED, and the function is called with CAStateInitializing. @@ -98,13 +129,13 @@ func (c *CAManager) getPrimaryRoots() structs.IndexedCARoots { // when setting up the CA during establishLeadership. The state should be set to // non-ready before calling this. func (c *CAManager) initializeCAConfig() (*structs.CAConfiguration, error) { - state := c.srv.fsm.State() + state := c.delegate.State() _, config, err := state.CAConfig(nil) if err != nil { return nil, err } if config == nil { - config = c.srv.config.CAConfig + config = c.serverConf.CAConfig if config.ClusterID == "" { id, err := uuid.GenerateUUID() if err != nil { @@ -129,7 +160,7 @@ func (c *CAManager) initializeCAConfig() (*structs.CAConfiguration, error) { Op: structs.CAOpSetConfig, Config: config, } - if resp, err := c.srv.raftApply(structs.ConnectCARequestType, req); err != nil { + if resp, err := c.delegate.raftApply(structs.ConnectCARequestType, req); err != nil { return nil, err } else if respErr, ok := resp.(error); ok { return nil, respErr @@ -182,7 +213,7 @@ func (c *CAManager) getCAProvider() (ca.Provider, *structs.CARoot) { // In cases where an agent is started with managed proxies, we may ask // for the provider before establishLeadership completes. If we're the // leader, then wait and get the provider again - if result == nil && c.srv.IsLeader() && retries < 10 { + if result == nil && c.delegate.IsLeader() && retries < 10 { retries++ time.Sleep(50 * time.Millisecond) continue @@ -208,7 +239,7 @@ func (c *CAManager) setCAProvider(newProvider ca.Provider, root *structs.CARoot) // if this is a secondary DC. func (c *CAManager) InitializeCA() error { // Bail if connect isn't enabled. - if !c.srv.config.ConnectEnabled { + if !c.serverConf.ConnectEnabled { return nil } @@ -224,7 +255,7 @@ func (c *CAManager) InitializeCA() error { if err != nil { return err } - provider, err := c.srv.createCAProvider(conf) + provider, err := c.delegate.createCAProvider(conf) if err != nil { return err } @@ -232,12 +263,12 @@ func (c *CAManager) InitializeCA() error { c.setCAProvider(provider, nil) // Run the root CA initialization if this is the primary DC. - if c.srv.config.PrimaryDatacenter == c.srv.config.Datacenter { + if c.serverConf.PrimaryDatacenter == c.serverConf.Datacenter { return c.initializeRootCA(provider, conf) } // If this isn't the primary DC, run the secondary DC routine if the primary has already been upgraded to at least 1.6.0 - versionOk, foundPrimary := ServersInDCMeetMinimumVersion(c.srv, c.srv.config.PrimaryDatacenter, minMultiDCConnectVersion) + versionOk, foundPrimary := ServersInDCMeetMinimumVersion(c.delegate, c.serverConf.PrimaryDatacenter, minMultiDCConnectVersion) if !foundPrimary { c.logger.Warn("primary datacenter is configured but unreachable - deferring initialization of the secondary datacenter CA") // return nil because we will initialize the secondary CA later @@ -252,10 +283,10 @@ func (c *CAManager) InitializeCA() error { // Get the root CA to see if we need to refresh our intermediate. args := structs.DCSpecificRequest{ - Datacenter: c.srv.config.PrimaryDatacenter, + Datacenter: c.serverConf.PrimaryDatacenter, } var roots structs.IndexedCARoots - if err := c.srv.forwardDC("ConnectCA.Roots", c.srv.config.PrimaryDatacenter, &args, &roots); err != nil { + if err := c.delegate.forwardDC("ConnectCA.Roots", c.serverConf.PrimaryDatacenter, &args, &roots); err != nil { return err } if err := c.setPrimaryRoots(roots); err != nil { @@ -279,7 +310,7 @@ func (c *CAManager) InitializeCA() error { func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfiguration) error { pCfg := ca.ProviderConfig{ ClusterID: conf.ClusterID, - Datacenter: c.srv.config.Datacenter, + Datacenter: c.serverConf.Datacenter, IsPrimary: true, RawConfig: conf.Config, State: conf.State, @@ -324,7 +355,7 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi Op: structs.CAOpSetConfig, Config: conf, } - if _, err = c.srv.raftApply(structs.ConnectCARequestType, req); err != nil { + if _, err = c.delegate.raftApply(structs.ConnectCARequestType, req); err != nil { return fmt.Errorf("error persisting provider state: %v", err) } } @@ -334,7 +365,7 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi // tied to the provider. // Every change to the CA after this initial bootstrapping should // be done through the rotation process. - state := c.srv.fsm.State() + state := c.delegate.State() _, activeRoot, err := state.CARootActive(nil) if err != nil { return err @@ -359,7 +390,7 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi } // Store the root cert in raft - resp, err := c.srv.raftApply(structs.ConnectCARequestType, &structs.CARequest{ + resp, err := c.delegate.raftApply(structs.ConnectCARequestType, &structs.CARequest{ Op: structs.CAOpSetRoots, Index: idx, Roots: []*structs.CARoot{rootCA}, @@ -424,7 +455,7 @@ func (c *CAManager) initializeSecondaryCA(provider ca.Provider, config *structs. // This will fetch the secondary's exact current representation of the // active root. Note that this data should only be used if the IDs // match, otherwise it's out of date and should be regenerated. - _, activeSecondaryRoot, err = c.srv.fsm.State().CARootActive(nil) + _, activeSecondaryRoot, err = c.delegate.State().CARootActive(nil) if err != nil { return err } @@ -474,7 +505,7 @@ func (c *CAManager) initializeSecondaryCA(provider ca.Provider, config *structs. } // Update the roots list in the state store if there's a new active root. - state := c.srv.fsm.State() + state := c.delegate.State() _, activeRoot, err := state.CARootActive(nil) if err != nil { return err @@ -498,7 +529,7 @@ func (c *CAManager) initializeSecondaryCA(provider ca.Provider, config *structs. // If newActiveRoot is non-nil, it will be appended to the current roots list. // If config is non-nil, it will be used to overwrite the existing config. func (c *CAManager) persistNewRootAndConfig(provider ca.Provider, newActiveRoot *structs.CARoot, config *structs.CAConfiguration) error { - state := c.srv.fsm.State() + state := c.delegate.State() idx, oldRoots, err := state.CARoots(nil) if err != nil { return err @@ -554,7 +585,7 @@ func (c *CAManager) persistNewRootAndConfig(provider ca.Provider, newActiveRoot Roots: newRoots, Config: &newConf, } - resp, err := c.srv.raftApply(structs.ConnectCARequestType, &args) + resp, err := c.delegate.raftApply(structs.ConnectCARequestType, args) if err != nil { return err } @@ -577,7 +608,7 @@ func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error { defer c.setState(CAStateReady, false) // Exit early if it's a no-op change - state := c.srv.fsm.State() + state := c.delegate.State() confIdx, config, err := state.CAConfig(nil) if err != nil { return err @@ -610,15 +641,15 @@ func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error { // and get the current active root CA. This acts as a good validation // of the config and makes sure the provider is functioning correctly // before we commit any changes to Raft. - newProvider, err := c.srv.createCAProvider(args.Config) + newProvider, err := c.delegate.createCAProvider(args.Config) if err != nil { return fmt.Errorf("could not initialize provider: %v", err) } pCfg := ca.ProviderConfig{ ClusterID: args.Config.ClusterID, - Datacenter: c.srv.config.Datacenter, + Datacenter: c.serverConf.Datacenter, // This endpoint can be called in a secondary DC too so set this correctly. - IsPrimary: c.srv.config.Datacenter == c.srv.config.PrimaryDatacenter, + IsPrimary: c.serverConf.Datacenter == c.serverConf.PrimaryDatacenter, RawConfig: args.Config.Config, State: args.Config.State, } @@ -637,8 +668,8 @@ func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error { }() // If this is a secondary, just check if the intermediate needs to be regenerated. - if c.srv.config.Datacenter != c.srv.config.PrimaryDatacenter { - if err := c.srv.caManager.initializeSecondaryCA(newProvider, args.Config); err != nil { + if c.serverConf.Datacenter != c.serverConf.PrimaryDatacenter { + if err := c.initializeSecondaryCA(newProvider, args.Config); err != nil { return fmt.Errorf("Error updating secondary datacenter CA config: %v", err) } cleanupNewProvider = false @@ -678,7 +709,7 @@ func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error { // If the root didn't change, just update the config and return. if root != nil && root.ID == newActiveRoot.ID { args.Op = structs.CAOpSetConfig - resp, err := c.srv.raftApply(structs.ConnectCARequestType, args) + resp, err := c.delegate.raftApply(structs.ConnectCARequestType, args) if err != nil { return err } @@ -771,7 +802,7 @@ func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error { args.Index = idx args.Config.ModifyIndex = confIdx args.Roots = newRoots - resp, err := c.srv.raftApply(structs.ConnectCARequestType, args) + resp, err := c.delegate.raftApply(structs.ConnectCARequestType, args) if err != nil { return err } @@ -829,7 +860,7 @@ func (c *CAManager) getIntermediateCASigned(provider ca.Provider, newActiveRoot } var intermediatePEM string - if err := c.srv.forwardDC("ConnectCA.SignIntermediate", c.srv.config.PrimaryDatacenter, c.srv.generateCASignRequest(csr), &intermediatePEM); err != nil { + if err := c.delegate.forwardDC("ConnectCA.SignIntermediate", c.serverConf.PrimaryDatacenter, c.delegate.generateCASignRequest(csr), &intermediatePEM); err != nil { // this is a failure in the primary and shouldn't be capable of erroring out our establishing leadership c.logger.Warn("Primary datacenter refused to sign our intermediate CA certificate", "error", err) return nil @@ -855,7 +886,7 @@ func (c *CAManager) getIntermediateCASigned(provider ca.Provider, newActiveRoot // intermediateCertRenewalWatch periodically attempts to renew the intermediate cert. func (c *CAManager) intermediateCertRenewalWatch(ctx context.Context) error { - isPrimary := c.srv.config.Datacenter == c.srv.config.PrimaryDatacenter + isPrimary := c.serverConf.Datacenter == c.serverConf.PrimaryDatacenter for { select { @@ -863,7 +894,7 @@ func (c *CAManager) intermediateCertRenewalWatch(ctx context.Context) error { return nil case <-time.After(structs.IntermediateCertRenewInterval): retryLoopBackoffAbortOnSuccess(ctx, func() error { - return c.RenewIntermediate(isPrimary) + return c.RenewIntermediate(ctx, isPrimary) }, func(err error) { c.logger.Error("error renewing intermediate certs", "routine", intermediateCertRenewWatchRoutineName, @@ -877,7 +908,7 @@ func (c *CAManager) intermediateCertRenewalWatch(ctx context.Context) error { // RenewIntermediate checks the intermediate cert for // expiration. If more than half the time a cert is valid has passed, // it will try to renew it. -func (c *CAManager) RenewIntermediate(isPrimary bool) error { +func (c *CAManager) RenewIntermediate(ctx context.Context, isPrimary bool) error { // Grab the 'lock' right away so the provider/config can't be changed out while we check // the intermediate. if err := c.setState(CAStateRenewIntermediate, true); err != nil { @@ -895,7 +926,7 @@ func (c *CAManager) RenewIntermediate(isPrimary bool) error { return fmt.Errorf("secondary CA is not yet configured.") } - state := c.srv.fsm.State() + state := c.delegate.State() _, root, err := state.CARootActive(nil) if err != nil { return err @@ -939,8 +970,19 @@ func (c *CAManager) RenewIntermediate(isPrimary bool) error { if !isPrimary { renewalFunc = c.getIntermediateCASigned } - if err := renewalFunc(provider, activeRoot); err != nil { - return err + errCh := make(chan error) + go func() { + errCh <- renewalFunc(provider, activeRoot) + }() + + // Wait for the renewal func to return or for the context to be canceled. + select { + case <-ctx.Done(): + return ctx.Err() + case err := <-errCh: + if err != nil { + return err + } } if err := c.persistNewRootAndConfig(provider, activeRoot, nil); err != nil { @@ -956,18 +998,18 @@ func (c *CAManager) RenewIntermediate(isPrimary bool) error { // intermediate certificate. func (c *CAManager) secondaryCARootWatch(ctx context.Context) error { args := structs.DCSpecificRequest{ - Datacenter: c.srv.config.PrimaryDatacenter, + Datacenter: c.serverConf.PrimaryDatacenter, QueryOptions: structs.QueryOptions{ // the maximum time the primary roots watch query can block before returning - MaxQueryTime: c.srv.config.MaxQueryTime, + MaxQueryTime: c.serverConf.MaxQueryTime, }, } - c.logger.Debug("starting Connect CA root replication from primary datacenter", "primary", c.srv.config.PrimaryDatacenter) + c.logger.Debug("starting Connect CA root replication from primary datacenter", "primary", c.serverConf.PrimaryDatacenter) retryLoopBackoff(ctx, func() error { var roots structs.IndexedCARoots - if err := c.srv.forwardDC("ConnectCA.Roots", c.srv.config.PrimaryDatacenter, &args, &roots); err != nil { + if err := c.delegate.forwardDC("ConnectCA.Roots", c.serverConf.PrimaryDatacenter, &args, &roots); err != nil { return fmt.Errorf("Error retrieving the primary datacenter's roots: %v", err) } @@ -1016,7 +1058,7 @@ func (c *CAManager) UpdateRoots(roots structs.IndexedCARoots) error { return nil } if !c.configuredSecondaryCA() { - versionOk, primaryFound := ServersInDCMeetMinimumVersion(c.srv, c.srv.config.PrimaryDatacenter, minMultiDCConnectVersion) + versionOk, primaryFound := ServersInDCMeetMinimumVersion(c.delegate, c.serverConf.PrimaryDatacenter, minMultiDCConnectVersion) if !primaryFound { return fmt.Errorf("Primary datacenter is unreachable - deferring secondary CA initialization") } @@ -1046,14 +1088,14 @@ func (c *CAManager) initializeSecondaryProvider(provider ca.Provider, roots stru } clusterID := strings.Split(roots.TrustDomain, ".")[0] - _, conf, err := c.srv.fsm.State().CAConfig(nil) + _, conf, err := c.delegate.State().CAConfig(nil) if err != nil { return err } pCfg := ca.ProviderConfig{ ClusterID: clusterID, - Datacenter: c.srv.config.Datacenter, + Datacenter: c.serverConf.Datacenter, IsPrimary: false, RawConfig: conf.Config, State: conf.State, diff --git a/agent/consul/leader_connect_ca_test.go b/agent/consul/leader_connect_ca_test.go new file mode 100644 index 000000000..a582f90ba --- /dev/null +++ b/agent/consul/leader_connect_ca_test.go @@ -0,0 +1,255 @@ +package consul + +import ( + "context" + "crypto/x509" + "errors" + "fmt" + "testing" + "time" + + "github.com/hashicorp/consul/agent/connect" + ca "github.com/hashicorp/consul/agent/connect/ca" + "github.com/hashicorp/consul/agent/consul/state" + "github.com/hashicorp/consul/agent/metadata" + "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/sdk/testutil" + "github.com/hashicorp/go-version" + "github.com/hashicorp/serf/serf" + "github.com/stretchr/testify/require" +) + +type mockCAServerDelegate struct { + t *testing.T + config *Config + store *state.Store + primaryRoot *structs.CARoot + callbackCh chan string +} + +func NewMockCAServerDelegate(t *testing.T, config *Config) *mockCAServerDelegate { + delegate := &mockCAServerDelegate{ + t: t, + config: config, + store: state.NewStateStore(nil), + primaryRoot: connect.TestCAWithTTL(t, nil, 1*time.Second), + callbackCh: make(chan string, 0), + } + delegate.store.CASetConfig(1, testCAConfig()) + + return delegate +} + +func (m *mockCAServerDelegate) State() *state.Store { + return m.store +} + +func (m *mockCAServerDelegate) IsLeader() bool { + return true +} + +func (m *mockCAServerDelegate) CheckServers(datacenter string, fn func(*metadata.Server) bool) { + ver, _ := version.NewVersion("1.6.0") + fn(&metadata.Server{ + Status: serf.StatusAlive, + Build: *ver, + }) +} + +func (m *mockCAServerDelegate) ApplyCARequest(req *structs.CARequest) (interface{}, error) { + return ca.ApplyCARequestToStore(m.store, req) +} + +func (m *mockCAServerDelegate) createCAProvider(conf *structs.CAConfiguration) (ca.Provider, error) { + return &mockCAProvider{ + callbackCh: m.callbackCh, + rootPEM: m.primaryRoot.RootCert, + }, nil +} + +func (m *mockCAServerDelegate) forwardDC(method, dc string, args interface{}, reply interface{}) error { + switch method { + case "ConnectCA.Roots": + roots := reply.(*structs.IndexedCARoots) + roots.TrustDomain = connect.TestClusterID + roots.Roots = []*structs.CARoot{m.primaryRoot} + roots.ActiveRootID = m.primaryRoot.ID + case "ConnectCA.SignIntermediate": + r := reply.(*string) + *r = m.primaryRoot.RootCert + } + + m.callbackCh <- fmt.Sprintf("forwardDC/%s", method) + + return nil +} + +func (m *mockCAServerDelegate) generateCASignRequest(csr string) *structs.CASignRequest { + return &structs.CASignRequest{ + Datacenter: m.config.PrimaryDatacenter, + CSR: csr, + } +} + +func (m *mockCAServerDelegate) raftApply(t structs.MessageType, msg interface{}) (interface{}, error) { + if t == structs.ConnectCARequestType { + req := msg.(*structs.CARequest) + act, err := m.store.CARootSetCAS(1, req.Index, req.Roots) + require.NoError(m.t, err) + require.True(m.t, act) + + act, err = m.store.CACheckAndSetConfig(1, req.Config.ModifyIndex, req.Config) + require.NoError(m.t, err) + } + m.callbackCh <- fmt.Sprintf("raftApply/%s", t) + return nil, nil +} + +// mockCAProvider mocks an empty provider implementation with a channel in order to coordinate +// waiting for certain methods to be called. +type mockCAProvider struct { + callbackCh chan string + rootPEM string +} + +func (m *mockCAProvider) Configure(cfg ca.ProviderConfig) error { return nil } +func (m *mockCAProvider) State() (map[string]string, error) { return nil, nil } +func (m *mockCAProvider) GenerateRoot() error { return nil } +func (m *mockCAProvider) ActiveRoot() (string, error) { return m.rootPEM, nil } +func (m *mockCAProvider) GenerateIntermediateCSR() (string, error) { + m.callbackCh <- "provider/GenerateIntermediateCSR" + return "", nil +} +func (m *mockCAProvider) SetIntermediate(intermediatePEM, rootPEM string) error { + m.callbackCh <- "provider/SetIntermediate" + return nil +} +func (m *mockCAProvider) ActiveIntermediate() (string, error) { return m.rootPEM, nil } +func (m *mockCAProvider) GenerateIntermediate() (string, error) { return "", nil } +func (m *mockCAProvider) Sign(*x509.CertificateRequest) (string, error) { return "", nil } +func (m *mockCAProvider) SignIntermediate(*x509.CertificateRequest) (string, error) { return "", nil } +func (m *mockCAProvider) CrossSignCA(*x509.Certificate) (string, error) { return "", nil } +func (m *mockCAProvider) SupportsCrossSigning() (bool, error) { return false, nil } +func (m *mockCAProvider) Cleanup() error { return nil } + +func waitForCh(t *testing.T, ch chan string, expected string) { + select { + case op := <-ch: + if op != expected { + t.Fatalf("got unexpected op %q, wanted %q", op, expected) + } + case <-time.After(3 * time.Second): + t.Fatalf("never got op %q", expected) + } +} + +func waitForEmptyCh(t *testing.T, ch chan string) { + select { + case op := <-ch: + t.Fatalf("got unexpected op %q", op) + case <-time.After(1 * time.Second): + } +} + +func testCAConfig() *structs.CAConfiguration { + return &structs.CAConfiguration{ + ClusterID: connect.TestClusterID, + Provider: "mock", + Config: map[string]interface{}{ + "LeafCertTTL": "72h", + "IntermediateCertTTL": "2160h", + }, + } +} + +// initTestManager initializes a CAManager with a mockCAServerDelegate, consuming +// the ops that come through the channels and returning when initialization has finished. +func initTestManager(t *testing.T, manager *CAManager, delegate *mockCAServerDelegate) { + initCh := make(chan struct{}) + go func() { + require.NoError(t, manager.InitializeCA()) + close(initCh) + }() + for i := 0; i < 5; i++ { + select { + case <-delegate.callbackCh: + case <-time.After(3 * time.Second): + t.Fatal("failed waiting for initialization events") + } + } + select { + case <-initCh: + case <-time.After(3 * time.Second): + t.Fatal("failed waiting for initialization") + } +} + +func TestCAManager_Initialize(t *testing.T) { + conf := DefaultConfig() + conf.ConnectEnabled = true + conf.PrimaryDatacenter = "dc1" + conf.Datacenter = "dc2" + delegate := NewMockCAServerDelegate(t, conf) + manager := NewCAManager(delegate, testutil.Logger(t), conf) + + // Call InitializeCA and then confirm the RPCs and provider calls + // happen in the expected order. + go func() { + require.EqualValues(t, CAStateUninitialized, manager.state) + require.NoError(t, manager.InitializeCA()) + require.EqualValues(t, CAStateReady, manager.state) + }() + + waitForCh(t, delegate.callbackCh, "forwardDC/ConnectCA.Roots") + require.EqualValues(t, CAStateInitializing, manager.state) + waitForCh(t, delegate.callbackCh, "provider/GenerateIntermediateCSR") + waitForCh(t, delegate.callbackCh, "forwardDC/ConnectCA.SignIntermediate") + waitForCh(t, delegate.callbackCh, "provider/SetIntermediate") + waitForCh(t, delegate.callbackCh, "raftApply/ConnectCA") + waitForEmptyCh(t, delegate.callbackCh) +} + +func TestCAManager_UpdateConfigWhileRenewIntermediate(t *testing.T) { + // No parallel execution because we change globals + // Set the interval and drift buffer low for renewing the cert. + origInterval := structs.IntermediateCertRenewInterval + origDriftBuffer := ca.CertificateTimeDriftBuffer + defer func() { + structs.IntermediateCertRenewInterval = origInterval + ca.CertificateTimeDriftBuffer = origDriftBuffer + }() + structs.IntermediateCertRenewInterval = time.Millisecond + ca.CertificateTimeDriftBuffer = 0 + + conf := DefaultConfig() + conf.ConnectEnabled = true + conf.PrimaryDatacenter = "dc1" + conf.Datacenter = "dc2" + delegate := NewMockCAServerDelegate(t, conf) + manager := NewCAManager(delegate, testutil.Logger(t), conf) + initTestManager(t, manager, delegate) + + // Wait half the TTL for the cert to need renewing. + time.Sleep(500 * time.Millisecond) + + // Call RenewIntermediate and then confirm the RPCs and provider calls + // happen in the expected order. + go func() { + require.NoError(t, manager.RenewIntermediate(context.TODO(), false)) + require.EqualValues(t, CAStateReady, manager.state) + }() + + waitForCh(t, delegate.callbackCh, "provider/GenerateIntermediateCSR") + + // Call UpdateConfiguration while RenewIntermediate is still in-flight to + // make sure we get an error about the state being occupied. + go func() { + require.EqualValues(t, CAStateRenewIntermediate, manager.state) + require.Error(t, errors.New("already in state"), manager.UpdateConfiguration(&structs.CARequest{})) + }() + + waitForCh(t, delegate.callbackCh, "forwardDC/ConnectCA.SignIntermediate") + waitForCh(t, delegate.callbackCh, "provider/SetIntermediate") + waitForCh(t, delegate.callbackCh, "raftApply/ConnectCA") + waitForEmptyCh(t, delegate.callbackCh) +} diff --git a/agent/consul/server.go b/agent/consul/server.go index 0f34f25ba..d5884a869 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -466,7 +466,7 @@ func NewServer(config *Config, flat Deps) (*Server, error) { return nil, fmt.Errorf("Failed to start Raft: %v", err) } - s.caManager = NewCAManager(s) + s.caManager = NewCAManager(&caDelegateWithState{s}, s.loggers.Named(logging.Connect), s.config) if s.config.ConnectEnabled && (s.config.AutoEncryptAllowTLS || s.config.AutoConfigAuthzEnabled) { go s.connectCARootsMonitor(&lib.StopChannelContext{StopCh: s.shutdownCh}) } diff --git a/agent/structs/acl.go b/agent/structs/acl.go index de01c6e2f..fef8d0875 100644 --- a/agent/structs/acl.go +++ b/agent/structs/acl.go @@ -151,7 +151,7 @@ type ACLServiceIdentity struct { func (s *ACLServiceIdentity) Clone() *ACLServiceIdentity { s2 := *s - s2.Datacenters = cloneStringSlice(s.Datacenters) + s2.Datacenters = CloneStringSlice(s.Datacenters) return &s2 } @@ -666,7 +666,7 @@ func (t *ACLPolicy) UnmarshalJSON(data []byte) error { func (p *ACLPolicy) Clone() *ACLPolicy { p2 := *p - p2.Datacenters = cloneStringSlice(p.Datacenters) + p2.Datacenters = CloneStringSlice(p.Datacenters) return &p2 } @@ -1460,7 +1460,7 @@ type ACLPolicyBatchDeleteRequest struct { PolicyIDs []string } -func cloneStringSlice(s []string) []string { +func CloneStringSlice(s []string) []string { if len(s) == 0 { return nil } diff --git a/agent/structs/config_entry_intentions.go b/agent/structs/config_entry_intentions.go index 7a737d67d..21e55224c 100644 --- a/agent/structs/config_entry_intentions.go +++ b/agent/structs/config_entry_intentions.go @@ -300,7 +300,7 @@ func (p *IntentionHTTPPermission) Clone() *IntentionHTTPPermission { } } - p2.Methods = cloneStringSlice(p.Methods) + p2.Methods = CloneStringSlice(p.Methods) return &p2 } diff --git a/agent/structs/connect_ca.go b/agent/structs/connect_ca.go index dc96334f8..54f156141 100644 --- a/agent/structs/connect_ca.go +++ b/agent/structs/connect_ca.go @@ -128,7 +128,7 @@ func (c *CARoot) Clone() *CARoot { } newCopy := *c - copy(c.IntermediateCerts, newCopy.IntermediateCerts) + newCopy.IntermediateCerts = CloneStringSlice(c.IntermediateCerts) return &newCopy } From a01f853aa5e93ef46a571fb7606e5efcbd6cdb74 Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Fri, 20 Nov 2020 15:54:44 -0800 Subject: [PATCH 6/7] Clean up the logic in persistNewRootAndConfig --- agent/consul/leader_connect_ca.go | 39 +++++++++++++++++--------- agent/consul/leader_connect_ca_test.go | 1 + agent/consul/leader_connect_test.go | 16 ----------- 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/agent/consul/leader_connect_ca.go b/agent/consul/leader_connect_ca.go index 5e42b5f9d..cf010b353 100644 --- a/agent/consul/leader_connect_ca.go +++ b/agent/consul/leader_connect_ca.go @@ -535,6 +535,7 @@ func (c *CAManager) persistNewRootAndConfig(provider ca.Provider, newActiveRoot return err } + // Look up the existing CA config if a new one wasn't provided. var newConf structs.CAConfiguration _, storedConfig, err := state.CAConfig(nil) if err != nil { @@ -543,14 +544,28 @@ func (c *CAManager) persistNewRootAndConfig(provider ca.Provider, newActiveRoot if storedConfig == nil { return fmt.Errorf("local CA not initialized yet") } + // Exit early if the change is a no-op. + if newActiveRoot == nil && config != nil && config.Provider == storedConfig.Provider && reflect.DeepEqual(config.Config, storedConfig.Config) { + return nil + } + if config != nil { newConf = *config } else { newConf = *storedConfig } + + // Update the trust domain for the config if there's a new root, or keep the old + // one if the root isn't being updated. newConf.ModifyIndex = storedConfig.ModifyIndex if newActiveRoot != nil { newConf.ClusterID = newActiveRoot.ExternalTrustDomain + } else { + _, activeRoot, err := state.CARootActive(nil) + if err != nil { + return err + } + newConf.ClusterID = activeRoot.ExternalTrustDomain } // Persist any state the provider needs us to @@ -562,21 +577,19 @@ func (c *CAManager) persistNewRootAndConfig(provider ca.Provider, newActiveRoot // If there's a new active root, copy the root list and append it, updating // the old root with the time it was rotated out. var newRoots structs.CARoots - if newActiveRoot != nil { - for _, r := range oldRoots { - newRoot := *r - if newRoot.Active { - newRoot.Active = false - newRoot.RotatedOutAt = time.Now() - } - if newRoot.ExternalTrustDomain == "" { - newRoot.ExternalTrustDomain = newConf.ClusterID - } - newRoots = append(newRoots, &newRoot) + for _, r := range oldRoots { + newRoot := *r + if newRoot.Active { + newRoot.Active = false + newRoot.RotatedOutAt = time.Now() } + if newRoot.ExternalTrustDomain == "" { + newRoot.ExternalTrustDomain = newConf.ClusterID + } + newRoots = append(newRoots, &newRoot) + } + if newActiveRoot != nil { newRoots = append(newRoots, newActiveRoot) - } else { - newRoots = oldRoots } args := &structs.CARequest{ diff --git a/agent/consul/leader_connect_ca_test.go b/agent/consul/leader_connect_ca_test.go index a582f90ba..25708bc96 100644 --- a/agent/consul/leader_connect_ca_test.go +++ b/agent/consul/leader_connect_ca_test.go @@ -100,6 +100,7 @@ func (m *mockCAServerDelegate) raftApply(t structs.MessageType, msg interface{}) act, err = m.store.CACheckAndSetConfig(1, req.Config.ModifyIndex, req.Config) require.NoError(m.t, err) + require.True(m.t, act) } m.callbackCh <- fmt.Sprintf("raftApply/%s", t) return nil, nil diff --git a/agent/consul/leader_connect_test.go b/agent/consul/leader_connect_test.go index 5beca904b..cf7ddff42 100644 --- a/agent/consul/leader_connect_test.go +++ b/agent/consul/leader_connect_test.go @@ -700,22 +700,6 @@ func TestLeader_SecondaryCA_TransitionFromPrimary(t *testing.T) { require.NoError(t, s2.RPC("ConnectCA.Roots", &args, &dc2PrimaryRoots)) require.Len(t, dc2PrimaryRoots.Roots, 1) - // Set the ExternalTrustDomain to a blank string to simulate an old version (pre-1.4.0) - // it's fine to change the roots struct directly here because the RPC endpoint already - // makes a copy to return. - dc2PrimaryRoots.Roots[0].ExternalTrustDomain = "" - rootSetArgs := structs.CARequest{ - Op: structs.CAOpSetRoots, - Datacenter: "dc2", - Index: dc2PrimaryRoots.Index, - Roots: dc2PrimaryRoots.Roots, - } - resp, err := s2.raftApply(structs.ConnectCARequestType, rootSetArgs) - require.NoError(t, err) - if respErr, ok := resp.(error); ok { - t.Fatal(respErr) - } - // Shutdown s2 and restart it with the dc1 as the primary s2.Shutdown() dir3, s3 := testServerWithConfig(t, func(c *Config) { From c5167cf9c4abc9b0c774b9ec67e137763c4e2419 Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Mon, 30 Nov 2020 14:37:24 -0800 Subject: [PATCH 7/7] Use a buffered channel for CA intermediate renew func --- agent/consul/leader_connect_ca.go | 2 +- agent/consul/leader_connect_ca_test.go | 43 +++++++++++++++++++++----- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/agent/consul/leader_connect_ca.go b/agent/consul/leader_connect_ca.go index cf010b353..a00ffe2b5 100644 --- a/agent/consul/leader_connect_ca.go +++ b/agent/consul/leader_connect_ca.go @@ -983,7 +983,7 @@ func (c *CAManager) RenewIntermediate(ctx context.Context, isPrimary bool) error if !isPrimary { renewalFunc = c.getIntermediateCASigned } - errCh := make(chan error) + errCh := make(chan error, 1) go func() { errCh <- renewalFunc(provider, activeRoot) }() diff --git a/agent/consul/leader_connect_ca_test.go b/agent/consul/leader_connect_ca_test.go index 25708bc96..f6c62a163 100644 --- a/agent/consul/leader_connect_ca_test.go +++ b/agent/consul/leader_connect_ca_test.go @@ -19,6 +19,9 @@ import ( "github.com/stretchr/testify/require" ) +// TODO(kyhavlov): replace with t.Deadline() +const CATestTimeout = 7 * time.Second + type mockCAServerDelegate struct { t *testing.T config *Config @@ -77,6 +80,8 @@ func (m *mockCAServerDelegate) forwardDC(method, dc string, args interface{}, re case "ConnectCA.SignIntermediate": r := reply.(*string) *r = m.primaryRoot.RootCert + default: + return fmt.Errorf("received call to unsupported method %q", method) } m.callbackCh <- fmt.Sprintf("forwardDC/%s", method) @@ -101,6 +106,8 @@ func (m *mockCAServerDelegate) raftApply(t structs.MessageType, msg interface{}) act, err = m.store.CACheckAndSetConfig(1, req.Config.ModifyIndex, req.Config) require.NoError(m.t, err) require.True(m.t, act) + } else { + return nil, fmt.Errorf("got invalid MessageType %v", t) } m.callbackCh <- fmt.Sprintf("raftApply/%s", t) return nil, nil @@ -139,7 +146,7 @@ func waitForCh(t *testing.T, ch chan string, expected string) { if op != expected { t.Fatalf("got unexpected op %q, wanted %q", op, expected) } - case <-time.After(3 * time.Second): + case <-time.After(CATestTimeout): t.Fatalf("never got op %q", expected) } } @@ -174,13 +181,13 @@ func initTestManager(t *testing.T, manager *CAManager, delegate *mockCAServerDel for i := 0; i < 5; i++ { select { case <-delegate.callbackCh: - case <-time.After(3 * time.Second): + case <-time.After(CATestTimeout): t.Fatal("failed waiting for initialization events") } } select { case <-initCh: - case <-time.After(3 * time.Second): + case <-time.After(CATestTimeout): t.Fatal("failed waiting for initialization") } } @@ -195,10 +202,10 @@ func TestCAManager_Initialize(t *testing.T) { // Call InitializeCA and then confirm the RPCs and provider calls // happen in the expected order. + require.EqualValues(t, CAStateUninitialized, manager.state) + errCh := make(chan error) go func() { - require.EqualValues(t, CAStateUninitialized, manager.state) - require.NoError(t, manager.InitializeCA()) - require.EqualValues(t, CAStateReady, manager.state) + errCh <- manager.InitializeCA() }() waitForCh(t, delegate.callbackCh, "forwardDC/ConnectCA.Roots") @@ -208,6 +215,16 @@ func TestCAManager_Initialize(t *testing.T) { waitForCh(t, delegate.callbackCh, "provider/SetIntermediate") waitForCh(t, delegate.callbackCh, "raftApply/ConnectCA") waitForEmptyCh(t, delegate.callbackCh) + + // Make sure the InitializeCA call returned successfully. + select { + case err := <-errCh: + require.NoError(t, err) + case <-time.After(CATestTimeout): + t.Fatal("never got result from errCh") + } + + require.EqualValues(t, CAStateReady, manager.state) } func TestCAManager_UpdateConfigWhileRenewIntermediate(t *testing.T) { @@ -235,9 +252,9 @@ func TestCAManager_UpdateConfigWhileRenewIntermediate(t *testing.T) { // Call RenewIntermediate and then confirm the RPCs and provider calls // happen in the expected order. + errCh := make(chan error) go func() { - require.NoError(t, manager.RenewIntermediate(context.TODO(), false)) - require.EqualValues(t, CAStateReady, manager.state) + errCh <- manager.RenewIntermediate(context.TODO(), false) }() waitForCh(t, delegate.callbackCh, "provider/GenerateIntermediateCSR") @@ -253,4 +270,14 @@ func TestCAManager_UpdateConfigWhileRenewIntermediate(t *testing.T) { waitForCh(t, delegate.callbackCh, "provider/SetIntermediate") waitForCh(t, delegate.callbackCh, "raftApply/ConnectCA") waitForEmptyCh(t, delegate.callbackCh) + + // Make sure the RenewIntermediate call returned successfully. + select { + case err := <-errCh: + require.NoError(t, err) + case <-time.After(CATestTimeout): + t.Fatal("never got result from errCh") + } + + require.EqualValues(t, CAStateReady, manager.state) }