From 335c604ced2653a8c891662e1b6125a8385665a2 Mon Sep 17 00:00:00 2001 From: Matt Keeler Date: Mon, 31 Aug 2020 13:12:17 -0400 Subject: [PATCH] Merge of auto-config and auto-encrypt code (#8523) auto-encrypt is now handled as a special case of auto-config. This also is moving all the cert-monitor code into the auto-config package. --- agent/agent.go | 69 +- agent/auto-config/auto_config.go | 539 +++----- agent/auto-config/auto_config_test.go | 1307 +++++++++++++------- agent/auto-config/auto_encrypt.go | 111 ++ agent/auto-config/auto_encrypt_test.go | 562 +++++++++ agent/auto-config/config.go | 72 +- agent/auto-config/config_translate.go | 20 +- agent/auto-config/config_translate_test.go | 41 + agent/auto-config/mock_test.go | 337 +++++ agent/auto-config/persist.go | 86 ++ agent/auto-config/run.go | 192 +++ agent/auto-config/server_addr.go | 111 ++ agent/auto-config/tls.go | 280 +++++ agent/auto-config/tls_test.go | 56 + agent/cert-monitor/cert_monitor.go | 505 -------- agent/cert-monitor/cert_monitor_test.go | 731 ----------- agent/cert-monitor/config.go | 150 --- agent/consul/auto_encrypt.go | 239 ---- agent/consul/auto_encrypt_test.go | 205 --- agent/setup.go | 52 +- proto/translate.go | 14 +- proto/translate_test.go | 25 +- 22 files changed, 2934 insertions(+), 2770 deletions(-) create mode 100644 agent/auto-config/auto_encrypt.go create mode 100644 agent/auto-config/auto_encrypt_test.go create mode 100644 agent/auto-config/mock_test.go create mode 100644 agent/auto-config/persist.go create mode 100644 agent/auto-config/run.go create mode 100644 agent/auto-config/server_addr.go create mode 100644 agent/auto-config/tls.go create mode 100644 agent/auto-config/tls_test.go delete mode 100644 agent/cert-monitor/cert_monitor.go delete mode 100644 agent/cert-monitor/cert_monitor_test.go delete mode 100644 agent/cert-monitor/config.go delete mode 100644 agent/consul/auto_encrypt.go delete mode 100644 agent/consul/auto_encrypt_test.go diff --git a/agent/agent.go b/agent/agent.go index f6c7f3418..0c639da9a 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -31,7 +31,6 @@ import ( autoconf "github.com/hashicorp/consul/agent/auto-config" "github.com/hashicorp/consul/agent/cache" cachetype "github.com/hashicorp/consul/agent/cache-types" - certmon "github.com/hashicorp/consul/agent/cert-monitor" "github.com/hashicorp/consul/agent/checks" "github.com/hashicorp/consul/agent/config" "github.com/hashicorp/consul/agent/consul" @@ -162,8 +161,6 @@ type notifier interface { type Agent struct { autoConf *autoconf.AutoConfig - certMonitor *certmon.CertMonitor - // config is the agent configuration. config *config.RuntimeConfig @@ -373,6 +370,11 @@ func New(bd BaseDeps) (*Agent, error) { // pass the agent itself so its safe to move here. a.registerCache() + // TODO: move to newBaseDeps + // TODO: handle error + a.loadTokens(a.config) + a.loadEnterpriseTokens(a.config) + return &a, nil } @@ -426,11 +428,6 @@ func (a *Agent) Start(ctx context.Context) error { return fmt.Errorf("Failed to load TLS configurations after applying auto-config settings: %w", err) } - // TODO: move to newBaseDeps - // TODO: handle error - a.loadTokens(a.config) - a.loadEnterpriseTokens(a.config) - // create the local state a.State = local.NewState(LocalConfig(c), a.logger, a.tokens) @@ -495,43 +492,6 @@ func (a *Agent) Start(ctx context.Context) error { a.State.Delegate = a.delegate a.State.TriggerSyncChanges = a.sync.SyncChanges.Trigger - if a.config.AutoEncryptTLS && !a.config.ServerMode { - reply, err := a.autoEncryptInitialCertificate(ctx) - if err != nil { - return fmt.Errorf("AutoEncrypt failed: %s", err) - } - - cmConfig := new(certmon.Config). - WithCache(a.cache). - WithLogger(a.logger.Named(logging.AutoEncrypt)). - WithTLSConfigurator(a.tlsConfigurator). - WithTokens(a.tokens). - WithFallback(a.autoEncryptInitialCertificate). - WithDNSSANs(a.config.AutoEncryptDNSSAN). - WithIPSANs(a.config.AutoEncryptIPSAN). - WithDatacenter(a.config.Datacenter). - WithNodeName(a.config.NodeName) - - monitor, err := certmon.New(cmConfig) - if err != nil { - return fmt.Errorf("AutoEncrypt failed to setup certificate monitor: %w", err) - } - if err := monitor.Update(reply); err != nil { - return fmt.Errorf("AutoEncrypt failed to setup certificate monitor: %w", err) - } - a.certMonitor = monitor - - // we don't need to worry about ever calling Stop as we have tied the go routines - // to the agents lifetime by using the StopCh. Also the agent itself doesn't have - // a need of ensuring that the go routine was stopped before performing any action - // so we can ignore the chan in the return. - if _, err := a.certMonitor.Start(&lib.StopChannelContext{StopCh: a.shutdownCh}); err != nil { - return fmt.Errorf("AutoEncrypt failed to start certificate monitor: %w", err) - } - - a.logger.Info("automatically upgraded to TLS") - } - if err := a.autoConf.Start(&lib.StopChannelContext{StopCh: a.shutdownCh}); err != nil { return fmt.Errorf("AutoConf failed to start certificate monitor: %w", err) } @@ -645,19 +605,6 @@ func (a *Agent) Start(ctx context.Context) error { return nil } -func (a *Agent) autoEncryptInitialCertificate(ctx context.Context) (*structs.SignedResponse, error) { - client := a.delegate.(*consul.Client) - - addrs := a.config.StartJoinAddrsLAN - disco, err := newDiscover() - if err != nil && len(addrs) == 0 { - return nil, err - } - addrs = append(addrs, retryJoinAddrs(disco, retryJoinSerfVariant, "LAN", a.config.RetryJoinLAN, a.logger)...) - - return client.RequestAutoEncryptCerts(ctx, addrs, a.config.ServerPort, a.tokens.AgentToken(), a.config.AutoEncryptDNSSAN, a.config.AutoEncryptIPSAN) -} - func (a *Agent) listenAndServeGRPC() error { if len(a.config.GRPCAddrs) < 1 { return nil @@ -1380,12 +1327,6 @@ func (a *Agent) ShutdownAgent() error { // this should help them to be stopped more quickly a.autoConf.Stop() - if a.certMonitor != nil { - // this would be cancelled anyways (by the closing of the shutdown ch) - // but this should help them to be stopped more quickly - a.certMonitor.Stop() - } - // Stop the service manager (must happen before we take the stateLock to avoid deadlock) if a.serviceManager != nil { a.serviceManager.Stop() diff --git a/agent/auto-config/auto_config.go b/agent/auto-config/auto_config.go index 939879a76..c2dd942c6 100644 --- a/agent/auto-config/auto_config.go +++ b/agent/auto-config/auto_config.go @@ -4,62 +4,54 @@ import ( "context" "fmt" "io/ioutil" - "net" - "os" - "path/filepath" - "strconv" - "strings" + "sync" "time" + "github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/config" - "github.com/hashicorp/consul/agent/connect" - "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/lib" "github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/proto/pbautoconf" - "github.com/hashicorp/go-discover" - discoverk8s "github.com/hashicorp/go-discover/provider/k8s" "github.com/hashicorp/go-hclog" - - "github.com/golang/protobuf/jsonpb" -) - -const ( - // autoConfigFileName is the name of the file that the agent auto-config settings are - // stored in within the data directory - autoConfigFileName = "auto-config.json" - - dummyTrustDomain = "dummytrustdomain" -) - -var ( - pbMarshaler = &jsonpb.Marshaler{ - OrigName: false, - EnumsAsInts: false, - Indent: " ", - EmitDefaults: true, - } - - pbUnmarshaler = &jsonpb.Unmarshaler{ - AllowUnknownFields: false, - } ) // AutoConfig is all the state necessary for being able to parse a configuration // as well as perform the necessary RPCs to perform Agent Auto Configuration. -// -// NOTE: This struct and methods on it are not currently thread/goroutine safe. -// However it doesn't spawn any of its own go routines yet and is used in a -// synchronous fashion. In the future if either of those two conditions change -// then we will need to add some locking here. I am deferring that for now -// to help ease the review of this already large PR. type AutoConfig struct { + sync.Mutex + acConfig Config logger hclog.Logger - certMonitor CertMonitor + cache Cache + waiter *lib.RetryWaiter config *config.RuntimeConfig autoConfigResponse *pbautoconf.AutoConfigResponse autoConfigSource config.Source + + running bool + done chan struct{} + // cancel is used to cancel the entire AutoConfig + // go routine. This is the main field protected + // by the mutex as it being non-nil indicates that + // the go routine has been started and is stoppable. + // note that it doesn't indcate that the go routine + // is currently running. + cancel context.CancelFunc + + // cancelWatches is used to cancel the existing + // cache watches regarding the agents certificate. This is + // mainly only necessary when the Agent token changes. + cancelWatches context.CancelFunc + + // cacheUpdates is the chan used to have the cache + // send us back events + cacheUpdates chan cache.UpdateEvent + + // tokenUpdates is the struct used to receive + // events from the token store when the Agent + // token is updated. + tokenUpdates token.Notifier } // New creates a new AutoConfig object for providing automatic Consul configuration. @@ -69,6 +61,19 @@ func New(config Config) (*AutoConfig, error) { return nil, fmt.Errorf("must provide a config loader") case config.DirectRPC == nil: return nil, fmt.Errorf("must provide a direct RPC delegate") + case config.Cache == nil: + return nil, fmt.Errorf("must provide a cache") + case config.TLSConfigurator == nil: + return nil, fmt.Errorf("must provide a TLS configurator") + case config.Tokens == nil: + return nil, fmt.Errorf("must provide a token store") + } + + if config.FallbackLeeway == 0 { + config.FallbackLeeway = 10 * time.Second + } + if config.FallbackRetry == 0 { + config.FallbackRetry = time.Minute } logger := config.Logger @@ -83,15 +88,16 @@ func New(config Config) (*AutoConfig, error) { } return &AutoConfig{ - acConfig: config, - logger: logger, - certMonitor: config.CertMonitor, + acConfig: config, + logger: logger, }, nil } // ReadConfig will parse the current configuration and inject any // auto-config sources if present into the correct place in the parsing chain. func (ac *AutoConfig) ReadConfig() (*config.RuntimeConfig, error) { + ac.Lock() + defer ac.Unlock() cfg, warnings, err := ac.acConfig.Loader(ac.autoConfigSource) if err != nil { return cfg, err @@ -105,46 +111,6 @@ func (ac *AutoConfig) ReadConfig() (*config.RuntimeConfig, error) { return cfg, nil } -// restorePersistedAutoConfig will attempt to load the persisted auto-config -// settings from the data directory. It returns true either when there was an -// unrecoverable error or when the configuration was successfully loaded from -// disk. Recoverable errors, such as "file not found" are suppressed and this -// method will return false for the first boolean. -func (ac *AutoConfig) restorePersistedAutoConfig() (bool, error) { - if ac.config.DataDir == "" { - // no data directory means we don't have anything to potentially load - return false, nil - } - - path := filepath.Join(ac.config.DataDir, autoConfigFileName) - ac.logger.Debug("attempting to restore any persisted configuration", "path", path) - - content, err := ioutil.ReadFile(path) - if err == nil { - rdr := strings.NewReader(string(content)) - - var resp pbautoconf.AutoConfigResponse - if err := pbUnmarshaler.Unmarshal(rdr, &resp); err != nil { - return false, fmt.Errorf("failed to decode persisted auto-config data: %w", err) - } - - if err := ac.update(&resp); err != nil { - return false, fmt.Errorf("error restoring persisted auto-config response: %w", err) - } - - ac.logger.Info("restored persisted configuration", "path", path) - return true, nil - } - - if !os.IsNotExist(err) { - return true, fmt.Errorf("failed to load %s: %w", path, err) - } - - // ignore non-existence errors as that is an indicator that we haven't - // performed the auto configuration before - return false, nil -} - // InitialConfiguration will perform a one-time RPC request to the configured servers // to retrieve various cluster wide configurations. See the proto/pbautoconf/auto_config.proto // file for a complete reference of what configurations can be applied in this manner. @@ -164,30 +130,49 @@ func (ac *AutoConfig) InitialConfiguration(ctx context.Context) (*config.Runtime ac.config = config } - if !ac.config.AutoConfig.Enabled { - return ac.config, nil - } - - ready, err := ac.restorePersistedAutoConfig() - if err != nil { - return nil, err - } - - if !ready { - ac.logger.Info("retrieving initial agent auto configuration remotely") - if err := ac.getInitialConfiguration(ctx); err != nil { + switch { + case ac.config.AutoConfig.Enabled: + resp, err := ac.readPersistedAutoConfig() + if err != nil { return nil, err } - } - // re-read the configuration now that we have our initial auto-config - config, err := ac.ReadConfig() - if err != nil { - return nil, err - } + if resp == nil { + ac.logger.Info("retrieving initial agent auto configuration remotely") + resp, err = ac.getInitialConfiguration(ctx) + if err != nil { + return nil, err + } + } - ac.config = config - return ac.config, nil + ac.logger.Debug("updating auto-config settings") + if err = ac.recordInitialConfiguration(resp); err != nil { + return nil, err + } + + // re-read the configuration now that we have our initial auto-config + config, err := ac.ReadConfig() + if err != nil { + return nil, err + } + + ac.config = config + return ac.config, nil + case ac.config.AutoEncryptTLS: + certs, err := ac.autoEncryptInitialCerts(ctx) + if err != nil { + return nil, err + } + + if err := ac.setInitialTLSCertificates(certs); err != nil { + return nil, err + } + + ac.logger.Info("automatically upgraded to TLS") + return ac.config, nil + default: + return ac.config, nil + } } // introToken is responsible for determining the correct intro token to use @@ -217,118 +202,45 @@ func (ac *AutoConfig) introToken() (string, error) { return token, nil } -// serverHosts is responsible for taking the list of server addresses and -// resolving any go-discover provider invocations. It will then return a list -// of hosts. These might be hostnames and is expected that DNS resolution may -// be performed after this function runs. Additionally these may contain ports -// so SplitHostPort could also be necessary. -func (ac *AutoConfig) serverHosts() ([]string, error) { - servers := ac.config.AutoConfig.ServerAddresses +// recordInitialConfiguration is responsible for recording the AutoConfigResponse from +// the AutoConfig.InitialConfiguration RPC. It is an all-in-one function to do the following +// * update the Agent token in the token store +func (ac *AutoConfig) recordInitialConfiguration(resp *pbautoconf.AutoConfigResponse) error { + ac.autoConfigResponse = resp - providers := make(map[string]discover.Provider) - for k, v := range discover.Providers { - providers[k] = v + ac.autoConfigSource = config.LiteralSource{ + Name: autoConfigFileName, + Config: translateConfig(resp.Config), } - providers["k8s"] = &discoverk8s.Provider{} - - disco, err := discover.New( - discover.WithUserAgent(lib.UserAgent()), - discover.WithProviders(providers), - ) + // we need to re-read the configuration to determine what the correct ACL + // token to push into the token store is. Any user provided token will override + // any AutoConfig generated token. + config, err := ac.ReadConfig() if err != nil { - return nil, fmt.Errorf("Failed to create go-discover resolver: %w", err) + return fmt.Errorf("failed to fully resolve configuration: %w", err) } - var addrs []string - for _, addr := range servers { - switch { - case strings.Contains(addr, "provider="): - resolved, err := disco.Addrs(addr, ac.logger.StandardLogger(&hclog.StandardLoggerOptions{InferLevels: true})) - if err != nil { - ac.logger.Error("failed to resolve go-discover auto-config servers", "configuration", addr, "err", err) - continue - } + // ignoring the return value which would indicate a change in the token + _ = ac.acConfig.Tokens.UpdateAgentToken(config.ACLAgentToken, token.TokenSourceConfig) - addrs = append(addrs, resolved...) - ac.logger.Debug("discovered auto-config servers", "servers", resolved) - default: - addrs = append(addrs, addr) - } - } - - if len(addrs) == 0 { - return nil, fmt.Errorf("no auto-config server addresses available for use") - } - - return addrs, nil -} - -// resolveHost will take a single host string and convert it to a list of TCPAddrs -// This will process any port in the input as well as looking up the hostname using -// normal DNS resolution. -func (ac *AutoConfig) resolveHost(hostPort string) []net.TCPAddr { - port := ac.config.ServerPort - host, portStr, err := net.SplitHostPort(hostPort) + // extra a structs.SignedResponse from the AutoConfigResponse for use in cache prepopulation + signed, err := extractSignedResponse(resp) if err != nil { - if strings.Contains(err.Error(), "missing port in address") { - host = hostPort - } else { - ac.logger.Warn("error splitting host address into IP and port", "address", hostPort, "error", err) - return nil - } - } else { - port, err = strconv.Atoi(portStr) - if err != nil { - ac.logger.Warn("Parsed port is not an integer", "port", portStr, "error", err) - return nil - } + return fmt.Errorf("failed to extract certificates from the auto-config response: %w", err) } - // resolve the host to a list of IPs - ips, err := net.LookupIP(host) - if err != nil { - ac.logger.Warn("IP resolution failed", "host", host, "error", err) - return nil + // prepopulate the cache + if err = ac.populateCertificateCache(signed); err != nil { + return fmt.Errorf("failed to populate the cache with certificate responses: %w", err) } - var addrs []net.TCPAddr - for _, ip := range ips { - addrs = append(addrs, net.TCPAddr{IP: ip, Port: port}) - } - - return addrs -} - -// recordResponse takes an AutoConfig RPC response records it with the agent -// This will persist the configuration to disk (unless in dev mode running without -// a data dir) and will reload the configuration. -func (ac *AutoConfig) recordResponse(resp *pbautoconf.AutoConfigResponse) error { - serialized, err := pbMarshaler.MarshalToString(resp) - if err != nil { - return fmt.Errorf("failed to encode auto-config response as JSON: %w", err) - } - - if err := ac.update(resp); err != nil { + // update the TLS configurator with the latest certificates + if err := ac.updateTLSFromResponse(resp); err != nil { return err } - // now that we know the configuration is generally fine including TLS certs go ahead and persist it to disk. - if ac.config.DataDir == "" { - ac.logger.Debug("not persisting auto-config settings because there is no data directory") - return nil - } - - path := filepath.Join(ac.config.DataDir, autoConfigFileName) - - err = ioutil.WriteFile(path, []byte(serialized), 0660) - if err != nil { - return fmt.Errorf("failed to write auto-config configurations: %w", err) - } - - ac.logger.Debug("auto-config settings were persisted to disk") - - return nil + return ac.persistAutoConfig(resp) } // getInitialConfigurationOnce will perform full server to TCPAddr resolution and @@ -352,7 +264,7 @@ func (ac *AutoConfig) getInitialConfigurationOnce(ctx context.Context, csr strin var resp pbautoconf.AutoConfigResponse - servers, err := ac.serverHosts() + servers, err := ac.autoConfigHosts() if err != nil { return nil, err } @@ -369,6 +281,7 @@ func (ac *AutoConfig) getInitialConfigurationOnce(ctx context.Context, csr strin ac.logger.Error("AutoConfig.InitialConfiguration RPC failed", "addr", addr.String(), "error", err) continue } + ac.logger.Debug("AutoConfig.InitialConfiguration RPC was successful") // update the Certificate with the private key we generated locally if resp.Certificate != nil { @@ -379,17 +292,17 @@ func (ac *AutoConfig) getInitialConfigurationOnce(ctx context.Context, csr strin } } - return nil, ctx.Err() + return nil, fmt.Errorf("No server successfully responded to the auto-config request") } // getInitialConfiguration implements a loop to retry calls to getInitialConfigurationOnce. // It uses the RetryWaiter on the AutoConfig object to control how often to attempt // the initial configuration process. It is also canceallable by cancelling the provided context. -func (ac *AutoConfig) getInitialConfiguration(ctx context.Context) error { +func (ac *AutoConfig) getInitialConfiguration(ctx context.Context) (*pbautoconf.AutoConfigResponse, error) { // generate a CSR csr, key, err := ac.generateCSR() if err != nil { - return err + return nil, err } // this resets the failures so that we will perform immediate request @@ -397,183 +310,95 @@ func (ac *AutoConfig) getInitialConfiguration(ctx context.Context) error { for { select { case <-wait: - resp, err := ac.getInitialConfigurationOnce(ctx, csr, key) - if resp != nil { - return ac.recordResponse(resp) + if resp, err := ac.getInitialConfigurationOnce(ctx, csr, key); err == nil && resp != nil { + return resp, nil } else if err != nil { ac.logger.Error(err.Error()) } else { - ac.logger.Error("No error returned when fetching the initial auto-configuration but no response was either") + ac.logger.Error("No error returned when fetching configuration from the servers but no response was either") } + wait = ac.acConfig.Waiter.Failed() case <-ctx.Done(): ac.logger.Info("interrupted during initial auto configuration", "err", ctx.Err()) - return ctx.Err() + return nil, ctx.Err() } } } -// generateCSR will generate a CSR for an Agent certificate. This should -// be sent along with the AutoConfig.InitialConfiguration RPC. The generated -// CSR does NOT have a real trust domain as when generating this we do -// not yet have the CA roots. The server will update the trust domain -// for us though. -func (ac *AutoConfig) generateCSR() (csr string, key string, err error) { - // We don't provide the correct host here, because we don't know any - // better at this point. Apart from the domain, we would need the - // ClusterID, which we don't have. This is why we go with - // dummyTrustDomain the first time. Subsequent CSRs will have the - // correct TrustDomain. - id := &connect.SpiffeIDAgent{ - // will be replaced - Host: dummyTrustDomain, - Datacenter: ac.config.Datacenter, - Agent: ac.config.NodeName, - } - - caConfig, err := ac.config.ConnectCAConfiguration() - if err != nil { - return "", "", fmt.Errorf("Cannot generate CSR: %w", err) - } - - conf, err := caConfig.GetCommonConfig() - if err != nil { - return "", "", fmt.Errorf("Failed to load common CA configuration: %w", err) - } - - if conf.PrivateKeyType == "" { - conf.PrivateKeyType = connect.DefaultPrivateKeyType - } - if conf.PrivateKeyBits == 0 { - conf.PrivateKeyBits = connect.DefaultPrivateKeyBits - } - - // Create a new private key - pk, pkPEM, err := connect.GeneratePrivateKeyWithConfig(conf.PrivateKeyType, conf.PrivateKeyBits) - if err != nil { - return "", "", fmt.Errorf("Failed to generate private key: %w", err) - } - - dnsNames := append([]string{"localhost"}, ac.config.AutoConfig.DNSSANs...) - ipAddresses := append([]net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::")}, ac.config.AutoConfig.IPSANs...) - - // Create a CSR. - // - // The Common Name includes the dummy trust domain for now but Server will - // override this when it is signed anyway so it's OK. - cn := connect.AgentCN(ac.config.NodeName, dummyTrustDomain) - csr, err = connect.CreateCSR(id, cn, pk, dnsNames, ipAddresses) - if err != nil { - return "", "", err - } - - return csr, pkPEM, nil -} - -// update will take an AutoConfigResponse and do all things necessary -// to restore those settings. This currently involves updating the -// config data to be used during a call to ReadConfig, updating the -// tls Configurator and prepopulating the cache. -func (ac *AutoConfig) update(resp *pbautoconf.AutoConfigResponse) error { - ac.autoConfigResponse = resp - - ac.autoConfigSource = config.LiteralSource{ - Name: autoConfigFileName, - Config: translateConfig(resp.Config), - } - - if err := ac.updateTLSFromResponse(resp); err != nil { - return err - } - - return nil -} - -// updateTLSFromResponse will update the TLS certificate and roots in the shared -// TLS configurator. -func (ac *AutoConfig) updateTLSFromResponse(resp *pbautoconf.AutoConfigResponse) error { - if ac.certMonitor == nil { - return nil - } - - roots, err := translateCARootsToStructs(resp.CARoots) - if err != nil { - return err - } - - cert, err := translateIssuedCertToStructs(resp.Certificate) - if err != nil { - return err - } - - update := &structs.SignedResponse{ - IssuedCert: *cert, - ConnectCARoots: *roots, - ManualCARoots: resp.ExtraCACertificates, - } - - if resp.Config != nil && resp.Config.TLS != nil { - update.VerifyServerHostname = resp.Config.TLS.VerifyServerHostname - } - - if err := ac.certMonitor.Update(update); err != nil { - return fmt.Errorf("failed to update the certificate monitor: %w", err) - } - - return nil -} - func (ac *AutoConfig) Start(ctx context.Context) error { - if ac.certMonitor == nil { + ac.Lock() + defer ac.Unlock() + + if !ac.config.AutoConfig.Enabled && !ac.config.AutoEncryptTLS { return nil } - if !ac.config.AutoConfig.Enabled { - return nil + if ac.running || ac.cancel != nil { + return fmt.Errorf("AutoConfig is already running") } - _, err := ac.certMonitor.Start(ctx) - return err + // create the top level context to control the go + // routine executing the `run` method + ctx, cancel := context.WithCancel(ctx) + + // create the channel to get cache update events through + // really we should only ever get 10 updates + ac.cacheUpdates = make(chan cache.UpdateEvent, 10) + + // setup the cache watches + cancelCertWatches, err := ac.setupCertificateCacheWatches(ctx) + if err != nil { + cancel() + return fmt.Errorf("error setting up cache watches: %w", err) + } + + // start the token update notifier + ac.tokenUpdates = ac.acConfig.Tokens.Notify(token.TokenKindAgent) + + // store the cancel funcs + ac.cancel = cancel + ac.cancelWatches = cancelCertWatches + + ac.running = true + ac.done = make(chan struct{}) + go ac.run(ctx, ac.done) + + ac.logger.Info("auto-config started") + return nil +} + +func (ac *AutoConfig) Done() <-chan struct{} { + ac.Lock() + defer ac.Unlock() + + if ac.done != nil { + return ac.done + } + + // return a closed channel to indicate that we are already done + done := make(chan struct{}) + close(done) + return done +} + +func (ac *AutoConfig) IsRunning() bool { + ac.Lock() + defer ac.Unlock() + return ac.running } func (ac *AutoConfig) Stop() bool { - if ac.certMonitor == nil { + ac.Lock() + defer ac.Unlock() + + if !ac.running { return false } - if !ac.config.AutoConfig.Enabled { - return false + if ac.cancel != nil { + ac.cancel() } - return ac.certMonitor.Stop() -} - -func (ac *AutoConfig) FallbackTLS(ctx context.Context) (*structs.SignedResponse, error) { - // generate a CSR - csr, key, err := ac.generateCSR() - if err != nil { - return nil, err - } - - resp, err := ac.getInitialConfigurationOnce(ctx, csr, key) - if err != nil { - return nil, err - } - - return extractSignedResponse(resp) -} - -func (ac *AutoConfig) RecordUpdatedCerts(resp *structs.SignedResponse) error { - var err error - ac.autoConfigResponse.ExtraCACertificates = resp.ManualCARoots - ac.autoConfigResponse.CARoots, err = translateCARootsToProtobuf(&resp.ConnectCARoots) - if err != nil { - return err - } - ac.autoConfigResponse.Certificate, err = translateIssuedCertToProtobuf(&resp.IssuedCert) - if err != nil { - return err - } - - return ac.recordResponse(ac.autoConfigResponse) + return true } diff --git a/agent/auto-config/auto_config_test.go b/agent/auto-config/auto_config_test.go index a421a45b7..e3469862a 100644 --- a/agent/auto-config/auto_config_test.go +++ b/agent/auto-config/auto_config_test.go @@ -5,115 +5,146 @@ import ( "fmt" "io/ioutil" "net" + "os" "path/filepath" - "strings" + "sync" "testing" "time" - "github.com/gogo/protobuf/types" + "github.com/hashicorp/consul/agent/cache" + cachetype "github.com/hashicorp/consul/agent/cache-types" "github.com/hashicorp/consul/agent/config" + "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/lib" "github.com/hashicorp/consul/proto/pbautoconf" "github.com/hashicorp/consul/proto/pbconfig" - "github.com/hashicorp/consul/proto/pbconnect" "github.com/hashicorp/consul/sdk/testutil" + "github.com/hashicorp/consul/sdk/testutil/retry" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" ) -type mockDirectRPC struct { - mock.Mock +type configLoader struct { + opts config.BuilderOpts } -func (m *mockDirectRPC) RPC(dc string, node string, addr net.Addr, method string, args interface{}, reply interface{}) error { - var retValues mock.Arguments - if method == "AutoConfig.InitialConfiguration" { - req := args.(*pbautoconf.AutoConfigRequest) - csr := req.CSR - req.CSR = "" - retValues = m.Called(dc, node, addr, method, args, reply) - req.CSR = csr - } else { - retValues = m.Called(dc, node, addr, method, args, reply) - } +func (c *configLoader) Load(source config.Source) (*config.RuntimeConfig, []string, error) { + return config.Load(c.opts, source) +} - switch ret := retValues.Get(0).(type) { - case error: - return ret - case func(interface{}): - ret(reply) - return nil +func (c *configLoader) addConfigHCL(cfg string) { + c.opts.HCL = append(c.opts.HCL, cfg) +} + +func requireChanNotReady(t *testing.T, ch <-chan struct{}) { + select { + case <-ch: + require.Fail(t, "chan is ready when it shouldn't be") default: - return fmt.Errorf("This should not happen, update mock direct rpc expectations") + return } } -type mockCertMonitor struct { - mock.Mock -} - -func (m *mockCertMonitor) Start(_ context.Context) (<-chan struct{}, error) { - ret := m.Called() - ch := ret.Get(0).(<-chan struct{}) - return ch, ret.Error(1) -} - -func (m *mockCertMonitor) Stop() bool { - return m.Called().Bool(0) -} - -func (m *mockCertMonitor) Update(resp *structs.SignedResponse) error { - var privKey string - // filter out real certificates as we cannot predict their values - if resp != nil && strings.HasPrefix(resp.IssuedCert.PrivateKeyPEM, "-----BEGIN") { - privKey = resp.IssuedCert.PrivateKeyPEM - resp.IssuedCert.PrivateKeyPEM = "" +func requireChanReady(t *testing.T, ch <-chan struct{}) { + select { + case <-ch: + return + default: + require.Fail(t, "chan is not ready when it should be") } - err := m.Called(resp).Error(0) - if privKey != "" { - resp.IssuedCert.PrivateKeyPEM = privKey +} + +func waitForChan(timer *time.Timer, ch <-chan struct{}) bool { + select { + case <-timer.C: + return false + case <-ch: + return true } - return err +} + +func waitForChans(timeout time.Duration, chans ...<-chan struct{}) bool { + timer := time.NewTimer(timeout) + defer timer.Stop() + + for _, ch := range chans { + if !waitForChan(timer, ch) { + return false + } + } + return true } func TestNew(t *testing.T) { type testCase struct { - config Config + modify func(*Config) err string validate func(t *testing.T, ac *AutoConfig) } cases := map[string]testCase{ "no-direct-rpc": { - config: Config{ - Loader: func(source config.Source) (cfg *config.RuntimeConfig, warnings []string, err error) { - return nil, nil, nil - }, + modify: func(c *Config) { + c.DirectRPC = nil }, err: "must provide a direct RPC delegate", }, - "no-config-loader": { + modify: func(c *Config) { + c.Loader = nil + }, err: "must provide a config loader", }, - "ok": { - config: Config{ - DirectRPC: &mockDirectRPC{}, - Loader: func(source config.Source) (cfg *config.RuntimeConfig, warnings []string, err error) { - return nil, nil, nil - }, + "no-cache": { + modify: func(c *Config) { + c.Cache = nil }, + err: "must provide a cache", + }, + "no-tls-configurator": { + modify: func(c *Config) { + c.TLSConfigurator = nil + }, + err: "must provide a TLS configurator", + }, + "no-tokens": { + modify: func(c *Config) { + c.Tokens = nil + }, + err: "must provide a token store", + }, + "ok": { validate: func(t *testing.T, ac *AutoConfig) { t.Helper() require.NotNil(t, ac.logger) + require.NotNil(t, ac.acConfig.Waiter) + require.Equal(t, time.Minute, ac.acConfig.FallbackRetry) + require.Equal(t, 10*time.Second, ac.acConfig.FallbackLeeway) }, }, } for name, tcase := range cases { t.Run(name, func(t *testing.T) { - ac, err := New(tcase.config) + cfg := Config{ + Loader: func(source config.Source) (cfg *config.RuntimeConfig, warnings []string, err error) { + return nil, nil, nil + }, + DirectRPC: newMockDirectRPC(t), + Tokens: newMockTokenStore(t), + Cache: newMockCache(t), + TLSConfigurator: newMockTLSConfigurator(t), + ServerProvider: newMockServerProvider(t), + } + + if tcase.modify != nil { + tcase.modify(&cfg) + } + + ac, err := New(cfg) if tcase.err != "" { testutil.RequireErrorContains(t, err, tcase.err) } else { @@ -157,32 +188,34 @@ func TestReadConfig(t *testing.T) { require.Same(t, ac.config, cfg) } -func setupRuntimeConfig(t *testing.T) *config.RuntimeConfig { +func setupRuntimeConfig(t *testing.T) *configLoader { t.Helper() dataDir := testutil.TempDir(t, "auto-config") - rtConfig := &config.RuntimeConfig{ - DataDir: dataDir, - Datacenter: "dc1", - NodeName: "autoconf", - BindAddr: &net.IPAddr{IP: net.ParseIP("127.0.0.1")}, + + opts := config.BuilderOpts{ + Config: config.Config{ + DataDir: &dataDir, + Datacenter: stringPointer("dc1"), + NodeName: stringPointer("autoconf"), + BindAddr: stringPointer("127.0.0.1"), + }, } - return rtConfig + return &configLoader{opts: opts} } func TestInitialConfiguration_disabled(t *testing.T) { - rtConfig := setupRuntimeConfig(t) + loader := setupRuntimeConfig(t) + loader.addConfigHCL(` + primary_datacenter = "primary" + auto_config = { + enabled = false + } + `) + + conf := newMockedConfig(t).Config + conf.Loader = loader.Load - directRPC := new(mockDirectRPC) - directRPC.Test(t) - conf := Config{ - DirectRPC: directRPC, - Loader: func(source config.Source) (*config.RuntimeConfig, []string, error) { - rtConfig.PrimaryDatacenter = "primary" - rtConfig.AutoConfig.Enabled = false - return rtConfig, nil, nil - }, - } ac, err := New(conf) require.NoError(t, err) require.NotNil(t, ac) @@ -191,38 +224,34 @@ func TestInitialConfiguration_disabled(t *testing.T) { require.NoError(t, err) require.NotNil(t, cfg) require.Equal(t, "primary", cfg.PrimaryDatacenter) - require.NoFileExists(t, filepath.Join(rtConfig.DataDir, autoConfigFileName)) - - // ensure no RPC was made - directRPC.AssertExpectations(t) + require.NoFileExists(t, filepath.Join(*loader.opts.Config.DataDir, autoConfigFileName)) } func TestInitialConfiguration_cancelled(t *testing.T) { - rtConfig := setupRuntimeConfig(t) + mcfg := newMockedConfig(t) + + loader := setupRuntimeConfig(t) + loader.addConfigHCL(` + primary_datacenter = "primary" + auto_config = { + enabled = true + intro_token = "blarg" + server_addresses = ["127.0.0.1:8300"] + } + verify_outgoing = true + `) + mcfg.Config.Loader = loader.Load - directRPC := new(mockDirectRPC) - directRPC.Test(t) expectedRequest := pbautoconf.AutoConfigRequest{ Datacenter: "dc1", Node: "autoconf", JWT: "blarg", } - directRPC.On("RPC", "dc1", "autoconf", &net.TCPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 8300}, "AutoConfig.InitialConfiguration", &expectedRequest, mock.Anything).Return(fmt.Errorf("injected error")).Times(0) - conf := Config{ - DirectRPC: directRPC, - Loader: func(source config.Source) (*config.RuntimeConfig, []string, error) { - rtConfig.PrimaryDatacenter = "primary" - rtConfig.AutoConfig = config.AutoConfig{ - Enabled: true, - IntroToken: "blarg", - ServerAddresses: []string{"127.0.0.1:8300"}, - } - rtConfig.VerifyOutgoing = true - return rtConfig, nil, nil - }, - } - ac, err := New(conf) + mcfg.directRPC.On("RPC", "dc1", "autoconf", &net.TCPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 8300}, "AutoConfig.InitialConfiguration", &expectedRequest, mock.Anything).Return(fmt.Errorf("injected error")).Times(0) + mcfg.serverProvider.On("FindLANServer").Return(nil).Times(0) + + ac, err := New(mcfg.Config) require.NoError(t, err) require.NotNil(t, ac) @@ -232,110 +261,55 @@ func TestInitialConfiguration_cancelled(t *testing.T) { cfg, err := ac.InitialConfiguration(ctx) testutil.RequireErrorContains(t, err, context.DeadlineExceeded.Error()) require.Nil(t, cfg) - - // ensure no RPC was made - directRPC.AssertExpectations(t) } func TestInitialConfiguration_restored(t *testing.T) { - rtConfig := setupRuntimeConfig(t) + mcfg := newMockedConfig(t) + + loader := setupRuntimeConfig(t) + loader.addConfigHCL(` + auto_config = { + enabled = true + intro_token ="blarg" + server_addresses = ["127.0.0.1:8300"] + } + verify_outgoing = true + `) + + mcfg.Config.Loader = loader.Load + + indexedRoots, cert, extraCACerts := mcfg.setupInitialTLS(t, "autoconf", "dc1", "secret") // persist an auto config response to the data dir where it is expected - persistedFile := filepath.Join(rtConfig.DataDir, autoConfigFileName) + persistedFile := filepath.Join(*loader.opts.Config.DataDir, autoConfigFileName) response := &pbautoconf.AutoConfigResponse{ Config: &pbconfig.Config{ PrimaryDatacenter: "primary", TLS: &pbconfig.TLS{ VerifyServerHostname: true, }, - }, - CARoots: &pbconnect.CARoots{ - ActiveRootID: "active", - TrustDomain: "trust", - Roots: []*pbconnect.CARoot{ - { - ID: "active", - Name: "foo", - SerialNumber: 42, - SigningKeyID: "blarg", - NotBefore: &types.Timestamp{Seconds: 5000, Nanos: 100}, - NotAfter: &types.Timestamp{Seconds: 10000, Nanos: 9009}, - RootCert: "not an actual cert", - Active: true, + ACL: &pbconfig.ACL{ + Tokens: &pbconfig.ACLTokens{ + Agent: "secret", }, }, }, - Certificate: &pbconnect.IssuedCert{ - SerialNumber: "1234", - CertPEM: "not a cert", - PrivateKeyPEM: "private", - Agent: "foo", - AgentURI: "spiffe://blarg/agent/client/dc/foo/id/foo", - ValidAfter: &types.Timestamp{Seconds: 6000}, - ValidBefore: &types.Timestamp{Seconds: 7000}, - }, - ExtraCACertificates: []string{"blarg"}, + CARoots: mustTranslateCARootsToProtobuf(t, indexedRoots), + Certificate: mustTranslateIssuedCertToProtobuf(t, cert), + ExtraCACertificates: extraCACerts, } data, err := pbMarshaler.MarshalToString(response) require.NoError(t, err) require.NoError(t, ioutil.WriteFile(persistedFile, []byte(data), 0600)) - directRPC := new(mockDirectRPC) - directRPC.Test(t) + // recording the initial configuration even when restoring is going to update + // the agent token in the token store + mcfg.tokens.On("UpdateAgentToken", "secret", token.TokenSourceConfig).Return(true).Once() - // setup the mock certificate monitor to ensure that the initial state gets - // updated appropriately during config restoration. - certMon := new(mockCertMonitor) - certMon.Test(t) - certMon.On("Update", &structs.SignedResponse{ - IssuedCert: structs.IssuedCert{ - SerialNumber: "1234", - CertPEM: "not a cert", - PrivateKeyPEM: "private", - Agent: "foo", - AgentURI: "spiffe://blarg/agent/client/dc/foo/id/foo", - ValidAfter: time.Unix(6000, 0), - ValidBefore: time.Unix(7000, 0), - }, - ConnectCARoots: structs.IndexedCARoots{ - ActiveRootID: "active", - TrustDomain: "trust", - Roots: []*structs.CARoot{ - { - ID: "active", - Name: "foo", - SerialNumber: 42, - SigningKeyID: "blarg", - NotBefore: time.Unix(5000, 100), - NotAfter: time.Unix(10000, 9009), - RootCert: "not an actual cert", - Active: true, - // the decoding process doesn't leave this nil - IntermediateCerts: []string{}, - }, - }, - }, - ManualCARoots: []string{"blarg"}, - VerifyServerHostname: true, - }).Return(nil).Once() + // prepopulation is going to grab the token to populate the correct cache key + mcfg.tokens.On("AgentToken").Return("secret").Times(0) - conf := Config{ - DirectRPC: directRPC, - Loader: func(source config.Source) (*config.RuntimeConfig, []string, error) { - if err := setPrimaryDatacenterFromSource(rtConfig, source); err != nil { - return nil, nil, err - } - rtConfig.AutoConfig = config.AutoConfig{ - Enabled: true, - IntroToken: "blarg", - ServerAddresses: []string{"127.0.0.1:8300"}, - } - rtConfig.VerifyOutgoing = true - return rtConfig, nil, nil - }, - CertMonitor: certMon, - } - ac, err := New(conf) + ac, err := New(mcfg.Config) require.NoError(t, err) require.NotNil(t, ac) @@ -343,64 +317,51 @@ func TestInitialConfiguration_restored(t *testing.T) { require.NoError(t, err, data) require.NotNil(t, cfg) require.Equal(t, "primary", cfg.PrimaryDatacenter) - - // ensure no RPC was made - directRPC.AssertExpectations(t) - certMon.AssertExpectations(t) -} - -func setPrimaryDatacenterFromSource(rtConfig *config.RuntimeConfig, source config.Source) error { - if source != nil { - cfg, _, err := source.Parse() - if err != nil { - return err - } - rtConfig.PrimaryDatacenter = *cfg.PrimaryDatacenter - } - return nil } func TestInitialConfiguration_success(t *testing.T) { - rtConfig := setupRuntimeConfig(t) + mcfg := newMockedConfig(t) + loader := setupRuntimeConfig(t) + loader.addConfigHCL(` + auto_config = { + enabled = true + intro_token ="blarg" + server_addresses = ["127.0.0.1:8300"] + } + verify_outgoing = true + `) + mcfg.Config.Loader = loader.Load - directRPC := new(mockDirectRPC) - directRPC.Test(t) + indexedRoots, cert, extraCerts := mcfg.setupInitialTLS(t, "autoconf", "dc1", "secret") - populateResponse := func(val interface{}) { - resp, ok := val.(*pbautoconf.AutoConfigResponse) + // this gets called when InitialConfiguration is invoked to record the token from the + // auto-config response + mcfg.tokens.On("UpdateAgentToken", "secret", token.TokenSourceConfig).Return(true).Once() + + // prepopulation is going to grab the token to populate the correct cache key + mcfg.tokens.On("AgentToken").Return("secret").Times(0) + + // no server provider + mcfg.serverProvider.On("FindLANServer").Return(nil).Times(0) + + populateResponse := func(args mock.Arguments) { + resp, ok := args.Get(5).(*pbautoconf.AutoConfigResponse) require.True(t, ok) resp.Config = &pbconfig.Config{ PrimaryDatacenter: "primary", TLS: &pbconfig.TLS{ VerifyServerHostname: true, }, - } - - resp.CARoots = &pbconnect.CARoots{ - ActiveRootID: "active", - TrustDomain: "trust", - Roots: []*pbconnect.CARoot{ - { - ID: "active", - Name: "foo", - SerialNumber: 42, - SigningKeyID: "blarg", - NotBefore: &types.Timestamp{Seconds: 5000, Nanos: 100}, - NotAfter: &types.Timestamp{Seconds: 10000, Nanos: 9009}, - RootCert: "not an actual cert", - Active: true, + ACL: &pbconfig.ACL{ + Tokens: &pbconfig.ACLTokens{ + Agent: "secret", }, }, } - resp.Certificate = &pbconnect.IssuedCert{ - SerialNumber: "1234", - CertPEM: "not a cert", - Agent: "foo", - AgentURI: "spiffe://blarg/agent/client/dc/foo/id/foo", - ValidAfter: &types.Timestamp{Seconds: 6000}, - ValidBefore: &types.Timestamp{Seconds: 7000}, - } - resp.ExtraCACertificates = []string{"blarg"} + + resp.CARoots = mustTranslateCARootsToProtobuf(t, indexedRoots) + resp.Certificate = mustTranslateIssuedCertToProtobuf(t, cert) + resp.ExtraCACertificates = extraCerts } expectedRequest := pbautoconf.AutoConfigRequest{ @@ -409,66 +370,16 @@ func TestInitialConfiguration_success(t *testing.T) { JWT: "blarg", } - directRPC.On( + mcfg.directRPC.On( "RPC", "dc1", "autoconf", &net.TCPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 8300}, "AutoConfig.InitialConfiguration", &expectedRequest, - &pbautoconf.AutoConfigResponse{}).Return(populateResponse) + &pbautoconf.AutoConfigResponse{}).Return(nil).Run(populateResponse) - // setup the mock certificate monitor to ensure that the initial state gets - // updated appropriately during config restoration. - certMon := new(mockCertMonitor) - certMon.Test(t) - certMon.On("Update", &structs.SignedResponse{ - IssuedCert: structs.IssuedCert{ - SerialNumber: "1234", - CertPEM: "not a cert", - PrivateKeyPEM: "", // the mock - Agent: "foo", - AgentURI: "spiffe://blarg/agent/client/dc/foo/id/foo", - ValidAfter: time.Unix(6000, 0), - ValidBefore: time.Unix(7000, 0), - }, - ConnectCARoots: structs.IndexedCARoots{ - ActiveRootID: "active", - TrustDomain: "trust", - Roots: []*structs.CARoot{ - { - ID: "active", - Name: "foo", - SerialNumber: 42, - SigningKeyID: "blarg", - NotBefore: time.Unix(5000, 100), - NotAfter: time.Unix(10000, 9009), - RootCert: "not an actual cert", - Active: true, - }, - }, - }, - ManualCARoots: []string{"blarg"}, - VerifyServerHostname: true, - }).Return(nil).Once() - - conf := Config{ - DirectRPC: directRPC, - Loader: func(source config.Source) (*config.RuntimeConfig, []string, error) { - if err := setPrimaryDatacenterFromSource(rtConfig, source); err != nil { - return nil, nil, err - } - rtConfig.AutoConfig = config.AutoConfig{ - Enabled: true, - IntroToken: "blarg", - ServerAddresses: []string{"127.0.0.1:8300"}, - } - rtConfig.VerifyOutgoing = true - return rtConfig, nil, nil - }, - CertMonitor: certMon, - } - ac, err := New(conf) + ac, err := New(mcfg.Config) require.NoError(t, err) require.NotNil(t, ac) @@ -478,26 +389,61 @@ func TestInitialConfiguration_success(t *testing.T) { require.Equal(t, "primary", cfg.PrimaryDatacenter) // the file was written to. - persistedFile := filepath.Join(rtConfig.DataDir, autoConfigFileName) + persistedFile := filepath.Join(*loader.opts.Config.DataDir, autoConfigFileName) require.FileExists(t, persistedFile) - - // ensure no RPC was made - directRPC.AssertExpectations(t) - certMon.AssertExpectations(t) } func TestInitialConfiguration_retries(t *testing.T) { - rtConfig := setupRuntimeConfig(t) + mcfg := newMockedConfig(t) + loader := setupRuntimeConfig(t) + loader.addConfigHCL(` + auto_config = { + enabled = true + intro_token ="blarg" + server_addresses = [ + "198.18.0.1:8300", + "198.18.0.2:8398", + "198.18.0.3:8399", + "127.0.0.1:1234" + ] + } + verify_outgoing = true + `) + mcfg.Config.Loader = loader.Load - directRPC := new(mockDirectRPC) - directRPC.Test(t) + // reduce the retry wait times to make this test run faster + mcfg.Config.Waiter = lib.NewRetryWaiter(2, 0, 1*time.Millisecond, nil) - populateResponse := func(val interface{}) { - resp, ok := val.(*pbautoconf.AutoConfigResponse) + indexedRoots, cert, extraCerts := mcfg.setupInitialTLS(t, "autoconf", "dc1", "secret") + + // this gets called when InitialConfiguration is invoked to record the token from the + // auto-config response + mcfg.tokens.On("UpdateAgentToken", "secret", token.TokenSourceConfig).Return(true).Once() + + // prepopulation is going to grab the token to populate the correct cache key + mcfg.tokens.On("AgentToken").Return("secret").Times(0) + + // no server provider + mcfg.serverProvider.On("FindLANServer").Return(nil).Times(0) + + populateResponse := func(args mock.Arguments) { + resp, ok := args.Get(5).(*pbautoconf.AutoConfigResponse) require.True(t, ok) resp.Config = &pbconfig.Config{ PrimaryDatacenter: "primary", + TLS: &pbconfig.TLS{ + VerifyServerHostname: true, + }, + ACL: &pbconfig.ACL{ + Tokens: &pbconfig.ACLTokens{ + Agent: "secret", + }, + }, } + + resp.CARoots = mustTranslateCARootsToProtobuf(t, indexedRoots) + resp.Certificate = mustTranslateIssuedCertToProtobuf(t, cert) + resp.ExtraCACertificates = extraCerts } expectedRequest := pbautoconf.AutoConfigRequest{ @@ -509,7 +455,7 @@ func TestInitialConfiguration_retries(t *testing.T) { // basically the 198.18.0.* addresses should fail indefinitely. the first time through the // outer loop we inject a failure for the DNS resolution of localhost to 127.0.0.1. Then // the second time through the outer loop we allow the localhost one to work. - directRPC.On( + mcfg.directRPC.On( "RPC", "dc1", "autoconf", @@ -517,7 +463,7 @@ func TestInitialConfiguration_retries(t *testing.T) { "AutoConfig.InitialConfiguration", &expectedRequest, &pbautoconf.AutoConfigResponse{}).Return(fmt.Errorf("injected failure")).Times(0) - directRPC.On( + mcfg.directRPC.On( "RPC", "dc1", "autoconf", @@ -525,7 +471,7 @@ func TestInitialConfiguration_retries(t *testing.T) { "AutoConfig.InitialConfiguration", &expectedRequest, &pbautoconf.AutoConfigResponse{}).Return(fmt.Errorf("injected failure")).Times(0) - directRPC.On( + mcfg.directRPC.On( "RPC", "dc1", "autoconf", @@ -533,7 +479,7 @@ func TestInitialConfiguration_retries(t *testing.T) { "AutoConfig.InitialConfiguration", &expectedRequest, &pbautoconf.AutoConfigResponse{}).Return(fmt.Errorf("injected failure")).Times(0) - directRPC.On( + mcfg.directRPC.On( "RPC", "dc1", "autoconf", @@ -541,37 +487,16 @@ func TestInitialConfiguration_retries(t *testing.T) { "AutoConfig.InitialConfiguration", &expectedRequest, &pbautoconf.AutoConfigResponse{}).Return(fmt.Errorf("injected failure")).Once() - directRPC.On( + mcfg.directRPC.On( "RPC", "dc1", "autoconf", &net.TCPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 1234}, "AutoConfig.InitialConfiguration", &expectedRequest, - &pbautoconf.AutoConfigResponse{}).Return(populateResponse) + &pbautoconf.AutoConfigResponse{}).Return(nil).Run(populateResponse).Once() - conf := Config{ - DirectRPC: directRPC, - Loader: func(source config.Source) (*config.RuntimeConfig, []string, error) { - if err := setPrimaryDatacenterFromSource(rtConfig, source); err != nil { - return nil, nil, err - } - rtConfig.AutoConfig = config.AutoConfig{ - Enabled: true, - IntroToken: "blarg", - ServerAddresses: []string{ - "198.18.0.1:8300", - "198.18.0.2:8398", - "198.18.0.3:8399", - "127.0.0.1:1234", - }, - } - rtConfig.VerifyOutgoing = true - return rtConfig, nil, nil - }, - Waiter: lib.NewRetryWaiter(2, 0, 1*time.Millisecond, nil), - } - ac, err := New(conf) + ac, err := New(mcfg.Config) require.NoError(t, err) require.NotNil(t, ac) @@ -581,102 +506,548 @@ func TestInitialConfiguration_retries(t *testing.T) { require.Equal(t, "primary", cfg.PrimaryDatacenter) // the file was written to. - persistedFile := filepath.Join(rtConfig.DataDir, autoConfigFileName) + persistedFile := filepath.Join(*loader.opts.Config.DataDir, autoConfigFileName) require.FileExists(t, persistedFile) - - // ensure no RPC was made - directRPC.AssertExpectations(t) } -func TestAutoConfig_StartStop(t *testing.T) { - // currently the only thing running for autoconf is just the cert monitor - // so this test only needs to ensure that the cert monitor is started and - // stopped and not that anything with regards to running the cert monitor - // actually work. Those are tested in the cert-monitor package. +func TestGoRoutineManagement(t *testing.T) { + mcfg := newMockedConfig(t) + loader := setupRuntimeConfig(t) + loader.addConfigHCL(` + auto_config = { + enabled = true + intro_token ="blarg" + server_addresses = ["127.0.0.1:8300"] + } + verify_outgoing = true + `) + mcfg.Config.Loader = loader.Load - rtConfig := setupRuntimeConfig(t) + // prepopulation is going to grab the token to populate the correct cache key + mcfg.tokens.On("AgentToken").Return("secret").Times(0) - directRPC := &mockDirectRPC{} - directRPC.Test(t) - certMon := &mockCertMonitor{} - certMon.Test(t) + ac, err := New(mcfg.Config) + require.NoError(t, err) - certMon.On("Start").Return((<-chan struct{})(nil), nil).Once() - certMon.On("Stop").Return(true).Once() + // priming the config so some other requests will work properly that need to + // read from the configuration. We are going to avoid doing InitialConfiguration + // for this test as we only are really concerned with the go routine management + _, err = ac.ReadConfig() + require.NoError(t, err) - conf := Config{ - DirectRPC: directRPC, - Loader: func(source config.Source) (*config.RuntimeConfig, []string, error) { - rtConfig.AutoConfig = config.AutoConfig{ - Enabled: true, - IntroToken: "blarg", - ServerAddresses: []string{ - "198.18.0.1", - "198.18.0.2:8398", - "198.18.0.3:8399", - "127.0.0.1:1234", - }, - } - rtConfig.VerifyOutgoing = true - return rtConfig, nil, nil - }, - CertMonitor: certMon, + var rootsCtx context.Context + var leafCtx context.Context + var ctxLock sync.Mutex + + rootsReq := ac.caRootsRequest() + mcfg.cache.On("Notify", + mock.Anything, + cachetype.ConnectCARootName, + &rootsReq, + rootsWatchID, + mock.Anything, + ).Return(nil).Times(2).Run(func(args mock.Arguments) { + ctxLock.Lock() + rootsCtx = args.Get(0).(context.Context) + ctxLock.Unlock() + }) + + leafReq := ac.leafCertRequest() + mcfg.cache.On("Notify", + mock.Anything, + cachetype.ConnectCALeafName, + &leafReq, + leafWatchID, + mock.Anything, + ).Return(nil).Times(2).Run(func(args mock.Arguments) { + ctxLock.Lock() + leafCtx = args.Get(0).(context.Context) + ctxLock.Unlock() + }) + + // we will start/stop things twice + mcfg.tokens.On("Notify", token.TokenKindAgent).Return(token.Notifier{}).Times(2) + mcfg.tokens.On("StopNotify", token.Notifier{}).Times(2) + + mcfg.tlsCfg.On("AutoEncryptCertNotAfter").Return(time.Now().Add(10 * time.Minute)).Times(0) + + // ensure that auto-config isn't running + require.False(t, ac.IsRunning()) + + // ensure that nothing bad happens and that it reports as stopped + require.False(t, ac.Stop()) + + // ensure that the Done chan also reports that things are not running + // in other words the chan is immediately selectable + requireChanReady(t, ac.Done()) + + // start auto-config + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + require.NoError(t, ac.Start(ctx)) + + waitForContexts := func() bool { + ctxLock.Lock() + defer ctxLock.Unlock() + return !(rootsCtx == nil || leafCtx == nil) } - ac, err := New(conf) - require.NoError(t, err) - require.NotNil(t, ac) - cfg, err := ac.ReadConfig() - require.NoError(t, err) - ac.config = cfg - require.NoError(t, ac.Start(context.Background())) + // wait for the cache notifications to get started + require.Eventually(t, waitForContexts, 100*time.Millisecond, 10*time.Millisecond) + + // hold onto the Done chan to test for the go routine exiting + done := ac.Done() + + // ensure we report as running + require.True(t, ac.IsRunning()) + + // ensure the done chan is not selectable yet + requireChanNotReady(t, done) + + // ensure we error if we attempt to start again + err = ac.Start(ctx) + testutil.RequireErrorContains(t, err, "AutoConfig is already running") + + // now stop things - it should return true indicating that it was running + // when we attempted to stop it. require.True(t, ac.Stop()) - certMon.AssertExpectations(t) - directRPC.AssertExpectations(t) + // ensure that the go routine shuts down - it will close the done chan. Also it should cancel + // the cache watches by cancelling the context it passed into the Notify call. + require.True(t, waitForChans(100*time.Millisecond, done, leafCtx.Done(), rootsCtx.Done()), "AutoConfig didn't shut down") + require.False(t, ac.IsRunning()) + + // restart it + require.NoError(t, ac.Start(ctx)) + + // get the new Done chan + done = ac.Done() + + // ensure that context cancellation causes us to stop as well + cancel() + require.True(t, waitForChans(100*time.Millisecond, done)) } -func TestFallBackTLS(t *testing.T) { - rtConfig := setupRuntimeConfig(t) +type testAutoConfig struct { + mcfg *mockedConfig + ac *AutoConfig + tokenUpdates chan struct{} + originalToken string - directRPC := new(mockDirectRPC) - directRPC.Test(t) + initialRoots *structs.IndexedCARoots + initialCert *structs.IssuedCert + extraCerts []string +} - populateResponse := func(val interface{}) { - resp, ok := val.(*pbautoconf.AutoConfigResponse) +func startedAutoConfig(t *testing.T, autoEncrypt bool) testAutoConfig { + t.Helper() + mcfg := newMockedConfig(t) + loader := setupRuntimeConfig(t) + if !autoEncrypt { + loader.addConfigHCL(` + auto_config = { + enabled = true + intro_token ="blarg" + server_addresses = ["127.0.0.1:8300"] + } + verify_outgoing = true + `) + } else { + loader.addConfigHCL(` + auto_encrypt { + tls = true + } + verify_outgoing = true + `) + } + mcfg.Config.Loader = loader.Load + mcfg.Config.FallbackLeeway = time.Nanosecond + + originalToken := "a5deaa25-11ca-48bf-a979-4c3a7aa4b9a9" + + if !autoEncrypt { + // this gets called when InitialConfiguration is invoked to record the token from the + // auto-config response + mcfg.tokens.On("UpdateAgentToken", originalToken, token.TokenSourceConfig).Return(true).Once() + } + + // we expect this to be retrieved twice: first during cache prepopulation + // and then again when setting up the cache watch for the leaf cert. + // However one of those expectations is setup in the expectInitialTLS + // method so we only need one more here + mcfg.tokens.On("AgentToken").Return(originalToken).Once() + + if autoEncrypt { + // when using AutoEncrypt we also have to grab the token once more + // when setting up the initial RPC as the ACL token is what is used + // to authorize the request. + mcfg.tokens.On("AgentToken").Return(originalToken).Once() + } + + // this is called once during Start to initialze the token watches + tokenUpdateCh := make(chan struct{}) + tokenNotifier := token.Notifier{ + Ch: tokenUpdateCh, + } + mcfg.tokens.On("Notify", token.TokenKindAgent).Once().Return(tokenNotifier) + mcfg.tokens.On("StopNotify", tokenNotifier).Once() + + // expect the roots watch on the cache + mcfg.cache.On("Notify", + mock.Anything, + cachetype.ConnectCARootName, + &structs.DCSpecificRequest{Datacenter: "dc1"}, + rootsWatchID, + mock.Anything, + ).Return(nil).Once() + + mcfg.cache.On("Notify", + mock.Anything, + cachetype.ConnectCALeafName, + &cachetype.ConnectCALeafRequest{ + Datacenter: "dc1", + Agent: "autoconf", + Token: originalToken, + DNSSAN: defaultDNSSANs, + IPSAN: defaultIPSANs, + }, + leafWatchID, + mock.Anything, + ).Return(nil).Once() + + // override the server provider - most of the other tests set it up so that this + // always returns no server (simulating a state where we haven't joined gossip). + // this seems like a good place to ensure this other way of finding server information + // works + mcfg.serverProvider.On("FindLANServer").Once().Return(&metadata.Server{ + Addr: &net.TCPAddr{IP: net.IPv4(198, 18, 0, 1), Port: 8300}, + }) + + indexedRoots, cert, extraCerts := mcfg.setupInitialTLS(t, "autoconf", "dc1", originalToken) + + mcfg.tlsCfg.On("AutoEncryptCertNotAfter").Return(cert.ValidBefore).Once() + + populateResponse := func(args mock.Arguments) { + method := args.String(3) + + switch method { + case "AutoConfig.InitialConfiguration": + resp, ok := args.Get(5).(*pbautoconf.AutoConfigResponse) + require.True(t, ok) + resp.Config = &pbconfig.Config{ + PrimaryDatacenter: "primary", + TLS: &pbconfig.TLS{ + VerifyServerHostname: true, + }, + ACL: &pbconfig.ACL{ + Tokens: &pbconfig.ACLTokens{ + Agent: originalToken, + }, + }, + } + + resp.CARoots = mustTranslateCARootsToProtobuf(t, indexedRoots) + resp.Certificate = mustTranslateIssuedCertToProtobuf(t, cert) + resp.ExtraCACertificates = extraCerts + case "AutoEncrypt.Sign": + resp, ok := args.Get(5).(*structs.SignedResponse) + require.True(t, ok) + *resp = structs.SignedResponse{ + VerifyServerHostname: true, + ConnectCARoots: *indexedRoots, + IssuedCert: *cert, + ManualCARoots: extraCerts, + } + } + } + + if !autoEncrypt { + expectedRequest := pbautoconf.AutoConfigRequest{ + Datacenter: "dc1", + Node: "autoconf", + JWT: "blarg", + } + + mcfg.directRPC.On( + "RPC", + "dc1", + "autoconf", + &net.TCPAddr{IP: net.IPv4(198, 18, 0, 1), Port: 8300}, + "AutoConfig.InitialConfiguration", + &expectedRequest, + &pbautoconf.AutoConfigResponse{}).Return(nil).Run(populateResponse).Once() + } else { + expectedRequest := structs.CASignRequest{ + WriteRequest: structs.WriteRequest{Token: originalToken}, + Datacenter: "dc1", + // TODO (autoconf) Maybe in the future we should populate a CSR + // and do some manual parsing/verification of the contents. The + // bits not having to do with the signing key such as the requested + // SANs and CN. For now though the mockDirectRPC type will empty + // the CSR so we have to pass in an empty string to the expectation. + CSR: "", + } + + mcfg.directRPC.On( + "RPC", + "dc1", + "autoconf", // reusing the same name to prevent needing more configurability + &net.TCPAddr{IP: net.IPv4(198, 18, 0, 1), Port: 8300}, + "AutoEncrypt.Sign", + &expectedRequest, + &structs.SignedResponse{}).Return(nil).Run(populateResponse) + } + + ac, err := New(mcfg.Config) + require.NoError(t, err) + require.NotNil(t, ac) + + cfg, err := ac.InitialConfiguration(context.Background()) + require.NoError(t, err) + require.NotNil(t, cfg) + if !autoEncrypt { + // auto-encrypt doesn't modify the config but rather sets the value + // in the TLS configurator + require.True(t, cfg.VerifyServerHostname) + } + + ctx, cancel := context.WithCancel(context.Background()) + require.NoError(t, ac.Start(ctx)) + t.Cleanup(func() { + done := ac.Done() + cancel() + timer := time.NewTimer(1 * time.Second) + defer timer.Stop() + select { + case <-done: + // do nothing + case <-timer.C: + t.Fatalf("AutoConfig wasn't stopped within 1 second after test completion") + } + }) + + return testAutoConfig{ + mcfg: mcfg, + ac: ac, + tokenUpdates: tokenUpdateCh, + originalToken: originalToken, + initialRoots: indexedRoots, + initialCert: cert, + extraCerts: extraCerts, + } +} + +// this test ensures that the cache watches are restarted with +// the updated token after receiving a token update +func TestTokenUpdate(t *testing.T) { + testAC := startedAutoConfig(t, false) + + newToken := "1a4cc445-86ed-46b4-a355-bbf5a11dddb0" + + rootsCtx, rootsCancel := context.WithCancel(context.Background()) + testAC.mcfg.cache.On("Notify", + mock.Anything, + cachetype.ConnectCARootName, + &structs.DCSpecificRequest{Datacenter: testAC.ac.config.Datacenter}, + rootsWatchID, + mock.Anything, + ).Return(nil).Once().Run(func(args mock.Arguments) { + rootsCancel() + }) + + leafCtx, leafCancel := context.WithCancel(context.Background()) + testAC.mcfg.cache.On("Notify", + mock.Anything, + cachetype.ConnectCALeafName, + &cachetype.ConnectCALeafRequest{ + Datacenter: "dc1", + Agent: "autoconf", + Token: newToken, + DNSSAN: defaultDNSSANs, + IPSAN: defaultIPSANs, + }, + leafWatchID, + mock.Anything, + ).Return(nil).Once().Run(func(args mock.Arguments) { + leafCancel() + }) + + // this will be retrieved once when resetting the leaf cert watch + testAC.mcfg.tokens.On("AgentToken").Return(newToken).Once() + + // send the notification about the token update + testAC.tokenUpdates <- struct{}{} + + // wait for the leaf cert watches + require.True(t, waitForChans(100*time.Millisecond, leafCtx.Done(), rootsCtx.Done()), "New cache watches were not started within 100ms") +} + +func TestRootsUpdate(t *testing.T) { + testAC := startedAutoConfig(t, false) + + secondCA := connect.TestCA(t, testAC.initialRoots.Roots[0]) + secondRoots := structs.IndexedCARoots{ + ActiveRootID: secondCA.ID, + TrustDomain: connect.TestClusterID, + Roots: []*structs.CARoot{ + secondCA, + testAC.initialRoots.Roots[0], + }, + QueryMeta: structs.QueryMeta{ + Index: 99, + }, + } + + updatedCtx, cancel := context.WithCancel(context.Background()) + testAC.mcfg.tlsCfg.On("UpdateAutoTLS", + testAC.extraCerts, + []string{secondCA.RootCert, testAC.initialRoots.Roots[0].RootCert}, + testAC.initialCert.CertPEM, + "redacted", + true, + ).Return(nil).Once().Run(func(args mock.Arguments) { + cancel() + }) + + // when a cache event comes in we end up recalculating the fallback timer which requires this call + testAC.mcfg.tlsCfg.On("AutoEncryptCertNotAfter").Return(time.Now().Add(10 * time.Minute)).Once() + + req := structs.DCSpecificRequest{Datacenter: "dc1"} + require.True(t, testAC.mcfg.cache.sendNotification(context.Background(), req.CacheInfo().Key, cache.UpdateEvent{ + CorrelationID: rootsWatchID, + Result: &secondRoots, + Meta: cache.ResultMeta{ + Index: secondRoots.Index, + }, + })) + + require.True(t, waitForChans(100*time.Millisecond, updatedCtx.Done()), "TLS certificates were not updated within the alotted time") + + // persisting these to disk happens right after the chan we are waiting for will have fired above + // however there is no deterministic way to know once its been written outside of maybe a filesystem + // event notifier. That seems a little heavy handed just for this and especially to do in any sort + // of cross platform way. + retry.Run(t, func(r *retry.R) { + resp, err := testAC.ac.readPersistedAutoConfig() + require.NoError(r, err) + require.Equal(r, secondRoots.ActiveRootID, resp.CARoots.GetActiveRootID()) + }) +} + +func TestCertUpdate(t *testing.T) { + testAC := startedAutoConfig(t, false) + secondCert := newLeaf(t, "autoconf", "dc1", testAC.initialRoots.Roots[0], 99, 10*time.Minute) + + updatedCtx, cancel := context.WithCancel(context.Background()) + testAC.mcfg.tlsCfg.On("UpdateAutoTLS", + testAC.extraCerts, + []string{testAC.initialRoots.Roots[0].RootCert}, + secondCert.CertPEM, + "redacted", + true, + ).Return(nil).Once().Run(func(args mock.Arguments) { + cancel() + }) + + // when a cache event comes in we end up recalculating the fallback timer which requires this call + testAC.mcfg.tlsCfg.On("AutoEncryptCertNotAfter").Return(secondCert.ValidBefore).Once() + + req := cachetype.ConnectCALeafRequest{ + Datacenter: "dc1", + Agent: "autoconf", + Token: testAC.originalToken, + DNSSAN: defaultDNSSANs, + IPSAN: defaultIPSANs, + } + require.True(t, testAC.mcfg.cache.sendNotification(context.Background(), req.CacheInfo().Key, cache.UpdateEvent{ + CorrelationID: leafWatchID, + Result: secondCert, + Meta: cache.ResultMeta{ + Index: secondCert.ModifyIndex, + }, + })) + + require.True(t, waitForChans(100*time.Millisecond, updatedCtx.Done()), "TLS certificates were not updated within the alotted time") + + // persisting these to disk happens after all the things we would wait for in assertCertUpdated + // will have fired. There is no deterministic way to know once its been written so we wrap + // this in a retry. + retry.Run(t, func(r *retry.R) { + resp, err := testAC.ac.readPersistedAutoConfig() + require.NoError(r, err) + + // ensure the roots got persisted to disk + require.Equal(r, secondCert.CertPEM, resp.Certificate.GetCertPEM()) + }) +} + +func TestFallback(t *testing.T) { + testAC := startedAutoConfig(t, false) + + // at this point everything is operating normally and we are just + // waiting for events. We are going to send a new cert that is basically + // already expired and then allow the fallback routine to kick in. + secondCert := newLeaf(t, "autoconf", "dc1", testAC.initialRoots.Roots[0], 100, time.Nanosecond) + secondCA := connect.TestCA(t, testAC.initialRoots.Roots[0]) + secondRoots := structs.IndexedCARoots{ + ActiveRootID: secondCA.ID, + TrustDomain: connect.TestClusterID, + Roots: []*structs.CARoot{ + secondCA, + testAC.initialRoots.Roots[0], + }, + QueryMeta: structs.QueryMeta{ + Index: 101, + }, + } + thirdCert := newLeaf(t, "autoconf", "dc1", secondCA, 102, 10*time.Minute) + + // setup the expectation for when the certs got updated initially + updatedCtx, updateCancel := context.WithCancel(context.Background()) + testAC.mcfg.tlsCfg.On("UpdateAutoTLS", + testAC.extraCerts, + []string{testAC.initialRoots.Roots[0].RootCert}, + secondCert.CertPEM, + "redacted", + true, + ).Return(nil).Once().Run(func(args mock.Arguments) { + updateCancel() + }) + + // when a cache event comes in we end up recalculating the fallback timer which requires this call + testAC.mcfg.tlsCfg.On("AutoEncryptCertNotAfter").Return(secondCert.ValidBefore).Once() + testAC.mcfg.tlsCfg.On("AutoEncryptCertExpired").Return(true).Once() + + fallbackCtx, fallbackCancel := context.WithCancel(context.Background()) + + // also testing here that we can change server IPs for ongoing operations + testAC.mcfg.serverProvider.On("FindLANServer").Once().Return(&metadata.Server{ + Addr: &net.TCPAddr{IP: net.IPv4(198, 18, 23, 2), Port: 8300}, + }) + + // after sending the notification for the cert update another InitialConfiguration RPC + // will be made to pull down the latest configuration. So we need to set up the response + // for the second RPC + populateResponse := func(args mock.Arguments) { + resp, ok := args.Get(5).(*pbautoconf.AutoConfigResponse) require.True(t, ok) resp.Config = &pbconfig.Config{ PrimaryDatacenter: "primary", TLS: &pbconfig.TLS{ VerifyServerHostname: true, }, - } - - resp.CARoots = &pbconnect.CARoots{ - ActiveRootID: "active", - TrustDomain: "trust", - Roots: []*pbconnect.CARoot{ - { - ID: "active", - Name: "foo", - SerialNumber: 42, - SigningKeyID: "blarg", - NotBefore: &types.Timestamp{Seconds: 5000, Nanos: 100}, - NotAfter: &types.Timestamp{Seconds: 10000, Nanos: 9009}, - RootCert: "not an actual cert", - Active: true, + ACL: &pbconfig.ACL{ + Tokens: &pbconfig.ACLTokens{ + Agent: testAC.originalToken, }, }, } - resp.Certificate = &pbconnect.IssuedCert{ - SerialNumber: "1234", - CertPEM: "not a cert", - Agent: "foo", - AgentURI: "spiffe://blarg/agent/client/dc/foo/id/foo", - ValidAfter: &types.Timestamp{Seconds: 6000}, - ValidBefore: &types.Timestamp{Seconds: 7000}, - } - resp.ExtraCACertificates = []string{"blarg"} + + resp.CARoots = mustTranslateCARootsToProtobuf(t, &secondRoots) + resp.Certificate = mustTranslateIssuedCertToProtobuf(t, thirdCert) + resp.ExtraCACertificates = testAC.extraCerts + + fallbackCancel() } expectedRequest := pbautoconf.AutoConfigRequest{ @@ -685,76 +1056,118 @@ func TestFallBackTLS(t *testing.T) { JWT: "blarg", } - directRPC.On( + testAC.mcfg.directRPC.On( "RPC", "dc1", "autoconf", - &net.TCPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 8300}, + &net.TCPAddr{IP: net.IPv4(198, 18, 23, 2), Port: 8300}, "AutoConfig.InitialConfiguration", &expectedRequest, - &pbautoconf.AutoConfigResponse{}).Return(populateResponse) + &pbautoconf.AutoConfigResponse{}).Return(nil).Run(populateResponse).Once() - // setup the mock certificate monitor we don't expect it to be used - // as the FallbackTLS method is mainly used by the certificate monitor - // if for some reason it fails to renew the TLS certificate in time. - certMon := new(mockCertMonitor) + // this gets called when InitialConfiguration is invoked to record the token from the + // auto-config response which is how the Fallback for auto-config works + testAC.mcfg.tokens.On("UpdateAgentToken", testAC.originalToken, token.TokenSourceConfig).Return(true).Once() - conf := Config{ - DirectRPC: directRPC, - Loader: func(source config.Source) (*config.RuntimeConfig, []string, error) { - rtConfig.AutoConfig = config.AutoConfig{ - Enabled: true, - IntroToken: "blarg", - ServerAddresses: []string{"127.0.0.1:8300"}, - } - rtConfig.VerifyOutgoing = true - return rtConfig, nil, nil - }, - CertMonitor: certMon, + testAC.mcfg.expectInitialTLS(t, "autoconf", "dc1", testAC.originalToken, secondCA, &secondRoots, thirdCert, testAC.extraCerts) + + // after the second RPC we now will use the new certs validity period in the next run loop iteration + testAC.mcfg.tlsCfg.On("AutoEncryptCertNotAfter").Return(time.Now().Add(10 * time.Minute)).Once() + + // now that all the mocks are set up we can trigger the whole thing by sending the second expired cert + // as a cache update event. + req := cachetype.ConnectCALeafRequest{ + Datacenter: "dc1", + Agent: "autoconf", + Token: testAC.originalToken, + DNSSAN: defaultDNSSANs, + IPSAN: defaultIPSANs, } - ac, err := New(conf) - require.NoError(t, err) - require.NotNil(t, ac) - ac.config, err = ac.ReadConfig() - require.NoError(t, err) + require.True(t, testAC.mcfg.cache.sendNotification(context.Background(), req.CacheInfo().Key, cache.UpdateEvent{ + CorrelationID: leafWatchID, + Result: secondCert, + Meta: cache.ResultMeta{ + Index: secondCert.ModifyIndex, + }, + })) - actual, err := ac.FallbackTLS(context.Background()) - require.NoError(t, err) - expected := &structs.SignedResponse{ - ConnectCARoots: structs.IndexedCARoots{ - ActiveRootID: "active", - TrustDomain: "trust", - Roots: []*structs.CARoot{ - { - ID: "active", - Name: "foo", - SerialNumber: 42, - SigningKeyID: "blarg", - NotBefore: time.Unix(5000, 100), - NotAfter: time.Unix(10000, 9009), - RootCert: "not an actual cert", - Active: true, + // wait for the TLS certificates to get updated + require.True(t, waitForChans(100*time.Millisecond, updatedCtx.Done()), "TLS certificates were not updated within the alotted time") + + // now wait for the fallback routine to be invoked + require.True(t, waitForChans(100*time.Millisecond, fallbackCtx.Done()), "fallback routines did not get invoked within the alotted time") + + // persisting these to disk happens after the RPC we waited on above will have fired + // There is no deterministic way to know once its been written so we wrap this in a retry. + retry.Run(t, func(r *retry.R) { + resp, err := testAC.ac.readPersistedAutoConfig() + require.NoError(r, err) + + // ensure the roots got persisted to disk + require.Equal(r, thirdCert.CertPEM, resp.Certificate.GetCertPEM()) + require.Equal(r, secondRoots.ActiveRootID, resp.CARoots.GetActiveRootID()) + }) +} + +func TestIntroToken(t *testing.T) { + tokenFile := testutil.TempFile(t, "intro-token") + t.Cleanup(func() { os.Remove(tokenFile.Name()) }) + + tokenFileEmpty := testutil.TempFile(t, "intro-token-empty") + t.Cleanup(func() { os.Remove(tokenFileEmpty.Name()) }) + + tokenFromFile := "8ae34d3a-8adf-446a-b236-69874597cb5b" + tokenFromConfig := "3ad9b572-ea42-4e47-9cd0-53a398a98abf" + require.NoError(t, ioutil.WriteFile(tokenFile.Name(), []byte(tokenFromFile), 0600)) + + type testCase struct { + config *config.RuntimeConfig + err string + token string + } + + cases := map[string]testCase{ + "config": { + config: &config.RuntimeConfig{ + AutoConfig: config.AutoConfig{ + IntroToken: tokenFromConfig, + IntroTokenFile: tokenFile.Name(), }, }, + token: tokenFromConfig, }, - IssuedCert: structs.IssuedCert{ - SerialNumber: "1234", - CertPEM: "not a cert", - Agent: "foo", - AgentURI: "spiffe://blarg/agent/client/dc/foo/id/foo", - ValidAfter: time.Unix(6000, 0), - ValidBefore: time.Unix(7000, 0), + "file": { + config: &config.RuntimeConfig{ + AutoConfig: config.AutoConfig{ + IntroTokenFile: tokenFile.Name(), + }, + }, + token: tokenFromFile, + }, + "file-empty": { + config: &config.RuntimeConfig{ + AutoConfig: config.AutoConfig{ + IntroTokenFile: tokenFileEmpty.Name(), + }, + }, + err: "intro_token_file did not contain any token", }, - ManualCARoots: []string{"blarg"}, - VerifyServerHostname: true, } - // have to just verify that the private key was put in here but we then - // must zero it out so that the remaining equality check will pass - require.NotEmpty(t, actual.IssuedCert.PrivateKeyPEM) - actual.IssuedCert.PrivateKeyPEM = "" - require.Equal(t, expected, actual) - // ensure no RPC was made - directRPC.AssertExpectations(t) - certMon.AssertExpectations(t) + for name, tcase := range cases { + t.Run(name, func(t *testing.T) { + ac := AutoConfig{ + config: tcase.config, + } + + token, err := ac.introToken() + if tcase.err != "" { + testutil.RequireErrorContains(t, err, tcase.err) + } else { + require.NoError(t, err) + require.Equal(t, tcase.token, token) + } + }) + } + } diff --git a/agent/auto-config/auto_encrypt.go b/agent/auto-config/auto_encrypt.go new file mode 100644 index 000000000..2290bb332 --- /dev/null +++ b/agent/auto-config/auto_encrypt.go @@ -0,0 +1,111 @@ +package autoconf + +import ( + "context" + "fmt" + "net" + "strings" + + "github.com/hashicorp/consul/agent/structs" +) + +func (ac *AutoConfig) autoEncryptInitialCerts(ctx context.Context) (*structs.SignedResponse, error) { + // generate a CSR + csr, key, err := ac.generateCSR() + if err != nil { + return nil, err + } + + // this resets the failures so that we will perform immediate request + wait := ac.acConfig.Waiter.Success() + for { + select { + case <-wait: + if resp, err := ac.autoEncryptInitialCertsOnce(ctx, csr, key); err == nil && resp != nil { + return resp, nil + } else if err != nil { + ac.logger.Error(err.Error()) + } else { + ac.logger.Error("No error returned when fetching certificates from the servers but no response was either") + } + + wait = ac.acConfig.Waiter.Failed() + case <-ctx.Done(): + ac.logger.Info("interrupted during retrieval of auto-encrypt certificates", "err", ctx.Err()) + return nil, ctx.Err() + } + } +} + +func (ac *AutoConfig) autoEncryptInitialCertsOnce(ctx context.Context, csr, key string) (*structs.SignedResponse, error) { + request := structs.CASignRequest{ + WriteRequest: structs.WriteRequest{Token: ac.acConfig.Tokens.AgentToken()}, + Datacenter: ac.config.Datacenter, + CSR: csr, + } + var resp structs.SignedResponse + + servers, err := ac.autoEncryptHosts() + if err != nil { + return nil, err + } + + for _, s := range servers { + // try each IP to see if we can successfully make the request + for _, addr := range ac.resolveHost(s) { + if ctx.Err() != nil { + return nil, ctx.Err() + } + + ac.logger.Debug("making AutoEncrypt.Sign RPC", "addr", addr.String()) + err = ac.acConfig.DirectRPC.RPC(ac.config.Datacenter, ac.config.NodeName, &addr, "AutoEncrypt.Sign", &request, &resp) + if err != nil { + ac.logger.Error("AutoEncrypt.Sign RPC failed", "addr", addr.String(), "error", err) + continue + } + + resp.IssuedCert.PrivateKeyPEM = key + return &resp, nil + } + } + return nil, fmt.Errorf("No servers successfully responded to the auto-encrypt request") +} + +func (ac *AutoConfig) autoEncryptHosts() ([]string, error) { + // use servers known to gossip if there are any + if ac.acConfig.ServerProvider != nil { + if srv := ac.acConfig.ServerProvider.FindLANServer(); srv != nil { + return []string{srv.Addr.String()}, nil + } + } + + hosts, err := ac.discoverServers(ac.config.RetryJoinLAN) + if err != nil { + return nil, err + } + + var addrs []string + + // The addresses we use for auto-encrypt are the retry join and start join + // addresses. These are for joining serf and therefore we cannot rely on the + // ports for these. This loop strips any port that may have been specified and + // will let subsequent resolveAddr calls add on the default RPC port. + for _, addr := range append(ac.config.StartJoinAddrsLAN, hosts...) { + host, _, err := net.SplitHostPort(addr) + if err != nil { + if strings.Contains(err.Error(), "missing port in address") { + host = addr + } else { + ac.logger.Warn("error splitting host address into IP and port", "address", addr, "error", err) + continue + } + } + addrs = append(addrs, host) + } + + if len(addrs) == 0 { + return nil, fmt.Errorf("no auto-encrypt server addresses available for use") + } + + return addrs, nil +} diff --git a/agent/auto-config/auto_encrypt_test.go b/agent/auto-config/auto_encrypt_test.go new file mode 100644 index 000000000..867db9441 --- /dev/null +++ b/agent/auto-config/auto_encrypt_test.go @@ -0,0 +1,562 @@ +package autoconf + +import ( + "context" + "crypto/x509" + "crypto/x509/pkix" + "encoding/asn1" + "fmt" + "net" + "net/url" + "testing" + "time" + + "github.com/hashicorp/consul/agent/cache" + cachetype "github.com/hashicorp/consul/agent/cache-types" + "github.com/hashicorp/consul/agent/config" + "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/metadata" + "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/lib" + "github.com/hashicorp/consul/sdk/testutil" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" +) + +func TestAutoEncrypt_generateCSR(t *testing.T) { + type testCase struct { + conf *config.RuntimeConfig + + // to validate the csr + expectedSubject pkix.Name + expectedSigAlg x509.SignatureAlgorithm + expectedPubAlg x509.PublicKeyAlgorithm + expectedDNSNames []string + expectedIPs []net.IP + expectedURIs []*url.URL + } + + cases := map[string]testCase{ + "ip-sans": { + conf: &config.RuntimeConfig{ + Datacenter: "dc1", + NodeName: "test-node", + AutoEncryptTLS: true, + AutoEncryptIPSAN: []net.IP{net.IPv4(198, 18, 0, 1), net.IPv4(198, 18, 0, 2)}, + }, + expectedSubject: pkix.Name{ + CommonName: connect.AgentCN("test-node", unknownTrustDomain), + Names: []pkix.AttributeTypeAndValue{ + { + // 2,5,4,3 is the CommonName type ASN1 identifier + Type: asn1.ObjectIdentifier{2, 5, 4, 3}, + Value: "testnode.agnt.unknown.consul", + }, + }, + }, + expectedSigAlg: x509.ECDSAWithSHA256, + expectedPubAlg: x509.ECDSA, + expectedDNSNames: defaultDNSSANs, + expectedIPs: append(defaultIPSANs, + net.IP{198, 18, 0, 1}, + net.IP{198, 18, 0, 2}, + ), + expectedURIs: []*url.URL{ + { + Scheme: "spiffe", + Host: unknownTrustDomain, + Path: "/agent/client/dc/dc1/id/test-node", + }, + }, + }, + "dns-sans": { + conf: &config.RuntimeConfig{ + Datacenter: "dc1", + NodeName: "test-node", + AutoEncryptTLS: true, + AutoEncryptDNSSAN: []string{"foo.local", "bar.local"}, + }, + expectedSubject: pkix.Name{ + CommonName: connect.AgentCN("test-node", unknownTrustDomain), + Names: []pkix.AttributeTypeAndValue{ + { + // 2,5,4,3 is the CommonName type ASN1 identifier + Type: asn1.ObjectIdentifier{2, 5, 4, 3}, + Value: "testnode.agnt.unknown.consul", + }, + }, + }, + expectedSigAlg: x509.ECDSAWithSHA256, + expectedPubAlg: x509.ECDSA, + expectedDNSNames: append(defaultDNSSANs, "foo.local", "bar.local"), + expectedIPs: defaultIPSANs, + expectedURIs: []*url.URL{ + { + Scheme: "spiffe", + Host: unknownTrustDomain, + Path: "/agent/client/dc/dc1/id/test-node", + }, + }, + }, + } + + for name, tcase := range cases { + t.Run(name, func(t *testing.T) { + ac := AutoConfig{config: tcase.conf} + + csr, _, err := ac.generateCSR() + require.NoError(t, err) + + request, err := connect.ParseCSR(csr) + require.NoError(t, err) + require.NotNil(t, request) + + require.Equal(t, tcase.expectedSubject, request.Subject) + require.Equal(t, tcase.expectedSigAlg, request.SignatureAlgorithm) + require.Equal(t, tcase.expectedPubAlg, request.PublicKeyAlgorithm) + require.Equal(t, tcase.expectedDNSNames, request.DNSNames) + require.Equal(t, tcase.expectedIPs, request.IPAddresses) + require.Equal(t, tcase.expectedURIs, request.URIs) + }) + } +} + +func TestAutoEncrypt_hosts(t *testing.T) { + type testCase struct { + serverProvider ServerProvider + config *config.RuntimeConfig + + hosts []string + err string + } + + providerNone := newMockServerProvider(t) + providerNone.On("FindLANServer").Return(nil).Times(0) + + providerWithServer := newMockServerProvider(t) + providerWithServer.On("FindLANServer").Return(&metadata.Server{Addr: &net.TCPAddr{IP: net.IPv4(198, 18, 0, 1), Port: 1234}}).Times(0) + + cases := map[string]testCase{ + "router-override": { + serverProvider: providerWithServer, + config: &config.RuntimeConfig{ + RetryJoinLAN: []string{"127.0.0.1:9876"}, + StartJoinAddrsLAN: []string{"192.168.1.2:4321"}, + }, + hosts: []string{"198.18.0.1:1234"}, + }, + "various-addresses": { + serverProvider: providerNone, + config: &config.RuntimeConfig{ + RetryJoinLAN: []string{"198.18.0.1", "foo.com", "[2001:db8::1234]:1234", "abc.local:9876"}, + StartJoinAddrsLAN: []string{"192.168.1.1:5432", "start.local", "[::ffff:172.16.5.4]", "main.dev:6789"}, + }, + hosts: []string{ + "192.168.1.1", + "start.local", + "[::ffff:172.16.5.4]", + "main.dev", + "198.18.0.1", + "foo.com", + "2001:db8::1234", + "abc.local", + }, + }, + "split-host-port-error": { + serverProvider: providerNone, + config: &config.RuntimeConfig{ + StartJoinAddrsLAN: []string{"this-is-not:a:ip:and_port"}, + }, + err: "no auto-encrypt server addresses available for use", + }, + } + + for name, tcase := range cases { + t.Run(name, func(t *testing.T) { + ac := AutoConfig{ + config: tcase.config, + logger: testutil.Logger(t), + acConfig: Config{ + ServerProvider: tcase.serverProvider, + }, + } + + hosts, err := ac.autoEncryptHosts() + if tcase.err != "" { + testutil.RequireErrorContains(t, err, tcase.err) + } else { + require.NoError(t, err) + require.Equal(t, tcase.hosts, hosts) + } + }) + } +} + +func TestAutoEncrypt_InitialCerts(t *testing.T) { + token := "1a148388-3dd7-4db4-9eea-520424b4a86a" + datacenter := "foo" + nodeName := "bar" + + mcfg := newMockedConfig(t) + + _, indexedRoots, cert := testCerts(t, nodeName, datacenter) + + // The following are called once for each round through the auto-encrypt initial certs outer loop + // (not the per-host direct rpc attempts but the one involving the RetryWaiter) + mcfg.tokens.On("AgentToken").Return(token).Times(2) + mcfg.serverProvider.On("FindLANServer").Return(nil).Times(2) + + request := structs.CASignRequest{ + WriteRequest: structs.WriteRequest{Token: token}, + Datacenter: datacenter, + // this gets removed by the mock code as its non-deterministic what it will be + CSR: "", + } + + // first failure + mcfg.directRPC.On("RPC", + datacenter, + nodeName, + &net.TCPAddr{IP: net.IPv4(198, 18, 0, 1), Port: 8300}, + "AutoEncrypt.Sign", + &request, + &structs.SignedResponse{}, + ).Once().Return(fmt.Errorf("injected error")) + // second failure + mcfg.directRPC.On("RPC", + datacenter, + nodeName, + &net.TCPAddr{IP: net.IPv4(198, 18, 0, 2), Port: 8300}, + "AutoEncrypt.Sign", + &request, + &structs.SignedResponse{}, + ).Once().Return(fmt.Errorf("injected error")) + // third times is successfuly (second attempt to first server) + mcfg.directRPC.On("RPC", + datacenter, + nodeName, + &net.TCPAddr{IP: net.IPv4(198, 18, 0, 1), Port: 8300}, + "AutoEncrypt.Sign", + &request, + &structs.SignedResponse{}, + ).Once().Return(nil).Run(func(args mock.Arguments) { + resp, ok := args.Get(5).(*structs.SignedResponse) + require.True(t, ok) + resp.ConnectCARoots = *indexedRoots + resp.IssuedCert = *cert + resp.VerifyServerHostname = true + }) + + mcfg.Config.Waiter = lib.NewRetryWaiter(2, 0, 1*time.Millisecond, nil) + + ac := AutoConfig{ + config: &config.RuntimeConfig{ + Datacenter: datacenter, + NodeName: nodeName, + RetryJoinLAN: []string{"198.18.0.1:1234", "198.18.0.2:3456"}, + ServerPort: 8300, + }, + acConfig: mcfg.Config, + logger: testutil.Logger(t), + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + resp, err := ac.autoEncryptInitialCerts(ctx) + require.NoError(t, err) + require.NotNil(t, resp) + require.True(t, resp.VerifyServerHostname) + require.NotEmpty(t, resp.IssuedCert.PrivateKeyPEM) + resp.IssuedCert.PrivateKeyPEM = "" + cert.PrivateKeyPEM = "" + require.Equal(t, cert, &resp.IssuedCert) + require.Equal(t, indexedRoots, &resp.ConnectCARoots) + require.Empty(t, resp.ManualCARoots) +} + +func TestAutoEncrypt_InitialConfiguration(t *testing.T) { + token := "010494ae-ee45-4433-903c-a58c91297714" + nodeName := "auto-encrypt" + datacenter := "dc1" + + mcfg := newMockedConfig(t) + loader := setupRuntimeConfig(t) + loader.addConfigHCL(` + auto_encrypt { + tls = true + } + `) + loader.opts.Config.NodeName = &nodeName + mcfg.Config.Loader = loader.Load + + indexedRoots, cert, extraCerts := mcfg.setupInitialTLS(t, nodeName, datacenter, token) + + // prepopulation is going to grab the token to populate the correct cache key + mcfg.tokens.On("AgentToken").Return(token).Times(0) + + // no server provider + mcfg.serverProvider.On("FindLANServer").Return(&metadata.Server{Addr: &net.TCPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 8300}}).Times(1) + + populateResponse := func(args mock.Arguments) { + resp, ok := args.Get(5).(*structs.SignedResponse) + require.True(t, ok) + *resp = structs.SignedResponse{ + VerifyServerHostname: true, + ConnectCARoots: *indexedRoots, + IssuedCert: *cert, + ManualCARoots: extraCerts, + } + } + + expectedRequest := structs.CASignRequest{ + WriteRequest: structs.WriteRequest{Token: token}, + Datacenter: datacenter, + // TODO (autoconf) Maybe in the future we should populate a CSR + // and do some manual parsing/verification of the contents. The + // bits not having to do with the signing key such as the requested + // SANs and CN. For now though the mockDirectRPC type will empty + // the CSR so we have to pass in an empty string to the expectation. + CSR: "", + } + + mcfg.directRPC.On( + "RPC", + datacenter, + nodeName, + &net.TCPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 8300}, + "AutoEncrypt.Sign", + &expectedRequest, + &structs.SignedResponse{}).Return(nil).Run(populateResponse) + + ac, err := New(mcfg.Config) + require.NoError(t, err) + require.NotNil(t, ac) + + cfg, err := ac.InitialConfiguration(context.Background()) + require.NoError(t, err) + require.NotNil(t, cfg) + +} + +func TestAutoEncrypt_TokenUpdate(t *testing.T) { + testAC := startedAutoConfig(t, true) + + newToken := "1a4cc445-86ed-46b4-a355-bbf5a11dddb0" + + rootsCtx, rootsCancel := context.WithCancel(context.Background()) + testAC.mcfg.cache.On("Notify", + mock.Anything, + cachetype.ConnectCARootName, + &structs.DCSpecificRequest{Datacenter: testAC.ac.config.Datacenter}, + rootsWatchID, + mock.Anything, + ).Return(nil).Once().Run(func(args mock.Arguments) { + rootsCancel() + }) + + leafCtx, leafCancel := context.WithCancel(context.Background()) + testAC.mcfg.cache.On("Notify", + mock.Anything, + cachetype.ConnectCALeafName, + &cachetype.ConnectCALeafRequest{ + Datacenter: "dc1", + Agent: "autoconf", + Token: newToken, + DNSSAN: defaultDNSSANs, + IPSAN: defaultIPSANs, + }, + leafWatchID, + mock.Anything, + ).Return(nil).Once().Run(func(args mock.Arguments) { + leafCancel() + }) + + // this will be retrieved once when resetting the leaf cert watch + testAC.mcfg.tokens.On("AgentToken").Return(newToken).Once() + + // send the notification about the token update + testAC.tokenUpdates <- struct{}{} + + // wait for the leaf cert watches + require.True(t, waitForChans(100*time.Millisecond, leafCtx.Done(), rootsCtx.Done()), "New cache watches were not started within 100ms") +} + +func TestAutoEncrypt_RootsUpdate(t *testing.T) { + testAC := startedAutoConfig(t, true) + + secondCA := connect.TestCA(t, testAC.initialRoots.Roots[0]) + secondRoots := structs.IndexedCARoots{ + ActiveRootID: secondCA.ID, + TrustDomain: connect.TestClusterID, + Roots: []*structs.CARoot{ + secondCA, + testAC.initialRoots.Roots[0], + }, + QueryMeta: structs.QueryMeta{ + Index: 99, + }, + } + + updatedCtx, cancel := context.WithCancel(context.Background()) + testAC.mcfg.tlsCfg.On("UpdateAutoTLSCA", + []string{secondCA.RootCert, testAC.initialRoots.Roots[0].RootCert}, + ).Return(nil).Once().Run(func(args mock.Arguments) { + cancel() + }) + + // when a cache event comes in we end up recalculating the fallback timer which requires this call + testAC.mcfg.tlsCfg.On("AutoEncryptCertNotAfter").Return(time.Now().Add(10 * time.Minute)).Once() + + req := structs.DCSpecificRequest{Datacenter: "dc1"} + require.True(t, testAC.mcfg.cache.sendNotification(context.Background(), req.CacheInfo().Key, cache.UpdateEvent{ + CorrelationID: rootsWatchID, + Result: &secondRoots, + Meta: cache.ResultMeta{ + Index: secondRoots.Index, + }, + })) + + require.True(t, waitForChans(100*time.Millisecond, updatedCtx.Done()), "TLS certificates were not updated within the alotted time") +} + +func TestAutoEncrypt_CertUpdate(t *testing.T) { + testAC := startedAutoConfig(t, true) + secondCert := newLeaf(t, "autoconf", "dc1", testAC.initialRoots.Roots[0], 99, 10*time.Minute) + + updatedCtx, cancel := context.WithCancel(context.Background()) + testAC.mcfg.tlsCfg.On("UpdateAutoTLSCert", + secondCert.CertPEM, + "redacted", + ).Return(nil).Once().Run(func(args mock.Arguments) { + cancel() + }) + + // when a cache event comes in we end up recalculating the fallback timer which requires this call + testAC.mcfg.tlsCfg.On("AutoEncryptCertNotAfter").Return(secondCert.ValidBefore).Once() + + req := cachetype.ConnectCALeafRequest{ + Datacenter: "dc1", + Agent: "autoconf", + Token: testAC.originalToken, + DNSSAN: defaultDNSSANs, + IPSAN: defaultIPSANs, + } + require.True(t, testAC.mcfg.cache.sendNotification(context.Background(), req.CacheInfo().Key, cache.UpdateEvent{ + CorrelationID: leafWatchID, + Result: secondCert, + Meta: cache.ResultMeta{ + Index: secondCert.ModifyIndex, + }, + })) + + require.True(t, waitForChans(100*time.Millisecond, updatedCtx.Done()), "TLS certificates were not updated within the alotted time") +} + +func TestAutoEncrypt_Fallback(t *testing.T) { + testAC := startedAutoConfig(t, true) + + // at this point everything is operating normally and we are just + // waiting for events. We are going to send a new cert that is basically + // already expired and then allow the fallback routine to kick in. + secondCert := newLeaf(t, "autoconf", "dc1", testAC.initialRoots.Roots[0], 100, time.Nanosecond) + secondCA := connect.TestCA(t, testAC.initialRoots.Roots[0]) + secondRoots := structs.IndexedCARoots{ + ActiveRootID: secondCA.ID, + TrustDomain: connect.TestClusterID, + Roots: []*structs.CARoot{ + secondCA, + testAC.initialRoots.Roots[0], + }, + QueryMeta: structs.QueryMeta{ + Index: 101, + }, + } + thirdCert := newLeaf(t, "autoconf", "dc1", secondCA, 102, 10*time.Minute) + + // setup the expectation for when the certs get updated initially + updatedCtx, updateCancel := context.WithCancel(context.Background()) + testAC.mcfg.tlsCfg.On("UpdateAutoTLSCert", + secondCert.CertPEM, + "redacted", + ).Return(nil).Once().Run(func(args mock.Arguments) { + updateCancel() + }) + + // when a cache event comes in we end up recalculating the fallback timer which requires this call + testAC.mcfg.tlsCfg.On("AutoEncryptCertNotAfter").Return(secondCert.ValidBefore).Once() + testAC.mcfg.tlsCfg.On("AutoEncryptCertExpired").Return(true).Once() + + fallbackCtx, fallbackCancel := context.WithCancel(context.Background()) + + // also testing here that we can change server IPs for ongoing operations + testAC.mcfg.serverProvider.On("FindLANServer").Once().Return(&metadata.Server{ + Addr: &net.TCPAddr{IP: net.IPv4(198, 18, 23, 2), Port: 8300}, + }) + + // after sending the notification for the cert update another InitialConfiguration RPC + // will be made to pull down the latest configuration. So we need to set up the response + // for the second RPC + populateResponse := func(args mock.Arguments) { + resp, ok := args.Get(5).(*structs.SignedResponse) + require.True(t, ok) + *resp = structs.SignedResponse{ + VerifyServerHostname: true, + ConnectCARoots: secondRoots, + IssuedCert: *thirdCert, + ManualCARoots: testAC.extraCerts, + } + + fallbackCancel() + } + + expectedRequest := structs.CASignRequest{ + WriteRequest: structs.WriteRequest{Token: testAC.originalToken}, + Datacenter: "dc1", + // TODO (autoconf) Maybe in the future we should populate a CSR + // and do some manual parsing/verification of the contents. The + // bits not having to do with the signing key such as the requested + // SANs and CN. For now though the mockDirectRPC type will empty + // the CSR so we have to pass in an empty string to the expectation. + CSR: "", + } + + // the fallback routine to perform auto-encrypt again will need to grab this + testAC.mcfg.tokens.On("AgentToken").Return(testAC.originalToken).Once() + + testAC.mcfg.directRPC.On( + "RPC", + "dc1", + "autoconf", + &net.TCPAddr{IP: net.IPv4(198, 18, 23, 2), Port: 8300}, + "AutoEncrypt.Sign", + &expectedRequest, + &structs.SignedResponse{}).Return(nil).Run(populateResponse).Once() + + testAC.mcfg.expectInitialTLS(t, "autoconf", "dc1", testAC.originalToken, secondCA, &secondRoots, thirdCert, testAC.extraCerts) + + // after the second RPC we now will use the new certs validity period in the next run loop iteration + testAC.mcfg.tlsCfg.On("AutoEncryptCertNotAfter").Return(time.Now().Add(10 * time.Minute)).Once() + + // now that all the mocks are set up we can trigger the whole thing by sending the second expired cert + // as a cache update event. + req := cachetype.ConnectCALeafRequest{ + Datacenter: "dc1", + Agent: "autoconf", + Token: testAC.originalToken, + DNSSAN: defaultDNSSANs, + IPSAN: defaultIPSANs, + } + require.True(t, testAC.mcfg.cache.sendNotification(context.Background(), req.CacheInfo().Key, cache.UpdateEvent{ + CorrelationID: leafWatchID, + Result: secondCert, + Meta: cache.ResultMeta{ + Index: secondCert.ModifyIndex, + }, + })) + + // wait for the TLS certificates to get updated + require.True(t, waitForChans(100*time.Millisecond, updatedCtx.Done()), "TLS certificates were not updated within the alotted time") + + // now wait for the fallback routine to be invoked + require.True(t, waitForChans(100*time.Millisecond, fallbackCtx.Done()), "fallback routines did not get invoked within the alotted time") +} diff --git a/agent/auto-config/config.go b/agent/auto-config/config.go index e6d729f4d..c812cae6a 100644 --- a/agent/auto-config/config.go +++ b/agent/auto-config/config.go @@ -3,9 +3,12 @@ package autoconf import ( "context" "net" + "time" + "github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/config" - "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/agent/metadata" + "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/lib" "github.com/hashicorp/go-hclog" ) @@ -18,12 +21,35 @@ type DirectRPC interface { RPC(dc string, node string, addr net.Addr, method string, args interface{}, reply interface{}) error } -// CertMonitor is the interface that needs to be satisfied for AutoConfig to be able to -// setup monitoring of the Connect TLS certificate after we first get it. -type CertMonitor interface { - Update(*structs.SignedResponse) error - Start(context.Context) (<-chan struct{}, error) - Stop() bool +// Cache is an interface to represent the methods of the +// agent/cache.Cache struct that we care about +type Cache interface { + Notify(ctx context.Context, t string, r cache.Request, correlationID string, ch chan<- cache.UpdateEvent) error + Prepopulate(t string, result cache.FetchResult, dc string, token string, key string) error +} + +// ServerProvider is an interface that can be used to find one server in the local DC known to +// the agent via Gossip +type ServerProvider interface { + FindLANServer() *metadata.Server +} + +// TLSConfigurator is an interface of the methods on the tlsutil.Configurator that we will require at +// runtime. +type TLSConfigurator interface { + UpdateAutoTLS(manualCAPEMs, connectCAPEMs []string, pub, priv string, verifyServerHostname bool) error + UpdateAutoTLSCA([]string) error + UpdateAutoTLSCert(pub, priv string) error + AutoEncryptCertNotAfter() time.Time + AutoEncryptCertExpired() bool +} + +// TokenStore is an interface of the methods we will need to use from the token.Store. +type TokenStore interface { + AgentToken() string + UpdateAgentToken(secret string, source token.TokenSource) bool + Notify(kind token.TokenKind) token.Notifier + StopNotify(notifier token.Notifier) } // Config contains all the tunables for AutoConfig @@ -37,6 +63,10 @@ type Config struct { // configuration. Setting this field is required. DirectRPC DirectRPC + // ServerProvider is the interfaced to be used by AutoConfig to find any + // known servers during fallback operations. + ServerProvider ServerProvider + // Waiter is a RetryWaiter to be used during retrieval of the // initial configuration. When a round of requests fails we will // wait and eventually make another round of requests (1 round @@ -49,14 +79,28 @@ type Config struct { // having the test take minutes/hours to complete. Waiter *lib.RetryWaiter - // CertMonitor is the Connect TLS Certificate Monitor to be used for ongoing - // certificate renewals and connect CA roots updates. This field is not - // strictly required but if not provided the TLS certificates retrieved - // through by the AutoConfig.InitialConfiguration RPC will not be used - // or renewed. - CertMonitor CertMonitor - // Loader merges source with the existing FileSources and returns the complete // RuntimeConfig. Loader func(source config.Source) (cfg *config.RuntimeConfig, warnings []string, err error) + + // TLSConfigurator is the shared TLS Configurator. AutoConfig will update the + // auto encrypt/auto config certs as they are renewed. + TLSConfigurator TLSConfigurator + + // Cache is an object implementing our Cache interface. The Cache + // used at runtime must be able to handle Roots and Leaf Cert watches + Cache Cache + + // FallbackLeeway is the amount of time after certificate expiration before + // invoking the fallback routine. If not set this will default to 10s. + FallbackLeeway time.Duration + + // FallbackRetry is the duration between Fallback invocations when the configured + // fallback routine returns an error. If not set this will default to 1m. + FallbackRetry time.Duration + + // Tokens is the shared token store. It is used to retrieve the current + // agent token as well as getting notifications when that token is updated. + // This field is required. + Tokens TokenStore } diff --git a/agent/auto-config/config_translate.go b/agent/auto-config/config_translate.go index b7d04d5f6..ba8f940d1 100644 --- a/agent/auto-config/config_translate.go +++ b/agent/auto-config/config_translate.go @@ -22,9 +22,9 @@ import ( // package cannot import the agent/config package without running into import cycles. func translateConfig(c *pbconfig.Config) config.Config { result := config.Config{ - Datacenter: &c.Datacenter, - PrimaryDatacenter: &c.PrimaryDatacenter, - NodeName: &c.NodeName, + Datacenter: stringPtrOrNil(c.Datacenter), + PrimaryDatacenter: stringPtrOrNil(c.PrimaryDatacenter), + NodeName: stringPtrOrNil(c.NodeName), // only output the SegmentName in the configuration if its non-empty // this will avoid a warning later when parsing the persisted configuration SegmentName: stringPtrOrNil(c.SegmentName), @@ -42,13 +42,13 @@ func translateConfig(c *pbconfig.Config) config.Config { if a := c.ACL; a != nil { result.ACL = config.ACL{ Enabled: &a.Enabled, - PolicyTTL: &a.PolicyTTL, - RoleTTL: &a.RoleTTL, - TokenTTL: &a.TokenTTL, - DownPolicy: &a.DownPolicy, - DefaultPolicy: &a.DefaultPolicy, + PolicyTTL: stringPtrOrNil(a.PolicyTTL), + RoleTTL: stringPtrOrNil(a.RoleTTL), + TokenTTL: stringPtrOrNil(a.TokenTTL), + DownPolicy: stringPtrOrNil(a.DownPolicy), + DefaultPolicy: stringPtrOrNil(a.DefaultPolicy), EnableKeyListPolicy: &a.EnableKeyListPolicy, - DisabledTTL: &a.DisabledTTL, + DisabledTTL: stringPtrOrNil(a.DisabledTTL), EnableTokenPersistence: &a.EnableTokenPersistence, } @@ -76,7 +76,7 @@ func translateConfig(c *pbconfig.Config) config.Config { result.RetryJoinLAN = g.RetryJoinLAN if e := c.Gossip.Encryption; e != nil { - result.EncryptKey = &e.Key + result.EncryptKey = stringPtrOrNil(e.Key) result.EncryptVerifyIncoming = &e.VerifyIncoming result.EncryptVerifyOutgoing = &e.VerifyOutgoing } diff --git a/agent/auto-config/config_translate_test.go b/agent/auto-config/config_translate_test.go index 18a4270a8..fa6d8febf 100644 --- a/agent/auto-config/config_translate_test.go +++ b/agent/auto-config/config_translate_test.go @@ -1,10 +1,13 @@ package autoconf import ( + "fmt" "testing" "github.com/hashicorp/consul/agent/config" + "github.com/hashicorp/consul/agent/structs" pbconfig "github.com/hashicorp/consul/proto/pbconfig" + "github.com/hashicorp/consul/proto/pbconnect" "github.com/stretchr/testify/require" ) @@ -16,6 +19,38 @@ func boolPointer(b bool) *bool { return &b } +func translateCARootToProtobuf(in *structs.CARoot) (*pbconnect.CARoot, error) { + var out pbconnect.CARoot + if err := mapstructureTranslateToProtobuf(in, &out); err != nil { + return nil, fmt.Errorf("Failed to re-encode CA Roots: %w", err) + } + return &out, nil +} + +func mustTranslateCARootToProtobuf(t *testing.T, in *structs.CARoot) *pbconnect.CARoot { + out, err := translateCARootToProtobuf(in) + require.NoError(t, err) + return out +} + +func mustTranslateCARootsToStructs(t *testing.T, in *pbconnect.CARoots) *structs.IndexedCARoots { + out, err := translateCARootsToStructs(in) + require.NoError(t, err) + return out +} + +func mustTranslateCARootsToProtobuf(t *testing.T, in *structs.IndexedCARoots) *pbconnect.CARoots { + out, err := translateCARootsToProtobuf(in) + require.NoError(t, err) + return out +} + +func mustTranslateIssuedCertToProtobuf(t *testing.T, in *structs.IssuedCert) *pbconnect.IssuedCert { + out, err := translateIssuedCertToProtobuf(in) + require.NoError(t, err) + return out +} + func TestTranslateConfig(t *testing.T) { original := pbconfig.Config{ Datacenter: "abc", @@ -119,3 +154,9 @@ func TestTranslateConfig(t *testing.T) { translated := translateConfig(&original) require.Equal(t, expected, translated) } + +func TestCArootsTranslation(t *testing.T) { + _, indexedRoots, _ := testCerts(t, "autoconf", "dc1") + protoRoots := mustTranslateCARootsToProtobuf(t, indexedRoots) + require.Equal(t, indexedRoots, mustTranslateCARootsToStructs(t, protoRoots)) +} diff --git a/agent/auto-config/mock_test.go b/agent/auto-config/mock_test.go new file mode 100644 index 000000000..d828e6a84 --- /dev/null +++ b/agent/auto-config/mock_test.go @@ -0,0 +1,337 @@ +package autoconf + +import ( + "context" + "net" + "sync" + "testing" + "time" + + "github.com/hashicorp/consul/agent/cache" + cachetype "github.com/hashicorp/consul/agent/cache-types" + "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/metadata" + "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/agent/token" + "github.com/hashicorp/consul/proto/pbautoconf" + "github.com/hashicorp/consul/sdk/testutil" + "github.com/stretchr/testify/mock" +) + +type mockDirectRPC struct { + mock.Mock +} + +func newMockDirectRPC(t *testing.T) *mockDirectRPC { + m := mockDirectRPC{} + m.Test(t) + return &m +} + +func (m *mockDirectRPC) RPC(dc string, node string, addr net.Addr, method string, args interface{}, reply interface{}) error { + var retValues mock.Arguments + if method == "AutoConfig.InitialConfiguration" { + req := args.(*pbautoconf.AutoConfigRequest) + csr := req.CSR + req.CSR = "" + retValues = m.Called(dc, node, addr, method, args, reply) + req.CSR = csr + } else if method == "AutoEncrypt.Sign" { + req := args.(*structs.CASignRequest) + csr := req.CSR + req.CSR = "" + retValues = m.Called(dc, node, addr, method, args, reply) + req.CSR = csr + } else { + retValues = m.Called(dc, node, addr, method, args, reply) + } + + return retValues.Error(0) +} + +type mockTLSConfigurator struct { + mock.Mock +} + +func newMockTLSConfigurator(t *testing.T) *mockTLSConfigurator { + m := mockTLSConfigurator{} + m.Test(t) + return &m +} + +func (m *mockTLSConfigurator) UpdateAutoTLS(manualCAPEMs, connectCAPEMs []string, pub, priv string, verifyServerHostname bool) error { + if priv != "" { + priv = "redacted" + } + + ret := m.Called(manualCAPEMs, connectCAPEMs, pub, priv, verifyServerHostname) + return ret.Error(0) +} + +func (m *mockTLSConfigurator) UpdateAutoTLSCA(pems []string) error { + ret := m.Called(pems) + return ret.Error(0) +} +func (m *mockTLSConfigurator) UpdateAutoTLSCert(pub, priv string) error { + if priv != "" { + priv = "redacted" + } + ret := m.Called(pub, priv) + return ret.Error(0) +} +func (m *mockTLSConfigurator) AutoEncryptCertNotAfter() time.Time { + ret := m.Called() + ts, _ := ret.Get(0).(time.Time) + + return ts +} +func (m *mockTLSConfigurator) AutoEncryptCertExpired() bool { + ret := m.Called() + return ret.Bool(0) +} + +type mockServerProvider struct { + mock.Mock +} + +func newMockServerProvider(t *testing.T) *mockServerProvider { + m := mockServerProvider{} + m.Test(t) + return &m +} + +func (m *mockServerProvider) FindLANServer() *metadata.Server { + ret := m.Called() + srv, _ := ret.Get(0).(*metadata.Server) + return srv +} + +type mockWatcher struct { + ch chan<- cache.UpdateEvent + done <-chan struct{} +} + +type mockCache struct { + mock.Mock + + lock sync.Mutex + watchers map[string][]mockWatcher +} + +func newMockCache(t *testing.T) *mockCache { + m := mockCache{ + watchers: make(map[string][]mockWatcher), + } + m.Test(t) + return &m +} + +func (m *mockCache) Notify(ctx context.Context, t string, r cache.Request, correlationID string, ch chan<- cache.UpdateEvent) error { + ret := m.Called(ctx, t, r, correlationID, ch) + + err := ret.Error(0) + if err == nil { + m.lock.Lock() + key := r.CacheInfo().Key + m.watchers[key] = append(m.watchers[key], mockWatcher{ch: ch, done: ctx.Done()}) + m.lock.Unlock() + } + return err +} + +func (m *mockCache) Prepopulate(t string, result cache.FetchResult, dc string, token string, key string) error { + var restore string + cert, ok := result.Value.(*structs.IssuedCert) + if ok { + // we cannot know what the private key is prior to it being injected into the cache. + // therefore redact it here and all mock expectations should take that into account + restore = cert.PrivateKeyPEM + cert.PrivateKeyPEM = "redacted" + } + + ret := m.Called(t, result, dc, token, key) + + if ok && restore != "" { + cert.PrivateKeyPEM = restore + } + return ret.Error(0) +} + +func (m *mockCache) sendNotification(ctx context.Context, key string, u cache.UpdateEvent) bool { + m.lock.Lock() + defer m.lock.Unlock() + + watchers, ok := m.watchers[key] + if !ok || len(m.watchers) < 1 { + return false + } + + var newWatchers []mockWatcher + + for _, watcher := range watchers { + select { + case watcher.ch <- u: + newWatchers = append(newWatchers, watcher) + case <-watcher.done: + // do nothing, this watcher will be removed from the list + case <-ctx.Done(): + // return doesn't matter here really, the test is being cancelled + return true + } + } + + // this removes any already cancelled watches from being sent to + m.watchers[key] = newWatchers + + return true +} + +type mockTokenStore struct { + mock.Mock +} + +func newMockTokenStore(t *testing.T) *mockTokenStore { + m := mockTokenStore{} + m.Test(t) + return &m +} + +func (m *mockTokenStore) AgentToken() string { + ret := m.Called() + return ret.String(0) +} + +func (m *mockTokenStore) UpdateAgentToken(secret string, source token.TokenSource) bool { + return m.Called(secret, source).Bool(0) +} + +func (m *mockTokenStore) Notify(kind token.TokenKind) token.Notifier { + ret := m.Called(kind) + n, _ := ret.Get(0).(token.Notifier) + return n +} + +func (m *mockTokenStore) StopNotify(notifier token.Notifier) { + m.Called(notifier) +} + +type mockedConfig struct { + Config + + directRPC *mockDirectRPC + serverProvider *mockServerProvider + cache *mockCache + tokens *mockTokenStore + tlsCfg *mockTLSConfigurator +} + +func newMockedConfig(t *testing.T) *mockedConfig { + directRPC := newMockDirectRPC(t) + serverProvider := newMockServerProvider(t) + mcache := newMockCache(t) + tokens := newMockTokenStore(t) + tlsCfg := newMockTLSConfigurator(t) + + // I am not sure it is well defined behavior but in testing it + // out it does appear like Cleanup functions can fail tests + // Adding in the mock expectations assertions here saves us + // a bunch of code in the other test functions. + t.Cleanup(func() { + if !t.Failed() { + directRPC.AssertExpectations(t) + serverProvider.AssertExpectations(t) + mcache.AssertExpectations(t) + tokens.AssertExpectations(t) + tlsCfg.AssertExpectations(t) + } + }) + + return &mockedConfig{ + Config: Config{ + DirectRPC: directRPC, + ServerProvider: serverProvider, + Cache: mcache, + Tokens: tokens, + TLSConfigurator: tlsCfg, + Logger: testutil.Logger(t), + }, + directRPC: directRPC, + serverProvider: serverProvider, + cache: mcache, + tokens: tokens, + tlsCfg: tlsCfg, + } +} + +func (m *mockedConfig) expectInitialTLS(t *testing.T, agentName, datacenter, token string, ca *structs.CARoot, indexedRoots *structs.IndexedCARoots, cert *structs.IssuedCert, extraCerts []string) { + var pems []string + for _, root := range indexedRoots.Roots { + pems = append(pems, root.RootCert) + } + + // we should update the TLS configurator with the proper certs + m.tlsCfg.On("UpdateAutoTLS", + extraCerts, + pems, + cert.CertPEM, + // auto-config handles the CSR and Key so our tests don't have + // a way to know that the key is correct or not. We do replace + // a non empty PEM with "redacted" so we can ensure that some + // certificate is being sent + "redacted", + true, + ).Return(nil).Once() + + rootRes := cache.FetchResult{Value: indexedRoots, Index: indexedRoots.QueryMeta.Index} + rootsReq := structs.DCSpecificRequest{Datacenter: datacenter} + + // we should prepopulate the cache with the CA roots + m.cache.On("Prepopulate", + cachetype.ConnectCARootName, + rootRes, + datacenter, + "", + rootsReq.CacheInfo().Key, + ).Return(nil).Once() + + leafReq := cachetype.ConnectCALeafRequest{ + Token: token, + Agent: agentName, + Datacenter: datacenter, + } + + // copy the cert and redact the private key for the mock expectation + // the actual private key will not correspond to the cert but thats + // because AutoConfig is generated a key/csr internally and sending that + // on up with the request. + copy := *cert + copy.PrivateKeyPEM = "redacted" + leafRes := cache.FetchResult{ + Value: ©, + Index: copy.RaftIndex.ModifyIndex, + State: cachetype.ConnectCALeafSuccess(ca.SigningKeyID), + } + + // we should prepopulate the cache with the agents cert + m.cache.On("Prepopulate", + cachetype.ConnectCALeafName, + leafRes, + datacenter, + token, + leafReq.Key(), + ).Return(nil).Once() + + // when prepopulating the cert in the cache we grab the token so + // we should expec that here + m.tokens.On("AgentToken").Return(token).Once() +} + +func (m *mockedConfig) setupInitialTLS(t *testing.T, agentName, datacenter, token string) (*structs.IndexedCARoots, *structs.IssuedCert, []string) { + ca, indexedRoots, cert := testCerts(t, agentName, datacenter) + + ca2 := connect.TestCA(t, nil) + extraCerts := []string{ca2.RootCert} + + m.expectInitialTLS(t, agentName, datacenter, token, ca, indexedRoots, cert, extraCerts) + return indexedRoots, cert, extraCerts +} diff --git a/agent/auto-config/persist.go b/agent/auto-config/persist.go new file mode 100644 index 000000000..9f94f445c --- /dev/null +++ b/agent/auto-config/persist.go @@ -0,0 +1,86 @@ +package autoconf + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + + "github.com/golang/protobuf/jsonpb" + "github.com/hashicorp/consul/proto/pbautoconf" +) + +const ( + // autoConfigFileName is the name of the file that the agent auto-config settings are + // stored in within the data directory + autoConfigFileName = "auto-config.json" +) + +var ( + pbMarshaler = &jsonpb.Marshaler{ + OrigName: false, + EnumsAsInts: false, + Indent: " ", + EmitDefaults: true, + } + + pbUnmarshaler = &jsonpb.Unmarshaler{ + AllowUnknownFields: false, + } +) + +func (ac *AutoConfig) readPersistedAutoConfig() (*pbautoconf.AutoConfigResponse, error) { + if ac.config.DataDir == "" { + // no data directory means we don't have anything to potentially load + return nil, nil + } + + path := filepath.Join(ac.config.DataDir, autoConfigFileName) + ac.logger.Debug("attempting to restore any persisted configuration", "path", path) + + content, err := ioutil.ReadFile(path) + if err == nil { + rdr := strings.NewReader(string(content)) + + var resp pbautoconf.AutoConfigResponse + if err := pbUnmarshaler.Unmarshal(rdr, &resp); err != nil { + return nil, fmt.Errorf("failed to decode persisted auto-config data: %w", err) + } + + ac.logger.Info("read persisted configuration", "path", path) + return &resp, nil + } + + if !os.IsNotExist(err) { + return nil, fmt.Errorf("failed to load %s: %w", path, err) + } + + // ignore non-existence errors as that is an indicator that we haven't + // performed the auto configuration before + return nil, nil +} + +func (ac *AutoConfig) persistAutoConfig(resp *pbautoconf.AutoConfigResponse) error { + // now that we know the configuration is generally fine including TLS certs go ahead and persist it to disk. + if ac.config.DataDir == "" { + ac.logger.Debug("not persisting auto-config settings because there is no data directory") + return nil + } + + serialized, err := pbMarshaler.MarshalToString(resp) + if err != nil { + return fmt.Errorf("failed to encode auto-config response as JSON: %w", err) + } + + path := filepath.Join(ac.config.DataDir, autoConfigFileName) + + err = ioutil.WriteFile(path, []byte(serialized), 0660) + if err != nil { + return fmt.Errorf("failed to write auto-config configurations: %w", err) + } + + ac.logger.Debug("auto-config settings were persisted to disk") + + return nil +} diff --git a/agent/auto-config/run.go b/agent/auto-config/run.go new file mode 100644 index 000000000..6155dc6be --- /dev/null +++ b/agent/auto-config/run.go @@ -0,0 +1,192 @@ +package autoconf + +import ( + "context" + "fmt" + "time" + + "github.com/hashicorp/consul/agent/cache" + "github.com/hashicorp/consul/agent/structs" +) + +// handleCacheEvent is used to handle event notifications from the cache for the roots +// or leaf cert watches. +func (ac *AutoConfig) handleCacheEvent(u cache.UpdateEvent) error { + switch u.CorrelationID { + case rootsWatchID: + ac.logger.Debug("roots watch fired - updating CA certificates") + if u.Err != nil { + return fmt.Errorf("root watch returned an error: %w", u.Err) + } + + roots, ok := u.Result.(*structs.IndexedCARoots) + if !ok { + return fmt.Errorf("invalid type for roots watch response: %T", u.Result) + } + + return ac.updateCARoots(roots) + case leafWatchID: + ac.logger.Debug("leaf certificate watch fired - updating TLS certificate") + if u.Err != nil { + return fmt.Errorf("leaf watch returned an error: %w", u.Err) + } + + leaf, ok := u.Result.(*structs.IssuedCert) + if !ok { + return fmt.Errorf("invalid type for agent leaf cert watch response: %T", u.Result) + } + + return ac.updateLeafCert(leaf) + } + + return nil +} + +// handleTokenUpdate is used when a notification about the agent token being updated +// is received and various watches need cancelling/restarting to use the new token. +func (ac *AutoConfig) handleTokenUpdate(ctx context.Context) error { + ac.logger.Debug("Agent token updated - resetting watches") + + // TODO (autoencrypt) Prepopulate the cache with the new token with + // the existing cache entry with the old token. The certificate doesn't + // need to change just because the token has. However there isn't a + // good way to make that happen and this behavior is benign enough + // that I am going to push off implementing it. + + // the agent token has been updated so we must update our leaf cert watch. + // this cancels the current watches before setting up new ones + ac.cancelWatches() + + // recreate the chan for cache updates. This is a precautionary measure to ensure + // that we don't accidentally get notified for the new watches being setup before + // a blocking query in the cache returns and sends data to the old chan. In theory + // the code in agent/cache/watch.go should prevent this where we specifically check + // for context cancellation prior to sending the event. However we could cancel + // it after that check and finish setting up the new watches before getting the old + // events. Both the go routine scheduler and the OS thread scheduler would have to + // be acting up for this to happen. Regardless the way to ensure we don't get events + // for the old watches is to simply replace the chan we are expecting them from. + close(ac.cacheUpdates) + ac.cacheUpdates = make(chan cache.UpdateEvent, 10) + + // restart watches - this will be done with the correct token + cancelWatches, err := ac.setupCertificateCacheWatches(ctx) + if err != nil { + return fmt.Errorf("failed to restart watches after agent token update: %w", err) + } + ac.cancelWatches = cancelWatches + return nil +} + +// handleFallback is used when the current TLS certificate has expired and the normal +// updating mechanisms have failed to renew it quickly enough. This function will +// use the configured fallback mechanism to retrieve a new cert and start monitoring +// that one. +func (ac *AutoConfig) handleFallback(ctx context.Context) error { + ac.logger.Warn("agent's client certificate has expired") + // Background because the context is mainly useful when the agent is first starting up. + switch { + case ac.config.AutoConfig.Enabled: + resp, err := ac.getInitialConfiguration(ctx) + if err != nil { + return fmt.Errorf("error while retrieving new agent certificates via auto-config: %w", err) + } + + return ac.recordInitialConfiguration(resp) + case ac.config.AutoEncryptTLS: + reply, err := ac.autoEncryptInitialCerts(ctx) + if err != nil { + return fmt.Errorf("error while retrieving new agent certificate via auto-encrypt: %w", err) + } + return ac.setInitialTLSCertificates(reply) + default: + return fmt.Errorf("logic error: either auto-encrypt or auto-config must be enabled") + } +} + +// run is the private method to be spawn by the Start method for +// executing the main monitoring loop. +func (ac *AutoConfig) run(ctx context.Context, exit chan struct{}) { + // The fallbackTimer is used to notify AFTER the agents + // leaf certificate has expired and where we need + // to fall back to the less secure RPC endpoint just like + // if the agent was starting up new. + // + // Check 10sec (fallback leeway duration) after cert + // expires. The agent cache should be handling the expiration + // and renew it before then. + // + // If there is no cert, AutoEncryptCertNotAfter returns + // a value in the past which immediately triggers the + // renew, but this case shouldn't happen because at + // this point, auto_encrypt was just being setup + // successfully. + calcFallbackInterval := func() time.Duration { + certExpiry := ac.acConfig.TLSConfigurator.AutoEncryptCertNotAfter() + return certExpiry.Add(ac.acConfig.FallbackLeeway).Sub(time.Now()) + } + fallbackTimer := time.NewTimer(calcFallbackInterval()) + + // cleanup for once we are stopped + defer func() { + // cancel the go routines performing the cache watches + ac.cancelWatches() + // ensure we don't leak the timers go routine + fallbackTimer.Stop() + // stop receiving notifications for token updates + ac.acConfig.Tokens.StopNotify(ac.tokenUpdates) + + ac.logger.Debug("auto-config has been stopped") + + ac.Lock() + ac.cancel = nil + ac.running = false + // this should be the final cleanup task as its what notifies + // the rest of the world that this go routine has exited. + close(exit) + ac.Unlock() + }() + + for { + select { + case <-ctx.Done(): + ac.logger.Debug("stopping auto-config") + return + case <-ac.tokenUpdates.Ch: + ac.logger.Debug("handling a token update event") + + if err := ac.handleTokenUpdate(ctx); err != nil { + ac.logger.Error("error in handling token update event", "error", err) + } + case u := <-ac.cacheUpdates: + ac.logger.Debug("handling a cache update event", "correlation_id", u.CorrelationID) + + if err := ac.handleCacheEvent(u); err != nil { + ac.logger.Error("error in handling cache update event", "error", err) + } + + // reset the fallback timer as the certificate may have been updated + fallbackTimer.Stop() + fallbackTimer = time.NewTimer(calcFallbackInterval()) + case <-fallbackTimer.C: + // This is a safety net in case the cert doesn't get renewed + // in time. The agent would be stuck in that case because the watches + // never use the AutoEncrypt.Sign endpoint. + + // check auto encrypt client cert expiration + if ac.acConfig.TLSConfigurator.AutoEncryptCertExpired() { + if err := ac.handleFallback(ctx); err != nil { + ac.logger.Error("error when handling a certificate expiry event", "error", err) + fallbackTimer = time.NewTimer(ac.acConfig.FallbackRetry) + } else { + fallbackTimer = time.NewTimer(calcFallbackInterval()) + } + } else { + // this shouldn't be possible. We calculate the timer duration to be the certificate + // expiration time + some leeway (10s default). So whenever we get here the certificate + // should be expired. Regardless its probably worth resetting the timer. + fallbackTimer = time.NewTimer(calcFallbackInterval()) + } + } + } +} diff --git a/agent/auto-config/server_addr.go b/agent/auto-config/server_addr.go new file mode 100644 index 000000000..98af4ae55 --- /dev/null +++ b/agent/auto-config/server_addr.go @@ -0,0 +1,111 @@ +package autoconf + +import ( + "fmt" + "net" + "strconv" + "strings" + + "github.com/hashicorp/consul/lib" + "github.com/hashicorp/go-discover" + discoverk8s "github.com/hashicorp/go-discover/provider/k8s" + + "github.com/hashicorp/go-hclog" +) + +func (ac *AutoConfig) discoverServers(servers []string) ([]string, error) { + providers := make(map[string]discover.Provider) + for k, v := range discover.Providers { + providers[k] = v + } + providers["k8s"] = &discoverk8s.Provider{} + + disco, err := discover.New( + discover.WithUserAgent(lib.UserAgent()), + discover.WithProviders(providers), + ) + + if err != nil { + return nil, fmt.Errorf("Failed to create go-discover resolver: %w", err) + } + + var addrs []string + for _, addr := range servers { + switch { + case strings.Contains(addr, "provider="): + resolved, err := disco.Addrs(addr, ac.logger.StandardLogger(&hclog.StandardLoggerOptions{InferLevels: true})) + if err != nil { + ac.logger.Error("failed to resolve go-discover auto-config servers", "configuration", addr, "err", err) + continue + } + + addrs = append(addrs, resolved...) + ac.logger.Debug("discovered auto-config servers", "servers", resolved) + default: + addrs = append(addrs, addr) + } + } + + return addrs, nil +} + +// autoConfigHosts is responsible for taking the list of server addresses +// and resolving any go-discover provider invocations. It will then return +// a list of hosts. These might be hostnames and is expected that DNS resolution +// may be performed after this function runs. Additionally these may contain +// ports so SplitHostPort could also be necessary. +func (ac *AutoConfig) autoConfigHosts() ([]string, error) { + // use servers known to gossip if there are any + if ac.acConfig.ServerProvider != nil { + if srv := ac.acConfig.ServerProvider.FindLANServer(); srv != nil { + return []string{srv.Addr.String()}, nil + } + } + + addrs, err := ac.discoverServers(ac.config.AutoConfig.ServerAddresses) + if err != nil { + return nil, err + } + + if len(addrs) == 0 { + return nil, fmt.Errorf("no auto-config server addresses available for use") + } + + return addrs, nil +} + +// resolveHost will take a single host string and convert it to a list of TCPAddrs +// This will process any port in the input as well as looking up the hostname using +// normal DNS resolution. +func (ac *AutoConfig) resolveHost(hostPort string) []net.TCPAddr { + port := ac.config.ServerPort + host, portStr, err := net.SplitHostPort(hostPort) + if err != nil { + if strings.Contains(err.Error(), "missing port in address") { + host = hostPort + } else { + ac.logger.Warn("error splitting host address into IP and port", "address", hostPort, "error", err) + return nil + } + } else { + port, err = strconv.Atoi(portStr) + if err != nil { + ac.logger.Warn("Parsed port is not an integer", "port", portStr, "error", err) + return nil + } + } + + // resolve the host to a list of IPs + ips, err := net.LookupIP(host) + if err != nil { + ac.logger.Warn("IP resolution failed", "host", host, "error", err) + return nil + } + + var addrs []net.TCPAddr + for _, ip := range ips { + addrs = append(addrs, net.TCPAddr{IP: ip, Port: port}) + } + + return addrs +} diff --git a/agent/auto-config/tls.go b/agent/auto-config/tls.go new file mode 100644 index 000000000..380c9f9f8 --- /dev/null +++ b/agent/auto-config/tls.go @@ -0,0 +1,280 @@ +package autoconf + +import ( + "context" + "fmt" + "net" + + "github.com/hashicorp/consul/agent/cache" + cachetype "github.com/hashicorp/consul/agent/cache-types" + "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/proto/pbautoconf" +) + +const ( + // ID of the roots watch + rootsWatchID = "roots" + + // ID of the leaf watch + leafWatchID = "leaf" + + unknownTrustDomain = "unknown" +) + +var ( + defaultDNSSANs = []string{"localhost"} + + defaultIPSANs = []net.IP{{127, 0, 0, 1}, net.ParseIP("::1")} +) + +func extractPEMs(roots *structs.IndexedCARoots) []string { + var pems []string + for _, root := range roots.Roots { + pems = append(pems, root.RootCert) + } + return pems +} + +// updateTLSFromResponse will update the TLS certificate and roots in the shared +// TLS configurator. +func (ac *AutoConfig) updateTLSFromResponse(resp *pbautoconf.AutoConfigResponse) error { + var pems []string + for _, root := range resp.GetCARoots().GetRoots() { + pems = append(pems, root.RootCert) + } + + err := ac.acConfig.TLSConfigurator.UpdateAutoTLS( + resp.ExtraCACertificates, + pems, + resp.Certificate.GetCertPEM(), + resp.Certificate.GetPrivateKeyPEM(), + resp.Config.GetTLS().GetVerifyServerHostname(), + ) + + if err != nil { + return fmt.Errorf("Failed to update the TLS configurator with new certificates: %w", err) + } + + return nil +} + +func (ac *AutoConfig) setInitialTLSCertificates(certs *structs.SignedResponse) error { + if certs == nil { + return nil + } + + if err := ac.populateCertificateCache(certs); err != nil { + return fmt.Errorf("error populating cache with certificates: %w", err) + } + + connectCAPems := extractPEMs(&certs.ConnectCARoots) + + err := ac.acConfig.TLSConfigurator.UpdateAutoTLS( + certs.ManualCARoots, + connectCAPems, + certs.IssuedCert.CertPEM, + certs.IssuedCert.PrivateKeyPEM, + certs.VerifyServerHostname, + ) + + if err != nil { + return fmt.Errorf("error updating TLS configurator with certificates: %w", err) + } + + return nil +} + +func (ac *AutoConfig) populateCertificateCache(certs *structs.SignedResponse) error { + cert, err := connect.ParseCert(certs.IssuedCert.CertPEM) + if err != nil { + return fmt.Errorf("Failed to parse certificate: %w", err) + } + + // prepolutate roots cache + rootRes := cache.FetchResult{Value: &certs.ConnectCARoots, Index: certs.ConnectCARoots.QueryMeta.Index} + rootsReq := ac.caRootsRequest() + // getting the roots doesn't require a token so in order to potentially share the cache with another + if err := ac.acConfig.Cache.Prepopulate(cachetype.ConnectCARootName, rootRes, ac.config.Datacenter, "", rootsReq.CacheInfo().Key); err != nil { + return err + } + + leafReq := ac.leafCertRequest() + + // prepolutate leaf cache + certRes := cache.FetchResult{ + Value: &certs.IssuedCert, + Index: certs.IssuedCert.RaftIndex.ModifyIndex, + State: cachetype.ConnectCALeafSuccess(connect.EncodeSigningKeyID(cert.AuthorityKeyId)), + } + if err := ac.acConfig.Cache.Prepopulate(cachetype.ConnectCALeafName, certRes, leafReq.Datacenter, leafReq.Token, leafReq.Key()); err != nil { + return err + } + + return nil +} + +func (ac *AutoConfig) setupCertificateCacheWatches(ctx context.Context) (context.CancelFunc, error) { + notificationCtx, cancel := context.WithCancel(ctx) + + rootsReq := ac.caRootsRequest() + err := ac.acConfig.Cache.Notify(notificationCtx, cachetype.ConnectCARootName, &rootsReq, rootsWatchID, ac.cacheUpdates) + if err != nil { + cancel() + return nil, err + } + + leafReq := ac.leafCertRequest() + err = ac.acConfig.Cache.Notify(notificationCtx, cachetype.ConnectCALeafName, &leafReq, leafWatchID, ac.cacheUpdates) + if err != nil { + cancel() + return nil, err + } + + return cancel, nil +} + +func (ac *AutoConfig) updateCARoots(roots *structs.IndexedCARoots) error { + switch { + case ac.config.AutoConfig.Enabled: + ac.Lock() + defer ac.Unlock() + var err error + ac.autoConfigResponse.CARoots, err = translateCARootsToProtobuf(roots) + if err != nil { + return err + } + + if err := ac.updateTLSFromResponse(ac.autoConfigResponse); err != nil { + return err + } + return ac.persistAutoConfig(ac.autoConfigResponse) + case ac.config.AutoEncryptTLS: + pems := extractPEMs(roots) + + if err := ac.acConfig.TLSConfigurator.UpdateAutoTLSCA(pems); err != nil { + return fmt.Errorf("failed to update Connect CA certificates: %w", err) + } + return nil + default: + return nil + } +} + +func (ac *AutoConfig) updateLeafCert(cert *structs.IssuedCert) error { + switch { + case ac.config.AutoConfig.Enabled: + ac.Lock() + defer ac.Unlock() + var err error + ac.autoConfigResponse.Certificate, err = translateIssuedCertToProtobuf(cert) + if err != nil { + return err + } + + if err := ac.updateTLSFromResponse(ac.autoConfigResponse); err != nil { + return err + } + return ac.persistAutoConfig(ac.autoConfigResponse) + case ac.config.AutoEncryptTLS: + if err := ac.acConfig.TLSConfigurator.UpdateAutoTLSCert(cert.CertPEM, cert.PrivateKeyPEM); err != nil { + return fmt.Errorf("failed to update the agent leaf cert: %w", err) + } + return nil + default: + return nil + } +} + +func (ac *AutoConfig) caRootsRequest() structs.DCSpecificRequest { + return structs.DCSpecificRequest{Datacenter: ac.config.Datacenter} +} + +func (ac *AutoConfig) leafCertRequest() cachetype.ConnectCALeafRequest { + return cachetype.ConnectCALeafRequest{ + Datacenter: ac.config.Datacenter, + Agent: ac.config.NodeName, + DNSSAN: ac.getDNSSANs(), + IPSAN: ac.getIPSANs(), + Token: ac.acConfig.Tokens.AgentToken(), + } +} + +// generateCSR will generate a CSR for an Agent certificate. This should +// be sent along with the AutoConfig.InitialConfiguration RPC or the +// AutoEncrypt.Sign RPC. The generated CSR does NOT have a real trust domain +// as when generating this we do not yet have the CA roots. The server will +// update the trust domain for us though. +func (ac *AutoConfig) generateCSR() (csr string, key string, err error) { + // We don't provide the correct host here, because we don't know any + // better at this point. Apart from the domain, we would need the + // ClusterID, which we don't have. This is why we go with + // unknownTrustDomain the first time. Subsequent CSRs will have the + // correct TrustDomain. + id := &connect.SpiffeIDAgent{ + // will be replaced + Host: unknownTrustDomain, + Datacenter: ac.config.Datacenter, + Agent: ac.config.NodeName, + } + + caConfig, err := ac.config.ConnectCAConfiguration() + if err != nil { + return "", "", fmt.Errorf("Cannot generate CSR: %w", err) + } + + conf, err := caConfig.GetCommonConfig() + if err != nil { + return "", "", fmt.Errorf("Failed to load common CA configuration: %w", err) + } + + if conf.PrivateKeyType == "" { + conf.PrivateKeyType = connect.DefaultPrivateKeyType + } + if conf.PrivateKeyBits == 0 { + conf.PrivateKeyBits = connect.DefaultPrivateKeyBits + } + + // Create a new private key + pk, pkPEM, err := connect.GeneratePrivateKeyWithConfig(conf.PrivateKeyType, conf.PrivateKeyBits) + if err != nil { + return "", "", fmt.Errorf("Failed to generate private key: %w", err) + } + + dnsNames := ac.getDNSSANs() + ipAddresses := ac.getIPSANs() + + // Create a CSR. + // + // The Common Name includes the dummy trust domain for now but Server will + // override this when it is signed anyway so it's OK. + cn := connect.AgentCN(ac.config.NodeName, unknownTrustDomain) + csr, err = connect.CreateCSR(id, cn, pk, dnsNames, ipAddresses) + if err != nil { + return "", "", err + } + + return csr, pkPEM, nil +} + +func (ac *AutoConfig) getDNSSANs() []string { + sans := defaultDNSSANs + switch { + case ac.config.AutoConfig.Enabled: + sans = append(sans, ac.config.AutoConfig.DNSSANs...) + case ac.config.AutoEncryptTLS: + sans = append(sans, ac.config.AutoEncryptDNSSAN...) + } + return sans +} + +func (ac *AutoConfig) getIPSANs() []net.IP { + sans := defaultIPSANs + switch { + case ac.config.AutoConfig.Enabled: + sans = append(sans, ac.config.AutoConfig.IPSANs...) + case ac.config.AutoEncryptTLS: + sans = append(sans, ac.config.AutoEncryptIPSAN...) + } + return sans +} diff --git a/agent/auto-config/tls_test.go b/agent/auto-config/tls_test.go new file mode 100644 index 000000000..400d7be0d --- /dev/null +++ b/agent/auto-config/tls_test.go @@ -0,0 +1,56 @@ +package autoconf + +import ( + "testing" + "time" + + "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/structs" + "github.com/stretchr/testify/require" +) + +func newLeaf(t *testing.T, agentName, datacenter string, ca *structs.CARoot, idx uint64, expiration time.Duration) *structs.IssuedCert { + t.Helper() + + pub, priv, err := connect.TestAgentLeaf(t, agentName, datacenter, ca, expiration) + require.NoError(t, err) + cert, err := connect.ParseCert(pub) + require.NoError(t, err) + + spiffeID, err := connect.ParseCertURI(cert.URIs[0]) + require.NoError(t, err) + + agentID, ok := spiffeID.(*connect.SpiffeIDAgent) + require.True(t, ok, "certificate doesn't have an agent leaf cert URI") + + return &structs.IssuedCert{ + SerialNumber: cert.SerialNumber.String(), + CertPEM: pub, + PrivateKeyPEM: priv, + ValidAfter: cert.NotBefore, + ValidBefore: cert.NotAfter, + Agent: agentID.Agent, + AgentURI: agentID.URI().String(), + EnterpriseMeta: *structs.DefaultEnterpriseMeta(), + RaftIndex: structs.RaftIndex{ + CreateIndex: idx, + ModifyIndex: idx, + }, + } +} + +func testCerts(t *testing.T, agentName, datacenter string) (*structs.CARoot, *structs.IndexedCARoots, *structs.IssuedCert) { + ca := connect.TestCA(t, nil) + ca.IntermediateCerts = make([]string, 0) + cert := newLeaf(t, agentName, datacenter, ca, 1, 10*time.Minute) + indexedRoots := structs.IndexedCARoots{ + ActiveRootID: ca.ID, + TrustDomain: connect.TestClusterID, + Roots: []*structs.CARoot{ + ca, + }, + QueryMeta: structs.QueryMeta{Index: 1}, + } + + return ca, &indexedRoots, cert +} diff --git a/agent/cert-monitor/cert_monitor.go b/agent/cert-monitor/cert_monitor.go deleted file mode 100644 index 0ad50e8a1..000000000 --- a/agent/cert-monitor/cert_monitor.go +++ /dev/null @@ -1,505 +0,0 @@ -package certmon - -import ( - "context" - "fmt" - "io/ioutil" - "sync" - "time" - - "github.com/hashicorp/consul/agent/cache" - cachetype "github.com/hashicorp/consul/agent/cache-types" - "github.com/hashicorp/consul/agent/connect" - "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/agent/token" - "github.com/hashicorp/consul/tlsutil" - "github.com/hashicorp/go-hclog" -) - -const ( - // ID of the roots watch - rootsWatchID = "roots" - - // ID of the leaf watch - leafWatchID = "leaf" -) - -// Cache is an interface to represent the methods of the -// agent/cache.Cache struct that we care about -type Cache interface { - Notify(ctx context.Context, t string, r cache.Request, correlationID string, ch chan<- cache.UpdateEvent) error - Prepopulate(t string, result cache.FetchResult, dc string, token string, key string) error -} - -// CertMonitor will setup the proper watches to ensure that -// the Agent's Connect TLS certificate remains up to date -type CertMonitor struct { - logger hclog.Logger - cache Cache - tlsConfigurator *tlsutil.Configurator - tokens *token.Store - leafReq cachetype.ConnectCALeafRequest - rootsReq structs.DCSpecificRequest - persist PersistFunc - fallback FallbackFunc - fallbackLeeway time.Duration - fallbackRetry time.Duration - - l sync.Mutex - running bool - // cancel is used to cancel the entire CertMonitor - // go routine. This is the main field protected - // by the mutex as it being non-nil indicates that - // the go routine has been started and is stoppable. - // note that it doesn't indcate that the go routine - // is currently running. - cancel context.CancelFunc - - // cancelWatches is used to cancel the existing - // cache watches. This is mainly only necessary - // when the Agent token changes - cancelWatches context.CancelFunc - - // cacheUpdates is the chan used to have the cache - // send us back events - cacheUpdates chan cache.UpdateEvent - // tokenUpdates is the struct used to receive - // events from the token store when the Agent - // token is updated. - tokenUpdates token.Notifier - - // this is used to keep a local copy of the certs - // keys and ca certs. It will be used to persist - // all of the local state at once. - certs structs.SignedResponse -} - -// New creates a new CertMonitor for automatically rotating -// an Agent's Connect Certificate -func New(config *Config) (*CertMonitor, error) { - logger := config.Logger - if logger == nil { - logger = hclog.New(&hclog.LoggerOptions{ - Level: 0, - Output: ioutil.Discard, - }) - } - - if config.FallbackLeeway == 0 { - config.FallbackLeeway = 10 * time.Second - } - if config.FallbackRetry == 0 { - config.FallbackRetry = time.Minute - } - - if config.Cache == nil { - return nil, fmt.Errorf("CertMonitor creation requires a Cache") - } - - if config.TLSConfigurator == nil { - return nil, fmt.Errorf("CertMonitor creation requires a TLS Configurator") - } - - if config.Fallback == nil { - return nil, fmt.Errorf("CertMonitor creation requires specifying a FallbackFunc") - } - - if config.Datacenter == "" { - return nil, fmt.Errorf("CertMonitor creation requires specifying the datacenter") - } - - if config.NodeName == "" { - return nil, fmt.Errorf("CertMonitor creation requires specifying the agent's node name") - } - - if config.Tokens == nil { - return nil, fmt.Errorf("CertMonitor creation requires specifying a token store") - } - - return &CertMonitor{ - logger: logger, - cache: config.Cache, - tokens: config.Tokens, - tlsConfigurator: config.TLSConfigurator, - persist: config.Persist, - fallback: config.Fallback, - fallbackLeeway: config.FallbackLeeway, - fallbackRetry: config.FallbackRetry, - rootsReq: structs.DCSpecificRequest{Datacenter: config.Datacenter}, - leafReq: cachetype.ConnectCALeafRequest{ - Datacenter: config.Datacenter, - Agent: config.NodeName, - DNSSAN: config.DNSSANs, - IPSAN: config.IPSANs, - }, - }, nil -} - -// Update is responsible for priming the cache with the certificates -// as well as injecting them into the TLS configurator -func (m *CertMonitor) Update(certs *structs.SignedResponse) error { - if certs == nil { - return nil - } - - m.certs = *certs - - if err := m.populateCache(certs); err != nil { - return fmt.Errorf("error populating cache with certificates: %w", err) - } - - connectCAPems := []string{} - for _, ca := range certs.ConnectCARoots.Roots { - connectCAPems = append(connectCAPems, ca.RootCert) - } - - // Note that its expected that the private key be within the IssuedCert in the - // SignedResponse. This isn't how a server would send back the response and requires - // that the recipient of the response who also has access to the private key will - // have filled it in. The Cache definitely does this but auto-encrypt/auto-config - // will need to ensure the original response is setup this way too. - err := m.tlsConfigurator.UpdateAutoTLS( - certs.ManualCARoots, - connectCAPems, - certs.IssuedCert.CertPEM, - certs.IssuedCert.PrivateKeyPEM, - certs.VerifyServerHostname) - - if err != nil { - return fmt.Errorf("error updating TLS configurator with certificates: %w", err) - } - - return nil -} - -// populateCache is responsible for inserting the certificates into the cache -func (m *CertMonitor) populateCache(resp *structs.SignedResponse) error { - cert, err := connect.ParseCert(resp.IssuedCert.CertPEM) - if err != nil { - return fmt.Errorf("Failed to parse certificate: %w", err) - } - - // prepolutate roots cache - rootRes := cache.FetchResult{Value: &resp.ConnectCARoots, Index: resp.ConnectCARoots.QueryMeta.Index} - // getting the roots doesn't require a token so in order to potentially share the cache with another - if err := m.cache.Prepopulate(cachetype.ConnectCARootName, rootRes, m.rootsReq.Datacenter, "", m.rootsReq.CacheInfo().Key); err != nil { - return err - } - - // copy the template and update the token - leafReq := m.leafReq - leafReq.Token = m.tokens.AgentToken() - - // prepolutate leaf cache - certRes := cache.FetchResult{ - Value: &resp.IssuedCert, - Index: resp.ConnectCARoots.QueryMeta.Index, - State: cachetype.ConnectCALeafSuccess(connect.EncodeSigningKeyID(cert.AuthorityKeyId)), - } - if err := m.cache.Prepopulate(cachetype.ConnectCALeafName, certRes, leafReq.Datacenter, leafReq.Token, leafReq.Key()); err != nil { - return err - } - return nil -} - -// Start spawns the go routine to monitor the certificate and ensure it is -// rotated/renewed as necessary. The chan will indicate once the started -// go routine has exited -func (m *CertMonitor) Start(ctx context.Context) (<-chan struct{}, error) { - m.l.Lock() - defer m.l.Unlock() - - if m.running || m.cancel != nil { - return nil, fmt.Errorf("the CertMonitor is already running") - } - - // create the top level context to control the go - // routine executing the `run` method - ctx, cancel := context.WithCancel(ctx) - - // create the channel to get cache update events through - // really we should only ever get 10 updates - m.cacheUpdates = make(chan cache.UpdateEvent, 10) - - // setup the cache watches - cancelWatches, err := m.setupCacheWatches(ctx) - if err != nil { - cancel() - return nil, fmt.Errorf("error setting up cache watches: %w", err) - } - - // start the token update notifier - m.tokenUpdates = m.tokens.Notify(token.TokenKindAgent) - - // store the cancel funcs - m.cancel = cancel - m.cancelWatches = cancelWatches - - m.running = true - exit := make(chan struct{}) - go m.run(ctx, exit) - - m.logger.Info("certificate monitor started") - return exit, nil -} - -// Stop manually stops the go routine spawned by Start and -// returns whether the go routine was still running before -// cancelling. -// -// Note that cancelling the context passed into Start will -// also cause the go routine to stop -func (m *CertMonitor) Stop() bool { - m.l.Lock() - defer m.l.Unlock() - - if !m.running { - return false - } - - if m.cancel != nil { - m.cancel() - } - - return true -} - -// IsRunning returns whether the go routine to perform certificate monitoring -// is already running. -func (m *CertMonitor) IsRunning() bool { - m.l.Lock() - defer m.l.Unlock() - return m.running -} - -// setupCacheWatches will start both the roots and leaf cert watch with a new child -// context and an up to date ACL token. The watches are started with a new child context -// whose CancelFunc is also returned. -func (m *CertMonitor) setupCacheWatches(ctx context.Context) (context.CancelFunc, error) { - notificationCtx, cancel := context.WithCancel(ctx) - - // copy the request - rootsReq := m.rootsReq - - err := m.cache.Notify(notificationCtx, cachetype.ConnectCARootName, &rootsReq, rootsWatchID, m.cacheUpdates) - if err != nil { - cancel() - return nil, err - } - - // copy the request - leafReq := m.leafReq - leafReq.Token = m.tokens.AgentToken() - - err = m.cache.Notify(notificationCtx, cachetype.ConnectCALeafName, &leafReq, leafWatchID, m.cacheUpdates) - if err != nil { - cancel() - return nil, err - } - - return cancel, nil -} - -// handleCacheEvent is used to handle event notifications from the cache for the roots -// or leaf cert watches. -func (m *CertMonitor) handleCacheEvent(u cache.UpdateEvent) error { - switch u.CorrelationID { - case rootsWatchID: - m.logger.Debug("roots watch fired - updating CA certificates") - if u.Err != nil { - return fmt.Errorf("root watch returned an error: %w", u.Err) - } - - roots, ok := u.Result.(*structs.IndexedCARoots) - if !ok { - return fmt.Errorf("invalid type for roots watch response: %T", u.Result) - } - - m.certs.ConnectCARoots = *roots - - var pems []string - for _, root := range roots.Roots { - pems = append(pems, root.RootCert) - } - - if err := m.tlsConfigurator.UpdateAutoTLSCA(pems); err != nil { - return fmt.Errorf("failed to update Connect CA certificates: %w", err) - } - - if m.persist != nil { - copy := m.certs - if err := m.persist(©); err != nil { - return fmt.Errorf("failed to persist certificate package: %w", err) - } - } - case leafWatchID: - m.logger.Debug("leaf certificate watch fired - updating TLS certificate") - if u.Err != nil { - return fmt.Errorf("leaf watch returned an error: %w", u.Err) - } - - leaf, ok := u.Result.(*structs.IssuedCert) - if !ok { - return fmt.Errorf("invalid type for agent leaf cert watch response: %T", u.Result) - } - - m.certs.IssuedCert = *leaf - - if err := m.tlsConfigurator.UpdateAutoTLSCert(leaf.CertPEM, leaf.PrivateKeyPEM); err != nil { - return fmt.Errorf("failed to update the agent leaf cert: %w", err) - } - - if m.persist != nil { - copy := m.certs - if err := m.persist(©); err != nil { - return fmt.Errorf("failed to persist certificate package: %w", err) - } - } - } - - return nil -} - -// handleTokenUpdate is used when a notification about the agent token being updated -// is received and various watches need cancelling/restarting to use the new token. -func (m *CertMonitor) handleTokenUpdate(ctx context.Context) error { - m.logger.Debug("Agent token updated - resetting watches") - - // TODO (autoencrypt) Prepopulate the cache with the new token with - // the existing cache entry with the old token. The certificate doesn't - // need to change just because the token has. However there isn't a - // good way to make that happen and this behavior is benign enough - // that I am going to push off implementing it. - - // the agent token has been updated so we must update our leaf cert watch. - // this cancels the current watches before setting up new ones - m.cancelWatches() - - // recreate the chan for cache updates. This is a precautionary measure to ensure - // that we don't accidentally get notified for the new watches being setup before - // a blocking query in the cache returns and sends data to the old chan. In theory - // the code in agent/cache/watch.go should prevent this where we specifically check - // for context cancellation prior to sending the event. However we could cancel - // it after that check and finish setting up the new watches before getting the old - // events. Both the go routine scheduler and the OS thread scheduler would have to - // be acting up for this to happen. Regardless the way to ensure we don't get events - // for the old watches is to simply replace the chan we are expecting them from. - close(m.cacheUpdates) - m.cacheUpdates = make(chan cache.UpdateEvent, 10) - - // restart watches - this will be done with the correct token - cancelWatches, err := m.setupCacheWatches(ctx) - if err != nil { - return fmt.Errorf("failed to restart watches after agent token update: %w", err) - } - m.cancelWatches = cancelWatches - return nil -} - -// handleFallback is used when the current TLS certificate has expired and the normal -// updating mechanisms have failed to renew it quickly enough. This function will -// use the configured fallback mechanism to retrieve a new cert and start monitoring -// that one. -func (m *CertMonitor) handleFallback(ctx context.Context) error { - m.logger.Warn("agent's client certificate has expired") - // Background because the context is mainly useful when the agent is first starting up. - reply, err := m.fallback(ctx) - if err != nil { - return fmt.Errorf("error when getting new agent certificate: %w", err) - } - - if m.persist != nil { - if err := m.persist(reply); err != nil { - return fmt.Errorf("failed to persist certificate package: %w", err) - } - } - return m.Update(reply) -} - -// run is the private method to be spawn by the Start method for -// executing the main monitoring loop. -func (m *CertMonitor) run(ctx context.Context, exit chan struct{}) { - // The fallbackTimer is used to notify AFTER the agents - // leaf certificate has expired and where we need - // to fall back to the less secure RPC endpoint just like - // if the agent was starting up new. - // - // Check 10sec (fallback leeway duration) after cert - // expires. The agent cache should be handling the expiration - // and renew it before then. - // - // If there is no cert, AutoEncryptCertNotAfter returns - // a value in the past which immediately triggers the - // renew, but this case shouldn't happen because at - // this point, auto_encrypt was just being setup - // successfully. - calcFallbackInterval := func() time.Duration { - certExpiry := m.tlsConfigurator.AutoEncryptCertNotAfter() - return certExpiry.Add(m.fallbackLeeway).Sub(time.Now()) - } - fallbackTimer := time.NewTimer(calcFallbackInterval()) - - // cleanup for once we are stopped - defer func() { - // cancel the go routines performing the cache watches - m.cancelWatches() - // ensure we don't leak the timers go routine - fallbackTimer.Stop() - // stop receiving notifications for token updates - m.tokens.StopNotify(m.tokenUpdates) - - m.logger.Debug("certificate monitor has been stopped") - - m.l.Lock() - m.cancel = nil - m.running = false - m.l.Unlock() - - // this should be the final cleanup task as its what notifies - // the rest of the world that this go routine has exited. - close(exit) - }() - - for { - select { - case <-ctx.Done(): - m.logger.Debug("stopping the certificate monitor") - return - case <-m.tokenUpdates.Ch: - m.logger.Debug("handling a token update event") - - if err := m.handleTokenUpdate(ctx); err != nil { - m.logger.Error("error in handling token update event", "error", err) - } - case u := <-m.cacheUpdates: - m.logger.Debug("handling a cache update event", "correlation_id", u.CorrelationID) - - if err := m.handleCacheEvent(u); err != nil { - m.logger.Error("error in handling cache update event", "error", err) - } - - // reset the fallback timer as the certificate may have been updated - fallbackTimer.Stop() - fallbackTimer = time.NewTimer(calcFallbackInterval()) - case <-fallbackTimer.C: - // This is a safety net in case the auto_encrypt cert doesn't get renewed - // in time. The agent would be stuck in that case because the watches - // never use the AutoEncrypt.Sign endpoint. - - // check auto encrypt client cert expiration - if m.tlsConfigurator.AutoEncryptCertExpired() { - if err := m.handleFallback(ctx); err != nil { - m.logger.Error("error when handling a certificate expiry event", "error", err) - fallbackTimer = time.NewTimer(m.fallbackRetry) - } else { - fallbackTimer = time.NewTimer(calcFallbackInterval()) - } - } else { - // this shouldn't be possible. We calculate the timer duration to be the certificate - // expiration time + some leeway (10s default). So whenever we get here the certificate - // should be expired. Regardless its probably worth resetting the timer. - fallbackTimer = time.NewTimer(calcFallbackInterval()) - } - } - } -} diff --git a/agent/cert-monitor/cert_monitor_test.go b/agent/cert-monitor/cert_monitor_test.go deleted file mode 100644 index 2b6ea76d8..000000000 --- a/agent/cert-monitor/cert_monitor_test.go +++ /dev/null @@ -1,731 +0,0 @@ -package certmon - -import ( - "context" - "crypto/tls" - "fmt" - "net" - "sync" - "testing" - "time" - - "github.com/hashicorp/consul/agent/cache" - cachetype "github.com/hashicorp/consul/agent/cache-types" - "github.com/hashicorp/consul/agent/connect" - "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/agent/token" - "github.com/hashicorp/consul/sdk/testutil" - "github.com/hashicorp/consul/sdk/testutil/retry" - "github.com/hashicorp/consul/tlsutil" - "github.com/hashicorp/go-uuid" - - "github.com/stretchr/testify/mock" - "github.com/stretchr/testify/require" -) - -type mockFallback struct { - mock.Mock -} - -func (m *mockFallback) fallback(ctx context.Context) (*structs.SignedResponse, error) { - ret := m.Called() - resp, _ := ret.Get(0).(*structs.SignedResponse) - return resp, ret.Error(1) -} - -type mockPersist struct { - mock.Mock -} - -func (m *mockPersist) persist(resp *structs.SignedResponse) error { - return m.Called(resp).Error(0) -} - -type mockWatcher struct { - ch chan<- cache.UpdateEvent - done <-chan struct{} -} - -type mockCache struct { - mock.Mock - - lock sync.Mutex - watchers map[string][]mockWatcher -} - -func (m *mockCache) Notify(ctx context.Context, t string, r cache.Request, correlationID string, ch chan<- cache.UpdateEvent) error { - m.lock.Lock() - key := r.CacheInfo().Key - m.watchers[key] = append(m.watchers[key], mockWatcher{ch: ch, done: ctx.Done()}) - m.lock.Unlock() - ret := m.Called(t, r, correlationID) - return ret.Error(0) -} - -func (m *mockCache) Prepopulate(t string, result cache.FetchResult, dc string, token string, key string) error { - ret := m.Called(t, result, dc, token, key) - return ret.Error(0) -} - -func (m *mockCache) sendNotification(ctx context.Context, key string, u cache.UpdateEvent) bool { - m.lock.Lock() - defer m.lock.Unlock() - - watchers, ok := m.watchers[key] - if !ok || len(m.watchers) < 1 { - return false - } - - var newWatchers []mockWatcher - - for _, watcher := range watchers { - select { - case watcher.ch <- u: - newWatchers = append(newWatchers, watcher) - case <-watcher.done: - // do nothing, this watcher will be removed from the list - case <-ctx.Done(): - // return doesn't matter here really, the test is being cancelled - return true - } - } - - // this removes any already cancelled watches from being sent to - m.watchers[key] = newWatchers - - return true -} - -func newMockCache(t *testing.T) *mockCache { - mcache := mockCache{watchers: make(map[string][]mockWatcher)} - mcache.Test(t) - return &mcache -} - -func waitForChan(timer *time.Timer, ch <-chan struct{}) bool { - select { - case <-timer.C: - return false - case <-ch: - return true - } -} - -func waitForChans(timeout time.Duration, chans ...<-chan struct{}) bool { - timer := time.NewTimer(timeout) - defer timer.Stop() - - for _, ch := range chans { - if !waitForChan(timer, ch) { - return false - } - } - return true -} - -func testTLSConfigurator(t *testing.T) *tlsutil.Configurator { - t.Helper() - logger := testutil.Logger(t) - cfg, err := tlsutil.NewConfigurator(tlsutil.Config{AutoTLS: true}, logger) - require.NoError(t, err) - return cfg -} - -func newLeaf(t *testing.T, ca *structs.CARoot, idx uint64, expiration time.Duration) *structs.IssuedCert { - t.Helper() - - pub, priv, err := connect.TestAgentLeaf(t, "node", "foo", ca, expiration) - require.NoError(t, err) - cert, err := connect.ParseCert(pub) - require.NoError(t, err) - - spiffeID, err := connect.ParseCertURI(cert.URIs[0]) - require.NoError(t, err) - - agentID, ok := spiffeID.(*connect.SpiffeIDAgent) - require.True(t, ok, "certificate doesn't have an agent leaf cert URI") - - return &structs.IssuedCert{ - SerialNumber: cert.SerialNumber.String(), - CertPEM: pub, - PrivateKeyPEM: priv, - ValidAfter: cert.NotBefore, - ValidBefore: cert.NotAfter, - Agent: agentID.Agent, - AgentURI: agentID.URI().String(), - EnterpriseMeta: *structs.DefaultEnterpriseMeta(), - RaftIndex: structs.RaftIndex{ - CreateIndex: idx, - ModifyIndex: idx, - }, - } -} - -type testCertMonitor struct { - monitor *CertMonitor - mcache *mockCache - tls *tlsutil.Configurator - tokens *token.Store - fallback *mockFallback - persist *mockPersist - - extraCACerts []string - initialCert *structs.IssuedCert - initialRoots *structs.IndexedCARoots - - // these are some variables that the CertMonitor was created with - datacenter string - nodeName string - dns []string - ips []net.IP - verifyServerHostname bool -} - -func newTestCertMonitor(t *testing.T) testCertMonitor { - t.Helper() - - tlsConfigurator := testTLSConfigurator(t) - tokens := new(token.Store) - - id, err := uuid.GenerateUUID() - require.NoError(t, err) - tokens.UpdateAgentToken(id, token.TokenSourceConfig) - - ca := connect.TestCA(t, nil) - manualCA := connect.TestCA(t, nil) - // this cert is setup to not expire quickly. this will prevent - // the test from accidentally running the fallback routine - // before we want to force that to happen. - issued := newLeaf(t, ca, 1, 10*time.Minute) - - indexedRoots := structs.IndexedCARoots{ - ActiveRootID: ca.ID, - TrustDomain: connect.TestClusterID, - Roots: []*structs.CARoot{ - ca, - }, - QueryMeta: structs.QueryMeta{ - Index: 1, - }, - } - - initialCerts := &structs.SignedResponse{ - ConnectCARoots: indexedRoots, - IssuedCert: *issued, - ManualCARoots: []string{manualCA.RootCert}, - VerifyServerHostname: true, - } - - dnsSANs := []string{"test.dev"} - ipSANs := []net.IP{net.IPv4(198, 18, 0, 1)} - - fallback := &mockFallback{} - fallback.Test(t) - persist := &mockPersist{} - persist.Test(t) - - mcache := newMockCache(t) - rootRes := cache.FetchResult{Value: &indexedRoots, Index: 1} - rootsReq := structs.DCSpecificRequest{Datacenter: "foo"} - mcache.On("Prepopulate", cachetype.ConnectCARootName, rootRes, "foo", "", rootsReq.CacheInfo().Key).Return(nil).Once() - - leafReq := cachetype.ConnectCALeafRequest{ - Token: tokens.AgentToken(), - Agent: "node", - Datacenter: "foo", - DNSSAN: dnsSANs, - IPSAN: ipSANs, - } - leafRes := cache.FetchResult{ - Value: issued, - Index: 1, - State: cachetype.ConnectCALeafSuccess(ca.SigningKeyID), - } - mcache.On("Prepopulate", cachetype.ConnectCALeafName, leafRes, "foo", tokens.AgentToken(), leafReq.Key()).Return(nil).Once() - - // we can assert more later but this should always be done. - defer mcache.AssertExpectations(t) - - cfg := new(Config). - WithCache(mcache). - WithLogger(testutil.Logger(t)). - WithTLSConfigurator(tlsConfigurator). - WithTokens(tokens). - WithFallback(fallback.fallback). - WithDNSSANs(dnsSANs). - WithIPSANs(ipSANs). - WithDatacenter("foo"). - WithNodeName("node"). - WithFallbackLeeway(time.Nanosecond). - WithFallbackRetry(time.Millisecond). - WithPersistence(persist.persist) - - monitor, err := New(cfg) - require.NoError(t, err) - require.NotNil(t, monitor) - - require.NoError(t, monitor.Update(initialCerts)) - - return testCertMonitor{ - monitor: monitor, - tls: tlsConfigurator, - tokens: tokens, - mcache: mcache, - persist: persist, - fallback: fallback, - extraCACerts: []string{manualCA.RootCert}, - initialCert: issued, - initialRoots: &indexedRoots, - datacenter: "foo", - nodeName: "node", - dns: dnsSANs, - ips: ipSANs, - verifyServerHostname: true, - } -} - -func tlsCertificateFromIssued(t *testing.T, issued *structs.IssuedCert) *tls.Certificate { - t.Helper() - - cert, err := tls.X509KeyPair([]byte(issued.CertPEM), []byte(issued.PrivateKeyPEM)) - require.NoError(t, err) - return &cert -} - -// convenience method to get a TLS Certificate from the intial issued certificate and priv key -func (cm *testCertMonitor) initialTLSCertificate(t *testing.T) *tls.Certificate { - t.Helper() - return tlsCertificateFromIssued(t, cm.initialCert) -} - -// just a convenience method to get a list of all the CA pems that we set up regardless -// of manual vs connect. -func (cm *testCertMonitor) initialCACerts() []string { - pems := cm.extraCACerts - for _, root := range cm.initialRoots.Roots { - pems = append(pems, root.RootCert) - } - return pems -} - -func (cm *testCertMonitor) assertExpectations(t *testing.T) { - cm.mcache.AssertExpectations(t) - cm.fallback.AssertExpectations(t) - cm.persist.AssertExpectations(t) -} - -func TestCertMonitor_InitialCerts(t *testing.T) { - // this also ensures that the cache was prepopulated properly - cm := newTestCertMonitor(t) - - // verify that the certificate was injected into the TLS configurator correctly - require.Equal(t, cm.initialTLSCertificate(t), cm.tls.Cert()) - // verify that the CA certs (both Connect and manual ones) were injected correctly - require.ElementsMatch(t, cm.initialCACerts(), cm.tls.CAPems()) - // verify that the auto-tls verify server hostname setting was injected correctly - require.Equal(t, cm.verifyServerHostname, cm.tls.VerifyServerHostname()) -} - -func TestCertMonitor_GoRoutineManagement(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - cm := newTestCertMonitor(t) - - // ensure that the monitor is not running - require.False(t, cm.monitor.IsRunning()) - - // ensure that nothing bad happens and that it reports as stopped - require.False(t, cm.monitor.Stop()) - - // we will never send notifications so these just ignore everything - cm.mcache.On("Notify", cachetype.ConnectCARootName, &structs.DCSpecificRequest{Datacenter: cm.datacenter}, rootsWatchID).Return(nil).Times(2) - cm.mcache.On("Notify", cachetype.ConnectCALeafName, - &cachetype.ConnectCALeafRequest{ - Token: cm.tokens.AgentToken(), - Datacenter: cm.datacenter, - Agent: cm.nodeName, - DNSSAN: cm.dns, - IPSAN: cm.ips, - }, - leafWatchID, - ).Return(nil).Times(2) - - done, err := cm.monitor.Start(ctx) - require.NoError(t, err) - require.True(t, cm.monitor.IsRunning()) - _, err = cm.monitor.Start(ctx) - testutil.RequireErrorContains(t, err, "the CertMonitor is already running") - require.True(t, cm.monitor.Stop()) - - require.True(t, waitForChans(100*time.Millisecond, done), "monitor didn't shut down") - require.False(t, cm.monitor.IsRunning()) - done, err = cm.monitor.Start(ctx) - require.NoError(t, err) - - // ensure that context cancellation causes us to stop as well - cancel() - require.True(t, waitForChans(100*time.Millisecond, done)) - - cm.assertExpectations(t) -} - -func startedCertMonitor(t *testing.T) (context.Context, testCertMonitor) { - ctx, cancel := context.WithCancel(context.Background()) - t.Cleanup(cancel) - - cm := newTestCertMonitor(t) - - rootsCtx, rootsCancel := context.WithCancel(ctx) - defer rootsCancel() - leafCtx, leafCancel := context.WithCancel(ctx) - defer leafCancel() - - // initial roots watch - cm.mcache.On("Notify", cachetype.ConnectCARootName, - &structs.DCSpecificRequest{ - Datacenter: cm.datacenter, - }, - rootsWatchID). - Return(nil). - Once(). - Run(func(_ mock.Arguments) { - rootsCancel() - }) - // the initial watch after starting the monitor - cm.mcache.On("Notify", cachetype.ConnectCALeafName, - &cachetype.ConnectCALeafRequest{ - Token: cm.tokens.AgentToken(), - Datacenter: cm.datacenter, - Agent: cm.nodeName, - DNSSAN: cm.dns, - IPSAN: cm.ips, - }, - leafWatchID). - Return(nil). - Once(). - Run(func(_ mock.Arguments) { - leafCancel() - }) - - done, err := cm.monitor.Start(ctx) - require.NoError(t, err) - // this prevents logs after the test finishes - t.Cleanup(func() { - cm.monitor.Stop() - <-done - }) - - require.True(t, - waitForChans(100*time.Millisecond, rootsCtx.Done(), leafCtx.Done()), - "not all watches were started within the alotted time") - - return ctx, cm -} - -// This test ensures that the cache watches are restarted with the updated -// token after receiving a token update -func TestCertMonitor_TokenUpdate(t *testing.T) { - ctx, cm := startedCertMonitor(t) - - rootsCtx, rootsCancel := context.WithCancel(ctx) - defer rootsCancel() - leafCtx, leafCancel := context.WithCancel(ctx) - defer leafCancel() - - newToken := "8e4fe8db-162d-42d8-81ca-710fb2280ad0" - - // we expect a new roots watch because when the leaf cert watch is restarted so is the root cert watch - cm.mcache.On("Notify", cachetype.ConnectCARootName, - &structs.DCSpecificRequest{ - Datacenter: cm.datacenter, - }, - rootsWatchID). - Return(nil). - Once(). - Run(func(_ mock.Arguments) { - rootsCancel() - }) - - secondWatch := &cachetype.ConnectCALeafRequest{ - Token: newToken, - Datacenter: cm.datacenter, - Agent: cm.nodeName, - DNSSAN: cm.dns, - IPSAN: cm.ips, - } - // the new watch after updating the token - cm.mcache.On("Notify", cachetype.ConnectCALeafName, secondWatch, leafWatchID). - Return(nil). - Once(). - Run(func(args mock.Arguments) { - leafCancel() - }) - - cm.tokens.UpdateAgentToken(newToken, token.TokenSourceAPI) - - require.True(t, - waitForChans(100*time.Millisecond, rootsCtx.Done(), leafCtx.Done()), - "not all watches were restarted within the alotted time") - - cm.assertExpectations(t) -} - -func TestCertMonitor_RootsUpdate(t *testing.T) { - ctx, cm := startedCertMonitor(t) - - secondCA := connect.TestCA(t, cm.initialRoots.Roots[0]) - secondRoots := structs.IndexedCARoots{ - ActiveRootID: secondCA.ID, - TrustDomain: connect.TestClusterID, - Roots: []*structs.CARoot{ - secondCA, - cm.initialRoots.Roots[0], - }, - QueryMeta: structs.QueryMeta{ - Index: 99, - }, - } - - cm.persist.On("persist", &structs.SignedResponse{ - IssuedCert: *cm.initialCert, - ManualCARoots: cm.extraCACerts, - ConnectCARoots: secondRoots, - VerifyServerHostname: cm.verifyServerHostname, - }).Return(nil).Once() - - // assert value of the CA certs prior to updating - require.ElementsMatch(t, cm.initialCACerts(), cm.tls.CAPems()) - - req := structs.DCSpecificRequest{Datacenter: cm.datacenter} - require.True(t, cm.mcache.sendNotification(ctx, req.CacheInfo().Key, cache.UpdateEvent{ - CorrelationID: rootsWatchID, - Result: &secondRoots, - Meta: cache.ResultMeta{ - Index: secondRoots.Index, - }, - })) - - expectedCAs := append(cm.extraCACerts, secondCA.RootCert, cm.initialRoots.Roots[0].RootCert) - - // this will wait up to 200ms (8 x 25 ms waits between the 9 requests) - retry.RunWith(&retry.Counter{Count: 9, Wait: 25 * time.Millisecond}, t, func(r *retry.R) { - require.ElementsMatch(r, expectedCAs, cm.tls.CAPems()) - }) - - cm.assertExpectations(t) -} - -func TestCertMonitor_CertUpdate(t *testing.T) { - ctx, cm := startedCertMonitor(t) - - secondCert := newLeaf(t, cm.initialRoots.Roots[0], 100, 10*time.Minute) - - cm.persist.On("persist", &structs.SignedResponse{ - IssuedCert: *secondCert, - ManualCARoots: cm.extraCACerts, - ConnectCARoots: *cm.initialRoots, - VerifyServerHostname: cm.verifyServerHostname, - }).Return(nil).Once() - - // assert value of cert prior to updating the leaf - require.Equal(t, cm.initialTLSCertificate(t), cm.tls.Cert()) - - key := cm.monitor.leafReq.CacheInfo().Key - - // send the new certificate - this notifies only the watchers utilizing - // the new ACL token - require.True(t, cm.mcache.sendNotification(ctx, key, cache.UpdateEvent{ - CorrelationID: leafWatchID, - Result: secondCert, - Meta: cache.ResultMeta{ - Index: secondCert.ModifyIndex, - }, - })) - - tlsCert := tlsCertificateFromIssued(t, secondCert) - - // this will wait up to 200ms (8 x 25 ms waits between the 9 requests) - retry.RunWith(&retry.Counter{Count: 9, Wait: 25 * time.Millisecond}, t, func(r *retry.R) { - require.Equal(r, tlsCert, cm.tls.Cert()) - }) - - cm.assertExpectations(t) -} - -func TestCertMonitor_Fallback(t *testing.T) { - ctx, cm := startedCertMonitor(t) - - // at this point everything is operating normally and the monitor is just - // waiting for events. We are going to send a new cert that is basically - // already expired and then allow the fallback routine to kick in. - secondCert := newLeaf(t, cm.initialRoots.Roots[0], 100, time.Nanosecond) - secondCA := connect.TestCA(t, cm.initialRoots.Roots[0]) - secondRoots := structs.IndexedCARoots{ - ActiveRootID: secondCA.ID, - TrustDomain: connect.TestClusterID, - Roots: []*structs.CARoot{ - secondCA, - cm.initialRoots.Roots[0], - }, - QueryMeta: structs.QueryMeta{ - Index: 101, - }, - } - thirdCert := newLeaf(t, secondCA, 102, 10*time.Minute) - - // inject a fallback routine error to check that we rerun it quickly - cm.fallback.On("fallback").Return(nil, fmt.Errorf("induced error")).Once() - - fallbackResp := &structs.SignedResponse{ - ConnectCARoots: secondRoots, - IssuedCert: *thirdCert, - ManualCARoots: cm.extraCACerts, - VerifyServerHostname: true, - } - // expect the fallback routine to be executed and setup the return - cm.fallback.On("fallback").Return(fallbackResp, nil).Once() - - cm.persist.On("persist", &structs.SignedResponse{ - IssuedCert: *secondCert, - ConnectCARoots: *cm.initialRoots, - ManualCARoots: cm.extraCACerts, - VerifyServerHostname: cm.verifyServerHostname, - }).Return(nil).Once() - - cm.persist.On("persist", fallbackResp).Return(nil).Once() - - // Add another roots cache prepopulation expectation which should happen - // in response to executing the fallback mechanism - rootRes := cache.FetchResult{Value: &secondRoots, Index: 101} - rootsReq := structs.DCSpecificRequest{Datacenter: cm.datacenter} - cm.mcache.On("Prepopulate", cachetype.ConnectCARootName, rootRes, cm.datacenter, "", rootsReq.CacheInfo().Key).Return(nil).Once() - - // add another leaf cert cache prepopulation expectation which should happen - // in response to executing the fallback mechanism - leafReq := cachetype.ConnectCALeafRequest{ - Token: cm.tokens.AgentToken(), - Agent: cm.nodeName, - Datacenter: cm.datacenter, - DNSSAN: cm.dns, - IPSAN: cm.ips, - } - leafRes := cache.FetchResult{ - Value: thirdCert, - Index: 101, - State: cachetype.ConnectCALeafSuccess(secondCA.SigningKeyID), - } - cm.mcache.On("Prepopulate", cachetype.ConnectCALeafName, leafRes, leafReq.Datacenter, leafReq.Token, leafReq.Key()).Return(nil).Once() - - // nothing in the monitor should be looking at this as its only done - // in response to sending token updates, no need to synchronize - key := cm.monitor.leafReq.CacheInfo().Key - // send the new certificate - this notifies only the watchers utilizing - // the new ACL token - require.True(t, cm.mcache.sendNotification(ctx, key, cache.UpdateEvent{ - CorrelationID: leafWatchID, - Result: secondCert, - Meta: cache.ResultMeta{ - Index: secondCert.ModifyIndex, - }, - })) - - // if all went well we would have updated the first certificate which was pretty much expired - // causing the fallback handler to be invoked almost immediately. The fallback routine will - // return the response containing the third cert and second CA roots so now we should wait - // a little while and ensure they were applied to the TLS Configurator - tlsCert := tlsCertificateFromIssued(t, thirdCert) - expectedCAs := append(cm.extraCACerts, secondCA.RootCert, cm.initialRoots.Roots[0].RootCert) - - // this will wait up to 200ms (8 x 25 ms waits between the 9 requests) - retry.RunWith(&retry.Counter{Count: 9, Wait: 25 * time.Millisecond}, t, func(r *retry.R) { - require.Equal(r, tlsCert, cm.tls.Cert()) - require.ElementsMatch(r, expectedCAs, cm.tls.CAPems()) - }) - - cm.assertExpectations(t) -} - -func TestCertMonitor_New_Errors(t *testing.T) { - type testCase struct { - cfg Config - err string - } - - fallback := func(_ context.Context) (*structs.SignedResponse, error) { - return nil, fmt.Errorf("Unimplemented") - } - - tokens := new(token.Store) - - cases := map[string]testCase{ - "no-cache": { - cfg: Config{ - TLSConfigurator: testTLSConfigurator(t), - Fallback: fallback, - Tokens: tokens, - Datacenter: "foo", - NodeName: "bar", - }, - err: "CertMonitor creation requires a Cache", - }, - "no-tls-configurator": { - cfg: Config{ - Cache: cache.New(cache.Options{}), - Fallback: fallback, - Tokens: tokens, - Datacenter: "foo", - NodeName: "bar", - }, - err: "CertMonitor creation requires a TLS Configurator", - }, - "no-fallback": { - cfg: Config{ - Cache: cache.New(cache.Options{}), - TLSConfigurator: testTLSConfigurator(t), - Tokens: tokens, - Datacenter: "foo", - NodeName: "bar", - }, - err: "CertMonitor creation requires specifying a FallbackFunc", - }, - "no-tokens": { - cfg: Config{ - Cache: cache.New(cache.Options{}), - TLSConfigurator: testTLSConfigurator(t), - Fallback: fallback, - Datacenter: "foo", - NodeName: "bar", - }, - err: "CertMonitor creation requires specifying a token store", - }, - "no-datacenter": { - cfg: Config{ - Cache: cache.New(cache.Options{}), - TLSConfigurator: testTLSConfigurator(t), - Fallback: fallback, - Tokens: tokens, - NodeName: "bar", - }, - err: "CertMonitor creation requires specifying the datacenter", - }, - "no-node-name": { - cfg: Config{ - Cache: cache.New(cache.Options{}), - TLSConfigurator: testTLSConfigurator(t), - Fallback: fallback, - Tokens: tokens, - Datacenter: "foo", - }, - err: "CertMonitor creation requires specifying the agent's node name", - }, - } - - for name, tcase := range cases { - t.Run(name, func(t *testing.T) { - monitor, err := New(&tcase.cfg) - testutil.RequireErrorContains(t, err, tcase.err) - require.Nil(t, monitor) - }) - } -} diff --git a/agent/cert-monitor/config.go b/agent/cert-monitor/config.go deleted file mode 100644 index 2e4bcc57c..000000000 --- a/agent/cert-monitor/config.go +++ /dev/null @@ -1,150 +0,0 @@ -package certmon - -import ( - "context" - "net" - "time" - - "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/agent/token" - "github.com/hashicorp/consul/tlsutil" - "github.com/hashicorp/go-hclog" -) - -// FallbackFunc is used when the normal cache watch based Certificate -// updating fails to update the Certificate in time and a different -// method of updating the certificate is required. -type FallbackFunc func(context.Context) (*structs.SignedResponse, error) - -// PersistFunc is used to persist the data from a signed response -type PersistFunc func(*structs.SignedResponse) error - -type Config struct { - // Logger is the logger to be used while running. If not set - // then no logging will be performed. - Logger hclog.Logger - - // TLSConfigurator is where the certificates and roots are set when - // they are updated. This field is required. - TLSConfigurator *tlsutil.Configurator - - // Cache is an object implementing our Cache interface. The Cache - // used at runtime must be able to handle Roots and Leaf Cert watches - Cache Cache - - // Tokens is the shared token store. It is used to retrieve the current - // agent token as well as getting notifications when that token is updated. - // This field is required. - Tokens *token.Store - - // Persist is a function to run when there are new certs or keys - Persist PersistFunc - - // Fallback is a function to run when the normal cache updating of the - // agent's certificates has failed to work for one reason or another. - // This field is required. - Fallback FallbackFunc - - // FallbackLeeway is the amount of time after certificate expiration before - // invoking the fallback routine. If not set this will default to 10s. - FallbackLeeway time.Duration - - // FallbackRetry is the duration between Fallback invocations when the configured - // fallback routine returns an error. If not set this will default to 1m. - FallbackRetry time.Duration - - // DNSSANs is a list of DNS SANs that certificate requests should include. This - // field is optional and no extra DNS SANs will be requested if unset. 'localhost' - // is unconditionally requested by the cache implementation. - DNSSANs []string - - // IPSANs is a list of IP SANs to include in the certificate signing request. This - // field is optional and no extra IP SANs will be requested if unset. Both '127.0.0.1' - // and '::1' IP SANs are unconditionally requested by the cache implementation. - IPSANs []net.IP - - // Datacenter is the datacenter to request certificates within. This filed is required - Datacenter string - - // NodeName is the agent's node name to use when requesting certificates. This field - // is required. - NodeName string -} - -// WithCache will cause the created CertMonitor type to use the provided Cache -func (cfg *Config) WithCache(cache Cache) *Config { - cfg.Cache = cache - return cfg -} - -// WithLogger will cause the created CertMonitor type to use the provided logger -func (cfg *Config) WithLogger(logger hclog.Logger) *Config { - cfg.Logger = logger - return cfg -} - -// WithTLSConfigurator will cause the created CertMonitor type to use the provided configurator -func (cfg *Config) WithTLSConfigurator(tlsConfigurator *tlsutil.Configurator) *Config { - cfg.TLSConfigurator = tlsConfigurator - return cfg -} - -// WithTokens will cause the created CertMonitor type to use the provided token store -func (cfg *Config) WithTokens(tokens *token.Store) *Config { - cfg.Tokens = tokens - return cfg -} - -// WithFallback configures a fallback function to use if the normal update mechanisms -// fail to renew the certificate in time. -func (cfg *Config) WithFallback(fallback FallbackFunc) *Config { - cfg.Fallback = fallback - return cfg -} - -// WithDNSSANs configures the CertMonitor to request these DNS SANs when requesting a new -// certificate -func (cfg *Config) WithDNSSANs(sans []string) *Config { - cfg.DNSSANs = sans - return cfg -} - -// WithIPSANs configures the CertMonitor to request these IP SANs when requesting a new -// certificate -func (cfg *Config) WithIPSANs(sans []net.IP) *Config { - cfg.IPSANs = sans - return cfg -} - -// WithDatacenter configures the CertMonitor to request Certificates in this DC -func (cfg *Config) WithDatacenter(dc string) *Config { - cfg.Datacenter = dc - return cfg -} - -// WithNodeName configures the CertMonitor to request Certificates with this agent name -func (cfg *Config) WithNodeName(name string) *Config { - cfg.NodeName = name - return cfg -} - -// WithFallbackLeeway configures how long after a certificate expires before attempting to -// generarte a new certificate using the fallback mechanism. The default is 10s. -func (cfg *Config) WithFallbackLeeway(leeway time.Duration) *Config { - cfg.FallbackLeeway = leeway - return cfg -} - -// WithFallbackRetry controls how quickly we will make subsequent invocations of -// the fallback func in the case of it erroring out. -func (cfg *Config) WithFallbackRetry(after time.Duration) *Config { - cfg.FallbackRetry = after - return cfg -} - -// WithPersistence will configure the CertMonitor to use this callback for persisting -// a new TLS configuration. -func (cfg *Config) WithPersistence(persist PersistFunc) *Config { - cfg.Persist = persist - return cfg -} diff --git a/agent/consul/auto_encrypt.go b/agent/consul/auto_encrypt.go deleted file mode 100644 index 0684e7f71..000000000 --- a/agent/consul/auto_encrypt.go +++ /dev/null @@ -1,239 +0,0 @@ -package consul - -import ( - "context" - "fmt" - "net" - "strings" - "time" - - "github.com/hashicorp/consul/agent/connect" - "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/lib" - "github.com/hashicorp/go-hclog" - "github.com/miekg/dns" -) - -const ( - dummyTrustDomain = "dummy.trustdomain" - retryJitterWindow = 30 * time.Second -) - -func (c *Client) autoEncryptCSR(extraDNSSANs []string, extraIPSANs []net.IP) (string, string, error) { - // We don't provide the correct host here, because we don't know any - // better at this point. Apart from the domain, we would need the - // ClusterID, which we don't have. This is why we go with - // dummyTrustDomain the first time. Subsequent CSRs will have the - // correct TrustDomain. - id := &connect.SpiffeIDAgent{ - Host: dummyTrustDomain, - Datacenter: c.config.Datacenter, - Agent: c.config.NodeName, - } - - conf, err := c.config.CAConfig.GetCommonConfig() - if err != nil { - return "", "", err - } - - if conf.PrivateKeyType == "" { - conf.PrivateKeyType = connect.DefaultPrivateKeyType - } - if conf.PrivateKeyBits == 0 { - conf.PrivateKeyBits = connect.DefaultPrivateKeyBits - } - - // Create a new private key - pk, pkPEM, err := connect.GeneratePrivateKeyWithConfig(conf.PrivateKeyType, conf.PrivateKeyBits) - if err != nil { - return "", "", err - } - - dnsNames := append([]string{"localhost"}, extraDNSSANs...) - ipAddresses := append([]net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::1")}, extraIPSANs...) - - // Create a CSR. - // - // The Common Name includes the dummy trust domain for now but Server will - // override this when it is signed anyway so it's OK. - cn := connect.AgentCN(c.config.NodeName, dummyTrustDomain) - csr, err := connect.CreateCSR(id, cn, pk, dnsNames, ipAddresses) - if err != nil { - return "", "", err - } - - return pkPEM, csr, nil -} - -func (c *Client) RequestAutoEncryptCerts(ctx context.Context, servers []string, port int, token string, extraDNSSANs []string, extraIPSANs []net.IP) (*structs.SignedResponse, error) { - errFn := func(err error) (*structs.SignedResponse, error) { - return nil, err - } - - // Check if we know about a server already through gossip. Depending on - // how the agent joined, there might already be one. Also in case this - // gets called because the cert expired. - server := c.router.FindLANServer() - if server != nil { - servers = []string{server.Addr.String()} - } - - if len(servers) == 0 { - return errFn(fmt.Errorf("No servers to request AutoEncrypt.Sign")) - } - - pkPEM, csr, err := c.autoEncryptCSR(extraDNSSANs, extraIPSANs) - if err != nil { - return errFn(err) - } - - // Prepare request and response so that it can be passed to - // RPCInsecure. - args := structs.CASignRequest{ - WriteRequest: structs.WriteRequest{Token: token}, - Datacenter: c.config.Datacenter, - CSR: csr, - } - var reply structs.SignedResponse - - // Retry implementation modeled after https://github.com/hashicorp/consul/pull/5228. - // TLDR; there is a 30s window from which a random time is picked. - // Repeat until the call is successful. - attempts := 0 - for { - select { - case <-ctx.Done(): - return errFn(fmt.Errorf("aborting AutoEncrypt because interrupted: %w", ctx.Err())) - default: - } - - // Translate host to net.TCPAddr to make life easier for - // RPCInsecure. - for _, s := range servers { - ips, err := resolveAddr(s, c.logger) - if err != nil { - c.logger.Warn("AutoEncrypt resolveAddr failed", "error", err) - continue - } - - for _, ip := range ips { - addr := net.TCPAddr{IP: ip, Port: port} - - if err = c.connPool.RPC(c.config.Datacenter, c.config.NodeName, &addr, "AutoEncrypt.Sign", &args, &reply); err == nil { - reply.IssuedCert.PrivateKeyPEM = pkPEM - return &reply, nil - } else { - c.logger.Warn("AutoEncrypt failed", "error", err) - } - } - } - attempts++ - - delay := lib.RandomStagger(retryJitterWindow) - interval := (time.Duration(attempts) * delay) + delay - c.logger.Warn("retrying AutoEncrypt", "retry_interval", interval) - select { - case <-time.After(interval): - continue - case <-ctx.Done(): - return errFn(fmt.Errorf("aborting AutoEncrypt because interrupted: %w", ctx.Err())) - case <-c.shutdownCh: - return errFn(fmt.Errorf("aborting AutoEncrypt because shutting down")) - } - } -} - -func missingPortError(host string, err error) bool { - return err != nil && err.Error() == fmt.Sprintf("address %s: missing port in address", host) -} - -// resolveAddr is used to resolve the host into IPs and error. -func resolveAddr(rawHost string, logger hclog.Logger) ([]net.IP, error) { - host, _, err := net.SplitHostPort(rawHost) - if err != nil { - // In case we encounter this error, we proceed with the - // rawHost. This is fine since -start-join and -retry-join - // take only hosts anyways and this is an expected case. - if missingPortError(rawHost, err) { - host = rawHost - } else { - return nil, err - } - } - - if ip := net.ParseIP(host); ip != nil { - return []net.IP{ip}, nil - } - - // First try TCP so we have the best chance for the largest list of - // hosts to join. If this fails it's not fatal since this isn't a standard - // way to query DNS, and we have a fallback below. - if ips, err := tcpLookupIP(host, logger); err != nil { - logger.Debug("TCP-first lookup failed for host, falling back to UDP", "host", host, "error", err) - } else if len(ips) > 0 { - return ips, nil - } - - // If TCP didn't yield anything then use the normal Go resolver which - // will try UDP, then might possibly try TCP again if the UDP response - // indicates it was truncated. - ips, err := net.LookupIP(host) - if err != nil { - return nil, err - } - return ips, nil -} - -// tcpLookupIP is a helper to initiate a TCP-based DNS lookup for the given host. -// The built-in Go resolver will do a UDP lookup first, and will only use TCP if -// the response has the truncate bit set, which isn't common on DNS servers like -// Consul's. By doing the TCP lookup directly, we get the best chance for the -// largest list of hosts to join. Since joins are relatively rare events, it's ok -// to do this rather expensive operation. -func tcpLookupIP(host string, logger hclog.Logger) ([]net.IP, error) { - // Don't attempt any TCP lookups against non-fully qualified domain - // names, since those will likely come from the resolv.conf file. - if !strings.Contains(host, ".") { - return nil, nil - } - - // Make sure the domain name is terminated with a dot (we know there's - // at least one character at this point). - dn := host - if dn[len(dn)-1] != '.' { - dn = dn + "." - } - - // See if we can find a server to try. - cc, err := dns.ClientConfigFromFile("/etc/resolv.conf") - if err != nil { - return nil, err - } - if len(cc.Servers) > 0 { - // Do the lookup. - c := new(dns.Client) - c.Net = "tcp" - msg := new(dns.Msg) - msg.SetQuestion(dn, dns.TypeANY) - in, _, err := c.Exchange(msg, cc.Servers[0]) - if err != nil { - return nil, err - } - - // Handle any IPs we get back that we can attempt to join. - var ips []net.IP - for _, r := range in.Answer { - switch rr := r.(type) { - case (*dns.A): - ips = append(ips, rr.A) - case (*dns.AAAA): - ips = append(ips, rr.AAAA) - case (*dns.CNAME): - logger.Debug("Ignoring CNAME RR in TCP-first answer for host", "host", host) - } - } - return ips, nil - } - - return nil, nil -} diff --git a/agent/consul/auto_encrypt_test.go b/agent/consul/auto_encrypt_test.go deleted file mode 100644 index 8dd04e416..000000000 --- a/agent/consul/auto_encrypt_test.go +++ /dev/null @@ -1,205 +0,0 @@ -package consul - -import ( - "context" - "crypto/x509" - "crypto/x509/pkix" - "encoding/asn1" - "net" - "net/url" - "os" - "testing" - "time" - - "github.com/hashicorp/consul/agent/connect" - "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/sdk/testutil" - "github.com/hashicorp/go-hclog" - "github.com/stretchr/testify/require" -) - -func TestAutoEncrypt_resolveAddr(t *testing.T) { - type args struct { - rawHost string - logger hclog.Logger - } - logger := testutil.Logger(t) - - tests := []struct { - name string - args args - ips []net.IP - wantErr bool - }{ - { - name: "host without port", - args: args{ - "127.0.0.1", - logger, - }, - ips: []net.IP{net.IPv4(127, 0, 0, 1)}, - wantErr: false, - }, - { - name: "host with port", - args: args{ - "127.0.0.1:1234", - logger, - }, - ips: []net.IP{net.IPv4(127, 0, 0, 1)}, - wantErr: false, - }, - { - name: "host with broken port", - args: args{ - "127.0.0.1:xyz", - logger, - }, - ips: []net.IP{net.IPv4(127, 0, 0, 1)}, - wantErr: false, - }, - { - name: "not an address", - args: args{ - "abc", - logger, - }, - ips: nil, - wantErr: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - ips, err := resolveAddr(tt.args.rawHost, tt.args.logger) - if (err != nil) != tt.wantErr { - t.Errorf("resolveAddr error: %v, wantErr: %v", err, tt.wantErr) - return - } - require.Equal(t, tt.ips, ips) - }) - } -} - -func TestAutoEncrypt_missingPortError(t *testing.T) { - host := "127.0.0.1" - _, _, err := net.SplitHostPort(host) - require.True(t, missingPortError(host, err)) - - host = "127.0.0.1:1234" - _, _, err = net.SplitHostPort(host) - require.False(t, missingPortError(host, err)) -} - -func TestAutoEncrypt_RequestAutoEncryptCerts(t *testing.T) { - dir1, c1 := testClient(t) - defer os.RemoveAll(dir1) - defer c1.Shutdown() - servers := []string{"localhost"} - port := 8301 - token := "" - - ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(75*time.Millisecond)) - defer cancel() - - doneCh := make(chan struct{}) - var err error - go func() { - _, err = c1.RequestAutoEncryptCerts(ctx, servers, port, token, nil, nil) - close(doneCh) - }() - select { - case <-doneCh: - // since there are no servers at this port, we shouldn't be - // done and this should be an error of some sorts that happened - // in the setup phase before entering the for loop in - // RequestAutoEncryptCerts. - require.NoError(t, err) - case <-ctx.Done(): - // this is the happy case since auto encrypt is in its loop to - // try to request certs. - } -} - -func TestAutoEncrypt_autoEncryptCSR(t *testing.T) { - type testCase struct { - conf *Config - extraDNSSANs []string - extraIPSANs []net.IP - err string - - // to validate the csr - expectedSubject pkix.Name - expectedSigAlg x509.SignatureAlgorithm - expectedPubAlg x509.PublicKeyAlgorithm - expectedDNSNames []string - expectedIPs []net.IP - expectedURIs []*url.URL - } - - cases := map[string]testCase{ - "sans": { - conf: &Config{ - Datacenter: "dc1", - NodeName: "test-node", - CAConfig: &structs.CAConfiguration{}, - }, - extraDNSSANs: []string{"foo.local", "bar.local"}, - extraIPSANs: []net.IP{net.IPv4(198, 18, 0, 1), net.IPv4(198, 18, 0, 2)}, - expectedSubject: pkix.Name{ - CommonName: connect.AgentCN("test-node", dummyTrustDomain), - Names: []pkix.AttributeTypeAndValue{ - { - // 2,5,4,3 is the CommonName type ASN1 identifier - Type: asn1.ObjectIdentifier{2, 5, 4, 3}, - Value: "testnode.agnt.dummy.tr.consul", - }, - }, - }, - expectedSigAlg: x509.ECDSAWithSHA256, - expectedPubAlg: x509.ECDSA, - expectedDNSNames: []string{ - "localhost", - "foo.local", - "bar.local", - }, - expectedIPs: []net.IP{ - {127, 0, 0, 1}, - net.ParseIP("::1"), - {198, 18, 0, 1}, - {198, 18, 0, 2}, - }, - expectedURIs: []*url.URL{ - { - Scheme: "spiffe", - Host: dummyTrustDomain, - Path: "/agent/client/dc/dc1/id/test-node", - }, - }, - }, - } - - for name, tcase := range cases { - t.Run(name, func(t *testing.T) { - client := Client{config: tcase.conf} - - _, csr, err := client.autoEncryptCSR(tcase.extraDNSSANs, tcase.extraIPSANs) - if tcase.err == "" { - require.NoError(t, err) - - request, err := connect.ParseCSR(csr) - require.NoError(t, err) - require.NotNil(t, request) - - require.Equal(t, tcase.expectedSubject, request.Subject) - require.Equal(t, tcase.expectedSigAlg, request.SignatureAlgorithm) - require.Equal(t, tcase.expectedPubAlg, request.PublicKeyAlgorithm) - require.Equal(t, tcase.expectedDNSNames, request.DNSNames) - require.Equal(t, tcase.expectedIPs, request.IPAddresses) - require.Equal(t, tcase.expectedURIs, request.URIs) - } else { - require.Error(t, err) - require.Empty(t, csr) - } - }) - } -} diff --git a/agent/setup.go b/agent/setup.go index 4ed282322..b807e7f7e 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -1,7 +1,6 @@ package agent import ( - "context" "fmt" "io" "net" @@ -10,11 +9,9 @@ import ( autoconf "github.com/hashicorp/consul/agent/auto-config" "github.com/hashicorp/consul/agent/cache" - certmon "github.com/hashicorp/consul/agent/cert-monitor" "github.com/hashicorp/consul/agent/config" "github.com/hashicorp/consul/agent/pool" "github.com/hashicorp/consul/agent/router" - "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/ipaddr" "github.com/hashicorp/consul/lib" @@ -86,38 +83,21 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer) (BaseDeps, error) d.Cache = cache.New(cfg.Cache) d.ConnPool = newConnPool(cfg, d.Logger, d.TLSConfigurator) - deferredAC := &deferredAutoConfig{} - d.Router = router.NewRouter(d.Logger, cfg.Datacenter, fmt.Sprintf("%s.%s", cfg.NodeName, cfg.Datacenter)) - cmConf := new(certmon.Config). - WithCache(d.Cache). - WithTLSConfigurator(d.TLSConfigurator). - WithDNSSANs(cfg.AutoConfig.DNSSANs). - WithIPSANs(cfg.AutoConfig.IPSANs). - WithDatacenter(cfg.Datacenter). - WithNodeName(cfg.NodeName). - WithFallback(deferredAC.autoConfigFallbackTLS). - WithLogger(d.Logger.Named(logging.AutoConfig)). - WithTokens(d.Tokens). - WithPersistence(deferredAC.autoConfigPersist) - acCertMon, err := certmon.New(cmConf) - if err != nil { - return d, err - } - acConf := autoconf.Config{ - DirectRPC: d.ConnPool, - Logger: d.Logger, - CertMonitor: acCertMon, - Loader: configLoader, + DirectRPC: d.ConnPool, + Logger: d.Logger, + Loader: configLoader, + ServerProvider: d.Router, + TLSConfigurator: d.TLSConfigurator, + Cache: d.Cache, + Tokens: d.Tokens, } d.AutoConfig, err = autoconf.New(acConf) if err != nil { return d, err } - // TODO: can this cyclic dependency be un-cycled? - deferredAC.autoConf = d.AutoConfig return d, nil } @@ -144,21 +124,3 @@ func newConnPool(config *config.RuntimeConfig, logger hclog.Logger, tls *tlsutil } return pool } - -type deferredAutoConfig struct { - autoConf *autoconf.AutoConfig // TODO: use an interface -} - -func (a *deferredAutoConfig) autoConfigFallbackTLS(ctx context.Context) (*structs.SignedResponse, error) { - if a.autoConf == nil { - return nil, fmt.Errorf("AutoConfig manager has not been created yet") - } - return a.autoConf.FallbackTLS(ctx) -} - -func (a *deferredAutoConfig) autoConfigPersist(resp *structs.SignedResponse) error { - if a.autoConf == nil { - return fmt.Errorf("AutoConfig manager has not been created yet") - } - return a.autoConf.RecordUpdatedCerts(resp) -} diff --git a/proto/translate.go b/proto/translate.go index 3619a0e6e..6ee90c084 100644 --- a/proto/translate.go +++ b/proto/translate.go @@ -12,6 +12,8 @@ var ( timePtrType = reflect.TypeOf((*time.Time)(nil)) timeType = timePtrType.Elem() mapStrInf = reflect.TypeOf((map[string]interface{})(nil)) + + epoch1970 = time.Date(1970, 1, 1, 0, 0, 0, 0, time.UTC) ) // HookPBTimestampToTime is a mapstructure decode hook to translate a protobuf timestamp @@ -19,7 +21,10 @@ var ( func HookPBTimestampToTime(from, to reflect.Type, data interface{}) (interface{}, error) { if to == timeType && from == tsType { ts := data.(*types.Timestamp) - return time.Unix(ts.Seconds, int64(ts.Nanos)), nil + if ts.Seconds == 0 && ts.Nanos == 0 { + return time.Time{}, nil + } + return time.Unix(ts.Seconds, int64(ts.Nanos)).UTC(), nil } return data, nil @@ -39,6 +44,13 @@ func HookTimeToPBTimestamp(from, to reflect.Type, data interface{}) (interface{} // seeing a *time.Time instead of a time.Time. if from == timePtrType && to == mapStrInf { ts := data.(*time.Time) + + // protobuf only supports times from Jan 1 1970 onward but the time.Time type + // can represent values back to year 1. Basically + if ts.Before(epoch1970) { + return map[string]interface{}{}, nil + } + nanos := ts.UnixNano() if nanos < 0 { return map[string]interface{}{}, nil diff --git a/proto/translate_test.go b/proto/translate_test.go index cd88d8933..0fbfa2b9b 100644 --- a/proto/translate_test.go +++ b/proto/translate_test.go @@ -27,7 +27,7 @@ func TestHookPBTimestampToTime(t *testing.T) { } expected := timeTSWrapper{ - Timestamp: time.Unix(1000, 42), + Timestamp: time.Unix(1000, 42).UTC(), } var actual timeTSWrapper @@ -43,7 +43,7 @@ func TestHookPBTimestampToTime(t *testing.T) { func TestHookTimeToPBTimestamp(t *testing.T) { in := timeTSWrapper{ - Timestamp: time.Unix(999999, 123456), + Timestamp: time.Unix(999999, 123456).UTC(), } expected := pbTSWrapper{ @@ -63,3 +63,24 @@ func TestHookTimeToPBTimestamp(t *testing.T) { require.Equal(t, expected, actual) } + +func TestHookTimeToPBTimestamp_ZeroTime(t *testing.T) { + in := timeTSWrapper{} + + expected := pbTSWrapper{ + Timestamp: &types.Timestamp{ + Seconds: 0, + Nanos: 0, + }, + } + + var actual pbTSWrapper + decoder, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ + DecodeHook: HookTimeToPBTimestamp, + Result: &actual, + }) + require.NoError(t, err) + require.NoError(t, decoder.Decode(in)) + + require.Equal(t, expected, actual) +}