From 30d3a087dc88b22c347f0115f992c8a49be6cfb3 Mon Sep 17 00:00:00 2001 From: John Eikenberry Date: Fri, 7 Apr 2023 20:38:07 +0000 Subject: [PATCH] log warning about certificate expiring sooner and with more details The old setting of 24 hours was not enough time to deal with an expiring certificates. This change ups it to 28 days OR 40% of the full cert duration, whichever is shorter. It also adds details to the log message to indicate which certificate it is logging about and a suggested action. --- agent/consul/leader_metrics.go | 86 +++++++++++++++++++++-------- agent/consul/leader_metrics_test.go | 37 +++++++++++++ agent/metrics.go | 10 ++-- 3 files changed, 107 insertions(+), 26 deletions(-) create mode 100644 agent/consul/leader_metrics_test.go diff --git a/agent/consul/leader_metrics.go b/agent/consul/leader_metrics.go index bd36a52f6..188e409e3 100644 --- a/agent/consul/leader_metrics.go +++ b/agent/consul/leader_metrics.go @@ -19,8 +19,10 @@ import ( "github.com/hashicorp/consul/logging" ) -var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"} -var metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"} +var ( + metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"} + metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"} +) var LeaderCertExpirationGauges = []prometheus.GaugeDefinition{ { @@ -37,30 +39,31 @@ func rootCAExpiryMonitor(s *Server) CertExpirationMonitor { return CertExpirationMonitor{ Key: metricsKeyMeshRootCAExpiry, Logger: s.logger.Named(logging.Connect), - Query: func() (time.Duration, error) { + Query: func() (time.Duration, time.Duration, error) { return getRootCAExpiry(s) }, } } -func getRootCAExpiry(s *Server) (time.Duration, error) { +func getRootCAExpiry(s *Server) (time.Duration, time.Duration, error) { state := s.fsm.State() _, root, err := state.CARootActive(nil) switch { case err != nil: - return 0, fmt.Errorf("failed to retrieve root CA: %w", err) + return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err) case root == nil: - return 0, fmt.Errorf("no active root CA") + return 0, 0, fmt.Errorf("no active root CA") } - return time.Until(root.NotAfter), nil + lifetime := time.Since(root.NotBefore) + time.Until(root.NotAfter) + return lifetime, time.Until(root.NotAfter), nil } func signingCAExpiryMonitor(s *Server) CertExpirationMonitor { return CertExpirationMonitor{ Key: metricsKeyMeshActiveSigningCAExpiry, Logger: s.logger.Named(logging.Connect), - Query: func() (time.Duration, error) { + Query: func() (time.Duration, time.Duration, error) { if s.caManager.isIntermediateUsedToSignLeaf() { return getActiveIntermediateExpiry(s) } @@ -69,26 +72,28 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor { } } -func getActiveIntermediateExpiry(s *Server) (time.Duration, error) { +func getActiveIntermediateExpiry(s *Server) (time.Duration, time.Duration, error) { state := s.fsm.State() _, root, err := state.CARootActive(nil) switch { case err != nil: - return 0, fmt.Errorf("failed to retrieve root CA: %w", err) + return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err) case root == nil: - return 0, fmt.Errorf("no active root CA") + return 0, 0, fmt.Errorf("no active root CA") } // the CA used in a secondary DC is the active intermediate, // which is the last in the IntermediateCerts stack if len(root.IntermediateCerts) == 0 { - return 0, errors.New("no intermediate available") + return 0, 0, errors.New("no intermediate available") } cert, err := connect.ParseCert(root.IntermediateCerts[len(root.IntermediateCerts)-1]) if err != nil { - return 0, err + return 0, 0, err } - return time.Until(cert.NotAfter), nil + + lifetime := time.Since(cert.NotBefore) + time.Until(cert.NotAfter) + return lifetime, time.Until(cert.NotAfter), nil } type CertExpirationMonitor struct { @@ -99,9 +104,11 @@ type CertExpirationMonitor struct { // then the metrics will expire before they are emitted again. Labels []metrics.Label Logger hclog.Logger - // Query is called at each interval. It should return the duration until the - // certificate expires, or an error if the query failed. - Query func() (time.Duration, error) + // Query is called at each interval. It should return 2 durations, the full + // lifespan of the certificate (NotBefore -> NotAfter) and the duration + // until the certificate expires (Now -> NotAfter), or an error if the + // query failed. + Query func() (time.Duration, time.Duration, error) } const certExpirationMonitorInterval = time.Hour @@ -113,18 +120,37 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error { logger := m.Logger.With("metric", strings.Join(m.Key, ".")) emitMetric := func() { - d, err := m.Query() + lifetime, untilAfter, err := m.Query() if err != nil { logger.Warn("failed to emit certificate expiry metric", "error", err) return } - if d < 24*time.Hour { - logger.Warn("certificate will expire soon", - "time_to_expiry", d, "expiration", time.Now().Add(d)) + if expiresSoon(lifetime, untilAfter) { + key := strings.Join(m.Key, ":") + switch key { + case "mesh:active-root-ca:expiry": + logger.Warn("root certificate will expire soon", + "time_to_expiry", untilAfter, + "expiration", time.Now().Add(untilAfter), + "suggested_action", "manually rotate the root certificate", + ) + case "mesh:active-signing-ca:expiry": + logger.Warn("signing (intermediate) certificate will expire soon", + "time_to_expiry", untilAfter, + "expiration", time.Now().Add(untilAfter), + "suggested_action", "check consul logs for rotation issues", + ) + case "agent:tls:cert:expiry": + logger.Warn("agent TLS certificate will expire soon", + "time_to_expiry", untilAfter, + "expiration", time.Now().Add(untilAfter), + "suggested_action", "manually rotate this agent's certificate", + ) + } } - expiry := d / time.Second + expiry := untilAfter / time.Second metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels) } @@ -153,3 +179,19 @@ func initLeaderMetrics() { metrics.SetGaugeWithLabels(g.Name, float32(math.NaN()), g.ConstLabels) } } + +// expiresSoon checks to see if we are close enough to the cert expiring that +// we should send out a WARN log message. +// It returns true if the cert will expire within 28 days or 40% of the +// certificate's total duration (whichever is shorter). +func expiresSoon(lifetime, untilAfter time.Duration) bool { + defaultPeriod := 28 * (24 * time.Hour) // 28 days + fortyPercent := (lifetime / 10) * 4 // 40% of total duration + + warningPeriod := defaultPeriod + if fortyPercent < defaultPeriod { + warningPeriod = fortyPercent + } + + return untilAfter < warningPeriod +} diff --git a/agent/consul/leader_metrics_test.go b/agent/consul/leader_metrics_test.go new file mode 100644 index 000000000..3227bcd05 --- /dev/null +++ b/agent/consul/leader_metrics_test.go @@ -0,0 +1,37 @@ +package consul + +import ( + "testing" + "time" +) + +const ( + day = time.Hour * 24 + year = day * 365 +) + +func TestExpiresSoon(t *testing.T) { + // ExpiresSoon() should return true if 'untilAfter' is <= 28 days + // OR if 40% of lifetime if it is less than 28 days + testCases := []struct { + name string + lifetime, untilAfter time.Duration + expiresSoon bool + }{ + {name: "base-pass", lifetime: year, untilAfter: year, expiresSoon: false}, + {name: "base-expire", lifetime: year, untilAfter: (day * 27), expiresSoon: true}, + {name: "expires", lifetime: (day * 70), untilAfter: (day * 20), expiresSoon: true}, + {name: "passes", lifetime: (day * 70), untilAfter: (day * 50), expiresSoon: false}, + {name: "just-expires", lifetime: (day * 70), untilAfter: (day * 27), expiresSoon: true}, + {name: "just-passes", lifetime: (day * 70), untilAfter: (day * 43), expiresSoon: false}, + {name: "40%-expire", lifetime: (day * 30), untilAfter: (day * 10), expiresSoon: true}, + {name: "40%-pass", lifetime: (day * 30), untilAfter: (day * 12), expiresSoon: false}, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if expiresSoon(tc.lifetime, tc.untilAfter) != tc.expiresSoon { + t.Errorf("test case failed, should return `%t`", tc.expiresSoon) + } + }) + } +} diff --git a/agent/metrics.go b/agent/metrics.go index 8fd3f87a2..d9294eb25 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -30,17 +30,19 @@ func tlsCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger) cons return consul.CertExpirationMonitor{ Key: metricsKeyAgentTLSCertExpiry, Logger: logger, - Query: func() (time.Duration, error) { + Query: func() (time.Duration, time.Duration, error) { raw := c.Cert() if raw == nil { - return 0, fmt.Errorf("tls not enabled") + return 0, 0, fmt.Errorf("tls not enabled") } cert, err := x509.ParseCertificate(raw.Certificate[0]) if err != nil { - return 0, fmt.Errorf("failed to parse agent tls cert: %w", err) + return 0, 0, fmt.Errorf("failed to parse agent tls cert: %w", err) } - return time.Until(cert.NotAfter), nil + + lifetime := time.Since(cert.NotBefore) + time.Until(cert.NotAfter) + return lifetime, time.Until(cert.NotAfter), nil }, } }