diff --git a/agent/consul/leader_metrics.go b/agent/consul/leader_metrics.go index bd36a52f6..188e409e3 100644 --- a/agent/consul/leader_metrics.go +++ b/agent/consul/leader_metrics.go @@ -19,8 +19,10 @@ import ( "github.com/hashicorp/consul/logging" ) -var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"} -var metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"} +var ( + metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"} + metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"} +) var LeaderCertExpirationGauges = []prometheus.GaugeDefinition{ { @@ -37,30 +39,31 @@ func rootCAExpiryMonitor(s *Server) CertExpirationMonitor { return CertExpirationMonitor{ Key: metricsKeyMeshRootCAExpiry, Logger: s.logger.Named(logging.Connect), - Query: func() (time.Duration, error) { + Query: func() (time.Duration, time.Duration, error) { return getRootCAExpiry(s) }, } } -func getRootCAExpiry(s *Server) (time.Duration, error) { +func getRootCAExpiry(s *Server) (time.Duration, time.Duration, error) { state := s.fsm.State() _, root, err := state.CARootActive(nil) switch { case err != nil: - return 0, fmt.Errorf("failed to retrieve root CA: %w", err) + return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err) case root == nil: - return 0, fmt.Errorf("no active root CA") + return 0, 0, fmt.Errorf("no active root CA") } - return time.Until(root.NotAfter), nil + lifetime := time.Since(root.NotBefore) + time.Until(root.NotAfter) + return lifetime, time.Until(root.NotAfter), nil } func signingCAExpiryMonitor(s *Server) CertExpirationMonitor { return CertExpirationMonitor{ Key: metricsKeyMeshActiveSigningCAExpiry, Logger: s.logger.Named(logging.Connect), - Query: func() (time.Duration, error) { + Query: func() (time.Duration, time.Duration, error) { if s.caManager.isIntermediateUsedToSignLeaf() { return getActiveIntermediateExpiry(s) } @@ -69,26 +72,28 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor { } } -func getActiveIntermediateExpiry(s *Server) (time.Duration, error) { +func getActiveIntermediateExpiry(s *Server) (time.Duration, time.Duration, error) { state := s.fsm.State() _, root, err := state.CARootActive(nil) switch { case err != nil: - return 0, fmt.Errorf("failed to retrieve root CA: %w", err) + return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err) case root == nil: - return 0, fmt.Errorf("no active root CA") + return 0, 0, fmt.Errorf("no active root CA") } // the CA used in a secondary DC is the active intermediate, // which is the last in the IntermediateCerts stack if len(root.IntermediateCerts) == 0 { - return 0, errors.New("no intermediate available") + return 0, 0, errors.New("no intermediate available") } cert, err := connect.ParseCert(root.IntermediateCerts[len(root.IntermediateCerts)-1]) if err != nil { - return 0, err + return 0, 0, err } - return time.Until(cert.NotAfter), nil + + lifetime := time.Since(cert.NotBefore) + time.Until(cert.NotAfter) + return lifetime, time.Until(cert.NotAfter), nil } type CertExpirationMonitor struct { @@ -99,9 +104,11 @@ type CertExpirationMonitor struct { // then the metrics will expire before they are emitted again. Labels []metrics.Label Logger hclog.Logger - // Query is called at each interval. It should return the duration until the - // certificate expires, or an error if the query failed. - Query func() (time.Duration, error) + // Query is called at each interval. It should return 2 durations, the full + // lifespan of the certificate (NotBefore -> NotAfter) and the duration + // until the certificate expires (Now -> NotAfter), or an error if the + // query failed. + Query func() (time.Duration, time.Duration, error) } const certExpirationMonitorInterval = time.Hour @@ -113,18 +120,37 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error { logger := m.Logger.With("metric", strings.Join(m.Key, ".")) emitMetric := func() { - d, err := m.Query() + lifetime, untilAfter, err := m.Query() if err != nil { logger.Warn("failed to emit certificate expiry metric", "error", err) return } - if d < 24*time.Hour { - logger.Warn("certificate will expire soon", - "time_to_expiry", d, "expiration", time.Now().Add(d)) + if expiresSoon(lifetime, untilAfter) { + key := strings.Join(m.Key, ":") + switch key { + case "mesh:active-root-ca:expiry": + logger.Warn("root certificate will expire soon", + "time_to_expiry", untilAfter, + "expiration", time.Now().Add(untilAfter), + "suggested_action", "manually rotate the root certificate", + ) + case "mesh:active-signing-ca:expiry": + logger.Warn("signing (intermediate) certificate will expire soon", + "time_to_expiry", untilAfter, + "expiration", time.Now().Add(untilAfter), + "suggested_action", "check consul logs for rotation issues", + ) + case "agent:tls:cert:expiry": + logger.Warn("agent TLS certificate will expire soon", + "time_to_expiry", untilAfter, + "expiration", time.Now().Add(untilAfter), + "suggested_action", "manually rotate this agent's certificate", + ) + } } - expiry := d / time.Second + expiry := untilAfter / time.Second metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels) } @@ -153,3 +179,19 @@ func initLeaderMetrics() { metrics.SetGaugeWithLabels(g.Name, float32(math.NaN()), g.ConstLabels) } } + +// expiresSoon checks to see if we are close enough to the cert expiring that +// we should send out a WARN log message. +// It returns true if the cert will expire within 28 days or 40% of the +// certificate's total duration (whichever is shorter). +func expiresSoon(lifetime, untilAfter time.Duration) bool { + defaultPeriod := 28 * (24 * time.Hour) // 28 days + fortyPercent := (lifetime / 10) * 4 // 40% of total duration + + warningPeriod := defaultPeriod + if fortyPercent < defaultPeriod { + warningPeriod = fortyPercent + } + + return untilAfter < warningPeriod +} diff --git a/agent/consul/leader_metrics_test.go b/agent/consul/leader_metrics_test.go new file mode 100644 index 000000000..3227bcd05 --- /dev/null +++ b/agent/consul/leader_metrics_test.go @@ -0,0 +1,37 @@ +package consul + +import ( + "testing" + "time" +) + +const ( + day = time.Hour * 24 + year = day * 365 +) + +func TestExpiresSoon(t *testing.T) { + // ExpiresSoon() should return true if 'untilAfter' is <= 28 days + // OR if 40% of lifetime if it is less than 28 days + testCases := []struct { + name string + lifetime, untilAfter time.Duration + expiresSoon bool + }{ + {name: "base-pass", lifetime: year, untilAfter: year, expiresSoon: false}, + {name: "base-expire", lifetime: year, untilAfter: (day * 27), expiresSoon: true}, + {name: "expires", lifetime: (day * 70), untilAfter: (day * 20), expiresSoon: true}, + {name: "passes", lifetime: (day * 70), untilAfter: (day * 50), expiresSoon: false}, + {name: "just-expires", lifetime: (day * 70), untilAfter: (day * 27), expiresSoon: true}, + {name: "just-passes", lifetime: (day * 70), untilAfter: (day * 43), expiresSoon: false}, + {name: "40%-expire", lifetime: (day * 30), untilAfter: (day * 10), expiresSoon: true}, + {name: "40%-pass", lifetime: (day * 30), untilAfter: (day * 12), expiresSoon: false}, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if expiresSoon(tc.lifetime, tc.untilAfter) != tc.expiresSoon { + t.Errorf("test case failed, should return `%t`", tc.expiresSoon) + } + }) + } +} diff --git a/agent/metrics.go b/agent/metrics.go index 8fd3f87a2..d9294eb25 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -30,17 +30,19 @@ func tlsCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger) cons return consul.CertExpirationMonitor{ Key: metricsKeyAgentTLSCertExpiry, Logger: logger, - Query: func() (time.Duration, error) { + Query: func() (time.Duration, time.Duration, error) { raw := c.Cert() if raw == nil { - return 0, fmt.Errorf("tls not enabled") + return 0, 0, fmt.Errorf("tls not enabled") } cert, err := x509.ParseCertificate(raw.Certificate[0]) if err != nil { - return 0, fmt.Errorf("failed to parse agent tls cert: %w", err) + return 0, 0, fmt.Errorf("failed to parse agent tls cert: %w", err) } - return time.Until(cert.NotAfter), nil + + lifetime := time.Since(cert.NotBefore) + time.Until(cert.NotAfter) + return lifetime, time.Until(cert.NotAfter), nil }, } }