30d3a087dc
The old setting of 24 hours was not enough time to deal with an expiring certificates. This change ups it to 28 days OR 40% of the full cert duration, whichever is shorter. It also adds details to the log message to indicate which certificate it is logging about and a suggested action.
198 lines
6 KiB
Go
198 lines
6 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
package consul
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"math"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/armon/go-metrics"
|
|
"github.com/armon/go-metrics/prometheus"
|
|
"github.com/hashicorp/go-hclog"
|
|
|
|
"github.com/hashicorp/consul/agent/connect"
|
|
"github.com/hashicorp/consul/logging"
|
|
)
|
|
|
|
var (
|
|
metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
|
|
metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"}
|
|
)
|
|
|
|
var LeaderCertExpirationGauges = []prometheus.GaugeDefinition{
|
|
{
|
|
Name: metricsKeyMeshRootCAExpiry,
|
|
Help: "Seconds until the service mesh root certificate expires. Updated every hour",
|
|
},
|
|
{
|
|
Name: metricsKeyMeshActiveSigningCAExpiry,
|
|
Help: "Seconds until the service mesh signing certificate expires. Updated every hour",
|
|
},
|
|
}
|
|
|
|
func rootCAExpiryMonitor(s *Server) CertExpirationMonitor {
|
|
return CertExpirationMonitor{
|
|
Key: metricsKeyMeshRootCAExpiry,
|
|
Logger: s.logger.Named(logging.Connect),
|
|
Query: func() (time.Duration, time.Duration, error) {
|
|
return getRootCAExpiry(s)
|
|
},
|
|
}
|
|
}
|
|
|
|
func getRootCAExpiry(s *Server) (time.Duration, time.Duration, error) {
|
|
state := s.fsm.State()
|
|
_, root, err := state.CARootActive(nil)
|
|
switch {
|
|
case err != nil:
|
|
return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err)
|
|
case root == nil:
|
|
return 0, 0, fmt.Errorf("no active root CA")
|
|
}
|
|
|
|
lifetime := time.Since(root.NotBefore) + time.Until(root.NotAfter)
|
|
return lifetime, time.Until(root.NotAfter), nil
|
|
}
|
|
|
|
func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
|
|
return CertExpirationMonitor{
|
|
Key: metricsKeyMeshActiveSigningCAExpiry,
|
|
Logger: s.logger.Named(logging.Connect),
|
|
Query: func() (time.Duration, time.Duration, error) {
|
|
if s.caManager.isIntermediateUsedToSignLeaf() {
|
|
return getActiveIntermediateExpiry(s)
|
|
}
|
|
return getRootCAExpiry(s)
|
|
},
|
|
}
|
|
}
|
|
|
|
func getActiveIntermediateExpiry(s *Server) (time.Duration, time.Duration, error) {
|
|
state := s.fsm.State()
|
|
_, root, err := state.CARootActive(nil)
|
|
switch {
|
|
case err != nil:
|
|
return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err)
|
|
case root == nil:
|
|
return 0, 0, fmt.Errorf("no active root CA")
|
|
}
|
|
|
|
// the CA used in a secondary DC is the active intermediate,
|
|
// which is the last in the IntermediateCerts stack
|
|
if len(root.IntermediateCerts) == 0 {
|
|
return 0, 0, errors.New("no intermediate available")
|
|
}
|
|
cert, err := connect.ParseCert(root.IntermediateCerts[len(root.IntermediateCerts)-1])
|
|
if err != nil {
|
|
return 0, 0, err
|
|
}
|
|
|
|
lifetime := time.Since(cert.NotBefore) + time.Until(cert.NotAfter)
|
|
return lifetime, time.Until(cert.NotAfter), nil
|
|
}
|
|
|
|
type CertExpirationMonitor struct {
|
|
Key []string
|
|
// Labels to be emitted along with the metric. It is very important that these
|
|
// labels be included in the pre-declaration as well. Otherwise, if
|
|
// telemetry.prometheus_retention_time is less than certExpirationMonitorInterval
|
|
// then the metrics will expire before they are emitted again.
|
|
Labels []metrics.Label
|
|
Logger hclog.Logger
|
|
// Query is called at each interval. It should return 2 durations, the full
|
|
// lifespan of the certificate (NotBefore -> NotAfter) and the duration
|
|
// until the certificate expires (Now -> NotAfter), or an error if the
|
|
// query failed.
|
|
Query func() (time.Duration, time.Duration, error)
|
|
}
|
|
|
|
const certExpirationMonitorInterval = time.Hour
|
|
|
|
func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
|
|
ticker := time.NewTicker(certExpirationMonitorInterval)
|
|
defer ticker.Stop()
|
|
|
|
logger := m.Logger.With("metric", strings.Join(m.Key, "."))
|
|
|
|
emitMetric := func() {
|
|
lifetime, untilAfter, err := m.Query()
|
|
if err != nil {
|
|
logger.Warn("failed to emit certificate expiry metric", "error", err)
|
|
return
|
|
}
|
|
|
|
if expiresSoon(lifetime, untilAfter) {
|
|
key := strings.Join(m.Key, ":")
|
|
switch key {
|
|
case "mesh:active-root-ca:expiry":
|
|
logger.Warn("root certificate will expire soon",
|
|
"time_to_expiry", untilAfter,
|
|
"expiration", time.Now().Add(untilAfter),
|
|
"suggested_action", "manually rotate the root certificate",
|
|
)
|
|
case "mesh:active-signing-ca:expiry":
|
|
logger.Warn("signing (intermediate) certificate will expire soon",
|
|
"time_to_expiry", untilAfter,
|
|
"expiration", time.Now().Add(untilAfter),
|
|
"suggested_action", "check consul logs for rotation issues",
|
|
)
|
|
case "agent:tls:cert:expiry":
|
|
logger.Warn("agent TLS certificate will expire soon",
|
|
"time_to_expiry", untilAfter,
|
|
"expiration", time.Now().Add(untilAfter),
|
|
"suggested_action", "manually rotate this agent's certificate",
|
|
)
|
|
}
|
|
}
|
|
|
|
expiry := untilAfter / time.Second
|
|
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
|
|
}
|
|
|
|
// emit the metric immediately so that if a cert was just updated the
|
|
// new metric will be updated to the new expiration time.
|
|
emitMetric()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
// "Zero-out" the metric on exit so that when prometheus scrapes this
|
|
// metric from a non-leader, it does not get a stale value.
|
|
metrics.SetGaugeWithLabels(m.Key, float32(math.NaN()), m.Labels)
|
|
return nil
|
|
case <-ticker.C:
|
|
emitMetric()
|
|
}
|
|
}
|
|
}
|
|
|
|
// initLeaderMetrics sets all metrics that are emitted only on leaders to a NaN
|
|
// value so that they don't incorrectly report 0 when a server starts as a
|
|
// follower.
|
|
func initLeaderMetrics() {
|
|
for _, g := range LeaderCertExpirationGauges {
|
|
metrics.SetGaugeWithLabels(g.Name, float32(math.NaN()), g.ConstLabels)
|
|
}
|
|
}
|
|
|
|
// expiresSoon checks to see if we are close enough to the cert expiring that
|
|
// we should send out a WARN log message.
|
|
// It returns true if the cert will expire within 28 days or 40% of the
|
|
// certificate's total duration (whichever is shorter).
|
|
func expiresSoon(lifetime, untilAfter time.Duration) bool {
|
|
defaultPeriod := 28 * (24 * time.Hour) // 28 days
|
|
fortyPercent := (lifetime / 10) * 4 // 40% of total duration
|
|
|
|
warningPeriod := defaultPeriod
|
|
if fortyPercent < defaultPeriod {
|
|
warningPeriod = fortyPercent
|
|
}
|
|
|
|
return untilAfter < warningPeriod
|
|
}
|