log warning about certificate expiring sooner and with more details

The old setting of 24 hours was not enough time to deal with an expiring certificates. This change ups it to 28 days OR 40% of the full cert duration, whichever is shorter. It also adds details to the log message to indicate which certificate it is logging about and a suggested action.
This commit is contained in:
John Eikenberry 2023-04-07 20:38:07 +00:00 committed by GitHub
parent 7933543096
commit 30d3a087dc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 107 additions and 26 deletions

View File

@ -19,8 +19,10 @@ import (
"github.com/hashicorp/consul/logging"
)
var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
var metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"}
var (
metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"}
)
var LeaderCertExpirationGauges = []prometheus.GaugeDefinition{
{
@ -37,30 +39,31 @@ func rootCAExpiryMonitor(s *Server) CertExpirationMonitor {
return CertExpirationMonitor{
Key: metricsKeyMeshRootCAExpiry,
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
Query: func() (time.Duration, time.Duration, error) {
return getRootCAExpiry(s)
},
}
}
func getRootCAExpiry(s *Server) (time.Duration, error) {
func getRootCAExpiry(s *Server) (time.Duration, time.Duration, error) {
state := s.fsm.State()
_, root, err := state.CARootActive(nil)
switch {
case err != nil:
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err)
case root == nil:
return 0, fmt.Errorf("no active root CA")
return 0, 0, fmt.Errorf("no active root CA")
}
return time.Until(root.NotAfter), nil
lifetime := time.Since(root.NotBefore) + time.Until(root.NotAfter)
return lifetime, time.Until(root.NotAfter), nil
}
func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
return CertExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry,
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
Query: func() (time.Duration, time.Duration, error) {
if s.caManager.isIntermediateUsedToSignLeaf() {
return getActiveIntermediateExpiry(s)
}
@ -69,26 +72,28 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
}
}
func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
func getActiveIntermediateExpiry(s *Server) (time.Duration, time.Duration, error) {
state := s.fsm.State()
_, root, err := state.CARootActive(nil)
switch {
case err != nil:
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
return 0, 0, fmt.Errorf("failed to retrieve root CA: %w", err)
case root == nil:
return 0, fmt.Errorf("no active root CA")
return 0, 0, fmt.Errorf("no active root CA")
}
// the CA used in a secondary DC is the active intermediate,
// which is the last in the IntermediateCerts stack
if len(root.IntermediateCerts) == 0 {
return 0, errors.New("no intermediate available")
return 0, 0, errors.New("no intermediate available")
}
cert, err := connect.ParseCert(root.IntermediateCerts[len(root.IntermediateCerts)-1])
if err != nil {
return 0, err
return 0, 0, err
}
return time.Until(cert.NotAfter), nil
lifetime := time.Since(cert.NotBefore) + time.Until(cert.NotAfter)
return lifetime, time.Until(cert.NotAfter), nil
}
type CertExpirationMonitor struct {
@ -99,9 +104,11 @@ type CertExpirationMonitor struct {
// then the metrics will expire before they are emitted again.
Labels []metrics.Label
Logger hclog.Logger
// Query is called at each interval. It should return the duration until the
// certificate expires, or an error if the query failed.
Query func() (time.Duration, error)
// Query is called at each interval. It should return 2 durations, the full
// lifespan of the certificate (NotBefore -> NotAfter) and the duration
// until the certificate expires (Now -> NotAfter), or an error if the
// query failed.
Query func() (time.Duration, time.Duration, error)
}
const certExpirationMonitorInterval = time.Hour
@ -113,18 +120,37 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
logger := m.Logger.With("metric", strings.Join(m.Key, "."))
emitMetric := func() {
d, err := m.Query()
lifetime, untilAfter, err := m.Query()
if err != nil {
logger.Warn("failed to emit certificate expiry metric", "error", err)
return
}
if d < 24*time.Hour {
logger.Warn("certificate will expire soon",
"time_to_expiry", d, "expiration", time.Now().Add(d))
if expiresSoon(lifetime, untilAfter) {
key := strings.Join(m.Key, ":")
switch key {
case "mesh:active-root-ca:expiry":
logger.Warn("root certificate will expire soon",
"time_to_expiry", untilAfter,
"expiration", time.Now().Add(untilAfter),
"suggested_action", "manually rotate the root certificate",
)
case "mesh:active-signing-ca:expiry":
logger.Warn("signing (intermediate) certificate will expire soon",
"time_to_expiry", untilAfter,
"expiration", time.Now().Add(untilAfter),
"suggested_action", "check consul logs for rotation issues",
)
case "agent:tls:cert:expiry":
logger.Warn("agent TLS certificate will expire soon",
"time_to_expiry", untilAfter,
"expiration", time.Now().Add(untilAfter),
"suggested_action", "manually rotate this agent's certificate",
)
}
}
expiry := d / time.Second
expiry := untilAfter / time.Second
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
}
@ -153,3 +179,19 @@ func initLeaderMetrics() {
metrics.SetGaugeWithLabels(g.Name, float32(math.NaN()), g.ConstLabels)
}
}
// expiresSoon checks to see if we are close enough to the cert expiring that
// we should send out a WARN log message.
// It returns true if the cert will expire within 28 days or 40% of the
// certificate's total duration (whichever is shorter).
func expiresSoon(lifetime, untilAfter time.Duration) bool {
defaultPeriod := 28 * (24 * time.Hour) // 28 days
fortyPercent := (lifetime / 10) * 4 // 40% of total duration
warningPeriod := defaultPeriod
if fortyPercent < defaultPeriod {
warningPeriod = fortyPercent
}
return untilAfter < warningPeriod
}

View File

@ -0,0 +1,37 @@
package consul
import (
"testing"
"time"
)
const (
day = time.Hour * 24
year = day * 365
)
func TestExpiresSoon(t *testing.T) {
// ExpiresSoon() should return true if 'untilAfter' is <= 28 days
// OR if 40% of lifetime if it is less than 28 days
testCases := []struct {
name string
lifetime, untilAfter time.Duration
expiresSoon bool
}{
{name: "base-pass", lifetime: year, untilAfter: year, expiresSoon: false},
{name: "base-expire", lifetime: year, untilAfter: (day * 27), expiresSoon: true},
{name: "expires", lifetime: (day * 70), untilAfter: (day * 20), expiresSoon: true},
{name: "passes", lifetime: (day * 70), untilAfter: (day * 50), expiresSoon: false},
{name: "just-expires", lifetime: (day * 70), untilAfter: (day * 27), expiresSoon: true},
{name: "just-passes", lifetime: (day * 70), untilAfter: (day * 43), expiresSoon: false},
{name: "40%-expire", lifetime: (day * 30), untilAfter: (day * 10), expiresSoon: true},
{name: "40%-pass", lifetime: (day * 30), untilAfter: (day * 12), expiresSoon: false},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if expiresSoon(tc.lifetime, tc.untilAfter) != tc.expiresSoon {
t.Errorf("test case failed, should return `%t`", tc.expiresSoon)
}
})
}
}

View File

@ -30,17 +30,19 @@ func tlsCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger) cons
return consul.CertExpirationMonitor{
Key: metricsKeyAgentTLSCertExpiry,
Logger: logger,
Query: func() (time.Duration, error) {
Query: func() (time.Duration, time.Duration, error) {
raw := c.Cert()
if raw == nil {
return 0, fmt.Errorf("tls not enabled")
return 0, 0, fmt.Errorf("tls not enabled")
}
cert, err := x509.ParseCertificate(raw.Certificate[0])
if err != nil {
return 0, fmt.Errorf("failed to parse agent tls cert: %w", err)
return 0, 0, fmt.Errorf("failed to parse agent tls cert: %w", err)
}
return time.Until(cert.NotAfter), nil
lifetime := time.Since(cert.NotBefore) + time.Until(cert.NotAfter)
return lifetime, time.Until(cert.NotAfter), nil
},
}
}