telemetry: add a metric for agent TLS cert expiry

This commit is contained in:
Daniel Nephin 2021-08-04 13:05:10 -04:00
parent 57aabe3455
commit 1673b3a68c
5 changed files with 66 additions and 24 deletions

4
.changelog/10768.txt Normal file
View File

@ -0,0 +1,4 @@
```release-note:improvement
telemetry: add a new `agent.tls.cert.expiry` metric for tracking when the Agent TLS certificate expires.
```

View File

@ -639,6 +639,11 @@ func (a *Agent) Start(ctx context.Context) error {
a.logger.Warn("DEPRECATED Backwards compatibility with pre-1.9 metrics enabled. These metrics will be removed in a future version of Consul. Set `telemetry { disable_compat_1.9 = true }` to disable them.") a.logger.Warn("DEPRECATED Backwards compatibility with pre-1.9 metrics enabled. These metrics will be removed in a future version of Consul. Set `telemetry { disable_compat_1.9 = true }` to disable them.")
} }
if a.tlsConfigurator.Cert() != nil {
m := consul.AgentTLSCertExpirationMonitor(a.tlsConfigurator, a.logger, a.config.Datacenter)
go m.Monitor(&lib.StopChannelContext{StopCh: a.shutdownCh})
}
// consul version metric with labels // consul version metric with labels
metrics.SetGaugeWithLabels([]string{"version"}, 1, []metrics.Label{ metrics.SetGaugeWithLabels([]string{"version"}, 1, []metrics.Label{
{Name: "version", Value: a.config.Version}, {Name: "version", Value: a.config.Version},

View File

@ -34,8 +34,8 @@ func (s *Server) startConnectLeader(ctx context.Context) error {
s.caManager.Start(ctx) s.caManager.Start(ctx)
s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning) s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning)
s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).monitor) s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).Monitor)
s.leaderRoutineManager.Start(ctx, caSigningMetricRoutineName, signingCAExpiryMonitor(s).monitor) s.leaderRoutineManager.Start(ctx, caSigningMetricRoutineName, signingCAExpiryMonitor(s).Monitor)
return s.startIntentionConfigEntryMigration(ctx) return s.startIntentionConfigEntryMigration(ctx)
} }

View File

@ -2,18 +2,19 @@ package consul
import ( import (
"context" "context"
"crypto/x509"
"errors" "errors"
"fmt" "fmt"
"time" "time"
"github.com/hashicorp/consul/agent/connect/ca"
"github.com/hashicorp/consul/agent/connect"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus" "github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/go-hclog" "github.com/hashicorp/go-hclog"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/connect/ca"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/consul/tlsutil"
) )
var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"} var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
@ -28,10 +29,14 @@ var CertExpirationGauges = []prometheus.GaugeDefinition{
Name: metricsKeyMeshActiveSigningCAExpiry, Name: metricsKeyMeshActiveSigningCAExpiry,
Help: "Seconds until the service mesh signing certificate expires. Updated every hour", Help: "Seconds until the service mesh signing certificate expires. Updated every hour",
}, },
{
Name: metricsKeyAgentTLSCertExpiry,
Help: "Seconds until the agent tls certificate expires. Updated every hour",
},
} }
func rootCAExpiryMonitor(s *Server) certExpirationMonitor { func rootCAExpiryMonitor(s *Server) CertExpirationMonitor {
return certExpirationMonitor{ return CertExpirationMonitor{
Key: metricsKeyMeshRootCAExpiry, Key: metricsKeyMeshRootCAExpiry,
Labels: []metrics.Label{ Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter}, {Name: "datacenter", Value: s.config.Datacenter},
@ -56,10 +61,10 @@ func getRootCAExpiry(s *Server) (time.Duration, error) {
return time.Until(root.NotAfter), nil return time.Until(root.NotAfter), nil
} }
func signingCAExpiryMonitor(s *Server) certExpirationMonitor { func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
isPrimary := s.config.Datacenter == s.config.PrimaryDatacenter isPrimary := s.config.Datacenter == s.config.PrimaryDatacenter
if isPrimary { if isPrimary {
return certExpirationMonitor{ return CertExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry, Key: metricsKeyMeshActiveSigningCAExpiry,
Labels: []metrics.Label{ Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter}, {Name: "datacenter", Value: s.config.Datacenter},
@ -76,17 +81,17 @@ func signingCAExpiryMonitor(s *Server) certExpirationMonitor {
}, },
} }
} else { }
return certExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry, return CertExpirationMonitor{
Labels: []metrics.Label{ Key: metricsKeyMeshActiveSigningCAExpiry,
{Name: "datacenter", Value: s.config.Datacenter}, Labels: []metrics.Label{
}, {Name: "datacenter", Value: s.config.Datacenter},
Logger: s.logger.Named(logging.Connect), },
Query: func() (time.Duration, error) { Logger: s.logger.Named(logging.Connect),
return getActiveIntermediateExpiry(s) Query: func() (time.Duration, error) {
}, return getActiveIntermediateExpiry(s)
} },
} }
} }
@ -109,7 +114,7 @@ func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
return time.Until(cert.NotAfter), nil return time.Until(cert.NotAfter), nil
} }
type certExpirationMonitor struct { type CertExpirationMonitor struct {
Key []string Key []string
Labels []metrics.Label Labels []metrics.Label
Logger hclog.Logger Logger hclog.Logger
@ -120,7 +125,7 @@ type certExpirationMonitor struct {
const certExpirationMonitorInterval = time.Hour const certExpirationMonitorInterval = time.Hour
func (m certExpirationMonitor) monitor(ctx context.Context) error { func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
ticker := time.NewTicker(certExpirationMonitorInterval) ticker := time.NewTicker(certExpirationMonitorInterval)
defer ticker.Stop() defer ticker.Stop()
@ -138,3 +143,30 @@ func (m certExpirationMonitor) monitor(ctx context.Context) error {
} }
} }
} }
var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"}
// AgentTLSCertExpirationMonitor returns a CertExpirationMonitor which will
// monitor the expiration of the certificate used for agent TLS.
func AgentTLSCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger, dc string) CertExpirationMonitor {
return CertExpirationMonitor{
Key: metricsKeyAgentTLSCertExpiry,
Labels: []metrics.Label{
{Name: "node", Value: c.Base().NodeName},
{Name: "datacenter", Value: dc},
},
Logger: logger,
Query: func() (time.Duration, error) {
raw := c.Cert()
if raw == nil {
return 0, fmt.Errorf("tls not enabled")
}
cert, err := x509.ParseCertificate(raw.Certificate[0])
if err != nil {
return 0, fmt.Errorf("failed to parse agent tls cert: %w", err)
}
return time.Until(cert.NotAfter), nil
},
}
}

View File

@ -480,6 +480,7 @@ These metrics give insight into the health of the cluster as a whole.
| `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter | | `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter |
| `consul.mesh.active-root-ca.expiry` | The number of seconds until the root CA expires, updated every hour. | seconds | gauge | | `consul.mesh.active-root-ca.expiry` | The number of seconds until the root CA expires, updated every hour. | seconds | gauge |
| `consul.mesh.active-signing-ca.expiry` | The number of seconds until the signing CA expires, updated every hour. | seconds | gauge | | `consul.mesh.active-signing-ca.expiry` | The number of seconds until the signing CA expires, updated every hour. | seconds | gauge |
| `consul.agent.tls.cert.expiry` | The number of seconds until the Agent TLS certificate expires, updated every hour. | seconds | gauge |
## Connect Built-in Proxy Metrics ## Connect Built-in Proxy Metrics