diff --git a/.changelog/10768.txt b/.changelog/10768.txt new file mode 100644 index 000000000..022205856 --- /dev/null +++ b/.changelog/10768.txt @@ -0,0 +1,4 @@ +```release-note:improvement +telemetry: add a new `agent.tls.cert.expiry` metric for tracking when the Agent TLS certificate expires. +``` + diff --git a/agent/agent.go b/agent/agent.go index c13553629..8dc6c1f0a 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -639,6 +639,11 @@ func (a *Agent) Start(ctx context.Context) error { a.logger.Warn("DEPRECATED Backwards compatibility with pre-1.9 metrics enabled. These metrics will be removed in a future version of Consul. Set `telemetry { disable_compat_1.9 = true }` to disable them.") } + if a.tlsConfigurator.Cert() != nil { + m := consul.AgentTLSCertExpirationMonitor(a.tlsConfigurator, a.logger, a.config.Datacenter) + go m.Monitor(&lib.StopChannelContext{StopCh: a.shutdownCh}) + } + // consul version metric with labels metrics.SetGaugeWithLabels([]string{"version"}, 1, []metrics.Label{ {Name: "version", Value: a.config.Version}, diff --git a/agent/consul/leader_connect.go b/agent/consul/leader_connect.go index e25edf394..a90194ec5 100644 --- a/agent/consul/leader_connect.go +++ b/agent/consul/leader_connect.go @@ -34,8 +34,8 @@ func (s *Server) startConnectLeader(ctx context.Context) error { s.caManager.Start(ctx) s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning) - s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).monitor) - s.leaderRoutineManager.Start(ctx, caSigningMetricRoutineName, signingCAExpiryMonitor(s).monitor) + s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).Monitor) + s.leaderRoutineManager.Start(ctx, caSigningMetricRoutineName, signingCAExpiryMonitor(s).Monitor) return s.startIntentionConfigEntryMigration(ctx) } diff --git a/agent/consul/leader_metrics.go b/agent/consul/leader_metrics.go index 1d40b6293..42ac50c37 100644 --- a/agent/consul/leader_metrics.go +++ b/agent/consul/leader_metrics.go @@ -2,18 +2,20 @@ package consul import ( "context" + "crypto/x509" "errors" "fmt" + "strings" "time" - "github.com/hashicorp/consul/agent/connect/ca" - - "github.com/hashicorp/consul/agent/connect" - "github.com/armon/go-metrics" "github.com/armon/go-metrics/prometheus" - "github.com/hashicorp/consul/logging" "github.com/hashicorp/go-hclog" + + "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/connect/ca" + "github.com/hashicorp/consul/logging" + "github.com/hashicorp/consul/tlsutil" ) var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"} @@ -28,10 +30,14 @@ var CertExpirationGauges = []prometheus.GaugeDefinition{ Name: metricsKeyMeshActiveSigningCAExpiry, Help: "Seconds until the service mesh signing certificate expires. Updated every hour", }, + { + Name: metricsKeyAgentTLSCertExpiry, + Help: "Seconds until the agent tls certificate expires. Updated every hour", + }, } -func rootCAExpiryMonitor(s *Server) certExpirationMonitor { - return certExpirationMonitor{ +func rootCAExpiryMonitor(s *Server) CertExpirationMonitor { + return CertExpirationMonitor{ Key: metricsKeyMeshRootCAExpiry, Labels: []metrics.Label{ {Name: "datacenter", Value: s.config.Datacenter}, @@ -56,10 +62,10 @@ func getRootCAExpiry(s *Server) (time.Duration, error) { return time.Until(root.NotAfter), nil } -func signingCAExpiryMonitor(s *Server) certExpirationMonitor { +func signingCAExpiryMonitor(s *Server) CertExpirationMonitor { isPrimary := s.config.Datacenter == s.config.PrimaryDatacenter if isPrimary { - return certExpirationMonitor{ + return CertExpirationMonitor{ Key: metricsKeyMeshActiveSigningCAExpiry, Labels: []metrics.Label{ {Name: "datacenter", Value: s.config.Datacenter}, @@ -68,25 +74,23 @@ func signingCAExpiryMonitor(s *Server) certExpirationMonitor { Query: func() (time.Duration, error) { provider, _ := s.caManager.getCAProvider() - if _, ok := provider.(ca.PrimaryUsesIntermediate); !ok { + if _, ok := provider.(ca.PrimaryUsesIntermediate); ok { return getActiveIntermediateExpiry(s) } - return getRootCAExpiry(s) + }, + } + } - }, - } - } else { - return certExpirationMonitor{ - Key: metricsKeyMeshActiveSigningCAExpiry, - Labels: []metrics.Label{ - {Name: "datacenter", Value: s.config.Datacenter}, - }, - Logger: s.logger.Named(logging.Connect), - Query: func() (time.Duration, error) { - return getActiveIntermediateExpiry(s) - }, - } + return CertExpirationMonitor{ + Key: metricsKeyMeshActiveSigningCAExpiry, + Labels: []metrics.Label{ + {Name: "datacenter", Value: s.config.Datacenter}, + }, + Logger: s.logger.Named(logging.Connect), + Query: func() (time.Duration, error) { + return getActiveIntermediateExpiry(s) + }, } } @@ -109,7 +113,7 @@ func getActiveIntermediateExpiry(s *Server) (time.Duration, error) { return time.Until(cert.NotAfter), nil } -type certExpirationMonitor struct { +type CertExpirationMonitor struct { Key []string Labels []metrics.Label Logger hclog.Logger @@ -120,10 +124,12 @@ type certExpirationMonitor struct { const certExpirationMonitorInterval = time.Hour -func (m certExpirationMonitor) monitor(ctx context.Context) error { +func (m CertExpirationMonitor) Monitor(ctx context.Context) error { ticker := time.NewTicker(certExpirationMonitorInterval) defer ticker.Stop() + logger := m.Logger.With("metric", strings.Join(m.Key, ".")) + for { select { case <-ctx.Done(): @@ -131,10 +137,38 @@ func (m certExpirationMonitor) monitor(ctx context.Context) error { case <-ticker.C: d, err := m.Query() if err != nil { - m.Logger.Warn("failed to emit certificate expiry metric", "error", err) + logger.Warn("failed to emit certificate expiry metric", "error", err) + continue } expiry := d / time.Second metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels) } } } + +var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"} + +// AgentTLSCertExpirationMonitor returns a CertExpirationMonitor which will +// monitor the expiration of the certificate used for agent TLS. +func AgentTLSCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger, dc string) CertExpirationMonitor { + return CertExpirationMonitor{ + Key: metricsKeyAgentTLSCertExpiry, + Labels: []metrics.Label{ + {Name: "node", Value: c.Base().NodeName}, + {Name: "datacenter", Value: dc}, + }, + Logger: logger, + Query: func() (time.Duration, error) { + raw := c.Cert() + if raw == nil { + return 0, fmt.Errorf("tls not enabled") + } + + cert, err := x509.ParseCertificate(raw.Certificate[0]) + if err != nil { + return 0, fmt.Errorf("failed to parse agent tls cert: %w", err) + } + return time.Until(cert.NotAfter), nil + }, + } +} diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index bd1dedc51..8d2438569 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -480,6 +480,7 @@ These metrics give insight into the health of the cluster as a whole. | `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter | | `consul.mesh.active-root-ca.expiry` | The number of seconds until the root CA expires, updated every hour. | seconds | gauge | | `consul.mesh.active-signing-ca.expiry` | The number of seconds until the signing CA expires, updated every hour. | seconds | gauge | +| `consul.agent.tls.cert.expiry` | The number of seconds until the Agent TLS certificate expires, updated every hour. | seconds | gauge | ## Connect Built-in Proxy Metrics