From 1673b3a68c86b6951f16033a74000342571b1c52 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Wed, 4 Aug 2021 13:05:10 -0400 Subject: [PATCH 1/2] telemetry: add a metric for agent TLS cert expiry --- .changelog/10768.txt | 4 ++ agent/agent.go | 5 ++ agent/consul/leader_connect.go | 4 +- agent/consul/leader_metrics.go | 76 +++++++++++++++++------- website/content/docs/agent/telemetry.mdx | 1 + 5 files changed, 66 insertions(+), 24 deletions(-) create mode 100644 .changelog/10768.txt diff --git a/.changelog/10768.txt b/.changelog/10768.txt new file mode 100644 index 000000000..022205856 --- /dev/null +++ b/.changelog/10768.txt @@ -0,0 +1,4 @@ +```release-note:improvement +telemetry: add a new `agent.tls.cert.expiry` metric for tracking when the Agent TLS certificate expires. +``` + diff --git a/agent/agent.go b/agent/agent.go index c13553629..8dc6c1f0a 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -639,6 +639,11 @@ func (a *Agent) Start(ctx context.Context) error { a.logger.Warn("DEPRECATED Backwards compatibility with pre-1.9 metrics enabled. These metrics will be removed in a future version of Consul. Set `telemetry { disable_compat_1.9 = true }` to disable them.") } + if a.tlsConfigurator.Cert() != nil { + m := consul.AgentTLSCertExpirationMonitor(a.tlsConfigurator, a.logger, a.config.Datacenter) + go m.Monitor(&lib.StopChannelContext{StopCh: a.shutdownCh}) + } + // consul version metric with labels metrics.SetGaugeWithLabels([]string{"version"}, 1, []metrics.Label{ {Name: "version", Value: a.config.Version}, diff --git a/agent/consul/leader_connect.go b/agent/consul/leader_connect.go index e25edf394..a90194ec5 100644 --- a/agent/consul/leader_connect.go +++ b/agent/consul/leader_connect.go @@ -34,8 +34,8 @@ func (s *Server) startConnectLeader(ctx context.Context) error { s.caManager.Start(ctx) s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning) - s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).monitor) - s.leaderRoutineManager.Start(ctx, caSigningMetricRoutineName, signingCAExpiryMonitor(s).monitor) + s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).Monitor) + s.leaderRoutineManager.Start(ctx, caSigningMetricRoutineName, signingCAExpiryMonitor(s).Monitor) return s.startIntentionConfigEntryMigration(ctx) } diff --git a/agent/consul/leader_metrics.go b/agent/consul/leader_metrics.go index 1d40b6293..fb1eaa9cd 100644 --- a/agent/consul/leader_metrics.go +++ b/agent/consul/leader_metrics.go @@ -2,18 +2,19 @@ package consul import ( "context" + "crypto/x509" "errors" "fmt" "time" - "github.com/hashicorp/consul/agent/connect/ca" - - "github.com/hashicorp/consul/agent/connect" - "github.com/armon/go-metrics" "github.com/armon/go-metrics/prometheus" - "github.com/hashicorp/consul/logging" "github.com/hashicorp/go-hclog" + + "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/connect/ca" + "github.com/hashicorp/consul/logging" + "github.com/hashicorp/consul/tlsutil" ) var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"} @@ -28,10 +29,14 @@ var CertExpirationGauges = []prometheus.GaugeDefinition{ Name: metricsKeyMeshActiveSigningCAExpiry, Help: "Seconds until the service mesh signing certificate expires. Updated every hour", }, + { + Name: metricsKeyAgentTLSCertExpiry, + Help: "Seconds until the agent tls certificate expires. Updated every hour", + }, } -func rootCAExpiryMonitor(s *Server) certExpirationMonitor { - return certExpirationMonitor{ +func rootCAExpiryMonitor(s *Server) CertExpirationMonitor { + return CertExpirationMonitor{ Key: metricsKeyMeshRootCAExpiry, Labels: []metrics.Label{ {Name: "datacenter", Value: s.config.Datacenter}, @@ -56,10 +61,10 @@ func getRootCAExpiry(s *Server) (time.Duration, error) { return time.Until(root.NotAfter), nil } -func signingCAExpiryMonitor(s *Server) certExpirationMonitor { +func signingCAExpiryMonitor(s *Server) CertExpirationMonitor { isPrimary := s.config.Datacenter == s.config.PrimaryDatacenter if isPrimary { - return certExpirationMonitor{ + return CertExpirationMonitor{ Key: metricsKeyMeshActiveSigningCAExpiry, Labels: []metrics.Label{ {Name: "datacenter", Value: s.config.Datacenter}, @@ -76,17 +81,17 @@ func signingCAExpiryMonitor(s *Server) certExpirationMonitor { }, } - } else { - return certExpirationMonitor{ - Key: metricsKeyMeshActiveSigningCAExpiry, - Labels: []metrics.Label{ - {Name: "datacenter", Value: s.config.Datacenter}, - }, - Logger: s.logger.Named(logging.Connect), - Query: func() (time.Duration, error) { - return getActiveIntermediateExpiry(s) - }, - } + } + + return CertExpirationMonitor{ + Key: metricsKeyMeshActiveSigningCAExpiry, + Labels: []metrics.Label{ + {Name: "datacenter", Value: s.config.Datacenter}, + }, + Logger: s.logger.Named(logging.Connect), + Query: func() (time.Duration, error) { + return getActiveIntermediateExpiry(s) + }, } } @@ -109,7 +114,7 @@ func getActiveIntermediateExpiry(s *Server) (time.Duration, error) { return time.Until(cert.NotAfter), nil } -type certExpirationMonitor struct { +type CertExpirationMonitor struct { Key []string Labels []metrics.Label Logger hclog.Logger @@ -120,7 +125,7 @@ type certExpirationMonitor struct { const certExpirationMonitorInterval = time.Hour -func (m certExpirationMonitor) monitor(ctx context.Context) error { +func (m CertExpirationMonitor) Monitor(ctx context.Context) error { ticker := time.NewTicker(certExpirationMonitorInterval) defer ticker.Stop() @@ -138,3 +143,30 @@ func (m certExpirationMonitor) monitor(ctx context.Context) error { } } } + +var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"} + +// AgentTLSCertExpirationMonitor returns a CertExpirationMonitor which will +// monitor the expiration of the certificate used for agent TLS. +func AgentTLSCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger, dc string) CertExpirationMonitor { + return CertExpirationMonitor{ + Key: metricsKeyAgentTLSCertExpiry, + Labels: []metrics.Label{ + {Name: "node", Value: c.Base().NodeName}, + {Name: "datacenter", Value: dc}, + }, + Logger: logger, + Query: func() (time.Duration, error) { + raw := c.Cert() + if raw == nil { + return 0, fmt.Errorf("tls not enabled") + } + + cert, err := x509.ParseCertificate(raw.Certificate[0]) + if err != nil { + return 0, fmt.Errorf("failed to parse agent tls cert: %w", err) + } + return time.Until(cert.NotAfter), nil + }, + } +} diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index bd1dedc51..8d2438569 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -480,6 +480,7 @@ These metrics give insight into the health of the cluster as a whole. | `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter | | `consul.mesh.active-root-ca.expiry` | The number of seconds until the root CA expires, updated every hour. | seconds | gauge | | `consul.mesh.active-signing-ca.expiry` | The number of seconds until the signing CA expires, updated every hour. | seconds | gauge | +| `consul.agent.tls.cert.expiry` | The number of seconds until the Agent TLS certificate expires, updated every hour. | seconds | gauge | ## Connect Built-in Proxy Metrics From 13aa7b70d59c63e13a7184e44338b11edab11de2 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Wed, 4 Aug 2021 13:26:36 -0400 Subject: [PATCH 2/2] telemetry: fix a couple bugs in cert expiry metrics 1. do not emit the metric if Query fails 2. properly check for PrimaryUsersIntermediate, the logic was inverted Also improve the logging by including the metric name in the log message --- agent/consul/leader_metrics.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/agent/consul/leader_metrics.go b/agent/consul/leader_metrics.go index fb1eaa9cd..42ac50c37 100644 --- a/agent/consul/leader_metrics.go +++ b/agent/consul/leader_metrics.go @@ -5,6 +5,7 @@ import ( "crypto/x509" "errors" "fmt" + "strings" "time" "github.com/armon/go-metrics" @@ -73,12 +74,10 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor { Query: func() (time.Duration, error) { provider, _ := s.caManager.getCAProvider() - if _, ok := provider.(ca.PrimaryUsesIntermediate); !ok { + if _, ok := provider.(ca.PrimaryUsesIntermediate); ok { return getActiveIntermediateExpiry(s) } - return getRootCAExpiry(s) - }, } } @@ -129,6 +128,8 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error { ticker := time.NewTicker(certExpirationMonitorInterval) defer ticker.Stop() + logger := m.Logger.With("metric", strings.Join(m.Key, ".")) + for { select { case <-ctx.Done(): @@ -136,7 +137,8 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error { case <-ticker.C: d, err := m.Query() if err != nil { - m.Logger.Warn("failed to emit certificate expiry metric", "error", err) + logger.Warn("failed to emit certificate expiry metric", "error", err) + continue } expiry := d / time.Second metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)