diff --git a/.changelog/10504.txt b/.changelog/10504.txt new file mode 100644 index 000000000..9ace691ea --- /dev/null +++ b/.changelog/10504.txt @@ -0,0 +1,3 @@ +```release-note:enhancement +telemetry: added metrics to track certificates expiry. +``` diff --git a/agent/consul/leader_connect.go b/agent/consul/leader_connect.go index 5f662e8b9..6ca14e4ce 100644 --- a/agent/consul/leader_connect.go +++ b/agent/consul/leader_connect.go @@ -37,6 +37,7 @@ func (s *Server) startConnectLeader(ctx context.Context) error { s.caManager.Start(ctx) s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning) s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).monitor) + s.leaderRoutineManager.Start(ctx, caSigningMetricRoutineName, signingCAExpiryMonitor(s).monitor) return s.startIntentionConfigEntryMigration(ctx) } @@ -46,6 +47,8 @@ func (s *Server) stopConnectLeader() { s.caManager.Stop() s.leaderRoutineManager.Stop(intentionMigrationRoutineName) s.leaderRoutineManager.Stop(caRootPruningRoutineName) + s.leaderRoutineManager.Stop(caRootMetricRoutineName) + s.leaderRoutineManager.Stop(caSigningMetricRoutineName) // If the provider implements NeedsStop, we call Stop to perform any shutdown actions. provider, _ := s.caManager.getCAProvider() diff --git a/agent/consul/leader_metrics.go b/agent/consul/leader_metrics.go index ea936a0a9..1d40b6293 100644 --- a/agent/consul/leader_metrics.go +++ b/agent/consul/leader_metrics.go @@ -2,25 +2,34 @@ package consul import ( "context" + "errors" "fmt" "time" + "github.com/hashicorp/consul/agent/connect/ca" + + "github.com/hashicorp/consul/agent/connect" + "github.com/armon/go-metrics" "github.com/armon/go-metrics/prometheus" - "github.com/hashicorp/go-hclog" - "github.com/hashicorp/consul/logging" + "github.com/hashicorp/go-hclog" ) +var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"} +var metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"} + var CertExpirationGauges = []prometheus.GaugeDefinition{ { Name: metricsKeyMeshRootCAExpiry, - Help: "Seconds until the service mesh root certificate expires.", + Help: "Seconds until the service mesh root certificate expires. Updated every hour", + }, + { + Name: metricsKeyMeshActiveSigningCAExpiry, + Help: "Seconds until the service mesh signing certificate expires. Updated every hour", }, } -var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"} - func rootCAExpiryMonitor(s *Server) certExpirationMonitor { return certExpirationMonitor{ Key: metricsKeyMeshRootCAExpiry, @@ -29,20 +38,77 @@ func rootCAExpiryMonitor(s *Server) certExpirationMonitor { }, Logger: s.logger.Named(logging.Connect), Query: func() (time.Duration, error) { - state := s.fsm.State() - _, root, err := state.CARootActive(nil) - switch { - case err != nil: - return 0, fmt.Errorf("failed to retrieve root CA: %w", err) - case root == nil: - return 0, fmt.Errorf("no active root CA") - } - - return time.Until(root.NotAfter), nil + return getRootCAExpiry(s) }, } } +func getRootCAExpiry(s *Server) (time.Duration, error) { + state := s.fsm.State() + _, root, err := state.CARootActive(nil) + switch { + case err != nil: + return 0, fmt.Errorf("failed to retrieve root CA: %w", err) + case root == nil: + return 0, fmt.Errorf("no active root CA") + } + + return time.Until(root.NotAfter), nil +} + +func signingCAExpiryMonitor(s *Server) certExpirationMonitor { + isPrimary := s.config.Datacenter == s.config.PrimaryDatacenter + if isPrimary { + return certExpirationMonitor{ + Key: metricsKeyMeshActiveSigningCAExpiry, + Labels: []metrics.Label{ + {Name: "datacenter", Value: s.config.Datacenter}, + }, + Logger: s.logger.Named(logging.Connect), + Query: func() (time.Duration, error) { + provider, _ := s.caManager.getCAProvider() + + if _, ok := provider.(ca.PrimaryUsesIntermediate); !ok { + return getActiveIntermediateExpiry(s) + } + + return getRootCAExpiry(s) + + }, + } + } else { + return certExpirationMonitor{ + Key: metricsKeyMeshActiveSigningCAExpiry, + Labels: []metrics.Label{ + {Name: "datacenter", Value: s.config.Datacenter}, + }, + Logger: s.logger.Named(logging.Connect), + Query: func() (time.Duration, error) { + return getActiveIntermediateExpiry(s) + }, + } + } +} + +func getActiveIntermediateExpiry(s *Server) (time.Duration, error) { + state := s.fsm.State() + _, root, err := state.CARootActive(nil) + if err != nil { + return 0, err + } + + // the CA used in a secondary DC is the active intermediate, + // which is the last in the IntermediateCerts stack + if len(root.IntermediateCerts) == 0 { + return 0, errors.New("no intermediate available") + } + cert, err := connect.ParseCert(root.IntermediateCerts[len(root.IntermediateCerts)-1]) + if err != nil { + return 0, err + } + return time.Until(cert.NotAfter), nil +} + type certExpirationMonitor struct { Key []string Labels []metrics.Label diff --git a/agent/consul/server.go b/agent/consul/server.go index 210f61016..9b44b090a 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -103,6 +103,7 @@ const ( aclUpgradeRoutineName = "legacy ACL token upgrade" caRootPruningRoutineName = "CA root pruning" caRootMetricRoutineName = "CA root expiration metric" + caSigningMetricRoutineName = "CA signing expiration metric" configReplicationRoutineName = "config entry replication" federationStateReplicationRoutineName = "federation state replication" federationStateAntiEntropyRoutineName = "federation state anti-entropy" diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index 06e2b3884..c0156bfa9 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -479,6 +479,7 @@ These metrics give insight into the health of the cluster as a whole. | `consul.catalog.connect.query-tags..` | Increments for each connect-based catalog query for the given service with the given tags. | queries | counter | | `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter | | `consul.mesh.active-root-ca.expiry` | The number of seconds until the root CA expires, updated every hour. | seconds | gauge | +| `consul.mesh.active-signing-ca.expiry` | The number of seconds until the signing CA expires, updated every hour. | seconds | gauge | ## Connect Built-in Proxy Metrics