Add ca certificate metrics (#10504)
* add intermediate ca metric routine * add Gauge config for intermediate cert * Stop metrics routine when stopping leader * add changelog entry * updage changelog Co-authored-by: Daniel Nephin <dnephin@hashicorp.com> * use variables instead of a map * go imports sort * Add metrics for primary and secondary ca * start metrics routine in the right DC * add telemetry documentation * update docs * extract expiry fetching in a func * merge metrics for primary and secondary into signing ca metric Co-authored-by: Daniel Nephin <dnephin@hashicorp.com>
This commit is contained in:
parent
83c543cd6b
commit
e5dbf5e55b
|
@ -0,0 +1,3 @@
|
|||
```release-note:enhancement
|
||||
telemetry: added metrics to track certificates expiry.
|
||||
```
|
|
@ -37,6 +37,7 @@ func (s *Server) startConnectLeader(ctx context.Context) error {
|
|||
s.caManager.Start(ctx)
|
||||
s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning)
|
||||
s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).monitor)
|
||||
s.leaderRoutineManager.Start(ctx, caSigningMetricRoutineName, signingCAExpiryMonitor(s).monitor)
|
||||
|
||||
return s.startIntentionConfigEntryMigration(ctx)
|
||||
}
|
||||
|
@ -46,6 +47,8 @@ func (s *Server) stopConnectLeader() {
|
|||
s.caManager.Stop()
|
||||
s.leaderRoutineManager.Stop(intentionMigrationRoutineName)
|
||||
s.leaderRoutineManager.Stop(caRootPruningRoutineName)
|
||||
s.leaderRoutineManager.Stop(caRootMetricRoutineName)
|
||||
s.leaderRoutineManager.Stop(caSigningMetricRoutineName)
|
||||
|
||||
// If the provider implements NeedsStop, we call Stop to perform any shutdown actions.
|
||||
provider, _ := s.caManager.getCAProvider()
|
||||
|
|
|
@ -2,25 +2,34 @@ package consul
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/consul/agent/connect/ca"
|
||||
|
||||
"github.com/hashicorp/consul/agent/connect"
|
||||
|
||||
"github.com/armon/go-metrics"
|
||||
"github.com/armon/go-metrics/prometheus"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
|
||||
"github.com/hashicorp/consul/logging"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
)
|
||||
|
||||
var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
|
||||
var metricsKeyMeshActiveSigningCAExpiry = []string{"mesh", "active-signing-ca", "expiry"}
|
||||
|
||||
var CertExpirationGauges = []prometheus.GaugeDefinition{
|
||||
{
|
||||
Name: metricsKeyMeshRootCAExpiry,
|
||||
Help: "Seconds until the service mesh root certificate expires.",
|
||||
Help: "Seconds until the service mesh root certificate expires. Updated every hour",
|
||||
},
|
||||
{
|
||||
Name: metricsKeyMeshActiveSigningCAExpiry,
|
||||
Help: "Seconds until the service mesh signing certificate expires. Updated every hour",
|
||||
},
|
||||
}
|
||||
|
||||
var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
|
||||
|
||||
func rootCAExpiryMonitor(s *Server) certExpirationMonitor {
|
||||
return certExpirationMonitor{
|
||||
Key: metricsKeyMeshRootCAExpiry,
|
||||
|
@ -29,20 +38,77 @@ func rootCAExpiryMonitor(s *Server) certExpirationMonitor {
|
|||
},
|
||||
Logger: s.logger.Named(logging.Connect),
|
||||
Query: func() (time.Duration, error) {
|
||||
state := s.fsm.State()
|
||||
_, root, err := state.CARootActive(nil)
|
||||
switch {
|
||||
case err != nil:
|
||||
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
|
||||
case root == nil:
|
||||
return 0, fmt.Errorf("no active root CA")
|
||||
}
|
||||
|
||||
return time.Until(root.NotAfter), nil
|
||||
return getRootCAExpiry(s)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func getRootCAExpiry(s *Server) (time.Duration, error) {
|
||||
state := s.fsm.State()
|
||||
_, root, err := state.CARootActive(nil)
|
||||
switch {
|
||||
case err != nil:
|
||||
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
|
||||
case root == nil:
|
||||
return 0, fmt.Errorf("no active root CA")
|
||||
}
|
||||
|
||||
return time.Until(root.NotAfter), nil
|
||||
}
|
||||
|
||||
func signingCAExpiryMonitor(s *Server) certExpirationMonitor {
|
||||
isPrimary := s.config.Datacenter == s.config.PrimaryDatacenter
|
||||
if isPrimary {
|
||||
return certExpirationMonitor{
|
||||
Key: metricsKeyMeshActiveSigningCAExpiry,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: s.config.Datacenter},
|
||||
},
|
||||
Logger: s.logger.Named(logging.Connect),
|
||||
Query: func() (time.Duration, error) {
|
||||
provider, _ := s.caManager.getCAProvider()
|
||||
|
||||
if _, ok := provider.(ca.PrimaryUsesIntermediate); !ok {
|
||||
return getActiveIntermediateExpiry(s)
|
||||
}
|
||||
|
||||
return getRootCAExpiry(s)
|
||||
|
||||
},
|
||||
}
|
||||
} else {
|
||||
return certExpirationMonitor{
|
||||
Key: metricsKeyMeshActiveSigningCAExpiry,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: s.config.Datacenter},
|
||||
},
|
||||
Logger: s.logger.Named(logging.Connect),
|
||||
Query: func() (time.Duration, error) {
|
||||
return getActiveIntermediateExpiry(s)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
|
||||
state := s.fsm.State()
|
||||
_, root, err := state.CARootActive(nil)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// the CA used in a secondary DC is the active intermediate,
|
||||
// which is the last in the IntermediateCerts stack
|
||||
if len(root.IntermediateCerts) == 0 {
|
||||
return 0, errors.New("no intermediate available")
|
||||
}
|
||||
cert, err := connect.ParseCert(root.IntermediateCerts[len(root.IntermediateCerts)-1])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return time.Until(cert.NotAfter), nil
|
||||
}
|
||||
|
||||
type certExpirationMonitor struct {
|
||||
Key []string
|
||||
Labels []metrics.Label
|
||||
|
|
|
@ -103,6 +103,7 @@ const (
|
|||
aclUpgradeRoutineName = "legacy ACL token upgrade"
|
||||
caRootPruningRoutineName = "CA root pruning"
|
||||
caRootMetricRoutineName = "CA root expiration metric"
|
||||
caSigningMetricRoutineName = "CA signing expiration metric"
|
||||
configReplicationRoutineName = "config entry replication"
|
||||
federationStateReplicationRoutineName = "federation state replication"
|
||||
federationStateAntiEntropyRoutineName = "federation state anti-entropy"
|
||||
|
|
|
@ -479,6 +479,7 @@ These metrics give insight into the health of the cluster as a whole.
|
|||
| `consul.catalog.connect.query-tags..` | Increments for each connect-based catalog query for the given service with the given tags. | queries | counter |
|
||||
| `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter |
|
||||
| `consul.mesh.active-root-ca.expiry` | The number of seconds until the root CA expires, updated every hour. | seconds | gauge |
|
||||
| `consul.mesh.active-signing-ca.expiry` | The number of seconds until the signing CA expires, updated every hour. | seconds | gauge |
|
||||
|
||||
## Connect Built-in Proxy Metrics
|
||||
|
||||
|
|
Loading…
Reference in New Issue