Merge pull request #9924 from hashicorp/dnephin/cert-expiration-metric
connect: emit a metric for the seconds until root CA expiry
This commit is contained in:
commit
f4c1f982d1
|
@ -0,0 +1,4 @@
|
|||
```release-note:improvement
|
||||
telemetry: add a new `mesh.active-root-ca.expiry` metric for tracking when the root certificate expires.
|
||||
```
|
||||
|
|
@ -36,6 +36,7 @@ func (s *Server) startConnectLeader(ctx context.Context) error {
|
|||
|
||||
s.caManager.Start(ctx)
|
||||
s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning)
|
||||
s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).monitor)
|
||||
|
||||
return s.startIntentionConfigEntryMigration(ctx)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
package consul
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/armon/go-metrics"
|
||||
"github.com/armon/go-metrics/prometheus"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
|
||||
"github.com/hashicorp/consul/logging"
|
||||
)
|
||||
|
||||
var CertExpirationGauges = []prometheus.GaugeDefinition{
|
||||
{
|
||||
Name: metricsKeyMeshRootCAExpiry,
|
||||
Help: "Seconds until the service mesh root certificate expires.",
|
||||
},
|
||||
}
|
||||
|
||||
var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
|
||||
|
||||
func rootCAExpiryMonitor(s *Server) certExpirationMonitor {
|
||||
return certExpirationMonitor{
|
||||
Key: metricsKeyMeshRootCAExpiry,
|
||||
Labels: []metrics.Label{
|
||||
{Name: "datacenter", Value: s.config.Datacenter},
|
||||
},
|
||||
Logger: s.logger.Named(logging.Connect),
|
||||
Query: func() (time.Duration, error) {
|
||||
state := s.fsm.State()
|
||||
_, root, err := state.CARootActive(nil)
|
||||
switch {
|
||||
case err != nil:
|
||||
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
|
||||
case root == nil:
|
||||
return 0, fmt.Errorf("no active root CA")
|
||||
}
|
||||
|
||||
return time.Until(root.NotAfter), nil
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
type certExpirationMonitor struct {
|
||||
Key []string
|
||||
Labels []metrics.Label
|
||||
Logger hclog.Logger
|
||||
// Query is called at each interval. It should return the duration until the
|
||||
// certificate expires, or an error if the query failed.
|
||||
Query func() (time.Duration, error)
|
||||
}
|
||||
|
||||
const certExpirationMonitorInterval = time.Hour
|
||||
|
||||
func (m certExpirationMonitor) monitor(ctx context.Context) error {
|
||||
ticker := time.NewTicker(certExpirationMonitorInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil
|
||||
case <-ticker.C:
|
||||
d, err := m.Query()
|
||||
if err != nil {
|
||||
m.Logger.Warn("failed to emit certificate expiry metric", "error", err)
|
||||
}
|
||||
expiry := d / time.Second
|
||||
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -102,6 +102,7 @@ const (
|
|||
aclTokenReapingRoutineName = "acl token reaping"
|
||||
aclUpgradeRoutineName = "legacy ACL token upgrade"
|
||||
caRootPruningRoutineName = "CA root pruning"
|
||||
caRootMetricRoutineName = "CA root expiration metric"
|
||||
configReplicationRoutineName = "config entry replication"
|
||||
federationStateReplicationRoutineName = "federation state replication"
|
||||
federationStateAntiEntropyRoutineName = "federation state anti-entropy"
|
||||
|
|
|
@ -194,6 +194,7 @@ func getPrometheusDefs(cfg lib.TelemetryConfig) ([]prometheus.GaugeDefinition, [
|
|||
xds.StatsGauges,
|
||||
usagemetrics.Gauges,
|
||||
consul.ReplicationGauges,
|
||||
consul.CertExpirationGauges,
|
||||
Gauges,
|
||||
raftGauges,
|
||||
}
|
||||
|
|
|
@ -478,6 +478,7 @@ These metrics give insight into the health of the cluster as a whole.
|
|||
| `consul.catalog.connect.query-tag..` | Increments for each connect-based catalog query for the given service with the given tag. | queries | counter |
|
||||
| `consul.catalog.connect.query-tags..` | Increments for each connect-based catalog query for the given service with the given tags. | queries | counter |
|
||||
| `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter |
|
||||
| `consul.mesh.active-root-ca.expiry` | The number of seconds until the root CA expires, updated every hour. | seconds | gauge |
|
||||
|
||||
## Connect Built-in Proxy Metrics
|
||||
|
||||
|
|
Loading…
Reference in New Issue