From a7fcf14c5ceff6e8fd5008d5fd5d61e379fdc458 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Wed, 20 Oct 2021 11:54:11 -0400 Subject: [PATCH] telemetry: fix cert expiry metrics by removing labels These labels should be set by whatever process scrapes Consul (for prometheus), or by the agent that receives them (for datadog/statsd). We need to remove them here because the labels are part of the "metric key", so we'd have to pre-declare the metrics with the labels. We could do that, but that is extra work for labels that should be added from elsewhere. Also renames the closure to be more descriptive. --- agent/consul/leader_metrics.go | 35 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/agent/consul/leader_metrics.go b/agent/consul/leader_metrics.go index faf8f68eb..6da2d70a9 100644 --- a/agent/consul/leader_metrics.go +++ b/agent/consul/leader_metrics.go @@ -42,10 +42,7 @@ var AgentCertExpirationGauges = []prometheus.GaugeDefinition{ func rootCAExpiryMonitor(s *Server) CertExpirationMonitor { return CertExpirationMonitor{ - Key: metricsKeyMeshRootCAExpiry, - Labels: []metrics.Label{ - {Name: "datacenter", Value: s.config.Datacenter}, - }, + Key: metricsKeyMeshRootCAExpiry, Logger: s.logger.Named(logging.Connect), Query: func() (time.Duration, error) { return getRootCAExpiry(s) @@ -70,10 +67,7 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor { isPrimary := s.config.Datacenter == s.config.PrimaryDatacenter if isPrimary { return CertExpirationMonitor{ - Key: metricsKeyMeshActiveSigningCAExpiry, - Labels: []metrics.Label{ - {Name: "datacenter", Value: s.config.Datacenter}, - }, + Key: metricsKeyMeshActiveSigningCAExpiry, Logger: s.logger.Named(logging.Connect), Query: func() (time.Duration, error) { provider, _ := s.caManager.getCAProvider() @@ -87,10 +81,7 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor { } return CertExpirationMonitor{ - Key: metricsKeyMeshActiveSigningCAExpiry, - Labels: []metrics.Label{ - {Name: "datacenter", Value: s.config.Datacenter}, - }, + Key: metricsKeyMeshActiveSigningCAExpiry, Logger: s.logger.Named(logging.Connect), Query: func() (time.Duration, error) { return getActiveIntermediateExpiry(s) @@ -121,7 +112,11 @@ func getActiveIntermediateExpiry(s *Server) (time.Duration, error) { } type CertExpirationMonitor struct { - Key []string + Key []string + // Labels to be emitted along with the metric. It is very important that these + // labels be included in the pre-declaration as well. Otherwise, if + // telemetry.prometheus_retention_time is less than certExpirationMonitorInterval + // then the metrics will expire before they are emitted again. Labels []metrics.Label Logger hclog.Logger // Query is called at each interval. It should return the duration until the @@ -137,7 +132,7 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error { logger := m.Logger.With("metric", strings.Join(m.Key, ".")) - fn := func() { + emitMetric := func() { d, err := m.Query() if err != nil { logger.Warn("failed to emit certificate expiry metric", "error", err) @@ -155,17 +150,17 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error { // emit the metric immediately so that if a cert was just updated the // new metric will be updated to the new expiration time. - fn() + emitMetric() for { select { case <-ctx.Done(): // "Zero-out" the metric on exit so that when prometheus scrapes this // metric from a non-leader, it does not get a stale value. - metrics.SetGauge(m.Key, float32(math.NaN())) + metrics.SetGaugeWithLabels(m.Key, float32(math.NaN()), m.Labels) return nil case <-ticker.C: - fn() + emitMetric() } } } @@ -176,11 +171,7 @@ var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"} // monitor the expiration of the certificate used for agent TLS. func AgentTLSCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger, dc string) CertExpirationMonitor { return CertExpirationMonitor{ - Key: metricsKeyAgentTLSCertExpiry, - Labels: []metrics.Label{ - {Name: "node", Value: c.Base().NodeName}, - {Name: "datacenter", Value: dc}, - }, + Key: metricsKeyAgentTLSCertExpiry, Logger: logger, Query: func() (time.Duration, error) { raw := c.Cert()