telemetry: improve cert expiry metrics

Emit the metric immediately so that after restarting an agent, the new expiry time will be
emitted. This is particularly important when this metric is being monitored, because we want
the alert to resovle itself immediately.

Also fixed a bug that was exposed in one of these metrics. The CARoot can be nil, so we have
to handle that case.
This commit is contained in:
Daniel Nephin 2021-08-04 16:34:01 -04:00
parent ae76144f55
commit 616cc9b6f8
1 changed files with 26 additions and 15 deletions

View File

@ -97,8 +97,11 @@ func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
state := s.fsm.State()
_, root, err := state.CARootActive(nil)
if err != nil {
return 0, err
switch {
case err != nil:
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
case root == nil:
return 0, fmt.Errorf("no active root CA")
}
// the CA used in a secondary DC is the active intermediate,
@ -130,24 +133,32 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
logger := m.Logger.With("metric", strings.Join(m.Key, "."))
fn := func() {
d, err := m.Query()
if err != nil {
logger.Warn("failed to emit certificate expiry metric", "error", err)
return
}
if d < 24*time.Hour {
logger.Warn("certificate will expire soon",
"time_to_expiry", d, "expiration", time.Now().Add(d))
}
expiry := d / time.Second
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
}
// emit the metric immediately so that if a cert was just updated the
// new metric will be updated to the new expiration time.
fn()
for {
select {
case <-ctx.Done():
return nil
case <-ticker.C:
d, err := m.Query()
if err != nil {
logger.Warn("failed to emit certificate expiry metric", "error", err)
continue
}
if d < 24*time.Hour {
logger.Warn("certificate will expire soon",
"time_to_expiry", d, "expiration", time.Now().Add(d))
}
expiry := d / time.Second
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
fn()
}
}
}