fix consul_autopilot_healthy metric emission (#11231)

https://github.com/hashicorp/consul/issues/10730
This commit is contained in:
FFMMM 2021-10-08 10:31:50 -07:00 committed by GitHub
parent b729414c05
commit 7f28301212
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 76 additions and 2 deletions

3
.changelog/11231.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
telemetry: fixes a bug with Prometheus consul_autopilot_healthy metric where 0 is reported instead of NaN on servers.
```

View File

@ -9,6 +9,7 @@ import (
"github.com/hashicorp/raft"
autopilot "github.com/hashicorp/raft-autopilot"
"github.com/hashicorp/serf/serf"
"math"
"github.com/hashicorp/consul/agent/metadata"
"github.com/hashicorp/consul/agent/structs"
@ -52,6 +53,12 @@ func (d *AutopilotDelegate) NotifyState(state *autopilot.State) {
} else {
metrics.SetGauge([]string{"autopilot", "healthy"}, 0)
}
} else {
// if we are not a leader, emit NaN per
// https://www.consul.io/docs/agent/telemetry#autopilot
metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN()))
}
}
@ -74,6 +81,8 @@ func (s *Server) initAutopilot(config *Config) {
autopilot.WithUpdateInterval(config.ServerHealthInterval),
autopilot.WithPromoter(s.autopilotPromoter()),
)
metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN()))
}
func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server {

53
agent/metrics_test.go Normal file
View File

@ -0,0 +1,53 @@
package agent
import (
"net/http"
"net/http/httptest"
"strings"
"testing"
)
// TestHTTPHandlers_AgentMetrics_ConsulAutopilotHealthy_Prometheus adds testing around
// the published autopilot metrics on https://www.consul.io/docs/agent/telemetry#autopilot
func TestHTTPHandlers_AgentMetrics_ConsulAutopilotHealthy_Prometheus(t *testing.T) {
checkForShortTesting(t)
// This test cannot use t.Parallel() since we modify global state, ie the global metrics instance
// don't bootstrap agent so as not to
// become a leader
hcl := `
telemetry = {
prometheus_retention_time = "5s",
disable_hostname = true
}
bootstrap = false
`
a := StartTestAgent(t, TestAgent{HCL: hcl})
defer a.Shutdown()
req, err := http.NewRequest("GET", "/v1/agent/metrics?format=prometheus", nil)
if err != nil {
t.Fatalf("Failed to generate new http request. err: %v", err)
}
respRec := httptest.NewRecorder()
_, err = a.srv.AgentMetrics(respRec, req)
if err != nil {
t.Fatalf("Failed to serve agent metrics. err: %v", err)
}
t.Run("Check consul_autopilot_healthy metric value on startup", func(t *testing.T) {
target := "consul_autopilot_healthy NaN"
keyValue := strings.Split(target, " ")
if !strings.Contains(respRec.Body.String(), target) {
t.Fatalf("Could not find the metric \"%s\" with value \"%s\" in the /v1/agent/metrics response", keyValue[0], keyValue[1])
}
})
}
func checkForShortTesting(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
}

View File

@ -185,8 +185,13 @@ func (a *TestAgent) Start(t *testing.T) error {
}
result, err := config.Load(opts)
if result.RuntimeConfig != nil {
// If prom metrics need to be enabled, do not disable telemetry
if result.RuntimeConfig.Telemetry.PrometheusOpts.Expiration > 0 {
result.RuntimeConfig.Telemetry.Disable = false
} else {
result.RuntimeConfig.Telemetry.Disable = true
}
}
return result, err
}
bd, err := NewBaseDeps(loader, logOutput)
@ -195,7 +200,11 @@ func (a *TestAgent) Start(t *testing.T) error {
}
bd.Logger = logger
// if we are not testing telemetry things, let's use a "mock" sink for metrics
if bd.RuntimeConfig.Telemetry.Disable {
bd.MetricsHandler = metrics.NewInmemSink(1*time.Second, time.Minute)
}
a.Config = bd.RuntimeConfig
agent, err := New(bd)