fix consul_autopilot_healthy metric emission (#11231)
https://github.com/hashicorp/consul/issues/10730
This commit is contained in:
parent
b729414c05
commit
7f28301212
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:bug
|
||||||
|
telemetry: fixes a bug with Prometheus consul_autopilot_healthy metric where 0 is reported instead of NaN on servers.
|
||||||
|
```
|
|
@ -9,6 +9,7 @@ import (
|
||||||
"github.com/hashicorp/raft"
|
"github.com/hashicorp/raft"
|
||||||
autopilot "github.com/hashicorp/raft-autopilot"
|
autopilot "github.com/hashicorp/raft-autopilot"
|
||||||
"github.com/hashicorp/serf/serf"
|
"github.com/hashicorp/serf/serf"
|
||||||
|
"math"
|
||||||
|
|
||||||
"github.com/hashicorp/consul/agent/metadata"
|
"github.com/hashicorp/consul/agent/metadata"
|
||||||
"github.com/hashicorp/consul/agent/structs"
|
"github.com/hashicorp/consul/agent/structs"
|
||||||
|
@ -52,6 +53,12 @@ func (d *AutopilotDelegate) NotifyState(state *autopilot.State) {
|
||||||
} else {
|
} else {
|
||||||
metrics.SetGauge([]string{"autopilot", "healthy"}, 0)
|
metrics.SetGauge([]string{"autopilot", "healthy"}, 0)
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
|
||||||
|
// if we are not a leader, emit NaN per
|
||||||
|
// https://www.consul.io/docs/agent/telemetry#autopilot
|
||||||
|
metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN()))
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,6 +81,8 @@ func (s *Server) initAutopilot(config *Config) {
|
||||||
autopilot.WithUpdateInterval(config.ServerHealthInterval),
|
autopilot.WithUpdateInterval(config.ServerHealthInterval),
|
||||||
autopilot.WithPromoter(s.autopilotPromoter()),
|
autopilot.WithPromoter(s.autopilotPromoter()),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN()))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server {
|
func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server {
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestHTTPHandlers_AgentMetrics_ConsulAutopilotHealthy_Prometheus adds testing around
|
||||||
|
// the published autopilot metrics on https://www.consul.io/docs/agent/telemetry#autopilot
|
||||||
|
func TestHTTPHandlers_AgentMetrics_ConsulAutopilotHealthy_Prometheus(t *testing.T) {
|
||||||
|
checkForShortTesting(t)
|
||||||
|
// This test cannot use t.Parallel() since we modify global state, ie the global metrics instance
|
||||||
|
|
||||||
|
// don't bootstrap agent so as not to
|
||||||
|
// become a leader
|
||||||
|
hcl := `
|
||||||
|
telemetry = {
|
||||||
|
prometheus_retention_time = "5s",
|
||||||
|
disable_hostname = true
|
||||||
|
}
|
||||||
|
bootstrap = false
|
||||||
|
`
|
||||||
|
|
||||||
|
a := StartTestAgent(t, TestAgent{HCL: hcl})
|
||||||
|
defer a.Shutdown()
|
||||||
|
|
||||||
|
req, err := http.NewRequest("GET", "/v1/agent/metrics?format=prometheus", nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to generate new http request. err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
respRec := httptest.NewRecorder()
|
||||||
|
_, err = a.srv.AgentMetrics(respRec, req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to serve agent metrics. err: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Run("Check consul_autopilot_healthy metric value on startup", func(t *testing.T) {
|
||||||
|
target := "consul_autopilot_healthy NaN"
|
||||||
|
keyValue := strings.Split(target, " ")
|
||||||
|
if !strings.Contains(respRec.Body.String(), target) {
|
||||||
|
t.Fatalf("Could not find the metric \"%s\" with value \"%s\" in the /v1/agent/metrics response", keyValue[0], keyValue[1])
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkForShortTesting(t *testing.T) {
|
||||||
|
if testing.Short() {
|
||||||
|
t.Skip("too slow for testing.Short")
|
||||||
|
}
|
||||||
|
}
|
|
@ -185,7 +185,12 @@ func (a *TestAgent) Start(t *testing.T) error {
|
||||||
}
|
}
|
||||||
result, err := config.Load(opts)
|
result, err := config.Load(opts)
|
||||||
if result.RuntimeConfig != nil {
|
if result.RuntimeConfig != nil {
|
||||||
result.RuntimeConfig.Telemetry.Disable = true
|
// If prom metrics need to be enabled, do not disable telemetry
|
||||||
|
if result.RuntimeConfig.Telemetry.PrometheusOpts.Expiration > 0 {
|
||||||
|
result.RuntimeConfig.Telemetry.Disable = false
|
||||||
|
} else {
|
||||||
|
result.RuntimeConfig.Telemetry.Disable = true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return result, err
|
return result, err
|
||||||
}
|
}
|
||||||
|
@ -195,7 +200,11 @@ func (a *TestAgent) Start(t *testing.T) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
bd.Logger = logger
|
bd.Logger = logger
|
||||||
bd.MetricsHandler = metrics.NewInmemSink(1*time.Second, time.Minute)
|
// if we are not testing telemetry things, let's use a "mock" sink for metrics
|
||||||
|
if bd.RuntimeConfig.Telemetry.Disable {
|
||||||
|
bd.MetricsHandler = metrics.NewInmemSink(1*time.Second, time.Minute)
|
||||||
|
}
|
||||||
|
|
||||||
a.Config = bd.RuntimeConfig
|
a.Config = bd.RuntimeConfig
|
||||||
|
|
||||||
agent, err := New(bd)
|
agent, err := New(bd)
|
||||||
|
|
Loading…
Reference in New Issue