fix autopilot_failure_tolerance, add autopilot metrics test case (#11399)

Signed-off-by: FFMMM <FFMMM@users.noreply.github.com>
This commit is contained in:
FFMMM 2021-10-25 10:55:59 -07:00 committed by GitHub
parent 96afd767a1
commit 6433a57d3c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 90 additions and 39 deletions

3
.changelog/11399.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
telemetry: fixes a bug with Prometheus consul_autopilot_failure_tolerance metric where 0 is reported instead of NaN on follower servers.
```

View File

@ -59,6 +59,9 @@ func (d *AutopilotDelegate) NotifyState(state *autopilot.State) {
// https://www.consul.io/docs/agent/telemetry#autopilot
metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN()))
// also emit NaN for failure tolerance to be backwards compatible
metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(math.NaN()))
}
}
@ -83,6 +86,7 @@ func (s *Server) initAutopilot(config *Config) {
)
metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN()))
metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(math.NaN()))
}
func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server {

View File

@ -1,53 +1,97 @@
package agent
import (
"github.com/stretchr/testify/require"
"net/http"
"net/http/httptest"
"strings"
"testing"
)
// TestHTTPHandlers_AgentMetrics_ConsulAutopilotHealthy_Prometheus adds testing around
// the published autopilot metrics on https://www.consul.io/docs/agent/telemetry#autopilot
func TestHTTPHandlers_AgentMetrics_ConsulAutopilotHealthy_Prometheus(t *testing.T) {
checkForShortTesting(t)
// This test cannot use t.Parallel() since we modify global state, ie the global metrics instance
// don't bootstrap agent so as not to
// become a leader
hcl := `
telemetry = {
prometheus_retention_time = "5s",
disable_hostname = true
}
bootstrap = false
`
a := StartTestAgent(t, TestAgent{HCL: hcl})
defer a.Shutdown()
req, err := http.NewRequest("GET", "/v1/agent/metrics?format=prometheus", nil)
if err != nil {
t.Fatalf("Failed to generate new http request. err: %v", err)
}
respRec := httptest.NewRecorder()
_, err = a.srv.AgentMetrics(respRec, req)
if err != nil {
t.Fatalf("Failed to serve agent metrics. err: %v", err)
}
t.Run("Check consul_autopilot_healthy metric value on startup", func(t *testing.T) {
target := "consul_autopilot_healthy NaN"
keyValue := strings.Split(target, " ")
if !strings.Contains(respRec.Body.String(), target) {
t.Fatalf("Could not find the metric \"%s\" with value \"%s\" in the /v1/agent/metrics response", keyValue[0], keyValue[1])
}
})
}
func checkForShortTesting(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
}
func recordPromMetrics(t *testing.T, a *TestAgent, respRec *httptest.ResponseRecorder) {
req, err := http.NewRequest("GET", "/v1/agent/metrics?format=prometheus", nil)
require.NoError(t, err, "Failed to generate new http request.")
_, err = a.srv.AgentMetrics(respRec, req)
require.NoError(t, err, "Failed to serve agent metrics")
}
func assertMetricExistsWithValue(t *testing.T, respRec *httptest.ResponseRecorder, metric string, value string) {
if respRec.Body.String() == "" {
t.Fatalf("Response body is empty.")
}
// eg "consul_autopilot_healthy NaN"
target := metric + " " + value
if !strings.Contains(respRec.Body.String(), target) {
t.Fatalf("Could not find the metric \"%s\" with value \"%s\" in the /v1/agent/metrics response", metric, value)
}
}
func assertMetricNotExists(t *testing.T, respRec *httptest.ResponseRecorder, metric string) {
if respRec.Body.String() == "" {
t.Fatalf("Response body is empty.")
}
if strings.Contains(respRec.Body.String(), metric) {
t.Fatalf("Didn't expect to find the metric \"%s\" in the /v1/agent/metrics response", metric)
}
}
// TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus adds testing around
// the published autopilot metrics on https://www.consul.io/docs/agent/telemetry#autopilot
func TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus(t *testing.T) {
checkForShortTesting(t)
// This test cannot use t.Parallel() since we modify global state, ie the global metrics instance
t.Run("Check consul_autopilot_* are not emitted metrics on clients", func(t *testing.T) {
hcl := `
telemetry = {
prometheus_retention_time = "5s"
disable_hostname = true
metrics_prefix = "agent_1"
}
bootstrap = false
server = false
`
a := StartTestAgent(t, TestAgent{HCL: hcl})
defer a.Shutdown()
respRec := httptest.NewRecorder()
recordPromMetrics(t, a, respRec)
assertMetricNotExists(t, respRec, "agent_1_autopilot_healthy")
assertMetricNotExists(t, respRec, "agent_1_autopilot_failure_tolerance")
})
t.Run("Check consul_autopilot_healthy metric value on startup", func(t *testing.T) {
// don't bootstrap agent so as not to
// become a leader
hcl := `
telemetry = {
prometheus_retention_time = "5s",
disable_hostname = true
metrics_prefix = "agent_2"
}
bootstrap = false
`
a := StartTestAgent(t, TestAgent{HCL: hcl})
defer a.Shutdown()
respRec := httptest.NewRecorder()
recordPromMetrics(t, a, respRec)
assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_healthy", "NaN")
assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_failure_tolerance", "NaN")
})
}