Enable running autopilot state updates on all servers (#12617)

* Fixes a lint warning about t.Errorf not supporting %w * Enable running autopilot on all servers On the non-leader servers all they do is update the state and do not attempt any modifications. * Fix the RPC conn limiting tests Technically they were relying on racey behavior before. Now they should be reliable.
2022-04-07 10:48:48 -04:00 · 2022-04-07 10:48:48 -04:00 · 3447880091
parent 9516b96d92
commit 3447880091
10 changed files with 56 additions and 44 deletions
--- a/.changelog/12617.txt
+++ b/.changelog/12617.txt
@ -0,0 +1,9 @@
+```release-note:improvement
+autopilot: Autopilot state is now tracked on Raft followers in addition to the leader. 
+Stale queries may be used to query for the non-leaders state.
+```
+
+```release-note:improvement
+autopilot: The `autopilot.healthy` and `autopilot.failure_tolerance` metrics are now 
+regularly emitted by all servers.
+```
--- a/agent/consul/autopilot.go
+++ b/agent/consul/autopilot.go
@ -9,10 +9,10 @@ import (
 	"github.com/hashicorp/raft"
 	autopilot "github.com/hashicorp/raft-autopilot"
 	"github.com/hashicorp/serf/serf"
-	"math"

 	"github.com/hashicorp/consul/agent/metadata"
 	"github.com/hashicorp/consul/agent/structs"
+	"github.com/hashicorp/consul/logging"
 	"github.com/hashicorp/consul/types"
 )

@ -33,7 +33,7 @@ type AutopilotDelegate struct {
 }

 func (d *AutopilotDelegate) AutopilotConfig() *autopilot.Config {
-	return d.server.getOrCreateAutopilotConfig().ToAutopilotLibraryConfig()
+	return d.server.getAutopilotConfigOrDefault().ToAutopilotLibraryConfig()
 }

 func (d *AutopilotDelegate) KnownServers() map[raft.ServerID]*autopilot.Server {
@ -45,24 +45,12 @@ func (d *AutopilotDelegate) FetchServerStats(ctx context.Context, servers map[ra
 }

 func (d *AutopilotDelegate) NotifyState(state *autopilot.State) {
-	// emit metrics if we are the leader regarding overall healthiness and the failure tolerance
-	if d.server.raft.State() == raft.Leader {
 	metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(state.FailureTolerance))
 	if state.Healthy {
 		metrics.SetGauge([]string{"autopilot", "healthy"}, 1)
 	} else {
 		metrics.SetGauge([]string{"autopilot", "healthy"}, 0)
 	}
-	} else {
-
-		// if we are not a leader, emit NaN per
-		// https://www.consul.io/docs/agent/telemetry#autopilot
-		metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN()))
-
-		// also emit NaN for failure tolerance to be backwards compatible
-		metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(math.NaN()))
-
-	}
 }

 func (d *AutopilotDelegate) RemoveFailedServer(srv *autopilot.Server) {
@ -84,10 +72,8 @@ func (s *Server) initAutopilot(config *Config) {
 		autopilot.WithReconcileInterval(config.AutopilotInterval),
 		autopilot.WithUpdateInterval(config.ServerHealthInterval),
 		autopilot.WithPromoter(s.autopilotPromoter()),
+		autopilot.WithReconciliationDisabled(),
 	)
-
-	metrics.SetGauge([]string{"autopilot", "healthy"}, float32(math.NaN()))
-	metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(math.NaN()))
 }

 func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server {
@ -154,3 +140,22 @@ func (s *Server) autopilotServerFromMetadata(srv *metadata.Server) (*autopilot.S

 	return server, nil
 }
+
+func (s *Server) getAutopilotConfigOrDefault() *structs.AutopilotConfig {
+	logger := s.loggers.Named(logging.Autopilot)
+	state := s.fsm.State()
+	_, config, err := state.AutopilotConfig()
+	if err != nil {
+		logger.Error("failed to get config", "error", err)
+		return nil
+	}
+
+	if config != nil {
+		return config
+	}
+
+	// autopilot may start running prior to there ever being a leader
+	// and having an autopilot configuration created. In that case
+	// use the one from the local configuration for now.
+	return s.config.AutopilotConfig
+}
--- a/agent/consul/leader.go
+++ b/agent/consul/leader.go
@ -297,7 +297,7 @@ func (s *Server) establishLeadership(ctx context.Context) error {
 	}

 	s.getOrCreateAutopilotConfig()
-	s.autopilot.Start(ctx)
+	s.autopilot.EnableReconciliation()

 	s.startConfigReplication(ctx)

@ -350,9 +350,7 @@ func (s *Server) revokeLeadership() {

 	s.resetConsistentReadReady()

-	// Stop returns a chan and we want to block until it is closed
-	// which indicates that autopilot is actually stopped.
-	<-s.autopilot.Stop()
+	s.autopilot.DisableReconciliation()
 }

 // initializeACLs is used to setup the ACLs if we are the leader
--- a/agent/consul/operator_autopilot_endpoint.go
+++ b/agent/consul/operator_autopilot_endpoint.go
@ -2,6 +2,7 @@ package consul

 import (
 	"fmt"
+
 	autopilot "github.com/hashicorp/raft-autopilot"
 	"github.com/hashicorp/serf/serf"

@ -75,10 +76,6 @@ func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRe

 // ServerHealth is used to get the current health of the servers.
 func (op *Operator) ServerHealth(args *structs.DCSpecificRequest, reply *structs.AutopilotHealthReply) error {
-	// This must be sent to the leader, so we fix the args since we are
-	// re-using a structure where we don't support all the options.
-	args.RequireConsistent = true
-	args.AllowStale = false
 	if done, err := op.srv.ForwardRPC("Operator.ServerHealth", args, reply); done {
 		return err
 	}
@ -143,10 +140,6 @@ func (op *Operator) ServerHealth(args *structs.DCSpecificRequest, reply *structs
 }

 func (op *Operator) AutopilotState(args *structs.DCSpecificRequest, reply *autopilot.State) error {
-	// This must be sent to the leader, so we fix the args since we are
-	// re-using a structure where we don't support all the options.
-	args.RequireConsistent = true
-	args.AllowStale = false
 	if done, err := op.srv.ForwardRPC("Operator.AutopilotState", args, reply); done {
 		return err
 	}
--- a/agent/consul/rpc_test.go
+++ b/agent/consul/rpc_test.go
@ -817,7 +817,8 @@ func TestRPC_RPCMaxConnsPerClient(t *testing.T) {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			dir1, s1 := testServerWithConfig(t, func(c *Config) {
-				c.RPCMaxConnsPerClient = 2
+				// we have to set this to 3 because autopilot is going to keep a connection open
+				c.RPCMaxConnsPerClient = 3
 				if tc.tlsEnabled {
 					c.TLSConfig.InternalRPC.CAFile = "../../test/hostname/CertAuth.crt"
 					c.TLSConfig.InternalRPC.CertFile = "../../test/hostname/Alice.crt"
@ -831,6 +832,8 @@ func TestRPC_RPCMaxConnsPerClient(t *testing.T) {
 			defer os.RemoveAll(dir1)
 			defer s1.Shutdown()

+			waitForLeaderEstablishment(t, s1)
+
 			// Connect to the server with bare TCP
 			conn1 := connectClient(t, s1, tc.magicByte, tc.tlsEnabled, true, "conn1")
 			defer conn1.Close()
@ -847,7 +850,7 @@ func TestRPC_RPCMaxConnsPerClient(t *testing.T) {
 			addr := conn1.RemoteAddr()
 			conn1.Close()
 			retry.Run(t, func(r *retry.R) {
-				if n := s1.rpcConnLimiter.NumOpen(addr); n >= 2 {
+				if n := s1.rpcConnLimiter.NumOpen(addr); n >= 3 {
 					r.Fatal("waiting for open conns to drop")
 				}
 			})
@ -1736,7 +1739,7 @@ func rpcBlockingQueryTestHarness(
 				return
 			case err := <-errCh:
 				if err != nil {
-					t.Errorf("[%d] unexpected error: %w", i, err)
+					t.Errorf("[%d] unexpected error: %v", i, err)
 					return
 				}
 			}
--- a/agent/consul/server.go
+++ b/agent/consul/server.go
@ -674,6 +674,10 @@ func NewServer(config *Config, flat Deps, publicGRPCServer *grpc.Server) (*Serve
 		go s.listen(listener)
 	}

+	// start autopilot - this must happen after the RPC listeners get setup
+	// or else it may block
+	s.autopilot.Start(&lib.StopChannelContext{StopCh: s.shutdownCh})
+
 	// Start the metrics handlers.
 	go s.updateMetrics()

--- a/agent/metrics_test.go
+++ b/agent/metrics_test.go
@ -250,8 +250,8 @@ func TestHTTPHandlers_AgentMetrics_ConsulAutopilot_Prometheus(t *testing.T) {
 		respRec := httptest.NewRecorder()
 		recordPromMetrics(t, a, respRec)

-		assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_healthy", "NaN")
-		assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_failure_tolerance", "NaN")
+		assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_healthy", "1")
+		assertMetricExistsWithValue(t, respRec, "agent_2_autopilot_failure_tolerance", "0")
 	})
 }

--- a/go.mod
+++ b/go.mod
@ -54,7 +54,7 @@ require (
 	github.com/hashicorp/hil v0.0.0-20200423225030-a18a1cd20038
 	github.com/hashicorp/memberlist v0.3.1
 	github.com/hashicorp/raft v1.3.6
-	github.com/hashicorp/raft-autopilot v0.1.5
+	github.com/hashicorp/raft-autopilot v0.1.6
 	github.com/hashicorp/raft-boltdb v0.0.0-20211202195631-7d34b9fb3f42 // indirect
 	github.com/hashicorp/raft-boltdb/v2 v2.2.2
 	github.com/hashicorp/serf v0.9.7
--- a/go.sum
+++ b/go.sum
@ -363,8 +363,8 @@ github.com/hashicorp/raft v1.1.1/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7
 github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
 github.com/hashicorp/raft v1.3.6 h1:v5xW5KzByoerQlN/o31VJrFNiozgzGyDoMgDJgXpsto=
 github.com/hashicorp/raft v1.3.6/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM=
-github.com/hashicorp/raft-autopilot v0.1.5 h1:onEfMH5uHVdXQqtas36zXUHEZxLdsJVu/nXHLcLdL1I=
-github.com/hashicorp/raft-autopilot v0.1.5/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
+github.com/hashicorp/raft-autopilot v0.1.6 h1:C1q3RNF2FfXNZfHWbvVAu0QixaQK8K5pX4O5lh+9z4I=
+github.com/hashicorp/raft-autopilot v0.1.6/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
 github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk=
 github.com/hashicorp/raft-boltdb v0.0.0-20210409134258-03c10cc3d4ea/go.mod h1:qRd6nFJYYS6Iqnc/8HcUmko2/2Gw8qTFEmxDLii6W5I=
 github.com/hashicorp/raft-boltdb v0.0.0-20211202195631-7d34b9fb3f42 h1:Ye8SofeDHJzu9xvvaMmpMkqHELWW7rTcXwdUR0CWW48=
--- a/website/content/docs/agent/telemetry.mdx
+++ b/website/content/docs/agent/telemetry.mdx
@ -94,7 +94,7 @@ These are some metrics emitted that can help you understand the health of your c

 | Metric Name                | Description                                                                                                                                                                  | Unit         | Type  |
 | :------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------- | :---- |
-| `consul.autopilot.healthy` | Tracks the overall health of the local server cluster. If all servers are considered healthy by Autopilot, this will be set to 1. If any are unhealthy, this will be 0. All non-leader servers will report `NaN`. | health state | gauge |
+| `consul.autopilot.healthy` | Tracks the overall health of the local server cluster. If all servers are considered healthy by Autopilot, this will be set to 1. If any are unhealthy, this will be 0. | health state | gauge |

 **Why it's important:** Autopilot can expose the overall health of your cluster with a simple boolean.

@ -592,8 +592,8 @@ These metrics give insight into the health of the cluster as a whole.
 | `consul.serf.member.left`             | Increments when an agent leaves the cluster.                                                                                                                                                                                                                                                                                                                                                                                       | leaves / interval                                   | counter |
 | `consul.serf.events`                  | Increments when an agent processes an [event](/commands/event). Consul uses events internally so there may be additional events showing in telemetry. There are also a per-event counters emitted as `consul.serf.events.`.                                                                                                                                                                                                        | events / interval                                   | counter |
 | `consul.serf.msgs.sent`               | This metric is sample of the number of bytes of messages broadcast to the cluster. In a given time interval, the sum of this metric is the total number of bytes sent and the count is the number of messages sent.                                                                                                                                                                                                                | message bytes / interval                            | counter |
-| `consul.autopilot.failure_tolerance`  | Tracks the number of voting servers that the cluster can lose while continuing to function.                                                                                                                                                                                                                                                                                                                                        | servers                                             | gauge   |
-| `consul.autopilot.healthy`            | Tracks the overall health of the local server cluster. If all servers are considered healthy by Autopilot, this will be set to 1. If any are unhealthy, this will be 0. All non-leader servers will report `NaN`.                                                                                                                                                                                                                  | boolean                                             | gauge   |
+| `consul.autopilot.failure_tolerance`  | Tracks the number of voting servers that the cluster can lose while continuing to function.                                                                                                                                                                                                                                                                                                                                        | servers                                             | gauge   |
+| `consul.autopilot.healthy`            | Tracks the overall health of the local server cluster. If all servers are considered healthy by Autopilot, this will be set to 1. If any are unhealthy, this will be 0.                                                                                                                                                                                                                                                            | boolean                                             | gauge   |
 | `consul.session_ttl.active`           | Tracks the active number of sessions being tracked.                                                                                                                                                                                                                                                                                                                                                                                | sessions                                            | gauge   |
 | `consul.catalog.service.query.`       | Increments for each catalog query for the given service.                                                                                                                                                                                                                                                                                                                                                                           | queries                                             | counter |
 | `consul.catalog.service.query-tag..`  | Increments for each catalog query for the given service with the given tag.                                                                                                                                                                                                                                                                                                                                                        | queries                                             | counter |