Make Raft trailing logs and snapshot timing reloadable (#10129)

* WIP reloadable raft config * Pre-define new raft gauges * Update go-metrics to change gauge reset behaviour * Update raft to pull in new metric and reloadable config * Add snapshot persistance timing and installSnapshot to our 'protected' list as they can be infrequent but are important * Update telemetry docs * Update config and telemetry docs * Add note to oldestLogAge on when it is visible * Add changelog entry * Update website/content/docs/agent/options.mdx Co-authored-by: Matt Keeler <mkeeler@users.noreply.github.com> Co-authored-by: Matt Keeler <mkeeler@users.noreply.github.com>
2021-05-04 15:36:53 +01:00 · 2021-05-04 15:36:53 +01:00 · d47eea3a3f
parent eb84a856c4
commit d47eea3a3f
32 changed files with 805 additions and 258 deletions
--- a/.changelog/10129.txt
+++ b/.changelog/10129.txt
@ -0,0 +1,4 @@
+```release-note:improvement
+raft: allow reloading of raft trailing logs and snapshot timing to allow recovery from some [replication failure modes](https://github.com/hashicorp/consul/issues/9609).
+telemetry: add metrics and documentation for [monitoring for replication issues](https://consul.io/docs/agent/telemetry#raft-replication-capacity-issues).
+```
--- a/agent/agent.go
+++ b/agent/agent.go
@ -3639,10 +3639,13 @@ func (a *Agent) reloadConfigInternal(newCfg *config.RuntimeConfig) error {
 	}

 	cc := consul.ReloadableConfig{
-		RPCRateLimit:         newCfg.RPCRateLimit,
-		RPCMaxBurst:          newCfg.RPCMaxBurst,
-		RPCMaxConnsPerClient: newCfg.RPCMaxConnsPerClient,
-		ConfigEntryBootstrap: newCfg.ConfigEntryBootstrap,
+		RPCRateLimit:          newCfg.RPCRateLimit,
+		RPCMaxBurst:           newCfg.RPCMaxBurst,
+		RPCMaxConnsPerClient:  newCfg.RPCMaxConnsPerClient,
+		ConfigEntryBootstrap:  newCfg.ConfigEntryBootstrap,
+		RaftSnapshotThreshold: newCfg.RaftSnapshotThreshold,
+		RaftSnapshotInterval:  newCfg.RaftSnapshotInterval,
+		RaftTrailingLogs:      newCfg.RaftTrailingLogs,
 	}
 	if err := a.delegate.ReloadConfig(cc); err != nil {
 		return err
--- a/agent/consul/config.go
+++ b/agent/consul/config.go
@ -659,8 +659,11 @@ type RPCConfig struct {
 // ReloadableConfig is the configuration that is passed to ReloadConfig when
 // application config is reloaded.
 type ReloadableConfig struct {
-	RPCRateLimit         rate.Limit
-	RPCMaxBurst          int
-	RPCMaxConnsPerClient int
-	ConfigEntryBootstrap []structs.ConfigEntry
+	RPCRateLimit          rate.Limit
+	RPCMaxBurst           int
+	RPCMaxConnsPerClient  int
+	ConfigEntryBootstrap  []structs.ConfigEntry
+	RaftSnapshotThreshold int
+	RaftSnapshotInterval  time.Duration
+	RaftTrailingLogs      int
 }
--- a/agent/consul/server.go
+++ b/agent/consul/server.go
@ -1387,6 +1387,13 @@ func (s *Server) GetLANCoordinate() (lib.CoordinateSet, error) {
 // ReloadConfig is used to have the Server do an online reload of
 // relevant configuration information
 func (s *Server) ReloadConfig(config ReloadableConfig) error {
+	// Reload raft config first before updating any other state since it could
+	// error if the new config is invalid.
+	raftCfg := computeRaftReloadableConfig(config)
+	if err := s.raft.ReloadConfig(raftCfg); err != nil {
+		return err
+	}
+
 	s.rpcLimiter.Store(rate.NewLimiter(config.RPCRateLimit, config.RPCMaxBurst))
 	s.rpcConnLimiter.SetConfig(connlimit.Config{
 		MaxConnsPerClientIP: config.RPCMaxConnsPerClient,
@ -1401,6 +1408,33 @@ func (s *Server) ReloadConfig(config ReloadableConfig) error {
 	return nil
 }

+// computeRaftReloadableConfig works out the correct reloadable config for raft.
+// We reload raft even if nothing has changed since it's cheap and simpler than
+// trying to work out if it's different from the current raft config. This
+// function is separate to make it cheap to table test thoroughly without a full
+// raft instance.
+func computeRaftReloadableConfig(config ReloadableConfig) raft.ReloadableConfig {
+	// We use the raw defaults _not_ the current values so that you can reload
+	// back to a zero value having previously started Consul with a custom value
+	// for one of these fields.
+	defaultConf := DefaultConfig()
+	raftCfg := raft.ReloadableConfig{
+		TrailingLogs:      defaultConf.RaftConfig.TrailingLogs,
+		SnapshotInterval:  defaultConf.RaftConfig.SnapshotInterval,
+		SnapshotThreshold: defaultConf.RaftConfig.SnapshotThreshold,
+	}
+	if config.RaftSnapshotThreshold != 0 {
+		raftCfg.SnapshotThreshold = uint64(config.RaftSnapshotThreshold)
+	}
+	if config.RaftSnapshotInterval != 0 {
+		raftCfg.SnapshotInterval = config.RaftSnapshotInterval
+	}
+	if config.RaftTrailingLogs != 0 {
+		raftCfg.TrailingLogs = uint64(config.RaftTrailingLogs)
+	}
+	return raftCfg
+}
+
 // Atomically sets a readiness state flag when leadership is obtained, to indicate that server is past its barrier write
 func (s *Server) setConsistentReadReady() {
 	atomic.StoreInt32(&s.readyForConsistentReads, 1)
--- a/agent/consul/server_test.go
+++ b/agent/consul/server_test.go
@ -14,6 +14,7 @@ import (

 	"github.com/google/tcpproxy"
 	"github.com/hashicorp/memberlist"
+	"github.com/hashicorp/raft"

 	"github.com/hashicorp/consul/agent/connect/ca"
 	"github.com/hashicorp/consul/ipaddr"
@ -1466,6 +1467,9 @@ func TestServer_ReloadConfig(t *testing.T) {
 		c.Build = "1.5.0"
 		c.RPCRateLimit = 500
 		c.RPCMaxBurst = 5000
+		// Set one raft param to be non-default in the initial config, others are
+		// default.
+		c.RaftConfig.TrailingLogs = 1234
 	})
 	defer os.RemoveAll(dir1)
 	defer s.Shutdown()
@ -1480,6 +1484,14 @@ func TestServer_ReloadConfig(t *testing.T) {
 		RPCRateLimit:         1000,
 		RPCMaxBurst:          10000,
 		ConfigEntryBootstrap: []structs.ConfigEntry{entryInit},
+		// Reset the custom one to default be removing it from config file (it will
+		// be a zero value here).
+		RaftTrailingLogs: 0,
+
+		// Set a different Raft param to something custom now
+		RaftSnapshotThreshold: 4321,
+
+		// Leave other raft fields default
 	}
 	require.NoError(t, s.ReloadConfig(rc))

@ -1496,6 +1508,98 @@ func TestServer_ReloadConfig(t *testing.T) {
 	limiter = s.rpcLimiter.Load().(*rate.Limiter)
 	require.Equal(t, rate.Limit(1000), limiter.Limit())
 	require.Equal(t, 10000, limiter.Burst())
+
+	// Check raft config
+	defaults := DefaultConfig()
+	got := s.raft.ReloadableConfig()
+	require.Equal(t, uint64(4321), got.SnapshotThreshold,
+		"should have be reloaded to new value")
+	require.Equal(t, defaults.RaftConfig.SnapshotInterval, got.SnapshotInterval,
+		"should have remained the default interval")
+	require.Equal(t, defaults.RaftConfig.TrailingLogs, got.TrailingLogs,
+		"should have reloaded to default trailing_logs")
+
+	// Now check that update each of those raft fields separately works correctly
+	// too.
+}
+
+func TestServer_computeRaftReloadableConfig(t *testing.T) {
+
+	defaults := DefaultConfig().RaftConfig
+
+	cases := []struct {
+		name string
+		rc   ReloadableConfig
+		want raft.ReloadableConfig
+	}{
+		{
+			// This case is the common path - reload is called with a ReloadableConfig
+			// populated from the RuntimeConfig which has zero values for the fields.
+			// On startup we selectively pick non-zero runtime config fields to
+			// override defaults so we need to do the same.
+			name: "Still defaults",
+			rc:   ReloadableConfig{},
+			want: raft.ReloadableConfig{
+				SnapshotThreshold: defaults.SnapshotThreshold,
+				SnapshotInterval:  defaults.SnapshotInterval,
+				TrailingLogs:      defaults.TrailingLogs,
+			},
+		},
+		{
+			name: "Threshold set",
+			rc: ReloadableConfig{
+				RaftSnapshotThreshold: 123456,
+			},
+			want: raft.ReloadableConfig{
+				SnapshotThreshold: 123456,
+				SnapshotInterval:  defaults.SnapshotInterval,
+				TrailingLogs:      defaults.TrailingLogs,
+			},
+		},
+		{
+			name: "interval set",
+			rc: ReloadableConfig{
+				RaftSnapshotInterval: 13 * time.Minute,
+			},
+			want: raft.ReloadableConfig{
+				SnapshotThreshold: defaults.SnapshotThreshold,
+				SnapshotInterval:  13 * time.Minute,
+				TrailingLogs:      defaults.TrailingLogs,
+			},
+		},
+		{
+			name: "trailing logs set",
+			rc: ReloadableConfig{
+				RaftTrailingLogs: 78910,
+			},
+			want: raft.ReloadableConfig{
+				SnapshotThreshold: defaults.SnapshotThreshold,
+				SnapshotInterval:  defaults.SnapshotInterval,
+				TrailingLogs:      78910,
+			},
+		},
+		{
+			name: "all set",
+			rc: ReloadableConfig{
+				RaftSnapshotThreshold: 123456,
+				RaftSnapshotInterval:  13 * time.Minute,
+				RaftTrailingLogs:      78910,
+			},
+			want: raft.ReloadableConfig{
+				SnapshotThreshold: 123456,
+				SnapshotInterval:  13 * time.Minute,
+				TrailingLogs:      78910,
+			},
+		},
+	}
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			got := computeRaftReloadableConfig(tc.rc)
+			require.Equal(t, tc.want, got)
+		})
+	}
 }

 func TestServer_RPC_RateLimit(t *testing.T) {
--- a/agent/setup.go
+++ b/agent/setup.go
@ -175,6 +175,19 @@ func registerWithGRPC(b grpcresolver.Builder) {
 // getPrometheusDefs reaches into every slice of prometheus defs we've defined in each part of the agent, and appends
 //  all of our slices into one nice slice of definitions per metric type for the Consul agent to pass to go-metrics.
 func getPrometheusDefs(cfg lib.TelemetryConfig) ([]prometheus.GaugeDefinition, []prometheus.CounterDefinition, []prometheus.SummaryDefinition) {
+	// TODO: "raft..." metrics come from the raft lib and we should migrate these to a telemetry
+	//  package within. In the mean time, we're going to define a few here because they're key to monitoring Consul.
+	raftGauges := []prometheus.GaugeDefinition{
+		{
+			Name: []string{"raft", "fsm", "lastRestoreDuration"},
+			Help: "This measures how long the last FSM restore (from disk or leader) took.",
+		},
+		{
+			Name: []string{"raft", "leader", "oldestLogAge"},
+			Help: "This measures how old the oldest log in the leader's log store is.",
+		},
+	}
+
 	// Build slice of slices for all gauge definitions
 	var gauges = [][]prometheus.GaugeDefinition{
 		cache.Gauges,
@ -185,7 +198,9 @@ func getPrometheusDefs(cfg lib.TelemetryConfig) ([]prometheus.GaugeDefinition, [
 		usagemetrics.Gauges,
 		consul.ReplicationGauges,
 		Gauges,
+		raftGauges,
 	}
+
 	// Flatten definitions
 	// NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique?
 	var gaugeDefs []prometheus.GaugeDefinition
@ -252,6 +267,14 @@ func getPrometheusDefs(cfg lib.TelemetryConfig) ([]prometheus.GaugeDefinition, [
 			Name: []string{"raft", "leader", "lastContact"},
 			Help: "Measures the time since the leader was last able to contact the follower nodes when checking its leader lease.",
 		},
+		{
+			Name: []string{"raft", "snapshot", "persist"},
+			Help: "Measures the time it takes raft to write a new snapshot to disk.",
+		},
+		{
+			Name: []string{"raft", "rpc", "installSnapshot"},
+			Help: "Measures the time it takes the raft leader to install a snapshot on a follower that is catching up after being down or has just joined the cluster.",
+		},
 	}

 	var summaries = [][]prometheus.SummaryDefinition{
--- a/go.mod
+++ b/go.mod
@ -12,7 +12,7 @@ require (
 	github.com/Microsoft/go-winio v0.4.3 // indirect
 	github.com/NYTimes/gziphandler v1.0.1
 	github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e
-	github.com/armon/go-metrics v0.3.6
+	github.com/armon/go-metrics v0.3.7
 	github.com/armon/go-radix v1.0.0
 	github.com/aws/aws-sdk-go v1.25.41
 	github.com/coredns/coredns v1.1.2
@ -52,7 +52,7 @@ require (
 	github.com/hashicorp/mdns v1.0.4 // indirect
 	github.com/hashicorp/memberlist v0.2.3
 	github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69
-	github.com/hashicorp/raft v1.2.0
+	github.com/hashicorp/raft v1.3.0
 	github.com/hashicorp/raft-autopilot v0.1.2
 	github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea
 	github.com/hashicorp/serf v0.9.5
--- a/go.sum
+++ b/go.sum
@ -58,8 +58,8 @@ github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5
 github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY=
 github.com/armon/go-metrics v0.0.0-20190430140413-ec5e00d3c878/go.mod h1:3AMJUQhVx52RsWOnlkpikZr01T/yAVN2gn0861vByNg=
 github.com/armon/go-metrics v0.3.0/go.mod h1:zXjbSimjXTd7vOpY8B0/2LpvNvDoXBuplAD+gJD3GYs=
-github.com/armon/go-metrics v0.3.6 h1:x/tmtOF9cDBoXH7XoAGOz2qqm1DknFD1590XmD/DUJ8=
-github.com/armon/go-metrics v0.3.6/go.mod h1:4O98XIr/9W0sxpJ8UaYkvjk10Iff7SnFrb4QAOwNTFc=
+github.com/armon/go-metrics v0.3.7 h1:c/oCtWzYpboy6+6f6LjXRlyW7NwA2SWf+a9KMlHq/bM=
+github.com/armon/go-metrics v0.3.7/go.mod h1:4O98XIr/9W0sxpJ8UaYkvjk10Iff7SnFrb4QAOwNTFc=
 github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8=
 github.com/armon/go-radix v1.0.0 h1:F4z6KzEeeQIMeLFa97iZU6vupzoecKdU5TX24SNppXI=
 github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8=
@ -279,8 +279,9 @@ github.com/hashicorp/memberlist v0.2.3/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOn
 github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69 h1:lc3c72qGlIMDqQpQH82Y4vaglRMMFdJbziYWriR4UcE=
 github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69/go.mod h1:/z+jUGRBlwVpUZfjute9jWaF6/HuhjuFQuL1YXzVD1Q=
 github.com/hashicorp/raft v1.1.1/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
-github.com/hashicorp/raft v1.2.0 h1:mHzHIrF0S91d3A7RPBvuqkgB4d/7oFJZyvf1Q4m7GA0=
 github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
+github.com/hashicorp/raft v1.3.0 h1:Wox4J4R7J2FOJLtTa6hdk0VJfiNUSP32pYoYR738bkE=
+github.com/hashicorp/raft v1.3.0/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM=
 github.com/hashicorp/raft-autopilot v0.1.2 h1:yeqdUjWLjVJkBM+mcVxqwxi+w+aHsb9cEON2dz69OCs=
 github.com/hashicorp/raft-autopilot v0.1.2/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
 github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea h1:xykPFhrBAS2J0VBzVa5e80b5ZtYuNQtgXjN40qBZlD4=
--- a/vendor/github.com/armon/go-metrics/prometheus/prometheus.go
+++ b/vendor/github.com/armon/go-metrics/prometheus/prometheus.go
@ -5,7 +5,6 @@ package prometheus
 import (
 	"fmt"
 	"log"
-	"math"
 	"regexp"
 	"strings"
 	"sync"
@ -31,17 +30,16 @@ type PrometheusOpts struct {
 	Expiration time.Duration
 	Registerer prometheus.Registerer

-	// Gauges, Summaries, and Counters allow us to pre-declare metrics by giving their Name, Help, and ConstLabels to
-	// the PrometheusSink when it is created. Metrics declared in this way will be initialized at zero and will not be
-	// deleted when their expiry is reached.
-	// - Gauges and Summaries will be set to NaN when they expire.
-	// - Counters continue to Collect their last known value.
-	// Ex:
-	// PrometheusOpts{
+	// Gauges, Summaries, and Counters allow us to pre-declare metrics by giving
+	// their Name, Help, and ConstLabels to the PrometheusSink when it is created.
+	// Metrics declared in this way will be initialized at zero and will not be
+	// deleted or altered when their expiry is reached.
+	//
+	// Ex: PrometheusOpts{
 	//     Expiration: 10 * time.Second,
 	//     Gauges: []GaugeDefinition{
 	//         {
-	//	         Name: []string{ "application", "component", "measurement"},
+	//           Name: []string{ "application", "component", "measurement"},
 	//           Help: "application_component_measurement provides an example of how to declare static metrics",
 	//           ConstLabels: []metrics.Label{ { Name: "my_label", Value: "does_not_change" }, },
 	//         },
@ -139,21 +137,24 @@ func (p *PrometheusSink) Describe(c chan<- *prometheus.Desc) {
 // logic to clean up ephemeral metrics if their value haven't been set for a
 // duration exceeding our allowed expiration time.
 func (p *PrometheusSink) Collect(c chan<- prometheus.Metric) {
+	p.collectAtTime(c, time.Now())
+}
+
+// collectAtTime allows internal testing of the expiry based logic here without
+// mocking clocks or making tests timing sensitive.
+func (p *PrometheusSink) collectAtTime(c chan<- prometheus.Metric, t time.Time) {
 	expire := p.expiration != 0
-	now := time.Now()
 	p.gauges.Range(func(k, v interface{}) bool {
 		if v == nil {
 			return true
 		}
 		g := v.(*gauge)
 		lastUpdate := g.updatedAt
-		if expire && lastUpdate.Add(p.expiration).Before(now) {
+		if expire && lastUpdate.Add(p.expiration).Before(t) {
 			if g.canDelete {
 				p.gauges.Delete(k)
 				return true
 			}
-			// We have not observed the gauge this interval so we don't know its value.
-			g.Set(math.NaN())
 		}
 		g.Collect(c)
 		return true
@ -164,13 +165,11 @@ func (p *PrometheusSink) Collect(c chan<- prometheus.Metric) {
 		}
 		s := v.(*summary)
 		lastUpdate := s.updatedAt
-		if expire && lastUpdate.Add(p.expiration).Before(now) {
+		if expire && lastUpdate.Add(p.expiration).Before(t) {
 			if s.canDelete {
 				p.summaries.Delete(k)
 				return true
 			}
-			// We have observed nothing in this interval.
-			s.Observe(math.NaN())
 		}
 		s.Collect(c)
 		return true
@ -181,12 +180,11 @@ func (p *PrometheusSink) Collect(c chan<- prometheus.Metric) {
 		}
 		count := v.(*counter)
 		lastUpdate := count.updatedAt
-		if expire && lastUpdate.Add(p.expiration).Before(now) {
+		if expire && lastUpdate.Add(p.expiration).Before(t) {
 			if count.canDelete {
 				p.counters.Delete(k)
 				return true
 			}
-			// Counters remain at their previous value when not observed, so we do not set it to NaN.
 		}
 		count.Collect(c)
 		return true
--- a/vendor/github.com/hashicorp/raft/CHANGELOG.md
+++ b/vendor/github.com/hashicorp/raft/CHANGELOG.md
@ -1,5 +1,21 @@
 # UNRELEASED

+# 1.3.0 (April 22nd, 2021)
+
+IMPROVEMENTS
+
+* Added metrics for `oldestLogAge` and `lastRestoreDuration` to monitor capacity issues that can cause unrecoverable cluster failure  [[GH-452](https://github.com/hashicorp/raft/pull/452)][[GH-454](https://github.com/hashicorp/raft/pull/454/files)]
+* Made `TrailingLogs`, `SnapshotInterval` and `SnapshotThreshold` reloadable at runtime using a new `ReloadConfig` method. This allows recovery from cases where there are not enough logs retained for followers to catchup after a restart. [[GH-444](https://github.com/hashicorp/raft/pull/444)]
+* Inclusify the repository by switching to main [[GH-446](https://github.com/hashicorp/raft/pull/446)]
+* Add option for a buffered `ApplyCh` if `MaxAppendEntries` is enabled [[GH-445](https://github.com/hashicorp/raft/pull/445)]
+* Add string to `LogType` for more human readable debugging [[GH-442](https://github.com/hashicorp/raft/pull/442)]
+* Extract fuzzy testing into its own module [[GH-459](https://github.com/hashicorp/raft/pull/459)]
+
+BUG FIXES
+* Update LogCache `StoreLogs()` to capture an error that would previously cause a panic [[GH-460](https://github.com/hashicorp/raft/pull/460)]
+
+# 1.2.0 (October 5th, 2020)
+
 IMPROVEMENTS

 * Remove `StartAsLeader` configuration option [[GH-364](https://github.com/hashicorp/raft/pull/386)]
@ -85,4 +101,4 @@ v1.0.0 takes the changes that were staged in the library-v2-stage-one branch. Th

 # 0.1.0 (September 29th, 2017)

-v0.1.0 is the original stable version of the library that was in master and has been maintained with no breaking API changes. This was in use by Consul prior to version 0.7.0.
+v0.1.0 is the original stable version of the library that was in main and has been maintained with no breaking API changes. This was in use by Consul prior to version 0.7.0.
--- a/vendor/github.com/hashicorp/raft/Makefile
+++ b/vendor/github.com/hashicorp/raft/Makefile
@ -16,28 +16,28 @@ endif
 TEST_RESULTS_DIR?=/tmp/test-results

 test:
-	GOTRACEBACK=all go test $(TESTARGS) -timeout=60s -race .
-	GOTRACEBACK=all go test $(TESTARGS) -timeout=60s -tags batchtest -race .
+	GOTRACEBACK=all go test $(TESTARGS) -timeout=180s -race .
+	GOTRACEBACK=all go test $(TESTARGS) -timeout=180s -tags batchtest -race .

 integ: test
-	INTEG_TESTS=yes go test $(TESTARGS) -timeout=25s -run=Integ .
-	INTEG_TESTS=yes go test $(TESTARGS) -timeout=25s -tags batchtest -run=Integ .
+	INTEG_TESTS=yes go test $(TESTARGS) -timeout=60s -run=Integ .
+	INTEG_TESTS=yes go test $(TESTARGS) -timeout=60s -tags batchtest -run=Integ .

 ci.test-norace:
-	gotestsum --format=short-verbose --junitfile $(TEST_RESULTS_DIR)/gotestsum-report-test.xml -- -timeout=60s
-	gotestsum --format=short-verbose --junitfile $(TEST_RESULTS_DIR)/gotestsum-report-test.xml -- -timeout=60s -tags batchtest
+	gotestsum --format=short-verbose --junitfile $(TEST_RESULTS_DIR)/gotestsum-report-test.xml -- -timeout=180s
+	gotestsum --format=short-verbose --junitfile $(TEST_RESULTS_DIR)/gotestsum-report-test.xml -- -timeout=180s -tags batchtest

 ci.test:
-	gotestsum --format=short-verbose --junitfile $(TEST_RESULTS_DIR)/gotestsum-report-test.xml -- -timeout=60s -race .
-	gotestsum --format=short-verbose --junitfile $(TEST_RESULTS_DIR)/gotestsum-report-test.xml -- -timeout=60s -race -tags batchtest .
+	gotestsum --format=short-verbose --junitfile $(TEST_RESULTS_DIR)/gotestsum-report-test.xml -- -timeout=180s -race .
+	gotestsum --format=short-verbose --junitfile $(TEST_RESULTS_DIR)/gotestsum-report-test.xml -- -timeout=180s -race -tags batchtest .

 ci.integ: ci.test
-	INTEG_TESTS=yes gotestsum --format=short-verbose --junitfile $(TEST_RESULTS_DIR)/gotestsum-report-integ.xml -- -timeout=25s -run=Integ .
-	INTEG_TESTS=yes gotestsum --format=short-verbose --junitfile $(TEST_RESULTS_DIR)/gotestsum-report-integ.xml -- -timeout=25s -run=Integ -tags batchtest .
+	INTEG_TESTS=yes gotestsum --format=short-verbose --junitfile $(TEST_RESULTS_DIR)/gotestsum-report-integ.xml -- -timeout=60s -run=Integ .
+	INTEG_TESTS=yes gotestsum --format=short-verbose --junitfile $(TEST_RESULTS_DIR)/gotestsum-report-integ.xml -- -timeout=60s -run=Integ -tags batchtest .

 fuzz:
-	go test $(TESTARGS) -timeout=20m ./fuzzy
-	go test $(TESTARGS) -timeout=20m -tags batchtest ./fuzzy
+	cd ./fuzzy && go test $(TESTARGS) -timeout=20m .
+	cd ./fuzzy && go test $(TESTARGS) -timeout=20m -tags batchtest .

 deps:
 	go get -t -d -v ./...
--- a/vendor/github.com/hashicorp/raft/README.md
+++ b/vendor/github.com/hashicorp/raft/README.md
@ -28,16 +28,21 @@ To prevent complications with cgo, the primary backend `MDBStore` is in a separa
 called [raft-mdb](http://github.com/hashicorp/raft-mdb). That is the recommended implementation
 for the `LogStore` and `StableStore`.

-A pure Go backend using [BoltDB](https://github.com/boltdb/bolt) is also available called
+A pure Go backend using [Bbolt](https://github.com/etcd-io/bbolt) is also available called
 [raft-boltdb](https://github.com/hashicorp/raft-boltdb). It can also be used as a `LogStore`
 and `StableStore`.

+
+## Community Contributed Examples 
+[Raft gRPC Example](https://github.com/Jille/raft-grpc-example) - Utilizing the Raft repository with gRPC
+
+
 ## Tagged Releases

 As of September 2017, HashiCorp will start using tags for this library to clearly indicate
 major version updates. We recommend you vendor your application's dependency on this library.

-* v0.1.0 is the original stable version of the library that was in master and has been maintained
+* v0.1.0 is the original stable version of the library that was in main and has been maintained
 with no breaking API changes. This was in use by Consul prior to version 0.7.0.

 * v1.0.0 takes the changes that were staged in the library-v2-stage-one branch. This version
@ -104,4 +109,3 @@ greatly sacrificing performance.
 In terms of performance, Raft is comparable to Paxos. Assuming stable leadership,
 committing a log entry requires a single round trip to half of the cluster.
 Thus performance is bound by disk I/O and network latency.
-
--- a/vendor/github.com/hashicorp/raft/api.go
+++ b/vendor/github.com/hashicorp/raft/api.go
@ -81,8 +81,15 @@ type Raft struct {
 	// be committed and applied to the FSM.
 	applyCh chan *logFuture

-	// Configuration provided at Raft initialization
-	conf Config
+	// conf stores the current configuration to use. This is the most recent one
+	// provided. All reads of config values should use the config() helper method
+	// to read this safely.
+	conf atomic.Value
+
+	// confReloadMu ensures that only one thread can reload config at once since
+	// we need to read-modify-write the atomic. It is NOT necessary to hold this
+	// for any other operation e.g. reading config using config().
+	confReloadMu sync.Mutex

 	// FSM is the client state machine to apply commands to
 	fsm FSM
@ -199,7 +206,7 @@ type Raft struct {
 // server. Any further attempts to bootstrap will return an error that can be
 // safely ignored.
 //
-// One sane approach is to bootstrap a single server with a configuration
+// One approach is to bootstrap a single server with a configuration
 // listing just itself as a Voter, then invoke AddVoter() on it to add other
 // servers to the cluster.
 func BootstrapCluster(conf *Config, logs LogStore, stable StableStore,
@ -316,6 +323,12 @@ func RecoverCluster(conf *Config, fsm FSM, logs LogStore, stable StableStore,
 			continue
 		}

+		// Note this is the one place we call fsm.Restore without the
+		// fsmRestoreAndMeasure wrapper since this function should only be called to
+		// reset state on disk and the FSM passed will not be used for a running
+		// server instance. If the same process will eventually become a Raft peer
+		// then it will call NewRaft and restore again from disk then which will
+		// report metrics.
 		err = fsm.Restore(source)
 		// Close the source after the restore has completed
 		source.Close()
@ -385,9 +398,9 @@ func RecoverCluster(conf *Config, fsm FSM, logs LogStore, stable StableStore,
 	return nil
 }

-// GetConfiguration returns the configuration of the Raft cluster without
-// starting a Raft instance or connecting to the cluster
-// This function has identical behavior to Raft.GetConfiguration
+// GetConfiguration returns the persisted configuration of the Raft cluster
+// without starting a Raft instance or connecting to the cluster. This function
+// has identical behavior to Raft.GetConfiguration.
 func GetConfiguration(conf *Config, fsm FSM, logs LogStore, stable StableStore,
 	snaps SnapshotStore, trans Transport) (Configuration, error) {
 	conf.skipStartup = true
@ -486,7 +499,7 @@ func NewRaft(conf *Config, fsm FSM, logs LogStore, stable StableStore, snaps Sna

 	// Make sure we have a valid server address and ID.
 	protocolVersion := conf.ProtocolVersion
-	localAddr := ServerAddress(trans.LocalAddr())
+	localAddr := trans.LocalAddr()
 	localID := conf.LocalID

 	// TODO (slackpad) - When we deprecate protocol version 2, remove this
@ -495,11 +508,16 @@ func NewRaft(conf *Config, fsm FSM, logs LogStore, stable StableStore, snaps Sna
 		return nil, fmt.Errorf("when running with ProtocolVersion < 3, LocalID must be set to the network address")
 	}

+	// Buffer applyCh to MaxAppendEntries if the option is enabled
+	applyCh := make(chan *logFuture)
+	if conf.BatchApplyCh {
+		applyCh = make(chan *logFuture, conf.MaxAppendEntries)
+	}
+
 	// Create Raft struct.
 	r := &Raft{
 		protocolVersion:       protocolVersion,
-		applyCh:               make(chan *logFuture),
-		conf:                  *conf,
+		applyCh:               applyCh,
 		fsm:                   fsm,
 		fsmMutateCh:           make(chan interface{}, 128),
 		fsmSnapshotCh:         make(chan *reqSnapshotFuture),
@ -524,6 +542,8 @@ func NewRaft(conf *Config, fsm FSM, logs LogStore, stable StableStore, snaps Sna
 		leadershipTransferCh:  make(chan *leadershipTransferFuture, 1),
 	}

+	r.conf.Store(*conf)
+
 	// Initialize as a follower.
 	r.setState(Follower)

@ -577,23 +597,23 @@ func (r *Raft) restoreSnapshot() error {

 	// Try to load in order of newest to oldest
 	for _, snapshot := range snapshots {
-		if !r.conf.NoSnapshotRestoreOnStart {
+		if !r.config().NoSnapshotRestoreOnStart {
 			_, source, err := r.snapshots.Open(snapshot.ID)
 			if err != nil {
 				r.logger.Error("failed to open snapshot", "id", snapshot.ID, "error", err)
 				continue
 			}

-			err = r.fsm.Restore(source)
-			// Close the source after the restore has completed
-			source.Close()
-			if err != nil {
+			if err := fsmRestoreAndMeasure(r.fsm, source); err != nil {
+				source.Close()
 				r.logger.Error("failed to restore snapshot", "id", snapshot.ID, "error", err)
 				continue
 			}
+			source.Close()

 			r.logger.Info("restored from snapshot", "id", snapshot.ID)
 		}
+
 		// Update the lastApplied so we don't replay old logs
 		r.setLastApplied(snapshot.Index)

@ -624,6 +644,45 @@ func (r *Raft) restoreSnapshot() error {
 	return nil
 }

+func (r *Raft) config() Config {
+	return r.conf.Load().(Config)
+}
+
+// ReloadConfig updates the configuration of a running raft node. If the new
+// configuration is invalid an error is returned and no changes made to the
+// instance. All fields will be copied from rc into the new configuration, even
+// if they are zero valued.
+func (r *Raft) ReloadConfig(rc ReloadableConfig) error {
+	r.confReloadMu.Lock()
+	defer r.confReloadMu.Unlock()
+
+	// Load the current config (note we are under a lock so it can't be changed
+	// between this read and a later Store).
+	oldCfg := r.config()
+
+	// Set the reloadable fields
+	newCfg := rc.apply(oldCfg)
+
+	if err := ValidateConfig(&newCfg); err != nil {
+		return err
+	}
+	r.conf.Store(newCfg)
+	return nil
+}
+
+// ReloadableConfig returns the current state of the reloadable fields in Raft's
+// configuration. This is useful for programs to discover the current state for
+// reporting to users or tests. It is safe to call from any goroutine. It is
+// intended for reporting and testing purposes primarily; external
+// synchronization would be required to safely use this in a read-modify-write
+// pattern for reloadable configuration options.
+func (r *Raft) ReloadableConfig() ReloadableConfig {
+	cfg := r.config()
+	var rc ReloadableConfig
+	rc.fromConfig(cfg)
+	return rc
+}
+
 // BootstrapCluster is equivalent to non-member BootstrapCluster but can be
 // called on an un-bootstrapped Raft instance after it has been created. This
 // should only be called at the beginning of time for the cluster with an
--- a/vendor/github.com/hashicorp/raft/commitment.go
+++ b/vendor/github.com/hashicorp/raft/commitment.go
@ -23,7 +23,7 @@ type commitment struct {
 	startIndex uint64
 }

-// newCommitment returns an commitment struct that notifies the provided
+// newCommitment returns a commitment struct that notifies the provided
 // channel when log entries have been committed. A new commitment struct is
 // created each time this server becomes leader for a particular term.
 // 'configuration' is the servers in the cluster.
--- a/vendor/github.com/hashicorp/raft/config.go
+++ b/vendor/github.com/hashicorp/raft/config.go
@ -151,25 +151,36 @@ type Config struct {
 	// an inconsistent log.
 	MaxAppendEntries int

+	// BatchApplyCh indicates whether we should buffer applyCh
+	// to size MaxAppendEntries. This enables batch log commitment,
+	// but breaks the timeout guarantee on Apply. Specifically,
+	// a log can be added to the applyCh buffer but not actually be
+	// processed until after the specified timeout.
+	BatchApplyCh bool
+
 	// If we are a member of a cluster, and RemovePeer is invoked for the
 	// local node, then we forget all peers and transition into the follower state.
-	// If ShutdownOnRemove is is set, we additional shutdown Raft. Otherwise,
+	// If ShutdownOnRemove is set, we additional shutdown Raft. Otherwise,
 	// we can become a leader of a cluster containing only this node.
 	ShutdownOnRemove bool

-	// TrailingLogs controls how many logs we leave after a snapshot. This is
-	// used so that we can quickly replay logs on a follower instead of being
-	// forced to send an entire snapshot.
+	// TrailingLogs controls how many logs we leave after a snapshot. This is used
+	// so that we can quickly replay logs on a follower instead of being forced to
+	// send an entire snapshot. The value passed here is the initial setting used.
+	// This can be tuned during operation using ReloadConfig.
 	TrailingLogs uint64

-	// SnapshotInterval controls how often we check if we should perform a snapshot.
-	// We randomly stagger between this value and 2x this value to avoid the entire
-	// cluster from performing a snapshot at once.
+	// SnapshotInterval controls how often we check if we should perform a
+	// snapshot. We randomly stagger between this value and 2x this value to avoid
+	// the entire cluster from performing a snapshot at once. The value passed
+	// here is the initial setting used. This can be tuned during operation using
+	// ReloadConfig.
 	SnapshotInterval time.Duration

 	// SnapshotThreshold controls how many outstanding logs there must be before
-	// we perform a snapshot. This is to prevent excessive snapshots when we can
-	// just replay a small set of logs.
+	// we perform a snapshot. This is to prevent excessive snapshotting by
+	// replaying a small set of logs instead. The value passed here is the initial
+	// setting used. This can be tuned during operation using ReloadConfig.
 	SnapshotThreshold uint64

 	// LeaderLeaseTimeout is used to control how long the "lease" lasts
@ -178,7 +189,7 @@ type Config struct {
 	// step down as leader.
 	LeaderLeaseTimeout time.Duration

-	// The unique ID for this server across all time. When running with
+	// LocalID is a unique ID for this server across all time. When running with
 	// ProtocolVersion < 3, you must set this to be the same as the network
 	// address of your transport.
 	LocalID ServerID
@ -192,25 +203,65 @@ type Config struct {
 	// Defaults to os.Stderr.
 	LogOutput io.Writer

-	// LogLevel represents a log level. If a no matching string is specified,
-	// hclog.NoLevel is assumed.
+	// LogLevel represents a log level. If the value does not match a known
+	// logging level hclog.NoLevel is used.
 	LogLevel string

-	// Logger is a user-provided hc-log logger. If nil, a logger writing to
+	// Logger is a user-provided logger. If nil, a logger writing to
 	// LogOutput with LogLevel is used.
 	Logger hclog.Logger

 	// NoSnapshotRestoreOnStart controls if raft will restore a snapshot to the
 	// FSM on start. This is useful if your FSM recovers from other mechanisms
 	// than raft snapshotting. Snapshot metadata will still be used to initialize
-	// raft's configuration and index values. This is used in NewRaft and
-	// RestoreCluster.
+	// raft's configuration and index values.
 	NoSnapshotRestoreOnStart bool

 	// skipStartup allows NewRaft() to bypass all background work goroutines
 	skipStartup bool
 }

+// ReloadableConfig is the subset of Config that may be reconfigured during
+// runtime using raft.ReloadConfig. We choose to duplicate fields over embedding