From 141eb60f061f7d392fa4bdf47c919b28962c47cc Mon Sep 17 00:00:00 2001 From: Matt Keeler Date: Thu, 8 Oct 2020 15:02:19 -0400 Subject: [PATCH] Add per-agent reconnect timeouts (#8781) This allows for client agent to be run in a more stateless manner where they may be abruptly terminated and not expected to come back. If advertising a per-agent reconnect timeout using the advertise_reconnect_timeout configuration when that agent leaves, other agents will wait only that amount of time for the agent to come back before reaping it. This has the advantageous side effect of causing servers to deregister the node/services/checks for that agent sooner than if the global reconnect_timeout was used. --- .changelog/8781.txt | 3 + agent/agent.go | 2 + agent/config/builder.go | 15 ++-- agent/config/config.go | 1 + agent/config/runtime.go | 7 ++ agent/config/runtime_test.go | 24 +++++- agent/consul/acl_client.go | 4 +- agent/consul/acl_server.go | 6 +- agent/consul/client_serf.go | 6 ++ agent/consul/client_test.go | 31 +++++++ agent/consul/config.go | 11 ++- agent/consul/server_serf.go | 3 + api/go.mod | 2 +- api/go.sum | 4 +- go.mod | 2 +- go.sum | 5 +- lib/serf.go | 44 ---------- lib/serf/serf.go | 82 +++++++++++++++++++ .../hashicorp/serf/coordinate/phantom.go | 4 +- .../github.com/hashicorp/serf/serf/config.go | 10 +++ .../hashicorp/serf/serf/merge_delegate.go | 37 +++++++-- vendor/github.com/hashicorp/serf/serf/serf.go | 40 ++++++++- vendor/modules.txt | 2 +- website/pages/docs/agent/options.mdx | 5 ++ 24 files changed, 275 insertions(+), 75 deletions(-) create mode 100644 .changelog/8781.txt delete mode 100644 lib/serf.go create mode 100644 lib/serf/serf.go diff --git a/.changelog/8781.txt b/.changelog/8781.txt new file mode 100644 index 000000000..0ad1d7f5b --- /dev/null +++ b/.changelog/8781.txt @@ -0,0 +1,3 @@ +```release-note:feature +agent: Allow client agents to be configured with an advertised reconnect timeout to control how long until the nodes are reaped by others in the cluster. +``` \ No newline at end of file diff --git a/agent/agent.go b/agent/agent.go index 9edeb5344..ae64eb611 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1025,6 +1025,8 @@ func newConsulConfig(runtimeCfg *config.RuntimeConfig, logger hclog.Logger) (*co cfg.SerfWANConfig = nil } + cfg.AdvertiseReconnectTimeout = runtimeCfg.AdvertiseReconnectTimeout + cfg.RPCAddr = runtimeCfg.RPCBindAddr cfg.RPCAdvertise = runtimeCfg.RPCAdvertiseAddr diff --git a/agent/config/builder.go b/agent/config/builder.go index 666ff0b65..868743bf6 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -949,11 +949,12 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) { }, // Agent - AdvertiseAddrLAN: advertiseAddrLAN, - AdvertiseAddrWAN: advertiseAddrWAN, - BindAddr: bindAddr, - Bootstrap: b.boolVal(c.Bootstrap), - BootstrapExpect: b.intVal(c.BootstrapExpect), + AdvertiseAddrLAN: advertiseAddrLAN, + AdvertiseAddrWAN: advertiseAddrWAN, + AdvertiseReconnectTimeout: b.durationVal("advertise_reconnect_timeout", c.AdvertiseReconnectTimeout), + BindAddr: bindAddr, + Bootstrap: b.boolVal(c.Bootstrap), + BootstrapExpect: b.intVal(c.BootstrapExpect), Cache: cache.Options{ EntryFetchRate: rate.Limit( b.float64ValWithDefault(c.Cache.EntryFetchRate, float64(cache.DefaultEntryFetchRate)), @@ -1389,6 +1390,10 @@ func (b *Builder) Validate(rt RuntimeConfig) error { return fmt.Errorf("auto_encrypt.allow_tls can only be used on a server.") } + if rt.ServerMode && rt.AdvertiseReconnectTimeout != 0 { + return fmt.Errorf("advertise_reconnect_timeout can only be used on a client") + } + // ---------------------------------------------------------------- // warnings // diff --git a/agent/config/config.go b/agent/config/config.go index b66bcd0d8..346603e34 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -142,6 +142,7 @@ type Config struct { AdvertiseAddrWAN *string `json:"advertise_addr_wan,omitempty" hcl:"advertise_addr_wan" mapstructure:"advertise_addr_wan"` AdvertiseAddrWANIPv4 *string `json:"advertise_addr_wan_ipv4,omitempty" hcl:"advertise_addr_wan_ipv4" mapstructure:"advertise_addr_wan_ipv4"` AdvertiseAddrWANIPv6 *string `json:"advertise_addr_wan_ipv6,omitempty" hcl:"advertise_addr_wan_ipv6" mapstructure:"advertise_addr_ipv6"` + AdvertiseReconnectTimeout *string `json:"advertise_reconnect_timeout,omitempty" hcl:"advertise_reconnect_timeout" mapstructure:"advertise_reconnect_timeout"` AutoConfig AutoConfigRaw `json:"auto_config,omitempty" hcl:"auto_config" mapstructure:"auto_config"` Autopilot Autopilot `json:"autopilot,omitempty" hcl:"autopilot" mapstructure:"autopilot"` BindAddr *string `json:"bind_addr,omitempty" hcl:"bind_addr" mapstructure:"bind_addr"` diff --git a/agent/config/runtime.go b/agent/config/runtime.go index 34b870c2a..417b22495 100644 --- a/agent/config/runtime.go +++ b/agent/config/runtime.go @@ -980,6 +980,13 @@ type RuntimeConfig struct { // hcl: reconnect_timeout = "duration" ReconnectTimeoutWAN time.Duration + // AdvertiseReconnectTimeout specifies the amount of time other agents should + // wait for us to reconnect before deciding we are permanently gone. This + // should only be set for client agents that are run in a stateless or + // ephemeral manner in order to realize their deletion sooner than we + // would otherwise. + AdvertiseReconnectTimeout time.Duration + // RejoinAfterLeave controls our interaction with the cluster after leave. // When set to false (default), a leave causes Consul to not rejoin // the cluster until an explicit join is received. If this is set to diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index 9a0536484..29f145dd1 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -51,7 +51,7 @@ type configTest struct { // should check one option at a time if possible and should use generic // values, e.g. 'a' or 1 instead of 'servicex' or 3306. -func TestBuilder_BuildAndValide_ConfigFlagsAndEdgecases(t *testing.T) { +func TestBuilder_BuildAndValidate_ConfigFlagsAndEdgecases(t *testing.T) { dataDir := testutil.TempDir(t, "consul") defaultEntMeta := structs.DefaultEnterpriseMeta() @@ -4438,7 +4438,6 @@ func TestBuilder_BuildAndValide_ConfigFlagsAndEdgecases(t *testing.T) { rt.CertFile = "foo" }, }, - // UI Config tests { desc: "ui config deprecated", @@ -4601,6 +4600,23 @@ func TestBuilder_BuildAndValide_ConfigFlagsAndEdgecases(t *testing.T) { `}, err: `ui_config.dashboard_url_templates values must be a valid http or https URL.`, }, + + // Per node reconnect timeout test + { + desc: "server and advertised reconnect timeout error", + args: []string{ + `-data-dir=` + dataDir, + `-server`, + }, + hcl: []string{` + advertise_reconnect_timeout = "5s" + `}, + json: []string{` + { + "advertise_reconnect_timeout": "5s" + }`}, + err: "advertise_reconnect_timeout can only be used on a client", + }, } testConfig(t, tests, dataDir) @@ -4834,6 +4850,7 @@ func TestFullConfig(t *testing.T) { }, "advertise_addr": "17.99.29.16", "advertise_addr_wan": "78.63.37.19", + "advertise_reconnect_timeout": "0s", "audit": { "enabled": false }, @@ -5515,6 +5532,7 @@ func TestFullConfig(t *testing.T) { } advertise_addr = "17.99.29.16" advertise_addr_wan = "78.63.37.19" + advertise_reconnect_timeout = "0s" audit = { enabled = false } @@ -6295,6 +6313,7 @@ func TestFullConfig(t *testing.T) { ACLTokenReplication: true, AdvertiseAddrLAN: ipAddr("17.99.29.16"), AdvertiseAddrWAN: ipAddr("78.63.37.19"), + AdvertiseReconnectTimeout: 0 * time.Second, AutopilotCleanupDeadServers: true, AutopilotDisableUpgradeMigration: true, AutopilotLastContactThreshold: 12705 * time.Second, @@ -7278,6 +7297,7 @@ func TestSanitize(t *testing.T) { "AEInterval": "0s", "AdvertiseAddrLAN": "", "AdvertiseAddrWAN": "", + "AdvertiseReconnectTimeout": "0s", "AutopilotCleanupDeadServers": false, "AutopilotDisableUpgradeMigration": false, "AutopilotLastContactThreshold": "0s", diff --git a/agent/consul/acl_client.go b/agent/consul/acl_client.go index 17a961ef1..1b9c6a46d 100644 --- a/agent/consul/acl_client.go +++ b/agent/consul/acl_client.go @@ -6,7 +6,7 @@ import ( "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/lib" + "github.com/hashicorp/consul/lib/serf" ) var clientACLCacheConfig *structs.ACLCachesConfig = &structs.ACLCachesConfig{ @@ -123,5 +123,5 @@ func (c *Client) ResolveTokenAndDefaultMeta(token string, entMeta *structs.Enter func (c *Client) updateSerfTags(key, value string) { // Update the LAN serf - lib.UpdateSerfTag(c.serf, key, value) + serf.UpdateTag(c.serf, key, value) } diff --git a/agent/consul/acl_server.go b/agent/consul/acl_server.go index a2f1790f4..a495a312b 100644 --- a/agent/consul/acl_server.go +++ b/agent/consul/acl_server.go @@ -6,7 +6,7 @@ import ( "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/lib" + "github.com/hashicorp/consul/lib/serf" ) var serverACLCacheConfig *structs.ACLCachesConfig = &structs.ACLCachesConfig{ @@ -86,10 +86,10 @@ func (s *Server) checkBindingRuleUUID(id string) (bool, error) { func (s *Server) updateSerfTags(key, value string) { // Update the LAN serf - lib.UpdateSerfTag(s.serfLAN, key, value) + serf.UpdateTag(s.serfLAN, key, value) if s.serfWAN != nil { - lib.UpdateSerfTag(s.serfWAN, key, value) + serf.UpdateTag(s.serfWAN, key, value) } s.updateEnterpriseSerfTags(key, value) diff --git a/agent/consul/client_serf.go b/agent/consul/client_serf.go index fb0b999e1..1fa28ab4d 100644 --- a/agent/consul/client_serf.go +++ b/agent/consul/client_serf.go @@ -8,6 +8,7 @@ import ( "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/lib" + libserf "github.com/hashicorp/consul/lib/serf" "github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/types" "github.com/hashicorp/go-hclog" @@ -27,6 +28,9 @@ func (c *Client) setupSerf(conf *serf.Config, ch chan serf.Event, path string) ( conf.Tags["vsn_min"] = fmt.Sprintf("%d", ProtocolVersionMin) conf.Tags["vsn_max"] = fmt.Sprintf("%d", ProtocolVersionMax) conf.Tags["build"] = c.config.Build + if c.config.AdvertiseReconnectTimeout != 0 { + conf.Tags[libserf.ReconnectTimeoutTag] = c.config.AdvertiseReconnectTimeout.String() + } if c.acls.ACLsEnabled() { // we start in legacy mode and then transition to normal // mode once we know the cluster can handle it. @@ -65,6 +69,8 @@ func (c *Client) setupSerf(conf *serf.Config, ch chan serf.Event, path string) ( c.addEnterpriseSerfTags(conf.Tags) + conf.ReconnectTimeoutOverride = libserf.NewReconnectOverride(c.logger) + return serf.Create(conf) } diff --git a/agent/consul/client_test.go b/agent/consul/client_test.go index 6fe726681..4af8a4629 100644 --- a/agent/consul/client_test.go +++ b/agent/consul/client_test.go @@ -747,3 +747,34 @@ func TestClient_Reload(t *testing.T) { require.Equal(t, rate.Limit(1000), limiter.Limit()) require.Equal(t, 10000, limiter.Burst()) } + +func TestClient_ShortReconnectTimeout(t *testing.T) { + cluster := newTestCluster(t, &testClusterConfig{ + Datacenter: "dc1", + Servers: 1, + Clients: 2, + ServerConf: func(c *Config) { + c.SerfLANConfig.ReapInterval = 50 * time.Millisecond + }, + ClientConf: func(c *Config) { + c.SerfLANConfig.ReapInterval = 50 * time.Millisecond + c.AdvertiseReconnectTimeout = 100 * time.Millisecond + }, + }) + + // shutdown the client + cluster.Clients[1].Shutdown() + + // Now wait for it to be reaped. We set the advertised reconnect + // timeout to 100ms so we are going to check every 50 ms and allow + // up to 10x the time in the case of slow CI. + require.Eventually(t, + func() bool { + return len(cluster.Servers[0].LANMembers()) == 2 && + len(cluster.Clients[0].LANMembers()) == 2 + + }, + time.Second, + 50*time.Millisecond, + "The client node was not reaped within the alotted time") +} diff --git a/agent/consul/config.go b/agent/consul/config.go index 72d1fac05..ea77edcba 100644 --- a/agent/consul/config.go +++ b/agent/consul/config.go @@ -9,7 +9,7 @@ import ( "github.com/hashicorp/consul/agent/checks" "github.com/hashicorp/consul/agent/consul/autopilot" "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/lib" + libserf "github.com/hashicorp/consul/lib/serf" "github.com/hashicorp/consul/tlsutil" "github.com/hashicorp/consul/types" "github.com/hashicorp/consul/version" @@ -219,6 +219,11 @@ type Config struct { // true, we ignore the leave, and rejoin the cluster on start. RejoinAfterLeave bool + // AdvertiseReconnectTimeout is the duration after which this node should be + // assumed to not be returning and thus should be reaped within Serf. This + // can only be set for Client agents + AdvertiseReconnectTimeout time.Duration + // Build is a string that is gossiped around, and can be used to help // operators track which versions are actively deployed Build string @@ -544,8 +549,8 @@ func DefaultConfig() *Config { NodeName: hostname, RPCAddr: DefaultRPCAddr, RaftConfig: raft.DefaultConfig(), - SerfLANConfig: lib.SerfDefaultConfig(), - SerfWANConfig: lib.SerfDefaultConfig(), + SerfLANConfig: libserf.DefaultConfig(), + SerfWANConfig: libserf.DefaultConfig(), SerfFloodInterval: 60 * time.Second, ReconcileInterval: 60 * time.Second, ProtocolVersion: ProtocolVersion2Compatible, diff --git a/agent/consul/server_serf.go b/agent/consul/server_serf.go index 46e3110d4..a014d13bd 100644 --- a/agent/consul/server_serf.go +++ b/agent/consul/server_serf.go @@ -11,6 +11,7 @@ import ( "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/lib" + libserf "github.com/hashicorp/consul/lib/serf" "github.com/hashicorp/consul/logging" "github.com/hashicorp/go-hclog" "github.com/hashicorp/memberlist" @@ -168,6 +169,8 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string, w return nil, err } + conf.ReconnectTimeoutOverride = libserf.NewReconnectOverride(s.logger) + s.addEnterpriseSerfTags(conf.Tags) if s.config.OverrideInitialSerfTags != nil { diff --git a/api/go.mod b/api/go.mod index d9902d403..becc75b41 100644 --- a/api/go.mod +++ b/api/go.mod @@ -10,7 +10,7 @@ require ( github.com/hashicorp/go-hclog v0.12.0 github.com/hashicorp/go-rootcerts v1.0.2 github.com/hashicorp/go-uuid v1.0.1 - github.com/hashicorp/serf v0.9.3 + github.com/hashicorp/serf v0.9.5 github.com/mitchellh/mapstructure v1.1.2 github.com/stretchr/testify v1.4.0 ) diff --git a/api/go.sum b/api/go.sum index 3c26420cd..57ef54399 100644 --- a/api/go.sum +++ b/api/go.sum @@ -41,8 +41,8 @@ github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO github.com/hashicorp/mdns v1.0.1/go.mod h1:4gW7WsVCke5TE7EPeYliwHlRUyBtfCwuFwuMg2DmyNY= github.com/hashicorp/memberlist v0.2.2 h1:5+RffWKwqJ71YPu9mWsF7ZOscZmwfasdA8kbdC7AO2g= github.com/hashicorp/memberlist v0.2.2/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE= -github.com/hashicorp/serf v0.9.3 h1:AVF6JDQQens6nMHT9OGERBvK0f8rPrAGILnsKLr6lzM= -github.com/hashicorp/serf v0.9.3/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk= +github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM= +github.com/hashicorp/serf v0.9.5/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk= github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs= github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= diff --git a/go.mod b/go.mod index 2b8583aef..4308e153b 100644 --- a/go.mod +++ b/go.mod @@ -56,7 +56,7 @@ require ( github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69 github.com/hashicorp/raft v1.1.2 github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea - github.com/hashicorp/serf v0.9.4 + github.com/hashicorp/serf v0.9.5 github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086 github.com/hashicorp/yamux v0.0.0-20181012175058-2f1d1f20f75d github.com/imdario/mergo v0.3.6 diff --git a/go.sum b/go.sum index a22da45ca..e585b7470 100644 --- a/go.sum +++ b/go.sum @@ -288,9 +288,8 @@ github.com/hashicorp/raft v1.1.2 h1:oxEL5DDeurYxLd3UbcY/hccgSPhLLpiBZ1YxtWEq59c= github.com/hashicorp/raft v1.1.2/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8= github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea h1:xykPFhrBAS2J0VBzVa5e80b5ZtYuNQtgXjN40qBZlD4= github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk= -github.com/hashicorp/serf v0.9.3/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk= -github.com/hashicorp/serf v0.9.4 h1:xrZ4ZR0wT5Dz8oQHHdfOzr0ei1jMToWlFFz3hh/DI7I= -github.com/hashicorp/serf v0.9.4/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk= +github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM= +github.com/hashicorp/serf v0.9.5/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk= github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086 h1:OKsyxKi2sNmqm1Gv93adf2AID2FOBFdCbbZn9fGtIdg= github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086/go.mod h1:R3Umvhlxi2TN7Ex2hzOowyeNb+SfbVWI973N+ctaFMk= github.com/hashicorp/vault/sdk v0.1.14-0.20200519221838-e0cfd64bc267 h1:e1ok06zGrWJW91rzRroyl5nRNqraaBe4d5hiKcVZuHM= diff --git a/lib/serf.go b/lib/serf.go deleted file mode 100644 index c0b6e5126..000000000 --- a/lib/serf.go +++ /dev/null @@ -1,44 +0,0 @@ -package lib - -import ( - "time" - - "github.com/hashicorp/serf/serf" -) - -// SerfDefaultConfig returns a Consul-flavored Serf default configuration, -// suitable as a basis for a LAN, WAN, segment, or area. -func SerfDefaultConfig() *serf.Config { - base := serf.DefaultConfig() - - // This effectively disables the annoying queue depth warnings. - base.QueueDepthWarning = 1000000 - - // This enables dynamic sizing of the message queue depth based on the - // cluster size. - base.MinQueueDepth = 4096 - - // This gives leaves some time to propagate through the cluster before - // we shut down. The value was chosen to be reasonably short, but to - // allow a leave to get to over 99.99% of the cluster with 100k nodes - // (using https://www.serf.io/docs/internals/simulator.html). - base.LeavePropagateDelay = 3 * time.Second - - return base -} - -func GetSerfTags(serf *serf.Serf) map[string]string { - tags := make(map[string]string) - for tag, value := range serf.LocalMember().Tags { - tags[tag] = value - } - - return tags -} - -func UpdateSerfTag(serf *serf.Serf, tag, value string) { - tags := GetSerfTags(serf) - tags[tag] = value - - serf.SetTags(tags) -} diff --git a/lib/serf/serf.go b/lib/serf/serf.go new file mode 100644 index 000000000..8f97e24d8 --- /dev/null +++ b/lib/serf/serf.go @@ -0,0 +1,82 @@ +package serf + +import ( + "time" + + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/serf/serf" +) + +const ( + ReconnectTimeoutTag = "rc_tm" +) + +// DefaultConfig returns a Consul-flavored Serf default configuration, +// suitable as a basis for a LAN, WAN, segment, or area. +func DefaultConfig() *serf.Config { + base := serf.DefaultConfig() + + // This effectively disables the annoying queue depth warnings. + base.QueueDepthWarning = 1000000 + + // This enables dynamic sizing of the message queue depth based on the + // cluster size. + base.MinQueueDepth = 4096 + + // This gives leaves some time to propagate through the cluster before + // we shut down. The value was chosen to be reasonably short, but to + // allow a leave to get to over 99.99% of the cluster with 100k nodes + // (using https://www.serf.io/docs/internals/simulator.html). + base.LeavePropagateDelay = 3 * time.Second + + return base +} + +func GetTags(serf *serf.Serf) map[string]string { + tags := make(map[string]string) + for tag, value := range serf.LocalMember().Tags { + tags[tag] = value + } + + return tags +} + +func UpdateTag(serf *serf.Serf, tag, value string) { + tags := GetTags(serf) + tags[tag] = value + + serf.SetTags(tags) +} + +type ReconnectOverride struct { + logger hclog.Logger +} + +func NewReconnectOverride(logger hclog.Logger) *ReconnectOverride { + if logger == nil { + logger = hclog.Default() + } + + return &ReconnectOverride{ + logger: logger, + } +} + +func (r *ReconnectOverride) ReconnectTimeout(m *serf.Member, timeout time.Duration) time.Duration { + val, ok := m.Tags[ReconnectTimeoutTag] + if !ok { + return timeout + } + newTimeout, err := time.ParseDuration(val) + if err != nil { + r.logger.Warn("Member is advertising a malformed reconnect timeout", "member", m.Name, "rc_tm", val) + return timeout + } + + // ignore a timeout of 0 as that indicates the default should be used + if newTimeout == 0 { + return timeout + } + + return newTimeout +} diff --git a/vendor/github.com/hashicorp/serf/coordinate/phantom.go b/vendor/github.com/hashicorp/serf/coordinate/phantom.go index 6fb033c0c..66da4e2e9 100644 --- a/vendor/github.com/hashicorp/serf/coordinate/phantom.go +++ b/vendor/github.com/hashicorp/serf/coordinate/phantom.go @@ -11,7 +11,7 @@ import ( // given config. func GenerateClients(nodes int, config *Config) ([]*Client, error) { clients := make([]*Client, nodes) - for i, _ := range clients { + for i := range clients { client, err := NewClient(config) if err != nil { return nil, err @@ -146,7 +146,7 @@ func Simulate(clients []*Client, truth [][]time.Duration, cycles int) { nodes := len(clients) for cycle := 0; cycle < cycles; cycle++ { - for i, _ := range clients { + for i := range clients { if j := rand.Intn(nodes); j != i { c := clients[j].GetCoordinate() rtt := truth[i][j] diff --git a/vendor/github.com/hashicorp/serf/serf/config.go b/vendor/github.com/hashicorp/serf/serf/config.go index 2875e1d60..57b5a98e5 100644 --- a/vendor/github.com/hashicorp/serf/serf/config.go +++ b/vendor/github.com/hashicorp/serf/serf/config.go @@ -253,6 +253,15 @@ type Config struct { // // WARNING: this should ONLY be used in tests messageDropper func(typ messageType) bool + + // ReconnectTimeoutOverride is an optional interface which when present allows + // the application to cause reaping of a node to happen when it otherwise wouldn't + ReconnectTimeoutOverride ReconnectTimeoutOverrider + + // ValidateNodeNames controls whether nodenames only + // contain alphanumeric, dashes and '.'characters + // and sets maximum length to 128 characters + ValidateNodeNames bool } // Init allocates the subdata structures @@ -298,6 +307,7 @@ func DefaultConfig() *Config { QuerySizeLimit: 1024, EnableNameConflictResolution: true, DisableCoordinates: false, + ValidateNodeNames: false, UserEventSizeLimit: 512, } } diff --git a/vendor/github.com/hashicorp/serf/serf/merge_delegate.go b/vendor/github.com/hashicorp/serf/serf/merge_delegate.go index 36eb52f7d..2e1e7c5b5 100644 --- a/vendor/github.com/hashicorp/serf/serf/merge_delegate.go +++ b/vendor/github.com/hashicorp/serf/serf/merge_delegate.go @@ -1,6 +1,7 @@ package serf import ( + "fmt" "net" "github.com/hashicorp/memberlist" @@ -17,22 +18,31 @@ type mergeDelegate struct { func (m *mergeDelegate) NotifyMerge(nodes []*memberlist.Node) error { members := make([]*Member, len(nodes)) for idx, n := range nodes { - members[idx] = m.nodeToMember(n) + var err error + members[idx], err = m.nodeToMember(n) + if err != nil { + return err + } } return m.serf.config.Merge.NotifyMerge(members) } func (m *mergeDelegate) NotifyAlive(peer *memberlist.Node) error { - member := m.nodeToMember(peer) + member, err := m.nodeToMember(peer) + if err != nil { + return err + } return m.serf.config.Merge.NotifyMerge([]*Member{member}) } -func (m *mergeDelegate) nodeToMember(n *memberlist.Node) *Member { +func (m *mergeDelegate) nodeToMember(n *memberlist.Node) (*Member, error) { status := StatusNone if n.State == memberlist.StateLeft { status = StatusLeft } - + if err := m.validateMemberInfo(n); err != nil { + return nil, err + } return &Member{ Name: n.Name, Addr: net.IP(n.Addr), @@ -45,5 +55,22 @@ func (m *mergeDelegate) nodeToMember(n *memberlist.Node) *Member { DelegateMin: n.DMin, DelegateMax: n.DMax, DelegateCur: n.DCur, - } + }, nil +} + +// validateMemberInfo checks that the data we are sending is valid +func (m *mergeDelegate) validateMemberInfo(n *memberlist.Node) error { + if err := m.serf.validateNodeName(n.Name); err != nil { + return err + } + + if len(n.Addr) != 4 && len(n.Addr) != 16 { + return fmt.Errorf("IP byte length is invalid: %d bytes is not either 4 or 16", len(n.Addr)) + } + + if len(n.Meta) > memberlist.MetaMaxSize { + return fmt.Errorf("Encoded length of tags exceeds limit of %d bytes", + memberlist.MetaMaxSize) + } + return nil } diff --git a/vendor/github.com/hashicorp/serf/serf/serf.go b/vendor/github.com/hashicorp/serf/serf/serf.go index c04719082..8a7c069c2 100644 --- a/vendor/github.com/hashicorp/serf/serf/serf.go +++ b/vendor/github.com/hashicorp/serf/serf/serf.go @@ -11,6 +11,7 @@ import ( "math/rand" "net" "os" + "regexp" "strconv" "sync" "sync/atomic" @@ -36,6 +37,8 @@ const ( tagMagicByte uint8 = 255 ) +const MaxNodeNameLength int = 128 + var ( // FeatureNotSupported is returned if a feature cannot be used // due to an older protocol version being used. @@ -47,6 +50,12 @@ func init() { rand.Seed(time.Now().UnixNano()) } +// ReconnectTimeoutOverrider is an interface that can be implemented to allow overriding +// the reconnect timeout for individual members. +type ReconnectTimeoutOverrider interface { + ReconnectTimeout(member *Member, timeout time.Duration) time.Duration +} + // Serf is a single node that is part of a single cluster that gets // events about joins/leaves/failures/etc. It is created with the Create // method. @@ -269,6 +278,9 @@ func Create(conf *Config) (*Serf, error) { if len(serf.encodeTags(conf.Tags)) > memberlist.MetaMaxSize { return nil, fmt.Errorf("Encoded length of tags exceeds limit of %d bytes", memberlist.MetaMaxSize) } + if err := serf.ValidateNodeNames(); err != nil { + return nil, err + } // Check if serf member event coalescing is enabled if conf.CoalescePeriod > 0 && conf.QuiescentPeriod > 0 && conf.EventCh != nil { @@ -1571,8 +1583,13 @@ func (s *Serf) reap(old []*memberState, now time.Time, timeout time.Duration) [] for i := 0; i < n; i++ { m := old[i] + memberTimeout := timeout + if s.config.ReconnectTimeoutOverride != nil { + memberTimeout = s.config.ReconnectTimeoutOverride.ReconnectTimeout(&m.Member, memberTimeout) + } + // Skip if the timeout is not yet reached - if now.Sub(m.leaveTime) <= timeout { + if now.Sub(m.leaveTime) <= memberTimeout { continue } @@ -1884,3 +1901,24 @@ func (s *Serf) NumNodes() (numNodes int) { return numNodes } + +// ValidateNodeNames verifies the NodeName contains +// only alphanumeric, -, or . and is under 128 chracters +func (s *Serf) ValidateNodeNames() error { + return s.validateNodeName(s.config.NodeName) +} + +func (s *Serf) validateNodeName(name string) error { + if s.config.ValidateNodeNames { + var InvalidNameRe = regexp.MustCompile(`[^A-Za-z0-9\-\.]+`) + if InvalidNameRe.MatchString(name) { + return fmt.Errorf("Node name contains invalid characters %v , Valid characters include "+ + "all alpha-numerics and dashes and '.' ", name) + } + if len(name) > MaxNodeNameLength { + return fmt.Errorf("Node name is %v characters. "+ + "Valid length is between 1 and 128 characters", len(name)) + } + } + return nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 5e436ebbb..04b5d1c51 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -280,7 +280,7 @@ github.com/hashicorp/net-rpc-msgpackrpc github.com/hashicorp/raft # github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea github.com/hashicorp/raft-boltdb -# github.com/hashicorp/serf v0.9.4 +# github.com/hashicorp/serf v0.9.5 github.com/hashicorp/serf/coordinate github.com/hashicorp/serf/serf # github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086 diff --git a/website/pages/docs/agent/options.mdx b/website/pages/docs/agent/options.mdx index a369400a8..3e8ca4600 100644 --- a/website/pages/docs/agent/options.mdx +++ b/website/pages/docs/agent/options.mdx @@ -834,6 +834,11 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'." - `advertise_addr_wan_ipv6` This was added together with [`advertise_addr_wan_ipv4`](#advertise_addr_wan_ipv4) to support dual stack IPv4/IPv6 environments. Using this, both IPv4 and IPv6 addresses can be specified and requested during eg service discovery. +- `advertise_reconnect_timeout` This is a per-agent setting of the [`reconnect_timeout`](#reconnect_timeout) parameter. + This agent will advertise to all other nodes in the cluster that after this timeout, the node may be completely + removed from the cluster. This may only be set on client agents and if unset then other nodes will use the main + `reconnect_timeout` setting when determing when this node may be removed from the cluster. + - `serf_lan` ((#serf_lan_bind)) Equivalent to the [`-serf-lan-bind` command-line flag](#_serf_lan_bind). - `serf_lan_allowed_cidrs` ((#serf_lan_allowed_cidrs)) Equivalent to the [`-serf-lan-allowed-cidrs` command-line flag](#_serf_lan_allowed_cidrs).