diff --git a/.changelog/8781.txt b/.changelog/8781.txt new file mode 100644 index 000000000..0ad1d7f5b --- /dev/null +++ b/.changelog/8781.txt @@ -0,0 +1,3 @@ +```release-note:feature +agent: Allow client agents to be configured with an advertised reconnect timeout to control how long until the nodes are reaped by others in the cluster. +``` \ No newline at end of file diff --git a/agent/agent.go b/agent/agent.go index 9edeb5344..ae64eb611 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1025,6 +1025,8 @@ func newConsulConfig(runtimeCfg *config.RuntimeConfig, logger hclog.Logger) (*co cfg.SerfWANConfig = nil } + cfg.AdvertiseReconnectTimeout = runtimeCfg.AdvertiseReconnectTimeout + cfg.RPCAddr = runtimeCfg.RPCBindAddr cfg.RPCAdvertise = runtimeCfg.RPCAdvertiseAddr diff --git a/agent/config/builder.go b/agent/config/builder.go index 666ff0b65..868743bf6 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -949,11 +949,12 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) { }, // Agent - AdvertiseAddrLAN: advertiseAddrLAN, - AdvertiseAddrWAN: advertiseAddrWAN, - BindAddr: bindAddr, - Bootstrap: b.boolVal(c.Bootstrap), - BootstrapExpect: b.intVal(c.BootstrapExpect), + AdvertiseAddrLAN: advertiseAddrLAN, + AdvertiseAddrWAN: advertiseAddrWAN, + AdvertiseReconnectTimeout: b.durationVal("advertise_reconnect_timeout", c.AdvertiseReconnectTimeout), + BindAddr: bindAddr, + Bootstrap: b.boolVal(c.Bootstrap), + BootstrapExpect: b.intVal(c.BootstrapExpect), Cache: cache.Options{ EntryFetchRate: rate.Limit( b.float64ValWithDefault(c.Cache.EntryFetchRate, float64(cache.DefaultEntryFetchRate)), @@ -1389,6 +1390,10 @@ func (b *Builder) Validate(rt RuntimeConfig) error { return fmt.Errorf("auto_encrypt.allow_tls can only be used on a server.") } + if rt.ServerMode && rt.AdvertiseReconnectTimeout != 0 { + return fmt.Errorf("advertise_reconnect_timeout can only be used on a client") + } + // ---------------------------------------------------------------- // warnings // diff --git a/agent/config/config.go b/agent/config/config.go index b66bcd0d8..346603e34 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -142,6 +142,7 @@ type Config struct { AdvertiseAddrWAN *string `json:"advertise_addr_wan,omitempty" hcl:"advertise_addr_wan" mapstructure:"advertise_addr_wan"` AdvertiseAddrWANIPv4 *string `json:"advertise_addr_wan_ipv4,omitempty" hcl:"advertise_addr_wan_ipv4" mapstructure:"advertise_addr_wan_ipv4"` AdvertiseAddrWANIPv6 *string `json:"advertise_addr_wan_ipv6,omitempty" hcl:"advertise_addr_wan_ipv6" mapstructure:"advertise_addr_ipv6"` + AdvertiseReconnectTimeout *string `json:"advertise_reconnect_timeout,omitempty" hcl:"advertise_reconnect_timeout" mapstructure:"advertise_reconnect_timeout"` AutoConfig AutoConfigRaw `json:"auto_config,omitempty" hcl:"auto_config" mapstructure:"auto_config"` Autopilot Autopilot `json:"autopilot,omitempty" hcl:"autopilot" mapstructure:"autopilot"` BindAddr *string `json:"bind_addr,omitempty" hcl:"bind_addr" mapstructure:"bind_addr"` diff --git a/agent/config/runtime.go b/agent/config/runtime.go index 34b870c2a..417b22495 100644 --- a/agent/config/runtime.go +++ b/agent/config/runtime.go @@ -980,6 +980,13 @@ type RuntimeConfig struct { // hcl: reconnect_timeout = "duration" ReconnectTimeoutWAN time.Duration + // AdvertiseReconnectTimeout specifies the amount of time other agents should + // wait for us to reconnect before deciding we are permanently gone. This + // should only be set for client agents that are run in a stateless or + // ephemeral manner in order to realize their deletion sooner than we + // would otherwise. + AdvertiseReconnectTimeout time.Duration + // RejoinAfterLeave controls our interaction with the cluster after leave. // When set to false (default), a leave causes Consul to not rejoin // the cluster until an explicit join is received. If this is set to diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index 9a0536484..29f145dd1 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -51,7 +51,7 @@ type configTest struct { // should check one option at a time if possible and should use generic // values, e.g. 'a' or 1 instead of 'servicex' or 3306. -func TestBuilder_BuildAndValide_ConfigFlagsAndEdgecases(t *testing.T) { +func TestBuilder_BuildAndValidate_ConfigFlagsAndEdgecases(t *testing.T) { dataDir := testutil.TempDir(t, "consul") defaultEntMeta := structs.DefaultEnterpriseMeta() @@ -4438,7 +4438,6 @@ func TestBuilder_BuildAndValide_ConfigFlagsAndEdgecases(t *testing.T) { rt.CertFile = "foo" }, }, - // UI Config tests { desc: "ui config deprecated", @@ -4601,6 +4600,23 @@ func TestBuilder_BuildAndValide_ConfigFlagsAndEdgecases(t *testing.T) { `}, err: `ui_config.dashboard_url_templates values must be a valid http or https URL.`, }, + + // Per node reconnect timeout test + { + desc: "server and advertised reconnect timeout error", + args: []string{ + `-data-dir=` + dataDir, + `-server`, + }, + hcl: []string{` + advertise_reconnect_timeout = "5s" + `}, + json: []string{` + { + "advertise_reconnect_timeout": "5s" + }`}, + err: "advertise_reconnect_timeout can only be used on a client", + }, } testConfig(t, tests, dataDir) @@ -4834,6 +4850,7 @@ func TestFullConfig(t *testing.T) { }, "advertise_addr": "17.99.29.16", "advertise_addr_wan": "78.63.37.19", + "advertise_reconnect_timeout": "0s", "audit": { "enabled": false }, @@ -5515,6 +5532,7 @@ func TestFullConfig(t *testing.T) { } advertise_addr = "17.99.29.16" advertise_addr_wan = "78.63.37.19" + advertise_reconnect_timeout = "0s" audit = { enabled = false } @@ -6295,6 +6313,7 @@ func TestFullConfig(t *testing.T) { ACLTokenReplication: true, AdvertiseAddrLAN: ipAddr("17.99.29.16"), AdvertiseAddrWAN: ipAddr("78.63.37.19"), + AdvertiseReconnectTimeout: 0 * time.Second, AutopilotCleanupDeadServers: true, AutopilotDisableUpgradeMigration: true, AutopilotLastContactThreshold: 12705 * time.Second, @@ -7278,6 +7297,7 @@ func TestSanitize(t *testing.T) { "AEInterval": "0s", "AdvertiseAddrLAN": "", "AdvertiseAddrWAN": "", + "AdvertiseReconnectTimeout": "0s", "AutopilotCleanupDeadServers": false, "AutopilotDisableUpgradeMigration": false, "AutopilotLastContactThreshold": "0s", diff --git a/agent/consul/acl_client.go b/agent/consul/acl_client.go index 17a961ef1..1b9c6a46d 100644 --- a/agent/consul/acl_client.go +++ b/agent/consul/acl_client.go @@ -6,7 +6,7 @@ import ( "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/lib" + "github.com/hashicorp/consul/lib/serf" ) var clientACLCacheConfig *structs.ACLCachesConfig = &structs.ACLCachesConfig{ @@ -123,5 +123,5 @@ func (c *Client) ResolveTokenAndDefaultMeta(token string, entMeta *structs.Enter func (c *Client) updateSerfTags(key, value string) { // Update the LAN serf - lib.UpdateSerfTag(c.serf, key, value) + serf.UpdateTag(c.serf, key, value) } diff --git a/agent/consul/acl_server.go b/agent/consul/acl_server.go index a2f1790f4..a495a312b 100644 --- a/agent/consul/acl_server.go +++ b/agent/consul/acl_server.go @@ -6,7 +6,7 @@ import ( "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/lib" + "github.com/hashicorp/consul/lib/serf" ) var serverACLCacheConfig *structs.ACLCachesConfig = &structs.ACLCachesConfig{ @@ -86,10 +86,10 @@ func (s *Server) checkBindingRuleUUID(id string) (bool, error) { func (s *Server) updateSerfTags(key, value string) { // Update the LAN serf - lib.UpdateSerfTag(s.serfLAN, key, value) + serf.UpdateTag(s.serfLAN, key, value) if s.serfWAN != nil { - lib.UpdateSerfTag(s.serfWAN, key, value) + serf.UpdateTag(s.serfWAN, key, value) } s.updateEnterpriseSerfTags(key, value) diff --git a/agent/consul/client_serf.go b/agent/consul/client_serf.go index fb0b999e1..1fa28ab4d 100644 --- a/agent/consul/client_serf.go +++ b/agent/consul/client_serf.go @@ -8,6 +8,7 @@ import ( "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/lib" + libserf "github.com/hashicorp/consul/lib/serf" "github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/types" "github.com/hashicorp/go-hclog" @@ -27,6 +28,9 @@ func (c *Client) setupSerf(conf *serf.Config, ch chan serf.Event, path string) ( conf.Tags["vsn_min"] = fmt.Sprintf("%d", ProtocolVersionMin) conf.Tags["vsn_max"] = fmt.Sprintf("%d", ProtocolVersionMax) conf.Tags["build"] = c.config.Build + if c.config.AdvertiseReconnectTimeout != 0 { + conf.Tags[libserf.ReconnectTimeoutTag] = c.config.AdvertiseReconnectTimeout.String() + } if c.acls.ACLsEnabled() { // we start in legacy mode and then transition to normal // mode once we know the cluster can handle it. @@ -65,6 +69,8 @@ func (c *Client) setupSerf(conf *serf.Config, ch chan serf.Event, path string) ( c.addEnterpriseSerfTags(conf.Tags) + conf.ReconnectTimeoutOverride = libserf.NewReconnectOverride(c.logger) + return serf.Create(conf) } diff --git a/agent/consul/client_test.go b/agent/consul/client_test.go index 6fe726681..4af8a4629 100644 --- a/agent/consul/client_test.go +++ b/agent/consul/client_test.go @@ -747,3 +747,34 @@ func TestClient_Reload(t *testing.T) { require.Equal(t, rate.Limit(1000), limiter.Limit()) require.Equal(t, 10000, limiter.Burst()) } + +func TestClient_ShortReconnectTimeout(t *testing.T) { + cluster := newTestCluster(t, &testClusterConfig{ + Datacenter: "dc1", + Servers: 1, + Clients: 2, + ServerConf: func(c *Config) { + c.SerfLANConfig.ReapInterval = 50 * time.Millisecond + }, + ClientConf: func(c *Config) { + c.SerfLANConfig.ReapInterval = 50 * time.Millisecond + c.AdvertiseReconnectTimeout = 100 * time.Millisecond + }, + }) + + // shutdown the client + cluster.Clients[1].Shutdown() + + // Now wait for it to be reaped. We set the advertised reconnect + // timeout to 100ms so we are going to check every 50 ms and allow + // up to 10x the time in the case of slow CI. + require.Eventually(t, + func() bool { + return len(cluster.Servers[0].LANMembers()) == 2 && + len(cluster.Clients[0].LANMembers()) == 2 + + }, + time.Second, + 50*time.Millisecond, + "The client node was not reaped within the alotted time") +} diff --git a/agent/consul/config.go b/agent/consul/config.go index 72d1fac05..ea77edcba 100644 --- a/agent/consul/config.go +++ b/agent/consul/config.go @@ -9,7 +9,7 @@ import ( "github.com/hashicorp/consul/agent/checks" "github.com/hashicorp/consul/agent/consul/autopilot" "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/lib" + libserf "github.com/hashicorp/consul/lib/serf" "github.com/hashicorp/consul/tlsutil" "github.com/hashicorp/consul/types" "github.com/hashicorp/consul/version" @@ -219,6 +219,11 @@ type Config struct { // true, we ignore the leave, and rejoin the cluster on start. RejoinAfterLeave bool + // AdvertiseReconnectTimeout is the duration after which this node should be + // assumed to not be returning and thus should be reaped within Serf. This + // can only be set for Client agents + AdvertiseReconnectTimeout time.Duration + // Build is a string that is gossiped around, and can be used to help // operators track which versions are actively deployed Build string @@ -544,8 +549,8 @@ func DefaultConfig() *Config { NodeName: hostname, RPCAddr: DefaultRPCAddr, RaftConfig: raft.DefaultConfig(), - SerfLANConfig: lib.SerfDefaultConfig(), - SerfWANConfig: lib.SerfDefaultConfig(), + SerfLANConfig: libserf.DefaultConfig(), + SerfWANConfig: libserf.DefaultConfig(), SerfFloodInterval: 60 * time.Second, ReconcileInterval: 60 * time.Second, ProtocolVersion: ProtocolVersion2Compatible, diff --git a/agent/consul/server_serf.go b/agent/consul/server_serf.go index 46e3110d4..a014d13bd 100644 --- a/agent/consul/server_serf.go +++ b/agent/consul/server_serf.go @@ -11,6 +11,7 @@ import ( "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/lib" + libserf "github.com/hashicorp/consul/lib/serf" "github.com/hashicorp/consul/logging" "github.com/hashicorp/go-hclog" "github.com/hashicorp/memberlist" @@ -168,6 +169,8 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string, w return nil, err } + conf.ReconnectTimeoutOverride = libserf.NewReconnectOverride(s.logger) + s.addEnterpriseSerfTags(conf.Tags) if s.config.OverrideInitialSerfTags != nil { diff --git a/api/go.mod b/api/go.mod index d9902d403..becc75b41 100644 --- a/api/go.mod +++ b/api/go.mod @@ -10,7 +10,7 @@ require ( github.com/hashicorp/go-hclog v0.12.0 github.com/hashicorp/go-rootcerts v1.0.2 github.com/hashicorp/go-uuid v1.0.1 - github.com/hashicorp/serf v0.9.3 + github.com/hashicorp/serf v0.9.5 github.com/mitchellh/mapstructure v1.1.2 github.com/stretchr/testify v1.4.0 ) diff --git a/api/go.sum b/api/go.sum index 3c26420cd..57ef54399 100644 --- a/api/go.sum +++ b/api/go.sum @@ -41,8 +41,8 @@ github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO github.com/hashicorp/mdns v1.0.1/go.mod h1:4gW7WsVCke5TE7EPeYliwHlRUyBtfCwuFwuMg2DmyNY= github.com/hashicorp/memberlist v0.2.2 h1:5+RffWKwqJ71YPu9mWsF7ZOscZmwfasdA8kbdC7AO2g= github.com/hashicorp/memberlist v0.2.2/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE= -github.com/hashicorp/serf v0.9.3 h1:AVF6JDQQens6nMHT9OGERBvK0f8rPrAGILnsKLr6lzM= -github.com/hashicorp/serf v0.9.3/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk= +github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM= +github.com/hashicorp/serf v0.9.5/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk= github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs= github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= diff --git a/go.mod b/go.mod index 2b8583aef..4308e153b 100644 --- a/go.mod +++ b/go.mod @@ -56,7 +56,7 @@ require ( github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69 github.com/hashicorp/raft v1.1.2 github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea - github.com/hashicorp/serf v0.9.4 + github.com/hashicorp/serf v0.9.5 github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086 github.com/hashicorp/yamux v0.0.0-20181012175058-2f1d1f20f75d github.com/imdario/mergo v0.3.6 diff --git a/go.sum b/go.sum index a22da45ca..e585b7470 100644 --- a/go.sum +++ b/go.sum @@ -288,9 +288,8 @@ github.com/hashicorp/raft v1.1.2 h1:oxEL5DDeurYxLd3UbcY/hccgSPhLLpiBZ1YxtWEq59c= github.com/hashicorp/raft v1.1.2/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8= github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea h1:xykPFhrBAS2J0VBzVa5e80b5ZtYuNQtgXjN40qBZlD4= github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk= -github.com/hashicorp/serf v0.9.3/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk= -github.com/hashicorp/serf v0.9.4 h1:xrZ4ZR0wT5Dz8oQHHdfOzr0ei1jMToWlFFz3hh/DI7I= -github.com/hashicorp/serf v0.9.4/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk= +github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM= +github.com/hashicorp/serf v0.9.5/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk= github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086 h1:OKsyxKi2sNmqm1Gv93adf2AID2FOBFdCbbZn9fGtIdg= github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086/go.mod h1:R3Umvhlxi2TN7Ex2hzOowyeNb+SfbVWI973N+ctaFMk= github.com/hashicorp/vault/sdk v0.1.14-0.20200519221838-e0cfd64bc267 h1:e1ok06zGrWJW91rzRroyl5nRNqraaBe4d5hiKcVZuHM= diff --git a/lib/serf.go b/lib/serf.go deleted file mode 100644 index c0b6e5126..000000000 --- a/lib/serf.go +++ /dev/null @@ -1,44 +0,0 @@ -package lib - -import ( - "time" - - "github.com/hashicorp/serf/serf" -) - -// SerfDefaultConfig returns a Consul-flavored Serf default configuration, -// suitable as a basis for a LAN, WAN, segment, or area. -func SerfDefaultConfig() *serf.Config { - base := serf.DefaultConfig() - - // This effectively disables the annoying queue depth warnings. - base.QueueDepthWarning = 1000000 - - // This enables dynamic sizing of the message queue depth based on the - // cluster size. - base.MinQueueDepth = 4096 - - // This gives leaves some time to propagate through the cluster before - // we shut down. The value was chosen to be reasonably short, but to - // allow a leave to get to over 99.99% of the cluster with 100k nodes - // (using https://www.serf.io/docs/internals/simulator.html). - base.LeavePropagateDelay = 3 * time.Second - - return base -} - -func GetSerfTags(serf *serf.Serf) map[string]string { - tags := make(map[string]string) - for tag, value := range serf.LocalMember().Tags { - tags[tag] = value - } - - return tags -} - -func UpdateSerfTag(serf *serf.Serf, tag, value string) { - tags := GetSerfTags(serf) - tags[tag] = value - - serf.SetTags(tags) -} diff --git a/lib/serf/serf.go b/lib/serf/serf.go new file mode 100644 index 000000000..8f97e24d8 --- /dev/null +++ b/lib/serf/serf.go @@ -0,0 +1,82 @@ +package serf + +import ( + "time" + + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/serf/serf" +) + +const ( + ReconnectTimeoutTag = "rc_tm" +) + +// DefaultConfig returns a Consul-flavored Serf default configuration, +// suitable as a basis for a LAN, WAN, segment, or area. +func DefaultConfig() *serf.Config { + base := serf.DefaultConfig() + + // This effectively disables the annoying queue depth warnings. + base.QueueDepthWarning = 1000000 + + // This enables dynamic sizing of the message queue depth based on the + // cluster size. + base.MinQueueDepth = 4096 + + // This gives leaves some time to propagate through the cluster before + // we shut down. The value was chosen to be reasonably short, but to + // allow a leave to get to over 99.99% of the cluster with 100k nodes + // (using https://www.serf.io/docs/internals/simulator.html). + base.LeavePropagateDelay = 3 * time.Second + + return base +} + +func GetTags(serf *serf.Serf) map[string]string { + tags := make(map[string]string) + for tag, value := range serf.LocalMember().Tags { + tags[tag] = value + } + + return tags +} + +func UpdateTag(serf *serf.Serf, tag, value string) { + tags := GetTags(serf) + tags[tag] = value + + serf.SetTags(tags) +} + +type ReconnectOverride struct { + logger hclog.Logger +} + +func NewReconnectOverride(logger hclog.Logger) *ReconnectOverride { + if logger == nil { + logger = hclog.Default() + } + + return &ReconnectOverride{ + logger: logger, + } +} + +func (r *ReconnectOverride) ReconnectTimeout(m *serf.Member, timeout time.Duration) time.Duration { + val, ok := m.Tags[ReconnectTimeoutTag] + if !ok { + return timeout + } + newTimeout, err := time.ParseDuration(val) + if err != nil { + r.logger.Warn("Member is advertising a malformed reconnect timeout", "member", m.Name, "rc_tm", val) + return timeout + } + + // ignore a timeout of 0 as that indicates the default should be used + if newTimeout == 0 { + return timeout + } + + return newTimeout +} diff --git a/vendor/github.com/hashicorp/serf/coordinate/phantom.go b/vendor/github.com/hashicorp/serf/coordinate/phantom.go index 6fb033c0c..66da4e2e9 100644 --- a/vendor/github.com/hashicorp/serf/coordinate/phantom.go +++ b/vendor/github.com/hashicorp/serf/coordinate/phantom.go @@ -11,7 +11,7 @@ import ( // given config. func GenerateClients(nodes int, config *Config) ([]*Client, error) { clients := make([]*Client, nodes) - for i, _ := range clients { + for i := range clients { client, err := NewClient(config) if err != nil { return nil, err @@ -146,7 +146,7 @@ func Simulate(clients []*Client, truth [][]time.Duration, cycles int) { nodes := len(clients) for cycle := 0; cycle < cycles; cycle++ { - for i, _ := range clients { + for i := range clients { if j := rand.Intn(nodes); j != i { c := clients[j].GetCoordinate() rtt := truth[i][j] diff --git a/vendor/github.com/hashicorp/serf/serf/config.go b/vendor/github.com/hashicorp/serf/serf/config.go index 2875e1d60..57b5a98e5 100644 --- a/vendor/github.com/hashicorp/serf/serf/config.go +++ b/vendor/github.com/hashicorp/serf/serf/config.go @@ -253,6 +253,15 @@ type Config struct { // // WARNING: this should ONLY be used in tests messageDropper func(typ messageType) bool + + // ReconnectTimeoutOverride is an optional interface which when present allows + // the application to cause reaping of a node to happen when it otherwise wouldn't + ReconnectTimeoutOverride ReconnectTimeoutOverrider + + // ValidateNodeNames controls whether nodenames only + // contain alphanumeric, dashes and '.'characters + // and sets maximum length to 128 characters + ValidateNodeNames bool } // Init allocates the subdata structures @@ -298,6 +307,7 @@ func DefaultConfig() *Config { QuerySizeLimit: 1024, EnableNameConflictResolution: true, DisableCoordinates: false, + ValidateNodeNames: false, UserEventSizeLimit: 512, } } diff --git a/vendor/github.com/hashicorp/serf/serf/merge_delegate.go b/vendor/github.com/hashicorp/serf/serf/merge_delegate.go index 36eb52f7d..2e1e7c5b5 100644 --- a/vendor/github.com/hashicorp/serf/serf/merge_delegate.go +++ b/vendor/github.com/hashicorp/serf/serf/merge_delegate.go @@ -1,6 +1,7 @@ package serf import ( + "fmt" "net" "github.com/hashicorp/memberlist" @@ -17,22 +18,31 @@ type mergeDelegate struct { func (m *mergeDelegate) NotifyMerge(nodes []*memberlist.Node) error { members := make([]*Member, len(nodes)) for idx, n := range nodes { - members[idx] = m.nodeToMember(n) + var err error + members[idx], err = m.nodeToMember(n) + if err != nil { + return err + } } return m.serf.config.Merge.NotifyMerge(members) } func (m *mergeDelegate) NotifyAlive(peer *memberlist.Node) error { - member := m.nodeToMember(peer) + member, err := m.nodeToMember(peer) + if err != nil { + return err + } return m.serf.config.Merge.NotifyMerge([]*Member{member}) } -func (m *mergeDelegate) nodeToMember(n *memberlist.Node) *Member { +func (m *mergeDelegate) nodeToMember(n *memberlist.Node) (*Member, error) { status := StatusNone if n.State == memberlist.StateLeft { status = StatusLeft } - + if err := m.validateMemberInfo(n); err != nil { + return nil, err + } return &Member{ Name: n.Name, Addr: net.IP(n.Addr), @@ -45,5 +55,22 @@ func (m *mergeDelegate) nodeToMember(n *memberlist.Node) *Member { DelegateMin: n.DMin, DelegateMax: n.DMax, DelegateCur: n.DCur, - } + }, nil +} + +// validateMemberInfo checks that the data we are sending is valid +func (m *mergeDelegate) validateMemberInfo(n *memberlist.Node) error { + if err := m.serf.validateNodeName(n.Name); err != nil { + return err + } + + if len(n.Addr) != 4 && len(n.Addr) != 16 { + return fmt.Errorf("IP byte length is invalid: %d bytes is not either 4 or 16", len(n.Addr)) + } + + if len(n.Meta) > memberlist.MetaMaxSize { + return fmt.Errorf("Encoded length of tags exceeds limit of %d bytes", + memberlist.MetaMaxSize) + } + return nil } diff --git a/vendor/github.com/hashicorp/serf/serf/serf.go b/vendor/github.com/hashicorp/serf/serf/serf.go index c04719082..8a7c069c2 100644 --- a/vendor/github.com/hashicorp/serf/serf/serf.go +++ b/vendor/github.com/hashicorp/serf/serf/serf.go @@ -11,6 +11,7 @@ import ( "math/rand" "net" "os" + "regexp" "strconv" "sync" "sync/atomic" @@ -36,6 +37,8 @@ const ( tagMagicByte uint8 = 255 ) +const MaxNodeNameLength int = 128 + var ( // FeatureNotSupported is returned if a feature cannot be used // due to an older protocol version being used. @@ -47,6 +50,12 @@ func init() { rand.Seed(time.Now().UnixNano()) } +// ReconnectTimeoutOverrider is an interface that can be implemented to allow overriding +// the reconnect timeout for individual members. +type ReconnectTimeoutOverrider interface { + ReconnectTimeout(member *Member, timeout time.Duration) time.Duration +} + // Serf is a single node that is part of a single cluster that gets // events about joins/leaves/failures/etc. It is created with the Create // method. @@ -269,6 +278,9 @@ func Create(conf *Config) (*Serf, error) { if len(serf.encodeTags(conf.Tags)) > memberlist.MetaMaxSize { return nil, fmt.Errorf("Encoded length of tags exceeds limit of %d bytes", memberlist.MetaMaxSize) } + if err := serf.ValidateNodeNames(); err != nil { + return nil, err + } // Check if serf member event coalescing is enabled if conf.CoalescePeriod > 0 && conf.QuiescentPeriod > 0 && conf.EventCh != nil { @@ -1571,8 +1583,13 @@ func (s *Serf) reap(old []*memberState, now time.Time, timeout time.Duration) [] for i := 0; i < n; i++ { m := old[i] + memberTimeout := timeout + if s.config.ReconnectTimeoutOverride != nil { + memberTimeout = s.config.ReconnectTimeoutOverride.ReconnectTimeout(&m.Member, memberTimeout) + } + // Skip if the timeout is not yet reached - if now.Sub(m.leaveTime) <= timeout { + if now.Sub(m.leaveTime) <= memberTimeout { continue } @@ -1884,3 +1901,24 @@ func (s *Serf) NumNodes() (numNodes int) { return numNodes } + +// ValidateNodeNames verifies the NodeName contains +// only alphanumeric, -, or . and is under 128 chracters +func (s *Serf) ValidateNodeNames() error { + return s.validateNodeName(s.config.NodeName) +} + +func (s *Serf) validateNodeName(name string) error { + if s.config.ValidateNodeNames { + var InvalidNameRe = regexp.MustCompile(`[^A-Za-z0-9\-\.]+`) + if InvalidNameRe.MatchString(name) { + return fmt.Errorf("Node name contains invalid characters %v , Valid characters include "+ + "all alpha-numerics and dashes and '.' ", name) + } + if len(name) > MaxNodeNameLength { + return fmt.Errorf("Node name is %v characters. "+ + "Valid length is between 1 and 128 characters", len(name)) + } + } + return nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 5e436ebbb..04b5d1c51 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -280,7 +280,7 @@ github.com/hashicorp/net-rpc-msgpackrpc github.com/hashicorp/raft # github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea github.com/hashicorp/raft-boltdb -# github.com/hashicorp/serf v0.9.4 +# github.com/hashicorp/serf v0.9.5 github.com/hashicorp/serf/coordinate github.com/hashicorp/serf/serf # github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086 diff --git a/website/pages/docs/agent/options.mdx b/website/pages/docs/agent/options.mdx index a369400a8..3e8ca4600 100644 --- a/website/pages/docs/agent/options.mdx +++ b/website/pages/docs/agent/options.mdx @@ -834,6 +834,11 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'." - `advertise_addr_wan_ipv6` This was added together with [`advertise_addr_wan_ipv4`](#advertise_addr_wan_ipv4) to support dual stack IPv4/IPv6 environments. Using this, both IPv4 and IPv6 addresses can be specified and requested during eg service discovery. +- `advertise_reconnect_timeout` This is a per-agent setting of the [`reconnect_timeout`](#reconnect_timeout) parameter. + This agent will advertise to all other nodes in the cluster that after this timeout, the node may be completely + removed from the cluster. This may only be set on client agents and if unset then other nodes will use the main + `reconnect_timeout` setting when determing when this node may be removed from the cluster. + - `serf_lan` ((#serf_lan_bind)) Equivalent to the [`-serf-lan-bind` command-line flag](#_serf_lan_bind). - `serf_lan_allowed_cidrs` ((#serf_lan_allowed_cidrs)) Equivalent to the [`-serf-lan-allowed-cidrs` command-line flag](#_serf_lan_allowed_cidrs).