Add per-agent reconnect timeouts (#8781)
This allows for client agent to be run in a more stateless manner where they may be abruptly terminated and not expected to come back. If advertising a per-agent reconnect timeout using the advertise_reconnect_timeout configuration when that agent leaves, other agents will wait only that amount of time for the agent to come back before reaping it. This has the advantageous side effect of causing servers to deregister the node/services/checks for that agent sooner than if the global reconnect_timeout was used.
This commit is contained in:
parent
8dd0fb836c
commit
141eb60f06
|
@ -0,0 +1,3 @@
|
|||
```release-note:feature
|
||||
agent: Allow client agents to be configured with an advertised reconnect timeout to control how long until the nodes are reaped by others in the cluster.
|
||||
```
|
|
@ -1025,6 +1025,8 @@ func newConsulConfig(runtimeCfg *config.RuntimeConfig, logger hclog.Logger) (*co
|
|||
cfg.SerfWANConfig = nil
|
||||
}
|
||||
|
||||
cfg.AdvertiseReconnectTimeout = runtimeCfg.AdvertiseReconnectTimeout
|
||||
|
||||
cfg.RPCAddr = runtimeCfg.RPCBindAddr
|
||||
cfg.RPCAdvertise = runtimeCfg.RPCAdvertiseAddr
|
||||
|
||||
|
|
|
@ -951,6 +951,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
|
|||
// Agent
|
||||
AdvertiseAddrLAN: advertiseAddrLAN,
|
||||
AdvertiseAddrWAN: advertiseAddrWAN,
|
||||
AdvertiseReconnectTimeout: b.durationVal("advertise_reconnect_timeout", c.AdvertiseReconnectTimeout),
|
||||
BindAddr: bindAddr,
|
||||
Bootstrap: b.boolVal(c.Bootstrap),
|
||||
BootstrapExpect: b.intVal(c.BootstrapExpect),
|
||||
|
@ -1389,6 +1390,10 @@ func (b *Builder) Validate(rt RuntimeConfig) error {
|
|||
return fmt.Errorf("auto_encrypt.allow_tls can only be used on a server.")
|
||||
}
|
||||
|
||||
if rt.ServerMode && rt.AdvertiseReconnectTimeout != 0 {
|
||||
return fmt.Errorf("advertise_reconnect_timeout can only be used on a client")
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// warnings
|
||||
//
|
||||
|
|
|
@ -142,6 +142,7 @@ type Config struct {
|
|||
AdvertiseAddrWAN *string `json:"advertise_addr_wan,omitempty" hcl:"advertise_addr_wan" mapstructure:"advertise_addr_wan"`
|
||||
AdvertiseAddrWANIPv4 *string `json:"advertise_addr_wan_ipv4,omitempty" hcl:"advertise_addr_wan_ipv4" mapstructure:"advertise_addr_wan_ipv4"`
|
||||
AdvertiseAddrWANIPv6 *string `json:"advertise_addr_wan_ipv6,omitempty" hcl:"advertise_addr_wan_ipv6" mapstructure:"advertise_addr_ipv6"`
|
||||
AdvertiseReconnectTimeout *string `json:"advertise_reconnect_timeout,omitempty" hcl:"advertise_reconnect_timeout" mapstructure:"advertise_reconnect_timeout"`
|
||||
AutoConfig AutoConfigRaw `json:"auto_config,omitempty" hcl:"auto_config" mapstructure:"auto_config"`
|
||||
Autopilot Autopilot `json:"autopilot,omitempty" hcl:"autopilot" mapstructure:"autopilot"`
|
||||
BindAddr *string `json:"bind_addr,omitempty" hcl:"bind_addr" mapstructure:"bind_addr"`
|
||||
|
|
|
@ -980,6 +980,13 @@ type RuntimeConfig struct {
|
|||
// hcl: reconnect_timeout = "duration"
|
||||
ReconnectTimeoutWAN time.Duration
|
||||
|
||||
// AdvertiseReconnectTimeout specifies the amount of time other agents should
|
||||
// wait for us to reconnect before deciding we are permanently gone. This
|
||||
// should only be set for client agents that are run in a stateless or
|
||||
// ephemeral manner in order to realize their deletion sooner than we
|
||||
// would otherwise.
|
||||
AdvertiseReconnectTimeout time.Duration
|
||||
|
||||
// RejoinAfterLeave controls our interaction with the cluster after leave.
|
||||
// When set to false (default), a leave causes Consul to not rejoin
|
||||
// the cluster until an explicit join is received. If this is set to
|
||||
|
|
|
@ -51,7 +51,7 @@ type configTest struct {
|
|||
// should check one option at a time if possible and should use generic
|
||||
// values, e.g. 'a' or 1 instead of 'servicex' or 3306.
|
||||
|
||||
func TestBuilder_BuildAndValide_ConfigFlagsAndEdgecases(t *testing.T) {
|
||||
func TestBuilder_BuildAndValidate_ConfigFlagsAndEdgecases(t *testing.T) {
|
||||
dataDir := testutil.TempDir(t, "consul")
|
||||
|
||||
defaultEntMeta := structs.DefaultEnterpriseMeta()
|
||||
|
@ -4438,7 +4438,6 @@ func TestBuilder_BuildAndValide_ConfigFlagsAndEdgecases(t *testing.T) {
|
|||
rt.CertFile = "foo"
|
||||
},
|
||||
},
|
||||
|
||||
// UI Config tests
|
||||
{
|
||||
desc: "ui config deprecated",
|
||||
|
@ -4601,6 +4600,23 @@ func TestBuilder_BuildAndValide_ConfigFlagsAndEdgecases(t *testing.T) {
|
|||
`},
|
||||
err: `ui_config.dashboard_url_templates values must be a valid http or https URL.`,
|
||||
},
|
||||
|
||||
// Per node reconnect timeout test
|
||||
{
|
||||
desc: "server and advertised reconnect timeout error",
|
||||
args: []string{
|
||||
`-data-dir=` + dataDir,
|
||||
`-server`,
|
||||
},
|
||||
hcl: []string{`
|
||||
advertise_reconnect_timeout = "5s"
|
||||
`},
|
||||
json: []string{`
|
||||
{
|
||||
"advertise_reconnect_timeout": "5s"
|
||||
}`},
|
||||
err: "advertise_reconnect_timeout can only be used on a client",
|
||||
},
|
||||
}
|
||||
|
||||
testConfig(t, tests, dataDir)
|
||||
|
@ -4834,6 +4850,7 @@ func TestFullConfig(t *testing.T) {
|
|||
},
|
||||
"advertise_addr": "17.99.29.16",
|
||||
"advertise_addr_wan": "78.63.37.19",
|
||||
"advertise_reconnect_timeout": "0s",
|
||||
"audit": {
|
||||
"enabled": false
|
||||
},
|
||||
|
@ -5515,6 +5532,7 @@ func TestFullConfig(t *testing.T) {
|
|||
}
|
||||
advertise_addr = "17.99.29.16"
|
||||
advertise_addr_wan = "78.63.37.19"
|
||||
advertise_reconnect_timeout = "0s"
|
||||
audit = {
|
||||
enabled = false
|
||||
}
|
||||
|
@ -6295,6 +6313,7 @@ func TestFullConfig(t *testing.T) {
|
|||
ACLTokenReplication: true,
|
||||
AdvertiseAddrLAN: ipAddr("17.99.29.16"),
|
||||
AdvertiseAddrWAN: ipAddr("78.63.37.19"),
|
||||
AdvertiseReconnectTimeout: 0 * time.Second,
|
||||
AutopilotCleanupDeadServers: true,
|
||||
AutopilotDisableUpgradeMigration: true,
|
||||
AutopilotLastContactThreshold: 12705 * time.Second,
|
||||
|
@ -7278,6 +7297,7 @@ func TestSanitize(t *testing.T) {
|
|||
"AEInterval": "0s",
|
||||
"AdvertiseAddrLAN": "",
|
||||
"AdvertiseAddrWAN": "",
|
||||
"AdvertiseReconnectTimeout": "0s",
|
||||
"AutopilotCleanupDeadServers": false,
|
||||
"AutopilotDisableUpgradeMigration": false,
|
||||
"AutopilotLastContactThreshold": "0s",
|
||||
|
|
|
@ -6,7 +6,7 @@ import (
|
|||
|
||||
"github.com/hashicorp/consul/acl"
|
||||
"github.com/hashicorp/consul/agent/structs"
|
||||
"github.com/hashicorp/consul/lib"
|
||||
"github.com/hashicorp/consul/lib/serf"
|
||||
)
|
||||
|
||||
var clientACLCacheConfig *structs.ACLCachesConfig = &structs.ACLCachesConfig{
|
||||
|
@ -123,5 +123,5 @@ func (c *Client) ResolveTokenAndDefaultMeta(token string, entMeta *structs.Enter
|
|||
|
||||
func (c *Client) updateSerfTags(key, value string) {
|
||||
// Update the LAN serf
|
||||
lib.UpdateSerfTag(c.serf, key, value)
|
||||
serf.UpdateTag(c.serf, key, value)
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@ import (
|
|||
|
||||
"github.com/hashicorp/consul/acl"
|
||||
"github.com/hashicorp/consul/agent/structs"
|
||||
"github.com/hashicorp/consul/lib"
|
||||
"github.com/hashicorp/consul/lib/serf"
|
||||
)
|
||||
|
||||
var serverACLCacheConfig *structs.ACLCachesConfig = &structs.ACLCachesConfig{
|
||||
|
@ -86,10 +86,10 @@ func (s *Server) checkBindingRuleUUID(id string) (bool, error) {
|
|||
|
||||
func (s *Server) updateSerfTags(key, value string) {
|
||||
// Update the LAN serf
|
||||
lib.UpdateSerfTag(s.serfLAN, key, value)
|
||||
serf.UpdateTag(s.serfLAN, key, value)
|
||||
|
||||
if s.serfWAN != nil {
|
||||
lib.UpdateSerfTag(s.serfWAN, key, value)
|
||||
serf.UpdateTag(s.serfWAN, key, value)
|
||||
}
|
||||
|
||||
s.updateEnterpriseSerfTags(key, value)
|
||||
|
|
|
@ -8,6 +8,7 @@ import (
|
|||
"github.com/hashicorp/consul/agent/metadata"
|
||||
"github.com/hashicorp/consul/agent/structs"
|
||||
"github.com/hashicorp/consul/lib"
|
||||
libserf "github.com/hashicorp/consul/lib/serf"
|
||||
"github.com/hashicorp/consul/logging"
|
||||
"github.com/hashicorp/consul/types"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
|
@ -27,6 +28,9 @@ func (c *Client) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (
|
|||
conf.Tags["vsn_min"] = fmt.Sprintf("%d", ProtocolVersionMin)
|
||||
conf.Tags["vsn_max"] = fmt.Sprintf("%d", ProtocolVersionMax)
|
||||
conf.Tags["build"] = c.config.Build
|
||||
if c.config.AdvertiseReconnectTimeout != 0 {
|
||||
conf.Tags[libserf.ReconnectTimeoutTag] = c.config.AdvertiseReconnectTimeout.String()
|
||||
}
|
||||
if c.acls.ACLsEnabled() {
|
||||
// we start in legacy mode and then transition to normal
|
||||
// mode once we know the cluster can handle it.
|
||||
|
@ -65,6 +69,8 @@ func (c *Client) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (
|
|||
|
||||
c.addEnterpriseSerfTags(conf.Tags)
|
||||
|
||||
conf.ReconnectTimeoutOverride = libserf.NewReconnectOverride(c.logger)
|
||||
|
||||
return serf.Create(conf)
|
||||
}
|
||||
|
||||
|
|
|
@ -747,3 +747,34 @@ func TestClient_Reload(t *testing.T) {
|
|||
require.Equal(t, rate.Limit(1000), limiter.Limit())
|
||||
require.Equal(t, 10000, limiter.Burst())
|
||||
}
|
||||
|
||||
func TestClient_ShortReconnectTimeout(t *testing.T) {
|
||||
cluster := newTestCluster(t, &testClusterConfig{
|
||||
Datacenter: "dc1",
|
||||
Servers: 1,
|
||||
Clients: 2,
|
||||
ServerConf: func(c *Config) {
|
||||
c.SerfLANConfig.ReapInterval = 50 * time.Millisecond
|
||||
},
|
||||
ClientConf: func(c *Config) {
|
||||
c.SerfLANConfig.ReapInterval = 50 * time.Millisecond
|
||||
c.AdvertiseReconnectTimeout = 100 * time.Millisecond
|
||||
},
|
||||
})
|
||||
|
||||
// shutdown the client
|
||||
cluster.Clients[1].Shutdown()
|
||||
|
||||
// Now wait for it to be reaped. We set the advertised reconnect
|
||||
// timeout to 100ms so we are going to check every 50 ms and allow
|
||||
// up to 10x the time in the case of slow CI.
|
||||
require.Eventually(t,
|
||||
func() bool {
|
||||
return len(cluster.Servers[0].LANMembers()) == 2 &&
|
||||
len(cluster.Clients[0].LANMembers()) == 2
|
||||
|
||||
},
|
||||
time.Second,
|
||||
50*time.Millisecond,
|
||||
"The client node was not reaped within the alotted time")
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ import (
|
|||
"github.com/hashicorp/consul/agent/checks"
|
||||
"github.com/hashicorp/consul/agent/consul/autopilot"
|
||||
"github.com/hashicorp/consul/agent/structs"
|
||||
"github.com/hashicorp/consul/lib"
|
||||
libserf "github.com/hashicorp/consul/lib/serf"
|
||||
"github.com/hashicorp/consul/tlsutil"
|
||||
"github.com/hashicorp/consul/types"
|
||||
"github.com/hashicorp/consul/version"
|
||||
|
@ -219,6 +219,11 @@ type Config struct {
|
|||
// true, we ignore the leave, and rejoin the cluster on start.
|
||||
RejoinAfterLeave bool
|
||||
|
||||
// AdvertiseReconnectTimeout is the duration after which this node should be
|
||||
// assumed to not be returning and thus should be reaped within Serf. This
|
||||
// can only be set for Client agents
|
||||
AdvertiseReconnectTimeout time.Duration
|
||||
|
||||
// Build is a string that is gossiped around, and can be used to help
|
||||
// operators track which versions are actively deployed
|
||||
Build string
|
||||
|
@ -544,8 +549,8 @@ func DefaultConfig() *Config {
|
|||
NodeName: hostname,
|
||||
RPCAddr: DefaultRPCAddr,
|
||||
RaftConfig: raft.DefaultConfig(),
|
||||
SerfLANConfig: lib.SerfDefaultConfig(),
|
||||
SerfWANConfig: lib.SerfDefaultConfig(),
|
||||
SerfLANConfig: libserf.DefaultConfig(),
|
||||
SerfWANConfig: libserf.DefaultConfig(),
|
||||
SerfFloodInterval: 60 * time.Second,
|
||||
ReconcileInterval: 60 * time.Second,
|
||||
ProtocolVersion: ProtocolVersion2Compatible,
|
||||
|
|
|
@ -11,6 +11,7 @@ import (
|
|||
"github.com/hashicorp/consul/agent/metadata"
|
||||
"github.com/hashicorp/consul/agent/structs"
|
||||
"github.com/hashicorp/consul/lib"
|
||||
libserf "github.com/hashicorp/consul/lib/serf"
|
||||
"github.com/hashicorp/consul/logging"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/memberlist"
|
||||
|
@ -168,6 +169,8 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string, w
|
|||
return nil, err
|
||||
}
|
||||
|
||||
conf.ReconnectTimeoutOverride = libserf.NewReconnectOverride(s.logger)
|
||||
|
||||
s.addEnterpriseSerfTags(conf.Tags)
|
||||
|
||||
if s.config.OverrideInitialSerfTags != nil {
|
||||
|
|
|
@ -10,7 +10,7 @@ require (
|
|||
github.com/hashicorp/go-hclog v0.12.0
|
||||
github.com/hashicorp/go-rootcerts v1.0.2
|
||||
github.com/hashicorp/go-uuid v1.0.1
|
||||
github.com/hashicorp/serf v0.9.3
|
||||
github.com/hashicorp/serf v0.9.5
|
||||
github.com/mitchellh/mapstructure v1.1.2
|
||||
github.com/stretchr/testify v1.4.0
|
||||
)
|
||||
|
|
|
@ -41,8 +41,8 @@ github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO
|
|||
github.com/hashicorp/mdns v1.0.1/go.mod h1:4gW7WsVCke5TE7EPeYliwHlRUyBtfCwuFwuMg2DmyNY=
|
||||
github.com/hashicorp/memberlist v0.2.2 h1:5+RffWKwqJ71YPu9mWsF7ZOscZmwfasdA8kbdC7AO2g=
|
||||
github.com/hashicorp/memberlist v0.2.2/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE=
|
||||
github.com/hashicorp/serf v0.9.3 h1:AVF6JDQQens6nMHT9OGERBvK0f8rPrAGILnsKLr6lzM=
|
||||
github.com/hashicorp/serf v0.9.3/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk=
|
||||
github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM=
|
||||
github.com/hashicorp/serf v0.9.5/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk=
|
||||
github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs=
|
||||
github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
|
||||
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
|
|
2
go.mod
2
go.mod
|
@ -56,7 +56,7 @@ require (
|
|||
github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69
|
||||
github.com/hashicorp/raft v1.1.2
|
||||
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea
|
||||
github.com/hashicorp/serf v0.9.4
|
||||
github.com/hashicorp/serf v0.9.5
|
||||
github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086
|
||||
github.com/hashicorp/yamux v0.0.0-20181012175058-2f1d1f20f75d
|
||||
github.com/imdario/mergo v0.3.6
|
||||
|
|
5
go.sum
5
go.sum
|
@ -288,9 +288,8 @@ github.com/hashicorp/raft v1.1.2 h1:oxEL5DDeurYxLd3UbcY/hccgSPhLLpiBZ1YxtWEq59c=
|
|||
github.com/hashicorp/raft v1.1.2/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
|
||||
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea h1:xykPFhrBAS2J0VBzVa5e80b5ZtYuNQtgXjN40qBZlD4=
|
||||
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk=
|
||||
github.com/hashicorp/serf v0.9.3/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk=
|
||||
github.com/hashicorp/serf v0.9.4 h1:xrZ4ZR0wT5Dz8oQHHdfOzr0ei1jMToWlFFz3hh/DI7I=
|
||||
github.com/hashicorp/serf v0.9.4/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk=
|
||||
github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM=
|
||||
github.com/hashicorp/serf v0.9.5/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk=
|
||||
github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086 h1:OKsyxKi2sNmqm1Gv93adf2AID2FOBFdCbbZn9fGtIdg=
|
||||
github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086/go.mod h1:R3Umvhlxi2TN7Ex2hzOowyeNb+SfbVWI973N+ctaFMk=
|
||||
github.com/hashicorp/vault/sdk v0.1.14-0.20200519221838-e0cfd64bc267 h1:e1ok06zGrWJW91rzRroyl5nRNqraaBe4d5hiKcVZuHM=
|
||||
|
|
44
lib/serf.go
44
lib/serf.go
|
@ -1,44 +0,0 @@
|
|||
package lib
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/serf/serf"
|
||||
)
|
||||
|
||||
// SerfDefaultConfig returns a Consul-flavored Serf default configuration,
|
||||
// suitable as a basis for a LAN, WAN, segment, or area.
|
||||
func SerfDefaultConfig() *serf.Config {
|
||||
base := serf.DefaultConfig()
|
||||
|
||||
// This effectively disables the annoying queue depth warnings.
|
||||
base.QueueDepthWarning = 1000000
|
||||
|
||||
// This enables dynamic sizing of the message queue depth based on the
|
||||
// cluster size.
|
||||
base.MinQueueDepth = 4096
|
||||
|
||||
// This gives leaves some time to propagate through the cluster before
|
||||
// we shut down. The value was chosen to be reasonably short, but to
|
||||
// allow a leave to get to over 99.99% of the cluster with 100k nodes
|
||||
// (using https://www.serf.io/docs/internals/simulator.html).
|
||||
base.LeavePropagateDelay = 3 * time.Second
|
||||
|
||||
return base
|
||||
}
|
||||
|
||||
func GetSerfTags(serf *serf.Serf) map[string]string {
|
||||
tags := make(map[string]string)
|
||||
for tag, value := range serf.LocalMember().Tags {
|
||||
tags[tag] = value
|
||||
}
|
||||
|
||||
return tags
|
||||
}
|
||||
|
||||
func UpdateSerfTag(serf *serf.Serf, tag, value string) {
|
||||
tags := GetSerfTags(serf)
|
||||
tags[tag] = value
|
||||
|
||||
serf.SetTags(tags)
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
package serf
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/serf/serf"
|
||||
)
|
||||
|
||||
const (
|
||||
ReconnectTimeoutTag = "rc_tm"
|
||||
)
|
||||
|
||||
// DefaultConfig returns a Consul-flavored Serf default configuration,
|
||||
// suitable as a basis for a LAN, WAN, segment, or area.
|
||||
func DefaultConfig() *serf.Config {
|
||||
base := serf.DefaultConfig()
|
||||
|
||||
// This effectively disables the annoying queue depth warnings.
|
||||
base.QueueDepthWarning = 1000000
|
||||
|
||||
// This enables dynamic sizing of the message queue depth based on the
|
||||
// cluster size.
|
||||
base.MinQueueDepth = 4096
|
||||
|
||||
// This gives leaves some time to propagate through the cluster before
|
||||
// we shut down. The value was chosen to be reasonably short, but to
|
||||
// allow a leave to get to over 99.99% of the cluster with 100k nodes
|
||||
// (using https://www.serf.io/docs/internals/simulator.html).
|
||||
base.LeavePropagateDelay = 3 * time.Second
|
||||
|
||||
return base
|
||||
}
|
||||
|
||||
func GetTags(serf *serf.Serf) map[string]string {
|
||||
tags := make(map[string]string)
|
||||
for tag, value := range serf.LocalMember().Tags {
|
||||
tags[tag] = value
|
||||
}
|
||||
|
||||
return tags
|
||||
}
|
||||
|
||||
func UpdateTag(serf *serf.Serf, tag, value string) {
|
||||
tags := GetTags(serf)
|
||||
tags[tag] = value
|
||||
|
||||
serf.SetTags(tags)
|
||||
}
|
||||
|
||||
type ReconnectOverride struct {
|
||||
logger hclog.Logger
|
||||
}
|
||||
|
||||
func NewReconnectOverride(logger hclog.Logger) *ReconnectOverride {
|
||||
if logger == nil {
|
||||
logger = hclog.Default()
|
||||
}
|
||||
|
||||
return &ReconnectOverride{
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
|
||||
func (r *ReconnectOverride) ReconnectTimeout(m *serf.Member, timeout time.Duration) time.Duration {
|
||||
val, ok := m.Tags[ReconnectTimeoutTag]
|
||||
if !ok {
|
||||
return timeout
|
||||
}
|
||||
newTimeout, err := time.ParseDuration(val)
|
||||
if err != nil {
|
||||
r.logger.Warn("Member is advertising a malformed reconnect timeout", "member", m.Name, "rc_tm", val)
|
||||
return timeout
|
||||
}
|
||||
|
||||
// ignore a timeout of 0 as that indicates the default should be used
|
||||
if newTimeout == 0 {
|
||||
return timeout
|
||||
}
|
||||
|
||||
return newTimeout
|
||||
}
|
|
@ -11,7 +11,7 @@ import (
|
|||
// given config.
|
||||
func GenerateClients(nodes int, config *Config) ([]*Client, error) {
|
||||
clients := make([]*Client, nodes)
|
||||
for i, _ := range clients {
|
||||
for i := range clients {
|
||||
client, err := NewClient(config)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -146,7 +146,7 @@ func Simulate(clients []*Client, truth [][]time.Duration, cycles int) {
|
|||
|
||||
nodes := len(clients)
|
||||
for cycle := 0; cycle < cycles; cycle++ {
|
||||
for i, _ := range clients {
|
||||
for i := range clients {
|
||||
if j := rand.Intn(nodes); j != i {
|
||||
c := clients[j].GetCoordinate()
|
||||
rtt := truth[i][j]
|
||||
|
|
|
@ -253,6 +253,15 @@ type Config struct {
|
|||
//
|
||||
// WARNING: this should ONLY be used in tests
|
||||
messageDropper func(typ messageType) bool
|
||||
|
||||
// ReconnectTimeoutOverride is an optional interface which when present allows
|
||||
// the application to cause reaping of a node to happen when it otherwise wouldn't
|
||||
ReconnectTimeoutOverride ReconnectTimeoutOverrider
|
||||
|
||||
// ValidateNodeNames controls whether nodenames only
|
||||
// contain alphanumeric, dashes and '.'characters
|
||||
// and sets maximum length to 128 characters
|
||||
ValidateNodeNames bool
|
||||
}
|
||||
|
||||
// Init allocates the subdata structures
|
||||
|
@ -298,6 +307,7 @@ func DefaultConfig() *Config {
|
|||
QuerySizeLimit: 1024,
|
||||
EnableNameConflictResolution: true,
|
||||
DisableCoordinates: false,
|
||||
ValidateNodeNames: false,
|
||||
UserEventSizeLimit: 512,
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package serf
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
|
||||
"github.com/hashicorp/memberlist"
|
||||
|
@ -17,22 +18,31 @@ type mergeDelegate struct {
|
|||
func (m *mergeDelegate) NotifyMerge(nodes []*memberlist.Node) error {
|
||||
members := make([]*Member, len(nodes))
|
||||
for idx, n := range nodes {
|
||||
members[idx] = m.nodeToMember(n)
|
||||
var err error
|
||||
members[idx], err = m.nodeToMember(n)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return m.serf.config.Merge.NotifyMerge(members)
|
||||
}
|
||||
|
||||
func (m *mergeDelegate) NotifyAlive(peer *memberlist.Node) error {
|
||||
member := m.nodeToMember(peer)
|
||||
member, err := m.nodeToMember(peer)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return m.serf.config.Merge.NotifyMerge([]*Member{member})
|
||||
}
|
||||
|
||||
func (m *mergeDelegate) nodeToMember(n *memberlist.Node) *Member {
|
||||
func (m *mergeDelegate) nodeToMember(n *memberlist.Node) (*Member, error) {
|
||||
status := StatusNone
|
||||
if n.State == memberlist.StateLeft {
|
||||
status = StatusLeft
|
||||
}
|
||||
|
||||
if err := m.validateMemberInfo(n); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &Member{
|
||||
Name: n.Name,
|
||||
Addr: net.IP(n.Addr),
|
||||
|
@ -45,5 +55,22 @@ func (m *mergeDelegate) nodeToMember(n *memberlist.Node) *Member {
|
|||
DelegateMin: n.DMin,
|
||||
DelegateMax: n.DMax,
|
||||
DelegateCur: n.DCur,
|
||||
}
|
||||
}, nil
|
||||
}
|
||||
|
||||
// validateMemberInfo checks that the data we are sending is valid
|
||||
func (m *mergeDelegate) validateMemberInfo(n *memberlist.Node) error {
|
||||
if err := m.serf.validateNodeName(n.Name); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(n.Addr) != 4 && len(n.Addr) != 16 {
|
||||
return fmt.Errorf("IP byte length is invalid: %d bytes is not either 4 or 16", len(n.Addr))
|
||||
}
|
||||
|
||||
if len(n.Meta) > memberlist.MetaMaxSize {
|
||||
return fmt.Errorf("Encoded length of tags exceeds limit of %d bytes",
|
||||
memberlist.MetaMaxSize)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ import (
|
|||
"math/rand"
|
||||
"net"
|
||||
"os"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
@ -36,6 +37,8 @@ const (
|
|||
tagMagicByte uint8 = 255
|
||||
)
|
||||
|
||||
const MaxNodeNameLength int = 128
|
||||
|
||||
var (
|
||||
// FeatureNotSupported is returned if a feature cannot be used
|
||||
// due to an older protocol version being used.
|
||||
|
@ -47,6 +50,12 @@ func init() {
|
|||
rand.Seed(time.Now().UnixNano())
|
||||
}
|
||||
|
||||
// ReconnectTimeoutOverrider is an interface that can be implemented to allow overriding
|
||||
// the reconnect timeout for individual members.
|
||||
type ReconnectTimeoutOverrider interface {
|
||||
ReconnectTimeout(member *Member, timeout time.Duration) time.Duration
|
||||
}
|
||||
|
||||
// Serf is a single node that is part of a single cluster that gets
|
||||
// events about joins/leaves/failures/etc. It is created with the Create
|
||||
// method.
|
||||
|
@ -269,6 +278,9 @@ func Create(conf *Config) (*Serf, error) {
|
|||
if len(serf.encodeTags(conf.Tags)) > memberlist.MetaMaxSize {
|
||||
return nil, fmt.Errorf("Encoded length of tags exceeds limit of %d bytes", memberlist.MetaMaxSize)
|
||||
}
|
||||
if err := serf.ValidateNodeNames(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Check if serf member event coalescing is enabled
|
||||
if conf.CoalescePeriod > 0 && conf.QuiescentPeriod > 0 && conf.EventCh != nil {
|
||||
|
@ -1571,8 +1583,13 @@ func (s *Serf) reap(old []*memberState, now time.Time, timeout time.Duration) []
|
|||
for i := 0; i < n; i++ {
|
||||
m := old[i]
|
||||
|
||||
memberTimeout := timeout
|
||||
if s.config.ReconnectTimeoutOverride != nil {
|
||||
memberTimeout = s.config.ReconnectTimeoutOverride.ReconnectTimeout(&m.Member, memberTimeout)
|
||||
}
|
||||
|
||||
// Skip if the timeout is not yet reached
|
||||
if now.Sub(m.leaveTime) <= timeout {
|
||||
if now.Sub(m.leaveTime) <= memberTimeout {
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -1884,3 +1901,24 @@ func (s *Serf) NumNodes() (numNodes int) {
|
|||
|
||||
return numNodes
|
||||
}
|
||||
|
||||
// ValidateNodeNames verifies the NodeName contains
|
||||
// only alphanumeric, -, or . and is under 128 chracters
|
||||
func (s *Serf) ValidateNodeNames() error {
|
||||
return s.validateNodeName(s.config.NodeName)
|
||||
}
|
||||
|
||||
func (s *Serf) validateNodeName(name string) error {
|
||||
if s.config.ValidateNodeNames {
|
||||
var InvalidNameRe = regexp.MustCompile(`[^A-Za-z0-9\-\.]+`)
|
||||
if InvalidNameRe.MatchString(name) {
|
||||
return fmt.Errorf("Node name contains invalid characters %v , Valid characters include "+
|
||||
"all alpha-numerics and dashes and '.' ", name)
|
||||
}
|
||||
if len(name) > MaxNodeNameLength {
|
||||
return fmt.Errorf("Node name is %v characters. "+
|
||||
"Valid length is between 1 and 128 characters", len(name))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -280,7 +280,7 @@ github.com/hashicorp/net-rpc-msgpackrpc
|
|||
github.com/hashicorp/raft
|
||||
# github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea
|
||||
github.com/hashicorp/raft-boltdb
|
||||
# github.com/hashicorp/serf v0.9.4
|
||||
# github.com/hashicorp/serf v0.9.5
|
||||
github.com/hashicorp/serf/coordinate
|
||||
github.com/hashicorp/serf/serf
|
||||
# github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086
|
||||
|
|
|
@ -834,6 +834,11 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'."
|
|||
|
||||
- `advertise_addr_wan_ipv6` This was added together with [`advertise_addr_wan_ipv4`](#advertise_addr_wan_ipv4) to support dual stack IPv4/IPv6 environments. Using this, both IPv4 and IPv6 addresses can be specified and requested during eg service discovery.
|
||||
|
||||
- `advertise_reconnect_timeout` This is a per-agent setting of the [`reconnect_timeout`](#reconnect_timeout) parameter.
|
||||
This agent will advertise to all other nodes in the cluster that after this timeout, the node may be completely
|
||||
removed from the cluster. This may only be set on client agents and if unset then other nodes will use the main
|
||||
`reconnect_timeout` setting when determing when this node may be removed from the cluster.
|
||||
|
||||
- `serf_lan` ((#serf_lan_bind)) Equivalent to the [`-serf-lan-bind` command-line flag](#_serf_lan_bind).
|
||||
|
||||
- `serf_lan_allowed_cidrs` ((#serf_lan_allowed_cidrs)) Equivalent to the [`-serf-lan-allowed-cidrs` command-line flag](#_serf_lan_allowed_cidrs).
|
||||
|
|
Loading…
Reference in New Issue