Update serf to pick up graceful leave fix

This commit is contained in:
Preetha Appan 2018-05-10 11:16:24 -05:00
parent b8c071e0f9
commit ca5758741b
No known key found for this signature in database
GPG Key ID: 9F7C19990A50EAFC
5 changed files with 27 additions and 5 deletions

View File

@ -1245,6 +1245,10 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (
}
conf.ProtocolVersion = protocolVersionMap[s.config.ProtocolVersion]
conf.RejoinAfterLeave = true
// LeavePropagateDelay is used to make sure broadcasted leave intents propagate
// This value was tuned using https://www.serf.io/docs/internals/simulator.html to
// allow for convergence in 99.9% of nodes in a 10 node cluster
conf.LeavePropagateDelay = 1 * time.Second
conf.Merge = &serfMergeDelegate{}
// Until Nomad supports this fully, we disable automatic resolution.

View File

@ -55,6 +55,13 @@ type Config struct {
// set, a timeout of 5 seconds will be set.
BroadcastTimeout time.Duration
// LeavePropagateDelay is for our leave (node dead) message to propagate
// through the cluster. In particular, we want to stay up long enough to
// service any probes from other nodes before they learn about us
// leaving and stop probing. Otherwise, we risk getting node failures as
// we leave.
LeavePropagateDelay time.Duration
// The settings below relate to Serf's event coalescence feature. Serf
// is able to coalesce multiple events into single events in order to
// reduce the amount of noise that is sent along the EventCh. For example
@ -255,6 +262,7 @@ func DefaultConfig() *Config {
return &Config{
NodeName: hostname,
BroadcastTimeout: 5 * time.Second,
LeavePropagateDelay: 1 * time.Second,
EventBuffer: 512,
QueryBuffer: 512,
LogOutput: os.Stderr,

View File

@ -223,13 +223,16 @@ func (d *delegate) MergeRemoteState(buf []byte, isJoin bool) {
d.serf.queryClock.Witness(pp.QueryLTime - 1)
}
// Process the left nodes first to avoid the LTimes from being increment
// in the wrong order
// Process the left nodes first to avoid the LTimes from incrementing
// in the wrong order. Note that we don't have the actual Lamport time
// for the leave message, so we go one past the join time, since the
// leave must have been accepted after that to get onto the left members
// list. If we didn't do this then the message would not get processed.
leftMap := make(map[string]struct{}, len(pp.LeftMembers))
leave := messageLeave{}
for _, name := range pp.LeftMembers {
leftMap[name] = struct{}{}
leave.LTime = pp.StatusLTimes[name]
leave.LTime = pp.StatusLTimes[name] + 1
leave.Node = name
d.serf.handleNodeLeaveIntent(&leave)
}

View File

@ -691,6 +691,13 @@ func (s *Serf) Leave() error {
return err
}
// Wait for the leave to propagate through the cluster. The broadcast
// timeout is how long we wait for the message to go out from our own
// queue, but this wait is for that message to propagate through the
// cluster. In particular, we want to stay up long enough to service
// any probes from other nodes before they learn about us leaving.
time.Sleep(s.config.LeavePropagateDelay)
// Transition to Left only if we not already shutdown
s.stateLock.Lock()
if s.state != SerfShutdown {

4
vendor/vendor.json vendored
View File

@ -180,8 +180,8 @@
{"path":"github.com/hashicorp/net-rpc-msgpackrpc","revision":"a14192a58a694c123d8fe5481d4a4727d6ae82f3"},
{"path":"github.com/hashicorp/raft","checksumSHA1":"zkA9uvbj1BdlveyqXpVTh1N6ers=","revision":"077966dbc90f342107eb723ec52fdb0463ec789b","revisionTime":"2018-01-17T20:29:25Z","version":"master","versionExact":"master"},
{"path":"github.com/hashicorp/raft-boltdb","checksumSHA1":"QAxukkv54/iIvLfsUP6IK4R0m/A=","revision":"d1e82c1ec3f15ee991f7cc7ffd5b67ff6f5bbaee","revisionTime":"2015-02-01T20:08:39Z"},
{"path":"github.com/hashicorp/serf/coordinate","checksumSHA1":"0PeWsO2aI+2PgVYlYlDPKfzCLEQ=","revision":"fc4bdedf2366c64984e280c6eefc703ca7812585","revisionTime":"2018-04-11T17:01:37Z"},
{"path":"github.com/hashicorp/serf/serf","checksumSHA1":"YzJaaeIJpxLfVDZYT1X2hpd8IK8=","revision":"fc4bdedf2366c64984e280c6eefc703ca7812585","revisionTime":"2018-04-11T17:01:37Z"},
{"path":"github.com/hashicorp/serf/coordinate","checksumSHA1":"0PeWsO2aI+2PgVYlYlDPKfzCLEQ=","revision":"80ab48778deee28e4ea2dc4ef1ebb2c5f4063996","revisionTime":"2018-05-07T23:19:28Z"},
{"path":"github.com/hashicorp/serf/serf","checksumSHA1":"QrT+nzyXsD/MmhTjjhcPdnALZ1I=","revision":"80ab48778deee28e4ea2dc4ef1ebb2c5f4063996","revisionTime":"2018-05-07T23:19:28Z"},
{"path":"github.com/hashicorp/vault","checksumSHA1":"eGzvBRMFD6ZB3A6uO750np7Om/E=","revision":"182ba68a9589d4cef95234134aaa498a686e3de3","revisionTime":"2016-08-21T23:40:57Z"},
{"path":"github.com/hashicorp/vault/api","checksumSHA1":"mKN4rEIWyflT6aqJyjgu9m1tPXI=","revision":"3ddd3bd20cec0588788547aecd15e91461b9d546","revisionTime":"2018-04-03T21:11:47Z"},
{"path":"github.com/hashicorp/vault/helper/compressutil","checksumSHA1":"jHVLe8KMdEpb/ZALp0zu+tenADo=","revision":"3ddd3bd20cec0588788547aecd15e91461b9d546","revisionTime":"2018-04-03T21:11:47Z"},