Update serf to pick up clean leave fix

This commit is contained in:
Preetha Appan 2018-05-04 15:51:55 -05:00
parent 705b20d9bc
commit 98a04a0af9
No known key found for this signature in database
GPG Key ID: 9F7C19990A50EAFC
5 changed files with 42 additions and 11 deletions

View File

@ -2,6 +2,7 @@ package lib
import ( import (
"github.com/hashicorp/serf/serf" "github.com/hashicorp/serf/serf"
"time"
) )
// SerfDefaultConfig returns a Consul-flavored Serf default configuration, // SerfDefaultConfig returns a Consul-flavored Serf default configuration,
@ -16,5 +17,12 @@ func SerfDefaultConfig() *serf.Config {
// cluster size. // cluster size.
base.MinQueueDepth = 4096 base.MinQueueDepth = 4096
// This gives leaves some time to propagate through the cluster before
// we shut down. The value was chosen to be reasonably short, but to
// allow a leave to get to over 99.99% of the cluster with 100k nodes
// (using https://www.serf.io/docs/internals/simulator.html).
base.LeavePropagateDelay = 3 * time.Second
return base return base
} }

View File

@ -55,6 +55,13 @@ type Config struct {
// set, a timeout of 5 seconds will be set. // set, a timeout of 5 seconds will be set.
BroadcastTimeout time.Duration BroadcastTimeout time.Duration
// LeavePropagateDelay is for our leave (node dead) message to propagate
// through the cluster. In particular, we want to stay up long enough to
// service any probes from other nodes before they learn about us
// leaving and stop probing. Otherwise, we risk getting node failures as
// we leave.
LeavePropagateDelay time.Duration
// The settings below relate to Serf's event coalescence feature. Serf // The settings below relate to Serf's event coalescence feature. Serf
// is able to coalesce multiple events into single events in order to // is able to coalesce multiple events into single events in order to
// reduce the amount of noise that is sent along the EventCh. For example // reduce the amount of noise that is sent along the EventCh. For example
@ -255,6 +262,7 @@ func DefaultConfig() *Config {
return &Config{ return &Config{
NodeName: hostname, NodeName: hostname,
BroadcastTimeout: 5 * time.Second, BroadcastTimeout: 5 * time.Second,
LeavePropagateDelay: 1 * time.Second,
EventBuffer: 512, EventBuffer: 512,
QueryBuffer: 512, QueryBuffer: 512,
LogOutput: os.Stderr, LogOutput: os.Stderr,

View File

@ -223,13 +223,16 @@ func (d *delegate) MergeRemoteState(buf []byte, isJoin bool) {
d.serf.queryClock.Witness(pp.QueryLTime - 1) d.serf.queryClock.Witness(pp.QueryLTime - 1)
} }
// Process the left nodes first to avoid the LTimes from being increment // Process the left nodes first to avoid the LTimes from incrementing
// in the wrong order // in the wrong order. Note that we don't have the actual Lamport time
// for the leave message, so we go one past the join time, since the
// leave must have been accepted after that to get onto the left members
// list. If we didn't do this then the message would not get processed.
leftMap := make(map[string]struct{}, len(pp.LeftMembers)) leftMap := make(map[string]struct{}, len(pp.LeftMembers))
leave := messageLeave{} leave := messageLeave{}
for _, name := range pp.LeftMembers { for _, name := range pp.LeftMembers {
leftMap[name] = struct{}{} leftMap[name] = struct{}{}
leave.LTime = pp.StatusLTimes[name] leave.LTime = pp.StatusLTimes[name] + 1
leave.Node = name leave.Node = name
d.serf.handleNodeLeaveIntent(&leave) d.serf.handleNodeLeaveIntent(&leave)
} }

View File

@ -691,6 +691,13 @@ func (s *Serf) Leave() error {
return err return err
} }
// Wait for the leave to propagate through the cluster. The broadcast
// timeout is how long we wait for the message to go out from our own
// queue, but this wait is for that message to propagate through the
// cluster. In particular, we want to stay up long enough to service
// any probes from other nodes before they learn about us leaving.
time.Sleep(s.config.LeavePropagateDelay)
// Transition to Left only if we not already shutdown // Transition to Left only if we not already shutdown
s.stateLock.Lock() s.stateLock.Lock()
if s.state != SerfShutdown { if s.state != SerfShutdown {
@ -1670,12 +1677,17 @@ func (s *Serf) Stats() map[string]string {
return strconv.FormatUint(v, 10) return strconv.FormatUint(v, 10)
} }
s.memberLock.RLock() s.memberLock.RLock()
defer s.memberLock.RUnlock() members := toString(uint64(len(s.members)))
failed := toString(uint64(len(s.failedMembers)))
left := toString(uint64(len(s.leftMembers)))
health_score := toString(uint64(s.memberlist.GetHealthScore()))
s.memberLock.RUnlock()
stats := map[string]string{ stats := map[string]string{
"members": toString(uint64(len(s.members))), "members": members,
"failed": toString(uint64(len(s.failedMembers))), "failed": failed,
"left": toString(uint64(len(s.leftMembers))), "left": left,
"health_score": toString(uint64(s.memberlist.GetHealthScore())), "health_score": health_score,
"member_time": toString(uint64(s.clock.Time())), "member_time": toString(uint64(s.clock.Time())),
"event_time": toString(uint64(s.eventClock.Time())), "event_time": toString(uint64(s.eventClock.Time())),
"query_time": toString(uint64(s.queryClock.Time())), "query_time": toString(uint64(s.queryClock.Time())),

6
vendor/vendor.json vendored
View File

@ -72,8 +72,8 @@
{"path":"github.com/hashicorp/net-rpc-msgpackrpc","checksumSHA1":"qnlqWJYV81ENr61SZk9c65R1mDo=","revision":"a14192a58a694c123d8fe5481d4a4727d6ae82f3","revisionTime":"2015-11-16T02:03:38Z"}, {"path":"github.com/hashicorp/net-rpc-msgpackrpc","checksumSHA1":"qnlqWJYV81ENr61SZk9c65R1mDo=","revision":"a14192a58a694c123d8fe5481d4a4727d6ae82f3","revisionTime":"2015-11-16T02:03:38Z"},
{"path":"github.com/hashicorp/raft","checksumSHA1":"JjJtGJi1ywWhVhs/PvTXxe4TeD8=","revision":"6d14f0c70869faabd9e60ba7ed88a6cbbd6a661f","revisionTime":"2017-10-03T22:09:13Z","version":"v1.0.0","versionExact":"v1.0.0"}, {"path":"github.com/hashicorp/raft","checksumSHA1":"JjJtGJi1ywWhVhs/PvTXxe4TeD8=","revision":"6d14f0c70869faabd9e60ba7ed88a6cbbd6a661f","revisionTime":"2017-10-03T22:09:13Z","version":"v1.0.0","versionExact":"v1.0.0"},
{"path":"github.com/hashicorp/raft-boltdb","checksumSHA1":"QAxukkv54/iIvLfsUP6IK4R0m/A=","revision":"d1e82c1ec3f15ee991f7cc7ffd5b67ff6f5bbaee","revisionTime":"2015-02-01T20:08:39Z"}, {"path":"github.com/hashicorp/raft-boltdb","checksumSHA1":"QAxukkv54/iIvLfsUP6IK4R0m/A=","revision":"d1e82c1ec3f15ee991f7cc7ffd5b67ff6f5bbaee","revisionTime":"2015-02-01T20:08:39Z"},
{"path":"github.com/hashicorp/serf/coordinate","checksumSHA1":"0PeWsO2aI+2PgVYlYlDPKfzCLEQ=","comment":"v0.7.0-66-g6c4672d","revision":"b6017ae61f4420ed0c02d5eeeb9ff3fc02953f14","revisionTime":"2018-01-19T22:43:00Z"}, {"path":"github.com/hashicorp/serf/coordinate","checksumSHA1":"0PeWsO2aI+2PgVYlYlDPKfzCLEQ=","revision":"4b67f2c2b2bb5b748d934a6d48221062e43d2274","revisionTime":"2018-05-04T20:06:40Z"},
{"path":"github.com/hashicorp/serf/serf","checksumSHA1":"QGImnWfhk0ILLZszcf3vRs/Ft7g=","comment":"v0.7.0-66-g6c4672d","revision":"b6017ae61f4420ed0c02d5eeeb9ff3fc02953f14","revisionTime":"2018-01-19T22:43:00Z"}, {"path":"github.com/hashicorp/serf/serf","checksumSHA1":"QrT+nzyXsD/MmhTjjhcPdnALZ1I=","revision":"4b67f2c2b2bb5b748d934a6d48221062e43d2274","revisionTime":"2018-05-04T20:06:40Z"},
{"path":"github.com/hashicorp/yamux","checksumSHA1":"NnWv17i1tpvBNJtpdRRWpE6j4LY=","revision":"2658be15c5f05e76244154714161f17e3e77de2e","revisionTime":"2018-03-14T20:07:45Z"}, {"path":"github.com/hashicorp/yamux","checksumSHA1":"NnWv17i1tpvBNJtpdRRWpE6j4LY=","revision":"2658be15c5f05e76244154714161f17e3e77de2e","revisionTime":"2018-03-14T20:07:45Z"},
{"path":"github.com/mattn/go-isatty","checksumSHA1":"xZuhljnmBysJPta/lMyYmJdujCg=","revision":"66b8e73f3f5cda9f96b69efd03dd3d7fc4a5cdb8","revisionTime":"2016-08-06T12:27:52Z"}, {"path":"github.com/mattn/go-isatty","checksumSHA1":"xZuhljnmBysJPta/lMyYmJdujCg=","revision":"66b8e73f3f5cda9f96b69efd03dd3d7fc4a5cdb8","revisionTime":"2016-08-06T12:27:52Z"},
{"path":"github.com/miekg/dns","checksumSHA1":"XTeOihCDhjG6ltUKExoJ2uEzShk=","revision":"5364553f1ee9cddc7ac8b62dce148309c386695b","revisionTime":"2018-01-25T10:38:03Z","version":"v1.0.4","versionExact":"v1.0.4"}, {"path":"github.com/miekg/dns","checksumSHA1":"XTeOihCDhjG6ltUKExoJ2uEzShk=","revision":"5364553f1ee9cddc7ac8b62dce148309c386695b","revisionTime":"2018-01-25T10:38:03Z","version":"v1.0.4","versionExact":"v1.0.4"},
@ -86,9 +86,9 @@
{"path":"github.com/mitchellh/reflectwalk","checksumSHA1":"mrqMlK6gqe//WsJSrJ1HgkPM0lM=","revision":"eecf4c70c626c7cfbb95c90195bc34d386c74ac6","revisionTime":"2015-05-27T15:31:53Z"}, {"path":"github.com/mitchellh/reflectwalk","checksumSHA1":"mrqMlK6gqe//WsJSrJ1HgkPM0lM=","revision":"eecf4c70c626c7cfbb95c90195bc34d386c74ac6","revisionTime":"2015-05-27T15:31:53Z"},
{"path":"github.com/pascaldekloe/goe/verify","checksumSHA1":"5h+ERzHw3Rl2G0kFPxoJzxiA9s0=","revision":"07ebd1e2481f616a278ab431cf04cc5cf5ab3ebe","revisionTime":"2017-03-28T18:37:59Z"}, {"path":"github.com/pascaldekloe/goe/verify","checksumSHA1":"5h+ERzHw3Rl2G0kFPxoJzxiA9s0=","revision":"07ebd1e2481f616a278ab431cf04cc5cf5ab3ebe","revisionTime":"2017-03-28T18:37:59Z"},
{"path":"github.com/pkg/errors","checksumSHA1":"ynJSWoF6v+3zMnh9R0QmmG6iGV8=","revision":"ff09b135c25aae272398c51a07235b90a75aa4f0","revisionTime":"2017-03-16T20:15:38Z","tree":true}, {"path":"github.com/pkg/errors","checksumSHA1":"ynJSWoF6v+3zMnh9R0QmmG6iGV8=","revision":"ff09b135c25aae272398c51a07235b90a75aa4f0","revisionTime":"2017-03-16T20:15:38Z","tree":true},
{"path": "github.com/prometheus/client_golang/prometheus/promhttp", "checksumSHA1": "BM771aKU6hC+5rap48aqvMXczII=", "revision": "f504d69affe11ec1ccb2e5948127f86878c9fd57", "revisionTime": "2018-03-28T13:04:30Z"},
{"path":"github.com/pmezard/go-difflib/difflib","checksumSHA1":"LuFv4/jlrmFNnDb/5SCSEPAM9vU=","revision":"792786c7400a136282c1664665ae0a8db921c6c2","revisionTime":"2016-01-10T10:55:54Z"}, {"path":"github.com/pmezard/go-difflib/difflib","checksumSHA1":"LuFv4/jlrmFNnDb/5SCSEPAM9vU=","revision":"792786c7400a136282c1664665ae0a8db921c6c2","revisionTime":"2016-01-10T10:55:54Z"},
{"path":"github.com/posener/complete","checksumSHA1":"Nt4Ol6ZM2n0XD5zatxjwEYBpQnw=","revision":"dc2bc5a81accba8782bebea28628224643a8286a","revisionTime":"2017-11-04T09:57:02Z","version":"=v1.1","versionExact":"v1.1"}, {"path":"github.com/posener/complete","checksumSHA1":"Nt4Ol6ZM2n0XD5zatxjwEYBpQnw=","revision":"dc2bc5a81accba8782bebea28628224643a8286a","revisionTime":"2017-11-04T09:57:02Z","version":"=v1.1","versionExact":"v1.1"},
{"path":"github.com/prometheus/client_golang/prometheus/promhttp","checksumSHA1":"BM771aKU6hC+5rap48aqvMXczII=","revision":"f504d69affe11ec1ccb2e5948127f86878c9fd57","revisionTime":"2018-03-28T13:04:30Z"},
{"path":"github.com/ryanuber/columnize","checksumSHA1":"ExnVEVNT8APpFTm26cUb5T09yR4=","comment":"v2.0.1-8-g983d3a5","revision":"9b3edd62028f107d7cabb19353292afd29311a4e","revisionTime":"2016-07-12T16:32:29Z"}, {"path":"github.com/ryanuber/columnize","checksumSHA1":"ExnVEVNT8APpFTm26cUb5T09yR4=","comment":"v2.0.1-8-g983d3a5","revision":"9b3edd62028f107d7cabb19353292afd29311a4e","revisionTime":"2016-07-12T16:32:29Z"},
{"path":"github.com/sean-/seed","checksumSHA1":"A/YUMbGg1LHIeK2+NLZBt+MIAao=","revision":"3c72d44db0c567f7c901f9c5da5fe68392227750","revisionTime":"2017-02-08T16:47:21Z"}, {"path":"github.com/sean-/seed","checksumSHA1":"A/YUMbGg1LHIeK2+NLZBt+MIAao=","revision":"3c72d44db0c567f7c901f9c5da5fe68392227750","revisionTime":"2017-02-08T16:47:21Z"},
{"path":"github.com/sergi/go-diff/diffmatchpatch","checksumSHA1":"v7C+aJ1D/z3MEeCte6bxvpoGjM4=","revision":"feef008d51ad2b3778f85d387ccf91735543008d","revisionTime":"2017-04-09T07:17:39Z"}, {"path":"github.com/sergi/go-diff/diffmatchpatch","checksumSHA1":"v7C+aJ1D/z3MEeCte6bxvpoGjM4=","revision":"feef008d51ad2b3778f85d387ccf91735543008d","revisionTime":"2017-04-09T07:17:39Z"},