Scale heartbeat retrying based on remaining heartbeat time

This commit is contained in:
Alex Dadgar 2018-04-05 10:58:13 -07:00
parent 7941f4eb2d
commit 279b5c22e5
1 changed files with 45 additions and 2 deletions

View File

@ -1170,11 +1170,11 @@ func (c *Client) registerAndHeartbeat() {
c.retryRegisterNode()
heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
} else {
intv := c.retryIntv(registerRetryIntv)
intv := c.getHeartbeatRetryIntv()
c.logger.Printf("[ERR] client: heartbeating failed. Retrying in %v: %v", intv, err)
heartbeat = time.After(intv)
// if heartbeating fails, trigger Consul discovery
// If heartbeating fails, trigger Consul discovery
c.triggerDiscovery()
}
} else {
@ -1185,6 +1185,49 @@ func (c *Client) registerAndHeartbeat() {
}
}
// getHeartbeatRetryIntv is used to retrieve the time to wait before attempting
// another heartbeat.
func (c *Client) getHeartbeatRetryIntv() time.Duration {
if c.config.DevMode {
return devModeRetryIntv
}
// Collect the useful heartbeat info
c.heartbeatLock.Lock()
haveHeartbeated := c.haveHeartbeated
last := c.lastHeartbeat
ttl := c.heartbeatTTL
c.heartbeatLock.Unlock()
// Haven't even successfully heartbeated once so treat it as a registration.
if !haveHeartbeated {
return c.retryIntv(registerRetryIntv)
}
// Determine how much time we have left to heartbeat
left := last.Add(ttl).Sub(time.Now())
// Logic for retrying is:
// * Do not retry faster than once a second
// * Do not retry less that once every 30 seconds
// * Use the absolute time on how long you have left since we may completely
// miss a heartbeat and do not want to start retrying every second.
abs := left
if abs < 0 {
abs *= -1
}
stagger := lib.RandomStagger(abs)
switch {
case stagger < time.Second:
return time.Second
case stagger > 30*time.Second:
return 30 * time.Second
default:
return stagger
}
}
// periodicSnapshot is a long lived goroutine used to periodically snapshot the
// state of the client
func (c *Client) periodicSnapshot() {