diff --git a/.changelog/18200.txt b/.changelog/18200.txt new file mode 100644 index 000000000..378043a65 --- /dev/null +++ b/.changelog/18200.txt @@ -0,0 +1,3 @@ +```release-note:bug +core: Fixed a bug where exponential backoff could result in excessive CPU usage +``` diff --git a/client/allocrunner/taskrunner/stats_hook.go b/client/allocrunner/taskrunner/stats_hook.go index 1968e99fa..dcd097287 100644 --- a/client/allocrunner/taskrunner/stats_hook.go +++ b/client/allocrunner/taskrunner/stats_hook.go @@ -11,6 +11,7 @@ import ( hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/allocrunner/interfaces" cstructs "github.com/hashicorp/nomad/client/structs" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" bstructs "github.com/hashicorp/nomad/plugins/base/structs" ) @@ -127,7 +128,9 @@ MAIN: // // It logs the errors with appropriate log levels; don't log returned error func (h *statsHook) callStatsWithRetry(ctx context.Context, handle interfaces.DriverStats) (<-chan *cstructs.TaskResourceUsage, error) { - var retry int + var retry uint64 + var backoff time.Duration + limit := time.Second * 5 MAIN: if ctx.Err() != nil { @@ -162,13 +165,7 @@ MAIN: h.logger.Error("failed to start stats collection for task", "error", err) } - limit := time.Second * 5 - backoff := 1 << (2 * uint64(retry)) * time.Second - if backoff > limit || retry > 5 { - backoff = limit - } - - // Increment retry counter + backoff = helper.Backoff(time.Second, limit, retry) retry++ time.Sleep(backoff) diff --git a/client/allocrunner/taskrunner/vault_hook.go b/client/allocrunner/taskrunner/vault_hook.go index b979e055c..713223475 100644 --- a/client/allocrunner/taskrunner/vault_hook.go +++ b/client/allocrunner/taskrunner/vault_hook.go @@ -18,6 +18,7 @@ import ( "github.com/hashicorp/nomad/client/allocrunner/interfaces" ti "github.com/hashicorp/nomad/client/allocrunner/taskrunner/interfaces" "github.com/hashicorp/nomad/client/vaultclient" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" ) @@ -311,7 +312,8 @@ OUTER: // deriveVaultToken derives the Vault token using exponential backoffs. It // returns the Vault token and whether the manager should exit. func (h *vaultHook) deriveVaultToken() (token string, exit bool) { - attempts := 0 + var attempts uint64 + var backoff time.Duration for { tokens, err := h.client.DeriveToken(h.alloc, []string{h.taskName}) if err == nil { @@ -339,14 +341,11 @@ func (h *vaultHook) deriveVaultToken() (token string, exit bool) { } // Handle the retry case - backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline - if backoff > vaultBackoffLimit { - backoff = vaultBackoffLimit - } - h.logger.Error("failed to derive Vault token", "error", err, "recoverable", true, "backoff", backoff) - + backoff = helper.Backoff(vaultBackoffBaseline, vaultBackoffLimit, attempts) attempts++ + h.logger.Error("failed to derive Vault token", "error", err, "recoverable", true, "backoff", backoff) + // Wait till retrying select { case <-h.ctx.Done(): diff --git a/client/devicemanager/instance.go b/client/devicemanager/instance.go index 89553fed7..deb29d2c3 100644 --- a/client/devicemanager/instance.go +++ b/client/devicemanager/instance.go @@ -11,6 +11,7 @@ import ( log "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/pluginutils/loader" "github.com/hashicorp/nomad/helper/pluginutils/singleton" "github.com/hashicorp/nomad/nomad/structs" @@ -450,7 +451,8 @@ func (i *instanceManager) handleFingerprint(f *device.FingerprintResponse) error // collectStats is a long lived goroutine for collecting device statistics. It // handles errors by backing off exponentially and retrying. func (i *instanceManager) collectStats() { - attempt := 0 + var attempt uint64 + var backoff time.Duration START: // Get a device plugin @@ -495,10 +497,7 @@ START: } // Retry with an exponential backoff - backoff := (1 << (2 * uint64(attempt))) * statsBackoffBaseline - if backoff > statsBackoffLimit { - backoff = statsBackoffLimit - } + backoff = helper.Backoff(statsBackoffBaseline, statsBackoffLimit, attempt) attempt++ i.logger.Error("stats returned an error", "error", err, "retry", backoff) @@ -511,7 +510,7 @@ START: } } - // Reset the attempt since we got statistics + // Reset the attempts since we got statistics attempt = 0 // Store the new stats diff --git a/client/pluginmanager/drivermanager/instance.go b/client/pluginmanager/drivermanager/instance.go index 8b7a5a85e..0ef3fab04 100644 --- a/client/pluginmanager/drivermanager/instance.go +++ b/client/pluginmanager/drivermanager/instance.go @@ -10,6 +10,7 @@ import ( "time" log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/pluginutils/loader" "github.com/hashicorp/nomad/helper/pluginutils/singleton" "github.com/hashicorp/nomad/nomad/structs" @@ -290,7 +291,7 @@ func (i *instanceManager) fingerprint() { // backoff and retry used if the RPC is closed by the other end var backoff time.Duration - var retry int + var retry uint64 for { if backoff > 0 { select { @@ -329,11 +330,7 @@ func (i *instanceManager) fingerprint() { i.handleFingerprintError() // Calculate the new backoff - backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline - if backoff > driverFPBackoffLimit { - backoff = driverFPBackoffLimit - } - // Increment retry counter + backoff = helper.Backoff(driverFPBackoffBaseline, driverFPBackoffLimit, retry) retry++ continue } @@ -426,7 +423,7 @@ func (i *instanceManager) handleEvents() { } var backoff time.Duration - var retry int + var retry uint64 for { if backoff > 0 { select { @@ -453,10 +450,7 @@ func (i *instanceManager) handleEvents() { i.logger.Warn("failed to receive task events, retrying", "error", err, "retry", retry) // Calculate the new backoff - backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline - if backoff > driverFPBackoffLimit { - backoff = driverFPBackoffLimit - } + backoff = helper.Backoff(driverFPBackoffBaseline, driverFPBackoffLimit, retry) retry++ continue } diff --git a/command/agent/consul/version_checker.go b/command/agent/consul/version_checker.go index 2e0fb637e..90501843e 100644 --- a/command/agent/consul/version_checker.go +++ b/command/agent/consul/version_checker.go @@ -23,11 +23,12 @@ func checkConsulTLSSkipVerify(ctx context.Context, logger log.Logger, client Age defer close(done) - i := uint64(0) - timer, stop := helper.NewSafeTimer(limit) defer stop() + var attempts uint64 + var backoff time.Duration + for { self, err := client.Self() if err == nil { @@ -40,13 +41,8 @@ func checkConsulTLSSkipVerify(ctx context.Context, logger log.Logger, client Age return } - backoff := (1 << (2 * i)) * baseline - if backoff > limit { - backoff = limit - } else { - i++ - } - + backoff = helper.Backoff(baseline, limit, attempts) + attempts++ timer.Reset(backoff) select { diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index 49624a53a..cb7e6ae2b 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -483,7 +483,9 @@ type createContainerClient interface { func (d *Driver) createContainer(client createContainerClient, config docker.CreateContainerOptions, image string) (*docker.Container, error) { // Create a container - attempted := 0 + var attempted uint64 + var backoff time.Duration + CREATE: container, createErr := client.CreateContainer(config) if createErr == nil { @@ -533,16 +535,19 @@ CREATE: if attempted < 5 { attempted++ - time.Sleep(nextBackoff(attempted)) + backoff = helper.Backoff(50*time.Millisecond, time.Minute, attempted) + time.Sleep(backoff) goto CREATE } + } else if strings.Contains(strings.ToLower(createErr.Error()), "no such image") { // There is still a very small chance this is possible even with the // coordinator so retry. return nil, nstructs.NewRecoverableError(createErr, true) } else if isDockerTransientError(createErr) && attempted < 5 { attempted++ - time.Sleep(nextBackoff(attempted)) + backoff = helper.Backoff(50*time.Millisecond, time.Minute, attempted) + time.Sleep(backoff) goto CREATE } @@ -557,8 +562,9 @@ func (d *Driver) startContainer(c *docker.Container) error { return err } - // Start a container - attempted := 0 + var attempted uint64 + var backoff time.Duration + START: startErr := dockerClient.StartContainer(c.ID, c.HostConfig) if startErr == nil || strings.Contains(startErr.Error(), "Container already running") { @@ -570,7 +576,8 @@ START: if isDockerTransientError(startErr) { if attempted < 5 { attempted++ - time.Sleep(nextBackoff(attempted)) + backoff = helper.Backoff(50*time.Millisecond, time.Minute, attempted) + time.Sleep(backoff) goto START } return nstructs.NewRecoverableError(startErr, true) @@ -579,13 +586,6 @@ START: return recoverableErrTimeouts(startErr) } -// nextBackoff returns appropriate docker backoff durations after attempted attempts. -func nextBackoff(attempted int) time.Duration { - // attempts in 200ms, 800ms, 3.2s, 12.8s, 51.2s - // TODO: add randomization factor and extract to a helper - return 1 << (2 * uint64(attempted)) * 50 * time.Millisecond -} - // createImage creates a docker image either by pulling it from a registry or by // loading it from the file system func (d *Driver) createImage(task *drivers.TaskConfig, driverConfig *TaskConfig, client *docker.Client) (string, error) { diff --git a/drivers/docker/stats.go b/drivers/docker/stats.go index 23097ee99..c3401a04d 100644 --- a/drivers/docker/stats.go +++ b/drivers/docker/stats.go @@ -95,8 +95,8 @@ func (h *taskHandle) collectStats(ctx context.Context, destCh *usageSender, inte defer destCh.close() // backoff and retry used if the docker stats API returns an error - var backoff time.Duration = 0 - var retry int + var backoff time.Duration + var retry uint64 // create an interval timer timer, stop := helper.NewSafeTimer(backoff) @@ -136,11 +136,7 @@ func (h *taskHandle) collectStats(ctx context.Context, destCh *usageSender, inte h.logger.Debug("error collecting stats from container", "error", err) // Calculate the new backoff - backoff = (1 << (2 * uint64(retry))) * statsCollectorBackoffBaseline - if backoff > statsCollectorBackoffLimit { - backoff = statsCollectorBackoffLimit - } - // Increment retry counter + backoff = helper.Backoff(statsCollectorBackoffBaseline, statsCollectorBackoffLimit, retry) retry++ continue } diff --git a/helper/backoff.go b/helper/backoff.go new file mode 100644 index 000000000..e25e1f1c2 --- /dev/null +++ b/helper/backoff.go @@ -0,0 +1,31 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package helper + +import ( + "time" +) + +func Backoff(backoffBase time.Duration, backoffLimit time.Duration, attempt uint64) time.Duration { + const MaxUint = ^uint64(0) + const MaxInt = int64(MaxUint >> 1) + + // Ensure lack of non-positive backoffs since these make no sense + if backoffBase.Nanoseconds() <= 0 { + return max(backoffBase, 0*time.Second) + } + + // Ensure that a large attempt will not cause an overflow + if attempt > 62 || MaxInt/backoffBase.Nanoseconds() < (1< backoffLimit { + deadline = backoffLimit + } + + return deadline +} diff --git a/helper/backoff_test.go b/helper/backoff_test.go new file mode 100644 index 000000000..29af33f7b --- /dev/null +++ b/helper/backoff_test.go @@ -0,0 +1,72 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package helper + +import ( + "testing" + "time" + + "github.com/shoenig/test/must" +) + +func Test_Backoff(t *testing.T) { + const MaxUint = ^uint64(0) + const MaxInt = int64(MaxUint >> 1) + + cases := []struct { + name string + backoffBase time.Duration + backoffLimit time.Duration + attempt uint64 + expectedResult time.Duration + }{ + { + name: "backoff limit clamps for high base", + backoffBase: time.Hour, + backoffLimit: time.Minute, + attempt: 1, + expectedResult: time.Minute, + }, + { + name: "backoff limit clamps for boundary attempt", + backoffBase: time.Hour, + backoffLimit: time.Minute, + attempt: 63, + expectedResult: time.Minute, + }, + { + name: "small retry value", + backoffBase: time.Minute, + backoffLimit: time.Hour, + attempt: 0, + expectedResult: time.Minute, + }, + { + name: "first retry value", + backoffBase: time.Minute, + backoffLimit: time.Hour, + attempt: 1, + expectedResult: 2 * time.Minute, + }, + { + name: "fifth retry value", + backoffBase: time.Minute, + backoffLimit: time.Hour, + attempt: 5, + expectedResult: 32 * time.Minute, + }, + { + name: "sixth retry value", + backoffBase: time.Minute, + backoffLimit: time.Hour, + attempt: 6, + expectedResult: time.Hour, + }, + } + + for _, tc := range cases { + result := Backoff(tc.backoffBase, tc.backoffLimit, tc.attempt) + must.Eq(t, tc.expectedResult, result) + } +} diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index 43c1dd1a5..7dd7c595d 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -17,6 +17,7 @@ import ( "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-multierror" "github.com/hashicorp/go-set" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/pointer" "github.com/hashicorp/nomad/lib/lang" "github.com/hashicorp/nomad/nomad/stream" @@ -254,8 +255,9 @@ func (s *StateStore) SnapshotMinIndex(ctx context.Context, index uint64) (*State const backoffBase = 20 * time.Millisecond const backoffLimit = 1 * time.Second - var retries uint + var retries uint64 var retryTimer *time.Timer + var deadline time.Duration // XXX: Potential optimization is to set up a watch on the state // store's index table and only unblock via a trigger rather than @@ -273,16 +275,13 @@ func (s *StateStore) SnapshotMinIndex(ctx context.Context, index uint64) (*State } // Exponential back off - retries++ if retryTimer == nil { // First retry, start at baseline retryTimer = time.NewTimer(backoffBase) } else { // Subsequent retry, reset timer - deadline := 1 << (2 * retries) * backoffBase - if deadline > backoffLimit { - deadline = backoffLimit - } + deadline = helper.Backoff(backoffBase, backoffLimit, retries) + retries++ retryTimer.Reset(deadline) } diff --git a/nomad/worker.go b/nomad/worker.go index 8caf93d70..928b73a9a 100644 --- a/nomad/worker.go +++ b/nomad/worker.go @@ -16,6 +16,7 @@ import ( log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-version" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -109,8 +110,9 @@ type Worker struct { // failures is the count of errors encountered while dequeueing evaluations // and is used to calculate backoff. - failures uint - evalToken string + failures uint64 + failureBackoff time.Duration + evalToken string // snapshotIndex is the index of the snapshot in which the scheduler was // first invoked. It is used to mark the SnapshotIndex of evaluations @@ -133,6 +135,7 @@ func newWorker(ctx context.Context, srv *Server, args SchedulerWorkerPoolArgs) * start: time.Now(), status: WorkerStarting, enabledSchedulers: make([]string, len(args.EnabledSchedulers)), + failureBackoff: time.Duration(0), } copy(w.enabledSchedulers, args.EnabledSchedulers) @@ -874,12 +877,10 @@ func (w *Worker) shouldResubmit(err error) bool { // backoff if the server or the worker is shutdown. func (w *Worker) backoffErr(base, limit time.Duration) bool { w.setWorkloadStatus(WorkloadBackoff) - backoff := (1 << (2 * w.failures)) * base - if backoff > limit { - backoff = limit - } else { - w.failures++ - } + + backoff := helper.Backoff(base, limit, w.failures) + w.failures++ + select { case <-time.After(backoff): return false