core: backoff considerably when worker is behind raft (#15523)

Upon dequeuing an evaluation workers snapshot their state store at the
eval's wait index or later. This ensures we process an eval at a point
in time after it was created or updated. Processing an eval on an old
snapshot could cause any number of problems such as:

1. Since job registration atomically updates an eval and job in a single
   raft entry, scheduling against indexes before that may not have the
   eval's job or may have an older version.
2. The older the scheduler's snapshot, the higher the likelihood
   something has changed in the cluster state which will cause the plan
   applier to reject the scheduler's plan. This could waste work or
   even cause eval's to be failed needlessly.

However, the workers run in parallel with a new server pulling the
cluster state from a peer. During this time, which may be many minutes
long, the state store is likely far behind the minimum index required
to process evaluations.

This PR addresses this by adding an additional long backoff period after
an eval is nacked. If the scheduler's indexes catches up within the
additional backoff, it will unblock early to dequeue the next eval.

When the server shuts down we'll get a `context.Canceled` error from the state
store method. We need to bubble this error up so that other callers can detect
it. Handle this case separately when waiting after dequeue so that we can warn
on shutdown instead of throwing an ambiguous error message with just the text
"canceled."

While there may be more precise ways to block scheduling until the
server catches up, this approach adds little risk and covers additional
cases where a server may be temporarily behind due to a spike in load or
a saturated network.

For testing, we make the `raftSyncLimit` into a parameter on the worker's `run` method 
so that we can run backoff tests without waiting 30+ seconds. We haven't followed thru
and made all the worker globals into worker parameters, because there isn't much
use outside of testing, but we can consider that in the future.

Co-authored-by: Tim Gross <tgross@hashicorp.com>
This commit is contained in:
Michael Schurter 2023-01-24 05:56:35 -08:00 committed by GitHub
parent 92c7d96e0a
commit ace5faf948
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 115 additions and 17 deletions

View File

@ -242,7 +242,7 @@ func (s *StateStore) SnapshotMinIndex(ctx context.Context, index uint64) (*State
// Get the states current index
snapshotIndex, err := s.LatestIndex()
if err != nil {
return nil, fmt.Errorf("failed to determine state store's index: %v", err)
return nil, fmt.Errorf("failed to determine state store's index: %w", err)
}
// We only need the FSM state to be as recent as the given index

View File

@ -3,6 +3,7 @@ package nomad
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"sync"
@ -148,7 +149,7 @@ func (w *Worker) ID() string {
// to see if it paused using IsStarted()
func (w *Worker) Start() {
w.setStatus(WorkerStarting)
go w.run()
go w.run(raftSyncLimit)
}
// Pause transitions a worker to the pausing state. Check
@ -383,7 +384,7 @@ func (w *Worker) workerShuttingDown() bool {
// ----------------------------------
// run is the long-lived goroutine which is used to run the worker
func (w *Worker) run() {
func (w *Worker) run(raftSyncLimit time.Duration) {
defer func() {
w.markStopped()
}()
@ -401,11 +402,12 @@ func (w *Worker) run() {
return
}
// since dequeue takes time, we could have shutdown the server after getting an eval that
// needs to be nacked before we exit. Explicitly checking the server to allow this eval
// to be processed on worker shutdown.
// since dequeue takes time, we could have shutdown the server after
// getting an eval that needs to be nacked before we exit. Explicitly
// check the server whether to allow this eval to be processed.
if w.srv.IsShutdown() {
w.logger.Error("nacking eval because the server is shutting down", "eval", log.Fmt("%#v", eval))
w.logger.Warn("nacking eval because the server is shutting down",
"eval", log.Fmt("%#v", eval))
w.sendNack(eval, token)
return
}
@ -414,8 +416,34 @@ func (w *Worker) run() {
w.setWorkloadStatus(WorkloadWaitingForRaft)
snap, err := w.snapshotMinIndex(waitIndex, raftSyncLimit)
if err != nil {
w.logger.Error("error waiting for Raft index", "error", err, "index", waitIndex)
w.sendNack(eval, token)
var timeoutErr ErrMinIndexDeadlineExceeded
if errors.As(err, &timeoutErr) {
w.logger.Warn("timeout waiting for Raft index required by eval",
"eval", eval.ID, "index", waitIndex, "timeout", raftSyncLimit)
w.sendNack(eval, token)
// Timing out above means this server is woefully behind the
// leader's index. This can happen when a new server is added to
// a cluster and must initially sync the cluster state.
// Backoff dequeuing another eval until there's some indication
// this server would be up to date enough to process it.
slowServerSyncLimit := 10 * raftSyncLimit
if _, err := w.snapshotMinIndex(waitIndex, slowServerSyncLimit); err != nil {
w.logger.Warn("server is unable to catch up to last eval's index", "error", err)
}
} else if errors.Is(err, context.Canceled) {
// If the server has shutdown while we're waiting, we'll get the
// Canceled error from the worker's context. We need to nack any
// dequeued evals before we exit.
w.logger.Warn("nacking eval because the server is shutting down", "eval", eval.ID)
w.sendNack(eval, token)
return
} else {
w.logger.Error("error waiting for Raft index", "error", err, "index", waitIndex)
w.sendNack(eval, token)
}
continue
}
@ -533,17 +561,35 @@ func (w *Worker) sendAck(eval *structs.Evaluation, token string) {
w.sendAcknowledgement(eval, token, true)
}
type ErrMinIndexDeadlineExceeded struct {
waitIndex uint64
timeout time.Duration
}
// Unwrapping an ErrMinIndexDeadlineExceeded always return
// context.DeadlineExceeded
func (ErrMinIndexDeadlineExceeded) Unwrap() error {
return context.DeadlineExceeded
}
func (e ErrMinIndexDeadlineExceeded) Error() string {
return fmt.Sprintf("timed out after %s waiting for index=%d", e.timeout, e.waitIndex)
}
// snapshotMinIndex times calls to StateStore.SnapshotAfter which may block.
func (w *Worker) snapshotMinIndex(waitIndex uint64, timeout time.Duration) (*state.StateSnapshot, error) {
start := time.Now()
defer metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, time.Now())
ctx, cancel := context.WithTimeout(w.ctx, timeout)
snap, err := w.srv.fsm.State().SnapshotMinIndex(ctx, waitIndex)
cancel()
metrics.MeasureSince([]string{"nomad", "worker", "wait_for_index"}, start)
// Wrap error to ensure callers don't disregard timeouts.
if err == context.DeadlineExceeded {
err = fmt.Errorf("timed out after %s waiting for index=%d", timeout, waitIndex)
// Wrap error to ensure callers can detect timeouts.
if errors.Is(err, context.DeadlineExceeded) {
return nil, ErrMinIndexDeadlineExceeded{
waitIndex: waitIndex,
timeout: timeout,
}
}
return snap, err

View File

@ -2,6 +2,7 @@ package nomad
import (
"context"
"errors"
"fmt"
"reflect"
"sync"
@ -11,6 +12,7 @@ import (
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-memdb"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/require"
@ -55,9 +57,10 @@ func init() {
// NewTestWorker returns the worker without calling it's run method.
func NewTestWorker(shutdownCtx context.Context, srv *Server) *Worker {
w := &Worker{
srv: srv,
start: time.Now(),
id: uuid.Generate(),
srv: srv,
start: time.Now(),
id: uuid.Generate(),
enabledSchedulers: srv.config.EnabledSchedulers,
}
w.logger = srv.logger.ResetNamed("worker").With("worker_id", w.id)
w.pauseCond = sync.NewCond(&w.pauseLock)
@ -338,6 +341,54 @@ func TestWorker_sendAck(t *testing.T) {
}
}
func TestWorker_runBackoff(t *testing.T) {
ci.Parallel(t)
srv, cleanupSrv := TestServer(t, func(c *Config) {
c.NumSchedulers = 0
c.EnabledSchedulers = []string{structs.JobTypeService}
})
defer cleanupSrv()
testutil.WaitForLeader(t, srv.RPC)
eval1 := mock.Eval()
eval1.ModifyIndex = 1000
srv.evalBroker.Enqueue(eval1)
must.Eq(t, 1, srv.evalBroker.Stats().TotalReady)
// make a new context here so we can still check the broker's state after
// we've shut down the worker
workerCtx, workerCancel := context.WithCancel(srv.shutdownCtx)
defer workerCancel()
w := NewTestWorker(workerCtx, srv)
doneCh := make(chan struct{})
go func() {
w.run(time.Millisecond)
doneCh <- struct{}{}
}()
// We expect to be paused for 10ms + 1ms but otherwise can't be all that
// precise here because of concurrency. But checking coverage for this test
// shows we've covered the logic
t1, cancelT1 := helper.NewSafeTimer(100 * time.Millisecond)
defer cancelT1()
select {
case <-doneCh:
t.Fatal("returned early")
case <-t1.C:
}
workerCancel()
<-doneCh
must.Eq(t, 1, srv.evalBroker.Stats().TotalWaiting)
must.Eq(t, 0, srv.evalBroker.Stats().TotalReady)
must.Eq(t, 0, srv.evalBroker.Stats().TotalPending)
must.Eq(t, 0, srv.evalBroker.Stats().TotalUnacked)
}
func TestWorker_waitForIndex(t *testing.T) {
ci.Parallel(t)
@ -376,6 +427,7 @@ func TestWorker_waitForIndex(t *testing.T) {
require.Nil(t, snap)
require.EqualError(t, err,
fmt.Sprintf("timed out after %s waiting for index=%d", timeout, waitIndex))
require.True(t, errors.Is(err, context.DeadlineExceeded), "expect error to wrap DeadlineExceeded")
}
func TestWorker_invokeScheduler(t *testing.T) {